0

I am supporting a system which is using code generated by rattle to do some predictions. However on looking at the predictions generated by the rattle code closely I see that the code is not at all doing any prediction but just displaying the input file as it is. Although I am expecting the prediction to be done on new data but it is simply displaying the input file as it is. Please can some expert have a look at the attached code and guide me what could be wrong with this. The person who created this has since left the company and I am trying to figure out how to fix the issue. I am absolutely new to R and would appreciate any help from the users. The code is given below:

# Rattle is Copyright (c) 2006-2014 Togaware Pty Ltd.

#============================================================
# Rattle timestamp: 2015-08-03 08:32:05 x86_64-w64-mingw32 

# Rattle version 3.4.1 user 'vikashs'

# Export this log textview to a file using the Export button or the Tools 
# menu to save a log of all activity. This facilitates repeatability. Exporting 
# to file 'myrf01.R', for example, allows us to the type in the R Console 
# the command source('myrf01.R') to repeat the process automatically. 
# Generally, we may want to edit the file to suit our needs. We can also directly 
# edit this current log textview to record additional information before exporting. 

# Saving and loading projects also retains this log.

library(rattle)
library(pROC)
library(ggplot2)
library(randomForest)
library(ROCR)

# This log generally records the process of building a model. However, with very 
# little effort the log can be used to score a new dataset. The logical variable 
# 'building' is used to toggle between generating transformations, as when building 
# a model, and simply using the transformations, as when scoring a dataset.

predictFraud=(function(inputVector) {

building <- TRUE
scoring  <- ! building

# The colorspace package is used to generate the colours used in plots, if available.

library(colorspace)

# A pre-defined value is used to reset the random seed so that results are repeatable.

crv$seed <- 42 

#============================================================
# Rattle timestamp: 2015-08-03 08:32:18 x86_64-w64-mingw32 

# Load the data.

crs$dataset <- read.csv(java_file1, na.strings=c(".", "NA", "", "?"), strip.white=TRUE, encoding="UTF-8")

#============================================================
# Rattle timestamp: 2015-08-03 08:32:30 x86_64-w64-mingw32 

# Note the user selections. 

# Build the training/validate/test datasets.

set.seed(crv$seed) 
crs$nobs <- nrow(crs$dataset) # 1000 observations 
crs$sample <- crs$train <- sample(nrow(crs$dataset), 0.7*crs$nobs) # 700 observations
crs$validate <- NULL
crs$test <- setdiff(setdiff(seq_len(nrow(crs$dataset)), crs$train), crs$validate) # 300 observations

# The following variable selections have been noted.

crs$input <- inputVector

crs$numeric <- c("TRANS_AMOUNT", "CUST_ACC_LIABLTY", "CUST_AGE", "CUST_SALARY")

crs$categoric <- c("ACTVTY_TYP_NAME", "TRNS_TYP_NAME", "TRNS_CTGRY_NAME", "TRNS_CRRNCY_NAME",
     "CUST_ACC_DEF_STATUS", "CUST_ACC_RGSTN_TYPE", "CUST_ACC_OWNRSHP_TYPE", "CUST_ACC_BRNCH",
     "CUST_GENDR", "CUST_MARITAL_STS", "CUST_EMPLYMNT_PRFLE", "CUST_TOT_FMLY_MMBRS",
     "CUST_DEPNDNTS", "CUST_EXSTNG_LOAN", "CUST_EXSTNG_INSRNCE", "LOAN_TYPE",
     "LOAN_STATUS", "LOAN_DFAULT", "LOAN_FRCLSRE_FLG", "LOAN_TRNSFR_FLG",
     "INS_STATUS", "INS_DFAULT", "INS_SRRNDR_FLG")

crs$target  <- colnames(crs$dataset)[ncol(crs$dataset)]
crs$risk    <- NULL
crs$ident   <- "ID"
crs$ignore  <- NULL
crs$weights <- NULL

#============================================================
# Rattle timestamp: 2015-08-03 08:32:36 x86_64-w64-mingw32 

# The 'randomForest' package provides the 'randomForest' function.

require(randomForest, quietly=TRUE)

# Build the Random Forest model.

set.seed(crv$seed)
crs$rf <- randomForest(Response ~ .,
      data=crs$dataset[crs$sample,c(crs$input, crs$target)], 
      ntree=500,
      mtry=5,
      importance=TRUE,
      na.action=na.roughfix,
      replace=FALSE)

rn <- round(importance(crs$rf), 2)
itm=length(crs$input)
GiniIndex=sort(rn[,4],decreasing=T)[1:itm]
write.csv(data.frame(GiniIndex),"D:/VarImp.csv")

temp="";
for (i in 1:length(names(GiniIndex)))
  {
    #if(colnames(crs$dataset)==names(GiniIndex)[i])
    {
        temp[i]= names(GiniIndex)[i]
    }
  }
 # temp=sort(temp)
  col_for_prediction=temp

# Regression model 

# Build a Regression model.

crs$glm <- glm(Response ~ .,
    data=crs$dataset[crs$train, c(crs$input, crs$target)],
    family=binomial(link="logit"))

# Generate a textual view of the Linear model.

print(summary(crs$glm))
cat(sprintf("Log likelihood: %.3f (%d df)\n",
            logLik(crs$glm)[1],
            attr(logLik(crs$glm), "df")))
cat(sprintf("Null/Residual deviance difference: %.3f (%d df)\n",
            crs$glm$null.deviance-crs$glm$deviance,
            crs$glm$df.null-crs$glm$df.residual))
cat(sprintf("Chi-square p-value: %.8f\n",
            dchisq(crs$glm$null.deviance-crs$glm$deviance,
                   crs$glm$df.null-crs$glm$df.residual)))
cat(sprintf("Pseudo R-Square (optimistic): %.8f\n",
             cor(crs$glm$y, crs$glm$fitted.values)))
cat('\n==== ANOVA ====\n\n')
print(anova(crs$glm, test="Chisq"))
cat("\n")

# Time taken: 0.56 secs

#============================================================
# Rattle timestamp: 2015-08-03 08:32:41 x86_64-w64-mingw32 

# Evaluate model performance. 

# Generate an Error Matrix for the Linear model.

# Obtain the response from the Linear model.

crs$pr <- as.vector(ifelse(predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)]) > 0.5, "Y", "N"))

# Generate the confusion matrix showing counts.

table(crs$dataset[crs$test, c(crs$input, crs$target)]$Response, crs$pr,
        dnn=c("Actual", "Predicted"))

# Generate the confusion matrix showing proportions.

pcme <- function(actual, cl)
{
  x <- table(actual, cl)
  tbl <- cbind(round(x/length(actual), 2),
               Error=round(c(x[1,2]/sum(x[1,]),
                             x[2,1]/sum(x[2,])), 2))
  names(attr(tbl, "dimnames")) <- c("Actual", "Predicted")
  return(tbl)
};
pcme(crs$dataset[crs$test, c(crs$input, crs$target)]$Response, crs$pr)

# Calculate the overall error percentage.

overall <- function(x)
{
  if (nrow(x) == 2) 
    cat((x[1,2] + x[2,1]) / sum(x)) 
  else
    cat(1 - (x[1,rownames(x)]) / sum(x))
} 
overall(table(crs$pr, crs$dataset[crs$test, c(crs$input, crs$target)]$Response,  
        dnn=c("Predicted", "Actual")))

# Calculate the averaged class error percentage.

avgerr <- function(x) 
    cat(mean(c(x[1,2], x[2,1]) / apply(x, 1, sum))) 
avgerr(table(crs$pr, crs$dataset[crs$test, c(crs$input, crs$target)]$Response,  
        dnn=c("Predicted", "Actual")))

#============================================================
# Rattle timestamp: 2015-08-03 08:32:46 x86_64-w64-mingw32 

# Evaluate model performance. 

# Risk Chart: requires the ggplot2 package.

library(ggplot2)

# Generate a risk chart.

# Rattle provides evaluateRisk() and riskchart().

crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])
crs$eval <- evaluateRisk(crs$pr, crs$dataset[crs$test, c(crs$input, crs$target)]$Response)
#dp3<-print(riskchart(crs$pr, 
#                crs$dataset[crs$test, c(crs$input, crs$target)]$Response, 
#                title="Performance Chart Linear Master_File_New_02_New_Ver01.csv [test] ", show.lift=TRUE, show.precision=TRUE))


#============================================================
# Rattle timestamp: 2015-08-03 08:32:53 x86_64-w64-mingw32 

# Evaluate model performance. 

# Lift Chart: requires the ROCR package.

library(ROCR)

# Obtain predictions for the glm model on Master_File_New_02_New_Ver01.csv [test].

crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])

# Remove observations with missing target.

no.miss   <- na.omit(crs$dataset[crs$test, c(crs$input, crs$target)]$Response)
miss.list <- attr(no.miss, "na.action")
attributes(no.miss) <- NULL

if (length(miss.list))
{
  pred <- prediction(crs$pr[-miss.list], no.miss)
} else
{
  pred <- prediction(crs$pr, no.miss)
}

# Convert rate of positive predictions to percentage.

per <- performance(pred, "lift", "rpp")
per@x.values[[1]] <- per@x.values[[1]]*100

# Plot the lift chart.
#plot(per, col="#CC0000FF", lty=1, xlab="Caseload (%)", add=FALSE)

# Generate a Lift Chart for the glm model on Master_File_New_02_New_Ver01.csv [train].

crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])

# Also convert rate of positive predictions to percentage

per <- performance(prediction(crs$pr, crs$dataset[crs$test, c(crs$input, crs$target)]$Response),"lift", "rpp")
per@x.values[[1]] <- per@x.values[[1]]*100

# Now plot the lift.

# Bug in ROCR 1.0-3 plot does not obey the add command.# Calling the function directly (.plot.performance) does work.

#.plot.performance(per, col="#00CCCCFF", lty=2, add=TRUE)

# Add a legend to the plot.

#legend("topright", c("Test","Train"), col=rainbow(2, 1, .8), lty=1:2, title="Linear", inset=c(0.05, 0.05))

# Add decorations to the plot.

#title(main="Lift Chart  Master_File_New_02_New_Ver01.csv ",
#    sub=paste("Rattle", format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
#grid()

#============================================================
# Rattle timestamp: 2015-08-03 08:32:56 x86_64-w64-mingw32 

# Evaluate model performance. 

# ROC Curve: requires the ROCR package.

library(ROCR)

# ROC Curve: requires the ggplot2 package.

require(ggplot2, quietly=TRUE)

# Generate an ROC Curve for the glm model on Master_File_New_02_New_Ver01.csv [test].

crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])

# Remove observations with missing target.

no.miss   <- na.omit(crs$dataset[crs$test, c(crs$input, crs$target)]$Response)
miss.list <- attr(no.miss, "na.action")
attributes(no.miss) <- NULL

if (length(miss.list))
{
  pred <- prediction(crs$pr[-miss.list], no.miss)
} else
{
  pred <- prediction(crs$pr, no.miss)
}

pe <- performance(pred, "tpr", "fpr")
au <- performance(pred, "auc")@y.values[[1]]
pd <- data.frame(fpr=unlist(pe@x.values), tpr=unlist(pe@y.values))
p <- ggplot(pd, aes(x=fpr, y=tpr))
p <- p + geom_line(colour="red")
p <- p + xlab("False Positive Rate") + ylab("True Positive Rate")
p <- p + ggtitle("ROC Curve Linear Master_File_New_02_New_Ver01.csv [test] Response")
p <- p + theme(plot.title=element_text(size=10))
p <- p + geom_line(data=data.frame(), aes(x=c(0,1), y=c(0,1)), colour="grey")
p <- p + annotate("text", x=0.50, y=0.00, hjust=0, vjust=0, size=5,
                   label=paste("AUC =", round(au, 2)))
#print(p)

# Calculate the area under the curve for the plot.


# Remove observations with missing target.

no.miss   <- na.omit(crs$dataset[crs$test, c(crs$input, crs$target)]$Response)
miss.list <- attr(no.miss, "na.action")
attributes(no.miss) <- NULL

if (length(miss.list))
{
  pred <- prediction(crs$pr[-miss.list], no.miss)
} else
{
  pred <- prediction(crs$pr, no.miss)
}
performance(pred, "auc")

#============================================================
# Rattle timestamp: 2015-08-03 08:32:59 x86_64-w64-mingw32 

# Evaluate model performance. 

# Precision/Recall Plot: requires the ROCR package

library(ROCR)

# Generate a Precision/Recall Plot for the glm model on Master_File_New_02_New_Ver01.csv [test].

crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])

# Remove observations with missing target.

no.miss   <- na.omit(crs$dataset[crs$test, c(crs$input, crs$target)]$Response)
miss.list <- attr(no.miss, "na.action")
attributes(no.miss) <- NULL

if (length(miss.list))
{
  pred <- prediction(crs$pr[-miss.list], no.miss)
} else
{
  pred <- prediction(crs$pr, no.miss)
}
#plot(performance(pred, "prec", "rec"), col="#CC0000FF", lty=1, add=FALSE)
dp2<-performance(pred, "prec", "rec")

# Generate a Precision/Recall Plot for the glm model on Master_File_New_02_New_Ver01.csv [train].

crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])

# In ROCR (1.0-3) plot does not obey the add command.
# Calling the function directly (.plot.performance) does work.

#.plot.performance(performance(prediction(crs$pr, crs$dataset[crs$test, c(crs$input, crs$target)]$Response),"prec", "rec"), col="#00CCCCFF", lty=2, add=TRUE)


# Add a legend to the plot.

#legend("bottomleft", c("Test","Train"), col=rainbow(2, 1, .8), lty=1:2, title="glm", inset=c(0.05, 0.05))

# Add decorations to the plot.

#title(main="Precision/Recall Plot  Master_File_New_02_New_Ver01.csv ",
#    sub=paste("Rattle", format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
#grid()

#============================================================
# Rattle timestamp: 2015-08-03 08:33:05 x86_64-w64-mingw32 

# Evaluate model performance. 

# Sensitivity/Specificity Plot: requires the ROCR package

library(ROCR)

# Generate Sensitivity/Specificity Plot for glm model on Master_File_New_02_New_Ver01.csv [test].

crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])

# Remove observations with missing target.

no.miss   <- na.omit(crs$dataset[crs$test, c(crs$input, crs$target)]$Response)
miss.list <- attr(no.miss, "na.action")
attributes(no.miss) <- NULL

if (length(miss.list))
{
  pred <- prediction(crs$pr[-miss.list], no.miss)
} else
{
  pred <- prediction(crs$pr, no.miss)
}
#plot(performance(pred, "sens", "spec"), col="#CC0000FF", lty=1, add=FALSE)
dp1<-performance(pred, "sens", "spec")

# Generate a Lift Chart for the glm model on Master_File_New_02_New_Ver01.csv [train].

crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)])

#In ROCR (1.0-3) plot does not obey the add command.
# Calling the function directly (.plot.performance) does work.

#.plot.performance(performance(prediction(crs$pr, crs$dataset[crs$test, c(crs$input, crs$target)]$Response),"sens", "spec"), col="#00CCCCFF", lty=2, add=TRUE)


# Add a legend to the plot.

#legend("bottomleft", c("Test","Train"), col=rainbow(2, 1, .8), lty=1:2, title="glm", inset=c(0.05, 0.05))

# Add decorations to the plot.

#title(main="Sensitivity/Specificity (tpr/tnr)  Master_File_New_02_New_Ver01.csv ",
 #   sub=paste("Rattle", format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
#grid()

#============================================================
# Rattle timestamp: 2015-08-03 08:33:11 x86_64-w64-mingw32 

# Score a dataset. 

# Obtain probability scores for the Linear model on Master_File_New_02_New_Ver01.csv [test].

#crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input)])
crs$pr <- as.vector(ifelse(predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input, crs$target)]) > 0.5, "Y", "N"))
sdata3<-crs$pr

# Extract the relevant variables from the dataset.

sdata <- subset(crs$dataset[crs$test,], select=c("ID", "Response"))

# Output the combined data.

write.csv(cbind(sdata, crs$pr), file="D:/R_Outputs/Logistic_Custom_Scores.csv", row.names=FALSE)

#============================================================
# Rattle timestamp: 2015-08-03 08:33:22 x86_64-w64-mingw32 

# Score a dataset. 

#============================================================
# Rattle timestamp: 2015-08-03 08:33:36 x86_64-w64-mingw32 

# Score a dataset. 

# Obtain probability scores for the Linear model on Master_File_New_02_New_Ver01.csv [test].

crs$pr <- predict(crs$glm, type="response", newdata=crs$dataset[crs$test, c(crs$input)])


# Extract the relevant variables from the dataset.

sdata <- subset(crs$dataset[crs$test,], select=c("ID", "Response"))

# Output the combined data.

write.csv(cbind(sdata, crs$pr), file="D:/R_Outputs/Logistic_Custom_Prob.csv", row.names=FALSE)

Probability<-crs$pr
sdata1<-subset(crs$dataset[crs$test,],select=c("ID"))
sdata2<-subset(crs$dataset[crs$test,],select=c("Response"))
sdata1<-cbind.data.frame(sdata1,Probability,sdata2,sdata3)

#=============================================================
#Prediction on New Dataset

#data_for_prediction <- crs$dataset[-29]
data_for_prediction=read.csv(java_file2, na.strings=c(".", "NA", "", "?"), strip.white=TRUE, encoding="UTF-8")
result <- predict(crs$glm, type = "response", newdata=data_for_prediction)

tempdata=0
for (i in 1:length(result))
{
  tempdata[i]= (1-result[i])
}

tempdata<-cbind.data.frame(tempdata,result)

write.csv(data.frame(tempdata),"D:/Predictions.csv")

#dp<-c(per,dp1,dp2,pd,dp3,tempdata,sdata1,data.frame(GiniIndex),col_for_prediction)
dp<-c(per,dp1,dp2,pd,tempdata,sdata1,data.frame(GiniIndex),col_for_prediction)
return(dp)

})
  • In fact it does prediction `result <- predict(crs$glm, type = "response", newdata=data_for_prediction)` and then the same is written as csv – amrrs Oct 31 '17 at 07:15
  • Yes, in the code that you pointed out it seems to be doing prediction. But in the UI where this prediction file is returned it returns the same file as the input file. I am not having access to the UI code as of now. Thought of looking at the R code first. – Santosh Rai Oct 31 '17 at 07:19
  • It's worth checking if the file is there if you access to the machine and also you can see that it is returning `dp` which must be the one that's received in UI. – amrrs Oct 31 '17 at 07:24
  • Thanks for your feedback. I will check the UI code and see what it is being fetched by UI. – Santosh Rai Oct 31 '17 at 07:32

0 Answers0