load("dat_ready.RData") dat_ready_train <- dat_ready[1:20000, ] dat_ready_test <- dat_ready[20001:39880, ] rsq <- function(x,y){ return(1 - sum((y-x)^2)/sum((y-mean(y))^2)) } # Rattle is Copyright (c) 2006-2015 Togaware Pty Ltd. #============================================================ # Rattle timestamp: 2016-09-11 20:01:51 x86_64-w64-mingw32 # Rattle version 4.1.0 user 'root' # This log file captures all Rattle interactions as R commands. # Export this log to a file using the Export button or the Tools # menu to save a log of all your activity. This facilitates repeatability. For example, exporting # to a file called 'myrf01.R' will allow you to type in the R Console # the command source('myrf01.R') and so repeat all actions automatically. # Generally, you will want to edit the file to suit your needs. You can also directly # edit this current log in place to record additional information before exporting. # Saving and loading projects also retains this log. # We begin by loading the required libraries. library(rattle) # To access the weather dataset and utility commands. library(magrittr) # For the %>% and %<>% operators. # This log generally records the process of building a model. However, with very # little effort the log can be used to score a new dataset. The logical variable # 'building' is used to toggle between generating transformations, as when building # a model, and simply using the transformations, as when scoring a dataset. building <- TRUE scoring <- ! building # A pre-defined value is used to reset the random seed so that results are repeatable. crv$seed <- 42 #============================================================ # Rattle timestamp: 2016-09-11 20:01:55 x86_64-w64-mingw32 # Load an R data frame. crs$dataset <- dat_ready_train # Display a simple summary (structure) of the dataset. str(crs$dataset) #============================================================ # Rattle timestamp: 2016-09-11 20:02:20 x86_64-w64-mingw32 # Note the user selections. # Build the training/validate/test datasets. set.seed(crv$seed) crs$nobs <- nrow(crs$dataset) # 20000 observations crs$sample <- crs$train <- sample(nrow(crs$dataset), 0.7*crs$nobs) # 14000 observations crs$validate <- sample(setdiff(seq_len(nrow(crs$dataset)), crs$train), 0.15*crs$nobs) # 3000 observations crs$test <- setdiff(setdiff(seq_len(nrow(crs$dataset)), crs$train), crs$validate) # 3000 observations # The following variable selections have been noted. crs$input <- c("lag_diff_1", "lag_diff_2", "lag_diff_3", "lag_diff_4", "lag_diff_5", "lag_diff_6", "lag_diff_7", "lag_diff_8", "lag_diff_9", "lag_diff_10", "lag_diff_11", "lag_diff_12", "lag_diff_13", "lag_diff_14", "lag_diff_15", "lag_diff_16", "lag_diff_17", "lag_diff_18", "lag_diff_19", "lag_diff_20", "lag_diff_21", "lag_diff_22", "lag_diff_23", "lag_diff_24", "lag_diff_25", "lag_diff_26", "lag_diff_27", "lag_diff_28", "lag_diff_29", "lag_diff_30", "lag_diff_31", "lag_diff_32", "lag_diff_33", "lag_diff_34", "lag_diff_35", "lag_diff_36", "lag_diff_37", "lag_diff_38", "lag_diff_39", "lag_diff_40", "lag_diff_41", "lag_diff_42", "lag_diff_43", "lag_diff_44", "lag_diff_45", "lag_diff_46", "lag_diff_47", "lag_diff_48", "lag_diff_49", "lag_diff_50", "lag_diff_51", "lag_diff_52", "lag_diff_53", "lag_diff_54", "lag_diff_55", "lag_diff_56", "lag_diff_57", "lag_diff_58", "lag_diff_59", "lag_diff_60", "lag_diff_61", "lag_diff_62", "lag_diff_63", "lag_diff_64", "lag_diff_65", "lag_diff_66", "lag_diff_67", "lag_diff_68", "lag_diff_69", "lag_diff_70", "lag_diff_71", "lag_diff_72", "lag_diff_73", "lag_diff_74", "lag_diff_75", "lag_diff_76", "lag_diff_77", "lag_diff_78", "lag_diff_79", "lag_diff_80", "lag_diff_81", "lag_diff_82", "lag_diff_83", "lag_diff_84", "lag_diff_85", "lag_diff_86", "lag_diff_87", "lag_diff_88", "lag_diff_89", "lag_diff_90", "lag_diff_91", "lag_diff_92", "lag_diff_93", "lag_diff_94", "lag_diff_95", "lag_diff_96", "lag_diff_97", "lag_diff_98", "lag_diff_99", "lag_diff_100") crs$numeric <- c("lag_diff_1", "lag_diff_2", "lag_diff_3", "lag_diff_4", "lag_diff_5", "lag_diff_6", "lag_diff_7", "lag_diff_8", "lag_diff_9", "lag_diff_10", "lag_diff_11", "lag_diff_12", "lag_diff_13", "lag_diff_14", "lag_diff_15", "lag_diff_16", "lag_diff_17", "lag_diff_18", "lag_diff_19", "lag_diff_20", "lag_diff_21", "lag_diff_22", "lag_diff_23", "lag_diff_24", "lag_diff_25", "lag_diff_26", "lag_diff_27", "lag_diff_28", "lag_diff_29", "lag_diff_30", "lag_diff_31", "lag_diff_32", "lag_diff_33", "lag_diff_34", "lag_diff_35", "lag_diff_36", "lag_diff_37", "lag_diff_38", "lag_diff_39", "lag_diff_40", "lag_diff_41", "lag_diff_42", "lag_diff_43", "lag_diff_44", "lag_diff_45", "lag_diff_46", "lag_diff_47", "lag_diff_48", "lag_diff_49", "lag_diff_50", "lag_diff_51", "lag_diff_52", "lag_diff_53", "lag_diff_54", "lag_diff_55", "lag_diff_56", "lag_diff_57", "lag_diff_58", "lag_diff_59", "lag_diff_60", "lag_diff_61", "lag_diff_62", "lag_diff_63", "lag_diff_64", "lag_diff_65", "lag_diff_66", "lag_diff_67", "lag_diff_68", "lag_diff_69", "lag_diff_70", "lag_diff_71", "lag_diff_72", "lag_diff_73", "lag_diff_74", "lag_diff_75", "lag_diff_76", "lag_diff_77", "lag_diff_78", "lag_diff_79", "lag_diff_80", "lag_diff_81", "lag_diff_82", "lag_diff_83", "lag_diff_84", "lag_diff_85", "lag_diff_86", "lag_diff_87", "lag_diff_88", "lag_diff_89", "lag_diff_90", "lag_diff_91", "lag_diff_92", "lag_diff_93", "lag_diff_94", "lag_diff_95", "lag_diff_96", "lag_diff_97", "lag_diff_98", "lag_diff_99", "lag_diff_100") crs$categoric <- NULL crs$target <- "X0.000808582" crs$risk <- NULL crs$ident <- NULL crs$ignore <- NULL crs$weights <- NULL #============================================================ # Rattle timestamp: 2016-09-11 20:02:32 x86_64-w64-mingw32 # Neural Network # Build a neural network model using the nnet package. library(nnet, quietly=TRUE) # Build the NNet model. set.seed(199) # crs$nnet <- nnet(X0.000808582 ~ ., # data=crs$dataset[crs$sample,c(crs$input, crs$target)], # size=10, linout=TRUE, skip=TRUE, MaxNWts=10000, trace=FALSE, maxit=100) i <- 1 scores <- matrix(NA, ncol=0, nrow=4) nnet_maxit <- 10 nnet_size <- 1 while(TRUE){ if(i==1){ crs$nnet <- nnet(X0.000808582 ~ ., data=crs$dataset[crs$sample,c(crs$input, crs$target)], size=nnet_size, linout=TRUE, skip=(nnet_size==0), MaxNWts=1000000, trace=TRUE, maxit=nnet_maxit) }else{ crs$nnet <- nnet(X0.000808582 ~ ., data=crs$dataset[crs$sample,c(crs$input, crs$target)], size=nnet_size, linout=TRUE, skip=(nnet_size==0), MaxNWts=1000000, trace=TRUE, maxit=nnet_maxit, Wts = crs$nnet$wts) } score1 <- rsq(predict(crs$nnet, newdata=crs$dataset[crs$sample, crs$input]), crs$dataset[crs$sample, crs$target]) score2 <- rsq(predict(crs$nnet, newdata=crs$dataset[crs$validate, crs$input]), crs$dataset[crs$validate, crs$target]) score3 <- rsq(predict(crs$nnet, newdata=crs$dataset[crs$test, crs$input]), crs$dataset[crs$test, crs$target]) score4 <- rsq(predict(crs$nnet, newdata=dat_ready_test[, crs$input]), dat_ready_test[, crs$target]) scores <- cbind(scores, c(score1, score2, score3, score4)) # save.image(paste0("dat_ready_nnet_", i, ".RData")) plot(scores[1,], ylim=c(min(scores), max(scores)), type="l", col="black", main=paste0("nnet R^2 at iteration ", i*nnet_maxit), lwd=3, xlab="iteration", ylab="R^2") lines(scores[2,], col="red", lwd=3) lines(scores[3,], col="green", lwd=3) lines(scores[4,], col="blue", lwd=3) grid() legend(x="bottomright", legend=c("sample","validate","test","new data"), fill=c("black","red","green","blue")) if(i>=3 && min(score1, score2, score3) <= min(scores[1:3, (ncol(scores)-2):ncol(scores)])){ break } i <- i + 1 } # Print the results of the modelling. cat(sprintf("A %s network with %d weights.\n", paste(crs$nnet$n, collapse="-"), length(crs$nnet$wts))) cat(sprintf("Inputs: %s.\n", paste(crs$nnet$coefnames, collapse=", "))) cat(sprintf("Output: %s.\n", names(attr(crs$nnet$terms, "dataClasses"))[1])) cat(sprintf("Sum of Squares Residuals: %.4f.\n", sum(residuals(crs$nnet) ^ 2))) cat("\n") print(summary(crs$nnet)) cat('\n') # Time taken: 3.36 secs dev.new() #============================================================ # Rattle timestamp: 2016-09-11 20:02:41 x86_64-w64-mingw32 # Evaluate model performance. # NNET: Generate a Predicted v Observed plot for nnet model on dat_ready_train [**train**]. crs$pr <- predict(crs$nnet, newdata=crs$dataset[crs$sample, c(crs$input, crs$target)]) # Obtain the observed output for the dataset. obs <- subset(crs$dataset[crs$sample, c(crs$input, crs$target)], select=crs$target) # Handle in case categoric target treated as numeric. obs.rownames <- rownames(obs) obs <- as.numeric(obs[[1]]) obs <- data.frame(X0.000808582=obs) rownames(obs) <- obs.rownames # Combine the observed values with the predicted. fitpoints <- na.omit(cbind(obs, Predicted=crs$pr)) # Obtain the pseudo R2 - a correlation. fitcorr <- format(cor(fitpoints[,1], fitpoints[,2])^2, digits=4) # Plot settings for the true points and best fit. op <- par(c(lty="solid", col="blue")) # Display the observed (X) versus predicted (Y) points. plot(fitpoints[[1]], fitpoints[[2]], asp=1, xlab="X0.000808582", ylab="Predicted") # Generate a simple linear fit between predicted and observed. prline <- lm(fitpoints[,2] ~ fitpoints[,1]) # Add the linear fit to the plot. abline(prline) # Add a diagonal representing perfect correlation. par(c(lty="dashed", col="black")) abline(0, 1) # Include a pseudo R-square on the plot legend("bottomright", sprintf(" Pseudo R-square=%s ", fitcorr), bty="n") # Add a title and grid to the plot. title(main="Predicted vs. Observed Neural Net Model dat_ready_train [**train**]", sub=paste("Rattle", format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"])) grid() dev.new() #============================================================ # Rattle timestamp: 2016-09-11 20:02:44 x86_64-w64-mingw32 # Evaluate model performance. # NNET: Generate a Predicted v Observed plot for nnet model on dat_ready_train [validate]. crs$pr <- predict(crs$nnet, newdata=crs$dataset[crs$validate, c(crs$input, crs$target)]) # Obtain the observed output for the dataset. obs <- subset(crs$dataset[crs$validate, c(crs$input, crs$target)], select=crs$target) # Handle in case categoric target treated as numeric. obs.rownames <- rownames(obs) obs <- as.numeric(obs[[1]]) obs <- data.frame(X0.000808582=obs) rownames(obs) <- obs.rownames # Combine the observed values with the predicted. fitpoints <- na.omit(cbind(obs, Predicted=crs$pr)) # Obtain the pseudo R2 - a correlation. fitcorr <- format(cor(fitpoints[,1], fitpoints[,2])^2, digits=4) # Plot settings for the true points and best fit. op <- par(c(lty="solid", col="blue")) # Display the observed (X) versus predicted (Y) points. plot(fitpoints[[1]], fitpoints[[2]], asp=1, xlab="X0.000808582", ylab="Predicted") # Generate a simple linear fit between predicted and observed. prline <- lm(fitpoints[,2] ~ fitpoints[,1]) # Add the linear fit to the plot. abline(prline) # Add a diagonal representing perfect correlation. par(c(lty="dashed", col="black")) abline(0, 1) # Include a pseudo R-square on the plot legend("bottomright", sprintf(" Pseudo R-square=%s ", fitcorr), bty="n") # Add a title and grid to the plot. title(main="Predicted vs. Observed Neural Net Model dat_ready_train [validate]", sub=paste("Rattle", format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"])) grid() dev.new() #============================================================ # Rattle timestamp: 2016-09-11 20:02:47 x86_64-w64-mingw32 # Evaluate model performance. # NNET: Generate a Predicted v Observed plot for nnet model on dat_ready_train [test]. crs$pr <- predict(crs$nnet, newdata=crs$dataset[crs$test, c(crs$input, crs$target)]) # Obtain the observed output for the dataset. obs <- subset(crs$dataset[crs$test, c(crs$input, crs$target)], select=crs$target) # Handle in case categoric target treated as numeric. obs.rownames <- rownames(obs) obs <- as.numeric(obs[[1]]) obs <- data.frame(X0.000808582=obs) rownames(obs) <- obs.rownames # Combine the observed values with the predicted. fitpoints <- na.omit(cbind(obs, Predicted=crs$pr)) # Obtain the pseudo R2 - a correlation. fitcorr <- format(cor(fitpoints[,1], fitpoints[,2])^2, digits=4) # Plot settings for the true points and best fit. op <- par(c(lty="solid", col="blue")) # Display the observed (X) versus predicted (Y) points. plot(fitpoints[[1]], fitpoints[[2]], asp=1, xlab="X0.000808582", ylab="Predicted") # Generate a simple linear fit between predicted and observed. prline <- lm(fitpoints[,2] ~ fitpoints[,1]) # Add the linear fit to the plot. abline(prline) # Add a diagonal representing perfect correlation. par(c(lty="dashed", col="black")) abline(0, 1) # Include a pseudo R-square on the plot legend("bottomright", sprintf(" Pseudo R-square=%s ", fitcorr), bty="n") # Add a title and grid to the plot. title(main="Predicted vs. Observed Neural Net Model dat_ready_train [test]", sub=paste("Rattle", format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"])) grid() dev.new() #============================================================ # Rattle timestamp: 2016-09-11 20:02:51 x86_64-w64-mingw32 # Evaluate model performance. # Assign the R dataset to be used as the test set. crs$testset <- dat_ready_test # NNET: Generate a Predicted v Observed plot for nnet model on dat_ready_test. crs$pr <- predict(crs$nnet, newdata=crs$testset) # Obtain the observed output for the dataset. obs <- subset(crs$testset, select=crs$target) # Handle in case categoric target treated as numeric. obs.rownames <- rownames(obs) obs <- as.numeric(obs[[1]]) obs <- data.frame(X0.000808582=obs) rownames(obs) <- obs.rownames # Combine the observed values with the predicted. fitpoints <- na.omit(cbind(obs, Predicted=crs$pr)) # Obtain the pseudo R2 - a correlation. fitcorr <- format(cor(fitpoints[,1], fitpoints[,2])^2, digits=4) # Plot settings for the true points and best fit. op <- par(c(lty="solid", col="blue")) # Display the observed (X) versus predicted (Y) points. plot(fitpoints[[1]], fitpoints[[2]], asp=1, xlab="X0.000808582", ylab="Predicted") # Generate a simple linear fit between predicted and observed. prline <- lm(fitpoints[,2] ~ fitpoints[,1]) # Add the linear fit to the plot. abline(prline) # Add a diagonal representing perfect correlation. par(c(lty="dashed", col="black")) abline(0, 1) # Include a pseudo R-square on the plot legend("bottomright", sprintf(" Pseudo R-square=%s ", fitcorr), bty="n") # Add a title and grid to the plot. title(main="Predicted vs. Observed Neural Net Model dat_ready_test", sub=paste("Rattle", format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"])) grid()