load("dat_ready.RData")
dat_ready_train <- dat_ready[1:20000, ]
dat_ready_test <- dat_ready[20001:39880, ]



rsq <- function(x,y){
	return(1 - sum((y-x)^2)/sum((y-mean(y))^2))
}


# Rattle is Copyright (c) 2006-2015 Togaware Pty Ltd.

#============================================================
# Rattle timestamp: 2016-09-11 20:01:51 x86_64-w64-mingw32 

# Rattle version 4.1.0 user 'root'

# This log file captures all Rattle interactions as R commands. 

# Export this log to a file using the Export button or the Tools 
# menu to save a log of all your activity. This facilitates repeatability. For example, exporting 
# to a file called 'myrf01.R' will allow you to type in the R Console 
# the command source('myrf01.R') and so repeat all actions automatically. 
# Generally, you will want to edit the file to suit your needs. You can also directly 
# edit this current log in place to record additional information before exporting. 
 
# Saving and loading projects also retains this log.

# We begin by loading the required libraries.

library(rattle)   # To access the weather dataset and utility commands.
library(magrittr) # For the %>% and %<>% operators.

# This log generally records the process of building a model. However, with very 
# little effort the log can be used to score a new dataset. The logical variable 
# 'building' is used to toggle between generating transformations, as when building 
# a model, and simply using the transformations, as when scoring a dataset.

building <- TRUE
scoring  <- ! building


# A pre-defined value is used to reset the random seed so that results are repeatable.

crv$seed <- 42 

#============================================================
# Rattle timestamp: 2016-09-11 20:01:55 x86_64-w64-mingw32 

# Load an R data frame.

crs$dataset <- dat_ready_train

# Display a simple summary (structure) of the dataset.

str(crs$dataset)

#============================================================
# Rattle timestamp: 2016-09-11 20:02:20 x86_64-w64-mingw32 

# Note the user selections. 

# Build the training/validate/test datasets.

set.seed(crv$seed) 
crs$nobs <- nrow(crs$dataset) # 20000 observations 
crs$sample <- crs$train <- sample(nrow(crs$dataset), 0.7*crs$nobs) # 14000 observations
crs$validate <- sample(setdiff(seq_len(nrow(crs$dataset)), crs$train), 0.15*crs$nobs) # 3000 observations
crs$test <- setdiff(setdiff(seq_len(nrow(crs$dataset)), crs$train), crs$validate) # 3000 observations

# The following variable selections have been noted.

crs$input <- c("lag_diff_1", "lag_diff_2", "lag_diff_3", "lag_diff_4",
     "lag_diff_5", "lag_diff_6", "lag_diff_7", "lag_diff_8",
     "lag_diff_9", "lag_diff_10", "lag_diff_11", "lag_diff_12",
     "lag_diff_13", "lag_diff_14", "lag_diff_15", "lag_diff_16",
     "lag_diff_17", "lag_diff_18", "lag_diff_19", "lag_diff_20",
     "lag_diff_21", "lag_diff_22", "lag_diff_23", "lag_diff_24",
     "lag_diff_25", "lag_diff_26", "lag_diff_27", "lag_diff_28",
     "lag_diff_29", "lag_diff_30", "lag_diff_31", "lag_diff_32",
     "lag_diff_33", "lag_diff_34", "lag_diff_35", "lag_diff_36",
     "lag_diff_37", "lag_diff_38", "lag_diff_39", "lag_diff_40",
     "lag_diff_41", "lag_diff_42", "lag_diff_43", "lag_diff_44",
     "lag_diff_45", "lag_diff_46", "lag_diff_47", "lag_diff_48",
     "lag_diff_49", "lag_diff_50", "lag_diff_51", "lag_diff_52",
     "lag_diff_53", "lag_diff_54", "lag_diff_55", "lag_diff_56",
     "lag_diff_57", "lag_diff_58", "lag_diff_59", "lag_diff_60",
     "lag_diff_61", "lag_diff_62", "lag_diff_63", "lag_diff_64",
     "lag_diff_65", "lag_diff_66", "lag_diff_67", "lag_diff_68",
     "lag_diff_69", "lag_diff_70", "lag_diff_71", "lag_diff_72",
     "lag_diff_73", "lag_diff_74", "lag_diff_75", "lag_diff_76",
     "lag_diff_77", "lag_diff_78", "lag_diff_79", "lag_diff_80",
     "lag_diff_81", "lag_diff_82", "lag_diff_83", "lag_diff_84",
     "lag_diff_85", "lag_diff_86", "lag_diff_87", "lag_diff_88",
     "lag_diff_89", "lag_diff_90", "lag_diff_91", "lag_diff_92",
     "lag_diff_93", "lag_diff_94", "lag_diff_95", "lag_diff_96",
     "lag_diff_97", "lag_diff_98", "lag_diff_99", "lag_diff_100")

crs$numeric <- c("lag_diff_1", "lag_diff_2", "lag_diff_3", "lag_diff_4",
     "lag_diff_5", "lag_diff_6", "lag_diff_7", "lag_diff_8",
     "lag_diff_9", "lag_diff_10", "lag_diff_11", "lag_diff_12",
     "lag_diff_13", "lag_diff_14", "lag_diff_15", "lag_diff_16",
     "lag_diff_17", "lag_diff_18", "lag_diff_19", "lag_diff_20",
     "lag_diff_21", "lag_diff_22", "lag_diff_23", "lag_diff_24",
     "lag_diff_25", "lag_diff_26", "lag_diff_27", "lag_diff_28",
     "lag_diff_29", "lag_diff_30", "lag_diff_31", "lag_diff_32",
     "lag_diff_33", "lag_diff_34", "lag_diff_35", "lag_diff_36",
     "lag_diff_37", "lag_diff_38", "lag_diff_39", "lag_diff_40",
     "lag_diff_41", "lag_diff_42", "lag_diff_43", "lag_diff_44",
     "lag_diff_45", "lag_diff_46", "lag_diff_47", "lag_diff_48",
     "lag_diff_49", "lag_diff_50", "lag_diff_51", "lag_diff_52",
     "lag_diff_53", "lag_diff_54", "lag_diff_55", "lag_diff_56",
     "lag_diff_57", "lag_diff_58", "lag_diff_59", "lag_diff_60",
     "lag_diff_61", "lag_diff_62", "lag_diff_63", "lag_diff_64",
     "lag_diff_65", "lag_diff_66", "lag_diff_67", "lag_diff_68",
     "lag_diff_69", "lag_diff_70", "lag_diff_71", "lag_diff_72",
     "lag_diff_73", "lag_diff_74", "lag_diff_75", "lag_diff_76",
     "lag_diff_77", "lag_diff_78", "lag_diff_79", "lag_diff_80",
     "lag_diff_81", "lag_diff_82", "lag_diff_83", "lag_diff_84",
     "lag_diff_85", "lag_diff_86", "lag_diff_87", "lag_diff_88",
     "lag_diff_89", "lag_diff_90", "lag_diff_91", "lag_diff_92",
     "lag_diff_93", "lag_diff_94", "lag_diff_95", "lag_diff_96",
     "lag_diff_97", "lag_diff_98", "lag_diff_99", "lag_diff_100")

crs$categoric <- NULL

crs$target  <- "X0.000808582"
crs$risk    <- NULL
crs$ident   <- NULL
crs$ignore  <- NULL
crs$weights <- NULL

#============================================================
# Rattle timestamp: 2016-09-11 20:02:32 x86_64-w64-mingw32 

# Neural Network 

# Build a neural network model using the nnet package.

library(nnet, quietly=TRUE)

# Build the NNet model.

set.seed(199)
# crs$nnet <- nnet(X0.000808582 ~ .,
    # data=crs$dataset[crs$sample,c(crs$input, crs$target)],
    # size=10, linout=TRUE, skip=TRUE, MaxNWts=10000, trace=FALSE, maxit=100)

	
i <- 1
scores <- matrix(NA, ncol=0, nrow=4)
nnet_maxit <- 10
nnet_size <- 1
while(TRUE){
	if(i==1){
		crs$nnet <- nnet(X0.000808582 ~ .,
			data=crs$dataset[crs$sample,c(crs$input, crs$target)],
			size=nnet_size, linout=TRUE, skip=(nnet_size==0), MaxNWts=1000000, trace=TRUE, maxit=nnet_maxit)
	}else{
		crs$nnet <- nnet(X0.000808582 ~ .,
			data=crs$dataset[crs$sample,c(crs$input, crs$target)],
			size=nnet_size, linout=TRUE, skip=(nnet_size==0), MaxNWts=1000000, trace=TRUE, maxit=nnet_maxit, Wts = crs$nnet$wts)
	}
	
	score1 <- rsq(predict(crs$nnet, newdata=crs$dataset[crs$sample, crs$input]), crs$dataset[crs$sample, crs$target])
	score2 <- rsq(predict(crs$nnet, newdata=crs$dataset[crs$validate, crs$input]), crs$dataset[crs$validate, crs$target])
	score3 <- rsq(predict(crs$nnet, newdata=crs$dataset[crs$test, crs$input]), crs$dataset[crs$test, crs$target])
	score4 <- rsq(predict(crs$nnet, newdata=dat_ready_test[, crs$input]), dat_ready_test[, crs$target])

	scores <- cbind(scores, c(score1, score2, score3, score4))
	# save.image(paste0("dat_ready_nnet_", i, ".RData"))
	
	plot(scores[1,], ylim=c(min(scores), max(scores)), type="l", col="black", main=paste0("nnet R^2 at iteration ", i*nnet_maxit), lwd=3, xlab="iteration", ylab="R^2")
	lines(scores[2,], col="red", lwd=3)
	lines(scores[3,], col="green", lwd=3)
	lines(scores[4,], col="blue", lwd=3)
	grid()
	legend(x="bottomright", legend=c("sample","validate","test","new data"), fill=c("black","red","green","blue"))
	
	if(i>=3 && min(score1, score2, score3) <= min(scores[1:3, (ncol(scores)-2):ncol(scores)])){
		break
	}
	
	i <- i + 1
}

# Print the results of the modelling.

cat(sprintf("A %s network with %d weights.\n",
    paste(crs$nnet$n, collapse="-"),
    length(crs$nnet$wts)))
cat(sprintf("Inputs: %s.\n",
    paste(crs$nnet$coefnames, collapse=", ")))
cat(sprintf("Output: %s.\n",
    names(attr(crs$nnet$terms, "dataClasses"))[1]))
cat(sprintf("Sum of Squares Residuals: %.4f.\n",
    sum(residuals(crs$nnet) ^ 2)))
cat("\n")
print(summary(crs$nnet))
cat('\n')

# Time taken: 3.36 secs

dev.new()
#============================================================
# Rattle timestamp: 2016-09-11 20:02:41 x86_64-w64-mingw32 

# Evaluate model performance. 

# NNET: Generate a Predicted v Observed plot for nnet model on dat_ready_train [**train**].

crs$pr <- predict(crs$nnet, newdata=crs$dataset[crs$sample, c(crs$input, crs$target)])

# Obtain the observed output for the dataset.

obs <- subset(crs$dataset[crs$sample, c(crs$input, crs$target)], select=crs$target)

# Handle in case categoric target treated as numeric.

obs.rownames <- rownames(obs)
obs <- as.numeric(obs[[1]])
obs <- data.frame(X0.000808582=obs)
rownames(obs) <- obs.rownames

# Combine the observed values with the predicted.

fitpoints <- na.omit(cbind(obs, Predicted=crs$pr))

# Obtain the pseudo R2 - a correlation.

fitcorr <- format(cor(fitpoints[,1], fitpoints[,2])^2, digits=4)

# Plot settings for the true points and best fit.

op <- par(c(lty="solid", col="blue"))

# Display the observed (X) versus predicted (Y) points.

plot(fitpoints[[1]], fitpoints[[2]], asp=1, xlab="X0.000808582", ylab="Predicted")

# Generate a simple linear fit between predicted and observed.

prline <- lm(fitpoints[,2] ~ fitpoints[,1])

# Add the linear fit to the plot.

abline(prline)

# Add a diagonal representing perfect correlation.

par(c(lty="dashed", col="black"))
abline(0, 1)

# Include a pseudo R-square on the plot

legend("bottomright",  sprintf(" Pseudo R-square=%s ", fitcorr),  bty="n")

# Add a title and grid to the plot.

title(main="Predicted vs. Observed
 Neural Net Model
 dat_ready_train [**train**]",
    sub=paste("Rattle", format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
grid()

dev.new()
#============================================================
# Rattle timestamp: 2016-09-11 20:02:44 x86_64-w64-mingw32 

# Evaluate model performance. 

# NNET: Generate a Predicted v Observed plot for nnet model on dat_ready_train [validate].

crs$pr <- predict(crs$nnet, newdata=crs$dataset[crs$validate, c(crs$input, crs$target)])

# Obtain the observed output for the dataset.

obs <- subset(crs$dataset[crs$validate, c(crs$input, crs$target)], select=crs$target)

# Handle in case categoric target treated as numeric.

obs.rownames <- rownames(obs)
obs <- as.numeric(obs[[1]])
obs <- data.frame(X0.000808582=obs)
rownames(obs) <- obs.rownames

# Combine the observed values with the predicted.

fitpoints <- na.omit(cbind(obs, Predicted=crs$pr))

# Obtain the pseudo R2 - a correlation.

fitcorr <- format(cor(fitpoints[,1], fitpoints[,2])^2, digits=4)

# Plot settings for the true points and best fit.

op <- par(c(lty="solid", col="blue"))

# Display the observed (X) versus predicted (Y) points.

plot(fitpoints[[1]], fitpoints[[2]], asp=1, xlab="X0.000808582", ylab="Predicted")

# Generate a simple linear fit between predicted and observed.

prline <- lm(fitpoints[,2] ~ fitpoints[,1])

# Add the linear fit to the plot.

abline(prline)

# Add a diagonal representing perfect correlation.

par(c(lty="dashed", col="black"))
abline(0, 1)

# Include a pseudo R-square on the plot

legend("bottomright",  sprintf(" Pseudo R-square=%s ", fitcorr),  bty="n")

# Add a title and grid to the plot.

title(main="Predicted vs. Observed
 Neural Net Model
 dat_ready_train [validate]",
    sub=paste("Rattle", format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
grid()

dev.new()
#============================================================
# Rattle timestamp: 2016-09-11 20:02:47 x86_64-w64-mingw32 

# Evaluate model performance. 

# NNET: Generate a Predicted v Observed plot for nnet model on dat_ready_train [test].

crs$pr <- predict(crs$nnet, newdata=crs$dataset[crs$test, c(crs$input, crs$target)])

# Obtain the observed output for the dataset.

obs <- subset(crs$dataset[crs$test, c(crs$input, crs$target)], select=crs$target)

# Handle in case categoric target treated as numeric.

obs.rownames <- rownames(obs)
obs <- as.numeric(obs[[1]])
obs <- data.frame(X0.000808582=obs)
rownames(obs) <- obs.rownames

# Combine the observed values with the predicted.

fitpoints <- na.omit(cbind(obs, Predicted=crs$pr))

# Obtain the pseudo R2 - a correlation.

fitcorr <- format(cor(fitpoints[,1], fitpoints[,2])^2, digits=4)

# Plot settings for the true points and best fit.

op <- par(c(lty="solid", col="blue"))

# Display the observed (X) versus predicted (Y) points.

plot(fitpoints[[1]], fitpoints[[2]], asp=1, xlab="X0.000808582", ylab="Predicted")

# Generate a simple linear fit between predicted and observed.

prline <- lm(fitpoints[,2] ~ fitpoints[,1])

# Add the linear fit to the plot.

abline(prline)

# Add a diagonal representing perfect correlation.

par(c(lty="dashed", col="black"))
abline(0, 1)

# Include a pseudo R-square on the plot

legend("bottomright",  sprintf(" Pseudo R-square=%s ", fitcorr),  bty="n")

# Add a title and grid to the plot.

title(main="Predicted vs. Observed
 Neural Net Model
 dat_ready_train [test]",
    sub=paste("Rattle", format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
grid()

dev.new()
#============================================================
# Rattle timestamp: 2016-09-11 20:02:51 x86_64-w64-mingw32 

# Evaluate model performance. 

# Assign the R dataset to be used as the test set.

crs$testset <- dat_ready_test

# NNET: Generate a Predicted v Observed plot for nnet model on dat_ready_test.

crs$pr <- predict(crs$nnet, newdata=crs$testset)

# Obtain the observed output for the dataset.

obs <- subset(crs$testset, select=crs$target)

# Handle in case categoric target treated as numeric.

obs.rownames <- rownames(obs)
obs <- as.numeric(obs[[1]])
obs <- data.frame(X0.000808582=obs)
rownames(obs) <- obs.rownames

# Combine the observed values with the predicted.

fitpoints <- na.omit(cbind(obs, Predicted=crs$pr))

# Obtain the pseudo R2 - a correlation.

fitcorr <- format(cor(fitpoints[,1], fitpoints[,2])^2, digits=4)

# Plot settings for the true points and best fit.

op <- par(c(lty="solid", col="blue"))

# Display the observed (X) versus predicted (Y) points.

plot(fitpoints[[1]], fitpoints[[2]], asp=1, xlab="X0.000808582", ylab="Predicted")

# Generate a simple linear fit between predicted and observed.

prline <- lm(fitpoints[,2] ~ fitpoints[,1])

# Add the linear fit to the plot.

abline(prline)

# Add a diagonal representing perfect correlation.

par(c(lty="dashed", col="black"))
abline(0, 1)

# Include a pseudo R-square on the plot

legend("bottomright",  sprintf(" Pseudo R-square=%s ", fitcorr),  bty="n")

# Add a title and grid to the plot.

title(main="Predicted vs. Observed
 Neural Net Model
 dat_ready_test",
    sub=paste("Rattle", format(Sys.time(), "%Y-%b-%d %H:%M:%S"), Sys.info()["user"]))
grid()