rm(list = ls()) library(vtreat) library(onion) library(rgl) data(bunny) importanceThreshold <- 0.95 bunny_dat <- as.data.frame(bunny) inputs <- append(as.numeric(bunny_dat$x), as.numeric(bunny_dat$y)) for (i in 1:10){ naming <- paste('input_noise_', i, sep = '') bunny_dat[, eval(naming)] <- inputs[sample(length(inputs), nrow(bunny_dat), replace = T)] } sampleA <- bunny_dat[sample(nrow(bunny_dat), nrow(bunny_dat) / 2, replace = F), c("x", "y", "input_noise_1", "input_noise_2", "input_noise_3", "input_noise_4", "input_noise_5", "input_noise_6", "input_noise_7", "input_noise_8", "input_noise_9", "input_noise_10", "z")] sampleB <- bunny_dat[!row.names(bunny_dat) %in% rownames(sampleA), c("x", "y", "input_noise_1", "input_noise_2", "input_noise_3", "input_noise_4", "input_noise_5", "input_noise_6", "input_noise_7", "input_noise_8", "input_noise_9", "input_noise_10", "z")] targetName <- tail(colnames(sampleA),1) predictorNames <- head(colnames(sampleA), ncol(sampleA)-1) #So a good upper bound on the pruning threshold might be 1/nvar, where nvar is the number of variables. #Set value "1" to keep all variables; or value close to 0 to keep only most significant ones pruneSig = 1 #1.0/(ncol(sampleA)-1) extractProjection <- function(ndim,princ) { return(princ$rotation[,1:ndim]) } rsq <- function(x,y) { 1 - sum((y-x)^2)/sum((y-mean(y))^2) } treatmentsN <- designTreatmentsN(dframe = sampleA, varlist = predictorNames, outcomename = targetName, verbose = FALSE) #scale=TRUE in this case is "move to outcome-scale" or "Y-scale", not the default scale dTrainNTreatedYScaled <- prepare(treatmentplan = treatmentsN, dframe = sampleA, pruneSig = pruneSig, scale = TRUE) vars <- setdiff(colnames(dTrainNTreatedYScaled),targetName) # prcomp defaults to scale. = FALSE, but we already scaled/centered in vtreat- which we don't want to lose. dmTrain <- as.matrix(dTrainNTreatedYScaled[,vars]) princ <- prcomp(x = dmTrain, center = FALSE, scale. = FALSE, tol=NULL) totalComponents <- ncol(princ$x) componentsToUse <- totalComponents for(i in 1:totalComponents){ componentsToUse <- i if(summary(princ)$importance[3,i] >= importanceThreshold){ break; } } if(componentsToUse<2){ componentsToUse = 2 } PCnameList <- colnames(princ$x)[1:componentsToUse] proj <- extractProjection(componentsToUse,princ) projectedTrain <- as.data.frame(dmTrain %*% proj, stringsAsFactors = FALSE) projectedTrain[, targetName] <- dTrainNTreatedYScaled[, targetName] model <- lm(paste(targetName, paste(PCnameList,collapse=' + '),sep=' ~ '),data=projectedTrain) projectedTrain$estimate <- predict(object = model,newdata=projectedTrain) trainrsq = rsq(projectedTrain$estimate,projectedTrain[, targetName]) cat("sampleA R^2:", trainrsq,"\n") dFronttestNTreatedYScaled <- prepare(treatmentplan = treatmentsN, dframe = sampleB[, predictorNames], pruneSig = pruneSig, scale = TRUE) dmFronttest <- as.matrix(dFronttestNTreatedYScaled[,vars]) projectedFronttest <- as.data.frame(dmFronttest %*% proj, stringsAsFactors = FALSE) #projectedFronttest[, targetName] <- dFronttestNTreatedYScaled[, targetName] projectedFronttest$estimate <- predict(object = model, newdata=projectedFronttest) frontestrsq = rsq(projectedFronttest$estimate,sampleB[, targetName]) cat("sampleB R^2 (predicted Z vs sampleB$z):", frontestrsq,"\n") open3d() points3d(sampleA[,c("x","y","z")], col=8, size=0.1) open3d() points3d(cbind(sampleB[,c("x","y")], projectedFronttest$estimate), col=8, size=0.1) #Plot predictor loadings library('ggplot2') library('tidyr') dotplot_identity = function(frame, xvar, yvar, colorvar=NULL) { if(is.null(colorvar)) { gplot = ggplot(frame, aes_string(x=xvar, y=yvar, ymax=yvar)) } else { gplot = ggplot(frame, aes_string(x=xvar, y=yvar, ymax=yvar, color=colorvar)) } gplot + geom_point() + geom_linerange(aes(ymin=0)) } projFull <- extractProjection(ncol(princ$x),princ) rotf = as.data.frame(projFull) rotf$varName = rownames(rotf) rotflong = gather(rotf, "PC", "loading", starts_with("PC")) rotflong$vartype = "signal" dotplot_identity(rotflong, "varName", "loading", "vartype") + facet_wrap(~PC,nrow=1) + coord_flip() + ggtitle("Y-Scaled Variable loadings, first five principal components") + scale_color_manual(values = c("noise" = "#d95f02", "signal" = "#1b9e77")) summary(model) cat("Component importance:\n") summary(princ)$importance[3,]