library(readr)
library(xgboost)
library(data.table)
library(Matrix)
library(caret)

# The competition datafiles are in the directory ../input
# Read competition data files:
train <- read_csv("../input/train.csv")
test <- read_csv("../input/test.csv")

# keep copy of ID variables for test and train data
train_Id <- train$Id
test_Id <- test$Id

# response variable from training data
train_y <- train$Hazard

# predictor variables from training
train_x <- subset(train, select = -c(Id, Hazard))
train_x <- sparse.model.matrix(~., data = train_x)

# predictor variables from test
test_x <- subset(test, select = -c(Id))
test_x <- sparse.model.matrix(~., data = test_x)


# build Gini functions for use in custom xgboost evaluation metric
SumModelGini <- function(solution, submission) {
  df = data.frame(solution = solution, submission = submission)
  df <- df[order(df$submission, decreasing = TRUE),]
  df$random = (1:nrow(df))/nrow(df)
  totalPos <- sum(df$solution)
  df$cumPosFound <- cumsum(df$solution) # this will store the cumulative number of positive examples found (used for computing "Model Lorentz")
  df$Lorentz <- df$cumPosFound / totalPos # this will store the cumulative proportion of positive examples found ("Model Lorentz")
  df$Gini <- df$Lorentz - df$random # will store Lorentz minus random
  return(sum(df$Gini))
}

NormalizedGini <- function(solution, submission) {
  SumModelGini(solution, submission) / SumModelGini(solution, solution)
}

# wrap up into a function to be called within xgboost.train
evalgini <- function(preds, dtrain) {
  labels <- getinfo(dtrain, "label")
  err <- NormalizedGini(as.numeric(labels),as.numeric(preds))
  return(list(metric = "Gini", value = err))
}


# Set xgboost parameters
param <- list("objective" = "reg:linear",
              "eta" = 0.002,
              "min_child_weight" = 5,
              "subsample" = .8,
              "colsample_bytree" = .8,
              "scale_pos_weight" = 1.0,
              "max_depth" = 8)

# Using 5000 rows for early stopping. 
offset <- 5000
num_rounds <- 3000
set.seed(2015)

# Set xgboost test and training and validation datasets
xgtest <- xgb.DMatrix(data = test_x)
xgtrain <- xgb.DMatrix(data = train_x[offset:nrow(train_x),], label= train_y[offset:nrow(train_x)])
xgval <-  xgb.DMatrix(data = train_x[1:offset,], label= train_y[1:offset])

# setup watchlist to enable train and validation, validation must be first for early stopping
watchlist <- list(val=xgval, train=xgtrain)
# to train with watchlist, use xgb.train, which contains more advanced features



#----------------------------------------
# Fit and evaulate using GINI
# Best iteration: 2282 Best Score(GINI): 0.3675499
# Public Leaderboard Gini: 0.346841
#----------------------------------------

bst2a <- xgb.train(params = param, data = xgtrain, feval = evalgini, nround=num_rounds, print.every.n = 50, watchlist=watchlist, early.stop.round = 50, maximize = TRUE)
preds2a <- predict(bst2a,xgtest)
# Output submission
pred2a_df = data.frame(Id = test_Id, Hazard= preds2a)
write.table(pred2a_df, file = '../output/xgboost_2a.csv', row.names = F, col.names = T, sep = ",", quote = F)

#----------------------------------------
# Fit and evaulate using GINI
# Best iteration: 2409 Best Score(RMSE): 3.715605
# Public Leaderboard Gini: 0.380114
#----------------------------------------

bst2b <- xgb.train(params = param, data = xgtrain, nround=num_rounds, print.every.n = 50, watchlist=watchlist, early.stop.round = 50, maximize = FALSE)
preds2b <- predict(bst2b,xgtest)
# Output submission
pred2b_df = data.frame(Id = test_Id, Hazard= preds2b)
write.table(pred2b_df, file = '../output/xgboost_2b.csv', row.names = F, col.names = T, sep = ",", quote = F)