library(caret) # for dummyVars
library(RCurl) # download https data
library(Metrics) # calculate errors
library(xgboost) # model
###############################################################################
# Load data from UCI Machine Learning Repository (https://archive.ics.uci.edu/ml/datasets.html)
urlfile <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
x <- getURL(urlfile, ssl.verifypeer = FALSE)
adults <- read.csv(textConnection(x), header=F)
# adults <-read.csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header=F)
names(adults)=c('age','workclass','fnlwgt','education','educationNum',
'maritalStatus','occupation','relationship','race',
'sex','capitalGain','capitalLoss','hoursWeek',
'nativeCountry','income')
# clean up data
adults$income <- ifelse(adults$income==' <=50K',0,1)
# binarize all factors
library(caret)
dmy <- dummyVars(" ~ .", data = adults)
adultsTrsf <- data.frame(predict(dmy, newdata = adults))
###############################################################################
# what we're trying to predict adults that make more than 50k
outcomeName <- c('income')
# list of features
predictors <- names(adultsTrsf)[!names(adultsTrsf) %in% outcomeName]
# play around with settings of xgboost - eXtreme Gradient Boosting (Tree) library
# https://github.com/tqchen/xgboost/wiki/Parameters
# max.depth - maximum depth of the tree
# nrounds - the max number of iterations
# take first 10% of the data only!
trainPortion <- floor(nrow(adultsTrsf)*0.1)
trainSet <- adultsTrsf[ 1:floor(trainPortion/2),]
testSet <- adultsTrsf[(floor(trainPortion/2)+1):trainPortion,]
smallestError <- 100
for (depth in seq(1,10,1)) {
for (rounds in seq(1,20,1)) {
# train
bst <- xgboost(data = as.matrix(trainSet[,predictors]),
label = trainSet[,outcomeName],
max.depth=depth, nround=rounds,
objective = "reg:linear", verbose=0)
gc()
# predict
predictions <- predict(bst, as.matrix(testSet[,predictors]), outputmargin=TRUE)
err <- rmse(as.numeric(testSet[,outcomeName]), as.numeric(predictions))
if (err < smallestError) {
smallestError = err
print(paste(depth,rounds,err))
}
}
}
cv <- 30
trainSet <- adultsTrsf[1:trainPortion,]
cvDivider <- floor(nrow(trainSet) / (cv+1))
smallestError <- 100
for (depth in seq(1,10,1)) {
for (rounds in seq(1,20,1)) {
totalError <- c()
indexCount <- 1
for (cv in seq(1:cv)) {
# assign chunk to data test
dataTestIndex <- c((cv * cvDivider):(cv * cvDivider + cvDivider))
dataTest <- trainSet[dataTestIndex,]
# everything else to train
dataTrain <- trainSet[-dataTestIndex,]
bst <- xgboost(data = as.matrix(dataTrain[,predictors]),
label = dataTrain[,outcomeName],
max.depth=depth, nround=rounds,
objective = "reg:linear", verbose=0)
gc()
predictions <- predict(bst, as.matrix(dataTest[,predictors]), outputmargin=TRUE)
err <- rmse(as.numeric(dataTest[,outcomeName]), as.numeric(predictions))
totalError <- c(totalError, err)
}
if (mean(totalError) < smallestError) {
smallestError = mean(totalError)
print(paste(depth,rounds,smallestError))
}
}
}
###########################################################################
# Test both models out on full data set
trainSet <- adultsTrsf[ 1:trainPortion,]
# assign everything else to test
testSet <- adultsTrsf[(trainPortion+1):nrow(adultsTrsf),]
bst <- xgboost(data = as.matrix(trainSet[,predictors]),
label = trainSet[,outcomeName],
max.depth=4, nround=19, objective = "reg:linear", verbose=0)
pred <- predict(bst, as.matrix(testSet[,predictors]), outputmargin=TRUE)
rmse(as.numeric(testSet[,outcomeName]), as.numeric(pred))
bst <- xgboost(data = as.matrix(trainSet[,predictors]),
label = trainSet[,outcomeName],
max.depth=3, nround=20, objective = "reg:linear", verbose=0)
pred <- predict(bst, as.matrix(testSet[,predictors]), outputmargin=TRUE)
rmse(as.numeric(testSet[,outcomeName]), as.numeric(pred))