#### XGBOOST #### # df <- read.csv("data/ - .csv", stringsAsFactors = F) df <- df %>% select(c(City, Time, days)) df <- df %>% separate(Time, c("hours", "minutes"), sep = ":") df$days <- as.factor(df$days) levels(df$days) <- c("7", "2", "1", "5", "3", "6", "4") df[,2:4] <- apply(df[,2:4], 2, function(x) as.numeric(x)) top10 <- df %>% count(City) %>% arrange(desc(n)) top10 <- top10$City[1:10] df <- df %>% filter(City %in% top10) df <- na.omit(df) # df$class <- as.numeric(as.factor(df$City))-1 City.class <- df %>% select(City, class) City.class <- unique(City.class) df <- df[,-1] # train test # 1/3 indexes <- createDataPartition(df$class, times = 1, p = 0.7, list = F) train <- df[indexes,] test <- df[-indexes,] # y.train <- train$class # train.m <- data.matrix(train[,-4]) train.m <- xgb.DMatrix(train.m, label = y.train) # Stopping. Best iteration: # [15] train-merror:0.425361+0.010171 # test-merror:0.504626+0.035449 model <- xgb.cv(data = train.m, nfold = 4, eta = 0.03, nrounds = 2000, num_class = 10, objective = "multi:softmax", early_stopping_round = 200) # # train$class <- as.factor(train$class) traintask <- makeClassifTask(data = train, target = "class") lrn <- makeLearner("classif.xgboost", predict.type = "response") lrn$par.vals <- list(objective = "multi:softmax", eval_metric = "merror", nrounds = 15, eta = 0.03) params <- makeParamSet(makeDiscreteParam("booster", values = c("gbtree", "gblinear")), makeIntegerParam("max_depth", lower = 1, upper = 10), makeNumericParam("min_child_weight", lower = 1, upper = 10), makeNumericParam("subsample", lower = 0.5, upper = 1), makeNumericParam("colsample_bytree", lower = 0.5, upper = 1)) rdesc <- makeResampleDesc("CV", iters = 4) # ctrl <- makeTuneControlRandom(maxit = 10) # mytune <- tuneParams(learner = lrn, task = traintask, resampling = rdesc, par.set = params, control = ctrl, show.info = T) # [Tune-y] 10: mmce.test.mean=0.525; time: 0.0 min # [Tune] Result: booster=gbtree; max_depth=10; min_child_weight=5; # subsample=0.99; colsample_bytree=0.907 : mmce.test.mean=0.516 # Xgboost-model # param <- list( "num_class" = 10, "objective" = "multi:softmax", "eval_metric" = "merror", "eta" = 0.03, "max_depth" = 10, "min_child_weight" = 5, "subsample" = 0.99, "colsample_bytree" = 0.907) # model <- xgb.cv(data = train.m, params = param, nfold = 4, nrounds = 20000, early_stopping_round = 100) # Stopping. Best iteration: # [84] train-merror:0.462308+0.015107 test-merror:0.509050+0.028020 # Xgboost- model <- xgboost(data = train.m, params = param, nrounds = 84, scale_pos_weight = 5) # test-matrix y.test <- test$class test <- data.matrix(test[,-4]) # mat <- xgb.importance(feature_names = colnames(train.m), model = model) xgb.plot.importance(importance_matrix = mat, main = " :") # y.predict <- predict(model, test, nrounds = 84, scale_pos_weight = 5) # replace.class <- function(x){ for (i in unique(x)) { x[x == i] <- City.class$City[City.class$class == i] } return(x) } # confusionMatrix(replace.class(y.predict), replace.class(y.test)) # # df_test <- data.frame(hours = as.numeric(sample(x = c(0:23), size = 10, replace = T)), minutes = as.numeric(sample(x = c(0:59), size = 10, replace = T)), days = as.numeric(sample(x = c(1:7), size = 10, replace = T))) # df_test$City <- replace.class(predict(model, data.matrix(df_test), nrounds = 84, scale_pos_weight = 5)) # df_test <- df_test[c("City", "days", "hours", "minutes")] colnames(df_test) <- c(" ", " ", " ", " ") df_test <- df_test %>% arrange(` `) grid.text(" xgboost", x = 0.5, y = 0.93, just = c("centre", "bottom"), gp = gpar(fontsize = 16)) grid.table(df_test) grid.text(": blablacar.ru", x = 0.02, y = 0.01, just = c("left", "bottom"), gp = gpar(fontsize = 11)) grid.text(" silentio.su", x = 0.98, y = 0.01, just = c("right", "bottom"), gp = gpar(fontsize = 11))