If you use train_users_2.csv.zipone available in kaggle, the problem is that you are not deleting the country_destinationtrain from the dataset since it is in position 16, not 1.
which(colnames(train) == "country_destination")
16
1 idwhich is unique to each observation and must also be deleted.
length(unique(train[,1)) == nrow(train)
#output
TRUE
When I run your code with the following modification:
full_variables <- data.matrix(train[,-c(1, 16)])
library(xgboost)
params <- list("objective" = "multi:softprob",
"num_class" = length(unique(train_label)),
eta = 0.3,
max_depth = 6)
cv_model <- xgb.cv(params = params,
data = train_matrix,
nrounds = 50,
nfold = 5,
early_stop_round = 1,
verbose = T,
maximize = T,
prediction = T)
0.12 .
out_of_fold_p <- data.frame(cv_model$pred) %>% mutate(max_prob = max.col(., ties.method = "last"),label = train_label + 1)
head(out_of_fold_p[,13:14], 20)
max_prob label
1 8 8
2 12 12
3 12 10
4 12 12
5 12 12
6 12 12
7 12 12
8 12 12
9 8 8
10 12 5
11 12 2
12 2 12
13 12 12
14 12 12
15 12 12
16 8 8
17 8 8
18 12 5
19 8 8
20 12 12
, , y x.
EDIT: , 100% 5 CV. 22 (, , ).
model <- xgboost(params = params,
data = train_matrix,
nrounds = 50,
verbose = T,
maximize = T)
100% - :
pred <- predict(model, test_matrix)
pred <- matrix(pred, ncol=length(unique(train_label)), byrow = TRUE)
out_of_fold_p <- data.frame(pred) %>% mutate(max_prob = max.col(., ties.method = "last"),label = test_label + 1)
sum(out_of_fold_p$max_prob != out_of_fold_p$label)
, :
xgb.plot.importance(importance_matrix = xgb.importance(colnames(train_matrix), model))

, xgb.cv :
train_matrix <- xgb.DMatrix(data = train_data[,which(colnames(train_data) %in% xgboost::xgb.importance(colnames(train_matrix), model)$Feature)], label = train_label)
set.seed(1)
cv_model <- xgb.cv(params = params,
data = train_matrix,
nrounds = 50,
nfold = 5,
early_stop_round = 1,
verbose = T,
maximize = T,
prediction = T)
100% -
:
table(train_label)
train_label
0 1 2 3 4 5 6 7 8 9 10 11
3 10 12 13 36 16 19 856 7 73 3 451
, 1 :
gg <- data.frame(train_data[,which(colnames(train_data) %in% xgb.importance(colnames(train_matrix), model)$Feature)], label = as.factor(train_label))
gg %>%
as.tibble() %>%
select(1:9, 11, 12, 15:21, 23) %>%
gather(key, value, 1:18) %>%
ggplot()+
geom_bar(aes(x = label))+
facet_grid(key ~ value) +
theme(strip.text.y = element_text(angle = 90))

0/1 22 , , , 100% .
, 0 10 5- CV, , , . , CV . xgb.cv:
lapply(cv_model$folds, function(x){
table(train_label[x])})