Does the XGboost model consistently get 100% accuracy?

I work with Airbnb data, here in Kaggle, and predicting that country users will book their first trips using the XGBoost model and almost 600 functions in R. By running the algorithm after 50 rounds of 5x cross-reference checking, I got 100% accuracy everytime. After fitting the model to the training data and forecasting on the test set, I also got 100% accuracy. These results cannot be real. There should be something wrong in my code, but so far I have not been able to figure it out. I have included a section of my code below. It is based on this article.. Following along with this article (using the data from the article + copying the code), I get similar results. However, applying it to Airbnb data, I consistently get 100% accuracy. I have no idea what is going on. Am I using the xgboost package incorrectly? Your help and time are welcome.

# set up the data  
# train is the data frame of features with the target variable to predict
full_variables <- data.matrix(train[,-1]) # country_destination removed
full_label <- as.numeric(train$country_destination) - 1 

# training data 
train_index <- caret::createDataPartition(y = train$country_destination, p = 0.70, list = FALSE)
train_data <- full_variables[train_index, ]
train_label <- full_label[train_index[,1]]
train_matrix <- xgb.DMatrix(data = train_data, label = train_label)

# test data 
test_data <- full_variables[-train_index, ]
test_label <- full_label[-train_index[,1]]
test_matrix <- xgb.DMatrix(data = test_data, label = test_label)

# 5-fold CV
params <- list("objective" = "multi:softprob",
               "num_class" = classes,
               eta = 0.3, 
               max_depth = 6)
cv_model <- xgb.cv(params = params,
               data = train_matrix,
               nrounds = 50,
               nfold = 5,
               early_stop_round = 1,
               verbose = F,
               maximize = T,
               prediction = T)

# out of fold predictions 
out_of_fold_p <- data.frame(cv_model$pred) %>% mutate(max_prob = max.col(., ties.method = "last"),label = train_label + 1)
head(out_of_fold_p)

# confusion matrix
confusionMatrix(factor(out_of_fold_p$label), 
                factor(out_of_fold_p$max_prob),
                mode = "everything")

An example of the data that I used for this can be found here by running this code:

library(RCurl)
x < getURL("https://raw.githubusercontent.com/loshita/Senior_project/master/train.csv")
y <- read.csv(text = x)
+4
source share
1 answer

If you use train_users_2.csv.zipone available in kaggle, the problem is that you are not deleting the country_destinationtrain from the dataset since it is in position 16, not 1.

which(colnames(train) == "country_destination")
#output
16

1 idwhich is unique to each observation and must also be deleted.

length(unique(train[,1)) == nrow(train)
#output
TRUE

When I run your code with the following modification:

full_variables <- data.matrix(train[,-c(1, 16)]) 

  library(xgboost)

params <- list("objective" = "multi:softprob",
               "num_class" = length(unique(train_label)),
               eta = 0.3, 
               max_depth = 6)
cv_model <- xgb.cv(params = params,
                   data = train_matrix,
                   nrounds = 50,
                   nfold = 5,
                   early_stop_round = 1,
                   verbose = T,
                   maximize = T,
                   prediction = T)

0.12 .

out_of_fold_p <- data.frame(cv_model$pred) %>% mutate(max_prob = max.col(., ties.method = "last"),label = train_label + 1)

head(out_of_fold_p[,13:14], 20)
#output
   max_prob label
1         8     8
2        12    12
3        12    10
4        12    12
5        12    12
6        12    12
7        12    12
8        12    12
9         8     8
10       12     5
11       12     2
12        2    12
13       12    12
14       12    12
15       12    12
16        8     8
17        8     8
18       12     5
19        8     8
20       12    12

, , y x.

EDIT: , 100% 5 CV. 22 (, , ).

model <- xgboost(params = params,
                   data = train_matrix,
                   nrounds = 50,
                   verbose = T,
                   maximize = T)

100% - :

pred <- predict(model, test_matrix)
pred <- matrix(pred, ncol=length(unique(train_label)), byrow = TRUE)
out_of_fold_p <- data.frame(pred) %>% mutate(max_prob = max.col(., ties.method = "last"),label = test_label + 1)

sum(out_of_fold_p$max_prob != out_of_fold_p$label) #0 errors

, :

xgb.plot.importance(importance_matrix = xgb.importance(colnames(train_matrix), model))

enter image description here

, xgb.cv :

train_matrix <- xgb.DMatrix(data = train_data[,which(colnames(train_data) %in% xgboost::xgb.importance(colnames(train_matrix), model)$Feature)], label = train_label)

set.seed(1)
cv_model <- xgb.cv(params = params,
                   data = train_matrix,
                   nrounds = 50,
                   nfold = 5,
                   early_stop_round = 1,
                   verbose = T,
                   maximize = T,
                   prediction = T)

100% -

:

table(train_label)
train_label
  0   1   2   3   4   5   6   7   8   9  10  11 
  3  10  12  13  36  16  19 856   7  73   3 451 

, 1 :

gg <- data.frame(train_data[,which(colnames(train_data) %in% xgb.importance(colnames(train_matrix), model)$Feature)], label = as.factor(train_label))

gg %>%
  as.tibble() %>%
  select(1:9, 11, 12, 15:21, 23) %>%
  gather(key, value, 1:18) %>%
  ggplot()+
  geom_bar(aes(x = label))+
  facet_grid(key ~ value) +
  theme(strip.text.y = element_text(angle = 90))

enter image description here

0/1 22 , , , 100% .

, 0 10 5- CV, , , . , CV . xgb.cv:

lapply(cv_model$folds, function(x){
  table(train_label[x])})
+4

Source: https://habr.com/ru/post/1693388/


All Articles