I use glmnetto fit some models and cross-check for lambda. I use cv.glmnetby default (as it cross-checks
lambdainternally), but below I focus on the first step of this function, which causes problems.
First data setup. I did not make a reproducible example and cannot share the raw data, but it dim(smat)is approximately 4.7M rows with 50 columns, about half of which are dense. I tried to simplify the approach to reproducing the problem with completely random columns, but to no avail.
library(data.table)
DT = fread(...)
n_cv = 10L
IDs = DT[ , .(rand_id = runif(1L)), keyby = ID]
IDs[order(rand_id), cv_grp := .I %% n_cv + 1L]
DT[IDs, cv_grp := i.cv_grp, on = 'ID']
setkey(DT, cv_grp)
DT[ , rowN := .I]
library(glmnet)
library(Matrix)
model = y ~ ...
smat = sparse.model.matrix(model, data = DT)
ymat = diag(2L)[factor(DT$y), ]
Below is an adapted version of what it cv.glmnetdoes before moving on to cv.lognet:
train_models = lapply(seq_len(n_cv), function(i) {
train_idx = DT[!.(i), rowN]
glmnet(smat[train_idx, , drop = FALSE], ymat[train_idx, ],
alpha = 1, family = 'binomial')
})
, . parallel = TRUE:
library(doMC)
registerDoMC(detectCores())
train_models_par = foreach(i = seq_len(n_cv), .packages = c("glmnet", "data.table")) %dopar% {
train_idx = DT[!.(i), rowN]
glmnet(smat[train_idx, , drop = FALSE], ymat[train_idx, ],
alpha = 1, family = 'binomial')
}
glmnet ( any(sapply(train_models, is.null)), FALSE):
sapply(train_models_par, is.null)
- ( , , cv_grp = 2 ). glmnet is.null . .verbose = TRUE foreach, . , data.table , cv.glmnet ( ) which = foldid == i .
? , , , (, )?
:
sessionInfo()
system('free -m')
detectCores()
system('lscpu | grep "Model name"')