basic-app.Rmd
First, some setup, though the RNGkind
does not seem to resolve the reproducibility issue of mclapply
!
library(pois) library(Matrix) library(corrplot) #> corrplot 0.84 loaded library(parallel) RNGkind("L'Ecuyer-CMRG") set.seed(1000)
Let us now load some data. simcctox
is a simulated dataset avaialbe in the package. It is simulated from a POIS model fitted to a real toxicity data. We pick a ranfom subsample of size 150 from simcctox dataset as a toy example:
X = simcctox[sample(nrow(simcctox), 150), ] # X = order_cols_by_freq(X)[, 1:12] # order columns by frequency and pick the first 12
We check whether any column has 0 or 1 observations. glmnet
would complain in this case:
check_01_cols(X) # check whether any columns has 0 or 1 observations #> [1] FALSE
The 0-1 issue could still happen during CV-subsampling, even if the original data has passed the test. The code does not check for that right now (or try to resample to avoid it), so better check to make sure the column sums are large enough:
Matrix::colSums(X) #> Lycd Drmr Drrh Fatg Anlp Anrx Rctp Bckp Anem Wbcd Abdp Rmc- #> 81 57 53 44 34 29 26 23 20 19 15 15
We can now fit the POIS model with pair-complement MMD cross-validation:
out = fit_pois_glmnet_mmdcv(X, nlam = 5, nreps = 3) #> --- Starting CV --- #> Rep 1/3 ... #> POIS: glment, global, nocv ... 0.225 (s). #> lam = 1.00e-04 [Computing pair-comp. MMD (nBasis = 64, nPairs = 66) ... 0.3 (s). value = 0.147] #> lam = 4.73e-04 [Computing pair-comp. MMD (nBasis = 64, nPairs = 66) ... 0.2 (s). value = 0.170] #> lam = 2.24e-03 [Computing pair-comp. MMD (nBasis = 64, nPairs = 66) ... 0.3 (s). value = 0.158] #> lam = 1.06e-02 [Computing pair-comp. MMD (nBasis = 64, nPairs = 66) ... 0.3 (s). value = 0.127] #> lam = 5.01e-02 [Computing pair-comp. MMD (nBasis = 64, nPairs = 66) ... 0.3 (s). value = 0.164] #> Rep 2/3 ... #> POIS: glment, global, nocv ... 0.210 (s). #> lam = 1.00e-04 [Computing pair-comp. MMD (nBasis = 64, nPairs = 66) ... 0.3 (s). value = 0.175] #> lam = 4.73e-04 [Computing pair-comp. MMD (nBasis = 64, nPairs = 66) ... 0.2 (s). value = 0.097] #> lam = 2.24e-03 [Computing pair-comp. MMD (nBasis = 64, nPairs = 66) ... 0.2 (s). value = 0.026] #> lam = 1.06e-02 [Computing pair-comp. MMD (nBasis = 64, nPairs = 66) ... 0.2 (s). value = 0.030] #> lam = 5.01e-02 [Computing pair-comp. MMD (nBasis = 64, nPairs = 66) ... 0.2 (s). value = 0.010] #> Rep 3/3 ... #> POIS: glment, global, nocv ... 0.202 (s). #> lam = 1.00e-04 [Computing pair-comp. MMD (nBasis = 64, nPairs = 66) ... 0.3 (s). value = 0.039] #> lam = 4.73e-04 [Computing pair-comp. MMD (nBasis = 64, nPairs = 66) ... 0.2 (s). value = 0.008] #> lam = 2.24e-03 [Computing pair-comp. MMD (nBasis = 64, nPairs = 66) ... 0.2 (s). value = 0.032] #> lam = 1.06e-02 [Computing pair-comp. MMD (nBasis = 64, nPairs = 66) ... 0.2 (s). value = 0.006] #> lam = 5.01e-02 [Computing pair-comp. MMD (nBasis = 64, nPairs = 66) ... 0.2 (s). value = 0.039] #> --- End of CV --- Chose lam = 0.01 #> Fitting the final model ... #> POIS: glment, global, nocv ... 0.210 (s). opt_idx = out$opt_idx printf('Optimal model = #%d, with lambda = %3.2e', opt_idx, out$opt_lambda) #> Optimal model = #4, with lambda = 1.06e-02
Let us also plot the regularization curve (pair-complement MMDvs. \(\lambda\)):
par(mar = c(4,4,0,0)) plot(out$lambda, out$reg_curve, log = "x", type="l", xlab="lambda", ylab="pair-complement MMD")
Let us pick a less regularized model that what CV gives …
model = out$models[[opt_idx-1]] # We tend to over-regularize, pick a less regularized model print(model) #> Pois model with dimension = 12 and # of levels 4. #> Pervalance of -Inf in Theta = 47.92% #> Sparsity of Gamma = 11.11%
… and plot its \(\Gamma\) paramter, i.e., POIS’s interaction matrix:
par(mar = c(0,0,0,0)) corr_type = "upper" # "full" corrplot(model$Gamma, is.corr = F, type=corr_type, method = "square", tl.cex = 0.8, tl.col = "black", diag = F, col=colorRampPalette(c("blue","gray","red"))(50))