--- title: "reproducibility - application" output: rmarkdown::html_vignette editor_options: chunk_output_type: console vignette: > %\VignetteIndexEntry{application} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} --- This script requires that the working directory includes the folders "data", "results", and "manuscript", with the files "PPMI_Baseline_Data_02Jul2018.csv", "PPMI_Year_1-3_Data_02Jul2018.csv" and "PPMI_Data_Dictionary_for_Baseline_Dataset_Jul2018.csv" in the folder "data". We obtained our results using R 4.3.0 (2023-04-21) with cornet 0.0.8 (2023-06-01) on a local machine (aarch64-apple-darwin20, macOS Ventura 13.4). ```{r setup, include=FALSE} knitr::opts_chunk$set(echo=TRUE,eval=FALSE) #setwd("~/Desktop/cornet") #utils::install.packages(pkgs=c("devtools","missRanger","xtable","randomForest","MLmetrics")) #devtools::install_github("rauschenberger/cornet") ``` # Processing ```{r process} # features X <- read.csv("data/PPMI_Baseline_Data_02Jul2018.csv",row.names="PATNO",na.strings=c(".","")) X <- X[X$APPRDX==1,] # Parkinson's disease X[c("SITE","APPRDX","EVENT_ID","symptom5_comment")] <- NULL 100*mean(is.na(X)) # proportion missingness #x <- mice::complete(data=mice::mice(X,m=10,maxit=5,method="pmm",seed=1),action="all") # low-dimensional x <- lapply(seq_len(10),function(x) missRanger::missRanger(data=X,pmm.k=3, num.trees=100,verbose=0,seed=1)) # high-dimensional x <- lapply(x,function(x) model.matrix(~.-1,data=x)) # outcome Y <- read.csv("data/PPMI_Year_1-3_Data_02Jul2018.csv",na.strings=".") Y <- Y[Y$APPRDX==1 & Y$EVENT_ID %in% c("V04","V06","V08"),] Y <- Y[,c("EVENT_ID","PATNO","moca")] Y <- reshape(Y,idvar="PATNO",timevar="EVENT_ID",direction="wide") rownames(Y) <- Y$PATNO; Y$PATNO <- NULL # overlap names <- intersect(rownames(X),rownames(Y)) Y <- Y[names,]; x <- lapply(x,function(x) x[names,]) save(Y,x,file="data/processed_data.RData") writeLines(text=capture.output(utils::sessionInfo(),cat("\n"), sessioninfo::session_info()),con="data/info_data.txt") # dictionary rm(list=ls()) info <- read.csv("data/PPMI_Data_Dictionary_for_Baseline_Dataset_Jul2018.csv",nrows=238) info <- info[!info$Variable %in% c("","PATNO","SITE","APPRDX","EVENT_ID","symptom5_comment"),] info <- paste(paste0("\\item \\textit{",info$Variable,"}: ",info$Description),collapse=" ") info <- gsub(pattern=" \\(OFF\\)",replacement="",x=info) info <- gsub(pattern="_",replacement="\\_",x=info,fixed=TRUE) info <- gsub(pattern="&",replacement="\\\\&",x=info) cat(info) ``` # Analysis ```{r analyse} load("data/processed_data.RData",verbose=TRUE) colSums(!is.na(Y)) # sample size round(100*colMeans(Y<25.5,na.rm=TRUE),1) # proportion impairment # --- Cross-validating models. --- names <- paste0(rep(c("lasso","ridge"),each=3),seq_len(3)) # change to 3! loss <- fit <- p.log <- p.lin <- list() for(i in seq_along(x)){ loss[[i]] <- fit[[i]] <- p.log[[i]] <- p.lin[[i]] <- list() cat("i =",i,"\n") for(j in seq_along(names)){ cat("j =",j," ") alpha <- 1*(substr(names[j],start=1,stop=5)=="lasso") index <- as.numeric(substr(names[j],start=6,stop=6)) y <- Y[,index] cond <- !is.na(y) set.seed(i) loss[[i]][[names[j]]] <- cornet::cv.cornet(y=y[cond],cutoff=25.5, X=x[[i]][cond,],alpha=alpha,rf=(alpha==1),xgboost=(alpha==1)) set.seed(i) fit[[i]][[names[j]]] <- cornet::cornet(y=y[cond],cutoff=25.5, X=x[[i]][cond,],alpha=alpha) set.seed(i) temp <- replicate(n=50,expr=cornet:::.test(y=y[cond],cutoff=25.5,X=x[[i]][cond,],alpha=alpha)) p.log[[i]][[names[j]]] <- median(unlist(temp["log",])) p.lin[[i]][[names[j]]] <- median(unlist(temp["lin",])) } cat("\n") } save(loss,fit,p.log,p.lin,file="results/application.RData") writeLines(text=capture.output(utils::sessionInfo(),cat("\n"), sessioninfo::session_info()),con="results/info_app.txt") ``` # Results ```{r table_test} load("results/application.RData",verbose=TRUE) k <- "binomial" # compare: k <- "gaussian names <- c(paste0("lasso",1:3),paste0("ridge",1:3)) frame <- data.frame(row.names=names) for(i in names){ # deviance dev <- as.data.frame(t(sapply(loss,function(x) x[[i]]$deviance))) dec <- (dev$combined-dev[[k]])/dev[[k]] # change in percent frame[i,"dev.min"] <- round(100*min(dec),digits=1) frame[i,"dev.max"] <- round(100*max(dec),digits=1) # proportion with improvement frame[i,"dev.num"] <- sum(dev$combined < dev[[k]]) # significance based on multi-split if(k=="binomial"){ pvalue <- sapply(p.log,function(x) x[[i]]) } if(k=="gaussian"){ pvalue <- sapply(p.lin,function(x) x[[i]]) } frame[i,"pval.min"] <- round(min(pvalue),digits=3) frame[i,"pval.max"] <- round(max(pvalue),digits=3) # quantiles weight parameter q <- round(quantile(sapply(fit,function(x) x[[i]]$pi.min),probs=c(0,0.5,1)),digits=2) frame[i,"pi.min"] <- q[1] frame[i,"pi.med"] <- q[2] frame[i,"pi.max"] <- q[3] # quantiles scale parameter q <- round(quantile(sapply(fit,function(x) x[[i]]$sigma.min),probs=c(0,0.5,1)),digits=2) frame[i,"sd.min"] <- q[1] frame[i,"sd.med"] <- q[2] frame[i,"sd.max"] <- q[3] } # presentation frame <- format(frame) rownames(frame) <- paste0(substr(x=rownames(frame),start=1,stop=5)," ", substr(x=rownames(frame),start=6,stop=6)) colnames(frame) <- gsub(pattern="dev.",replacement="\\\\delta_{\\\\text{",x=colnames(frame)) colnames(frame) <- gsub(pattern="pval.",replacement="p_{\\\\text{",x=colnames(frame)) colnames(frame) <- gsub(pattern="pi.",replacement="\\\\pi_{\\\\text{",x=colnames(frame)) colnames(frame) <- gsub(pattern="sd.",replacement="\\\\sigma_{\\\\text{",x=colnames(frame)) colnames(frame) <- paste0("$",colnames(frame),"}}$") xtable <- xtable::xtable(frame,align="r|rrc|cc|ccc|ccc") xtable::print.xtable(xtable,include.rownames=TRUE,sanitize.text.function=function(x) x,comment=FALSE) ``` ```{r table_change} load("results/application.RData",verbose=TRUE) names <- c(paste0("lasso",1:3),paste0("ridge",1:3)) type <- c("deviance","class","mse","mae","auc","prauc") k <- "binomial" frame <- matrix(NA,nrow=length(names),ncol=length(type),dimnames=list(names,type)) for(i in names){ for(j in type){ value <- as.data.frame(t(sapply(loss,function(x) x[[i]][[j]]))) change <- 100*(value$combined-value[[k]])/value[[k]] frame[i,j] <- median(change) } } frame <- format(round(frame,digits=1)) frame <- gsub(pattern=" ",replacement="+",x=frame) colnames(frame) <- sapply(colnames(frame),function(x) switch(x,class="\\textsc{mcr}",mse="\\textsc{mse}",mae="\\textsc{mae}",auc="\\textsc{roc-auc}",prauc="\\textsc{pr-auc}",x)) colnames(frame) <- paste0("$\\Delta$",colnames(frame)) rownames(frame) <- paste0(substr(x=rownames(frame),start=1,stop=5)," ", substr(x=rownames(frame),start=6,stop=6)) xtable <- xtable::xtable(frame,align="r|cccccc") xtable::print.xtable(xtable,include.rownames=TRUE,sanitize.text.function=function(x) x,comment=FALSE) ``` ```{r table_other} load("results/application.RData",verbose=TRUE) table <- numeric() for(i in 1:3){ lasso <- apply(sapply(loss,function(x) x[[paste0("lasso",i)]]$deviance),1,median) names(lasso)[names(lasso)=="binomial"] <- "logistic lasso regression" names(lasso)[names(lasso)=="combined"] <- "combined lasso regression" ridge <- apply(sapply(loss,function(x) x[[paste0("ridge",i)]]$deviance),1,median) names(ridge)[names(ridge)=="binomial"] <- "logistic ridge regression" names(ridge)[names(ridge)=="combined"] <- "combined ridge regression" table <- cbind(table,c(lasso[c("logistic lasso regression","combined lasso regression")],ridge[c("logistic ridge regression","combined ridge regression")],lasso["rf"],lasso["xgboost"])) rownames(table)[rownames(table)=="rf"] <- "\\texttt{randomForest}" rownames(table)[rownames(table)=="xgboost"] <- "\\texttt{xgboost}" } colnames(table) <- paste("year",1:3) rownames(table) <- paste("\\footnotesize",rownames(table)) xtable <- xtable::xtable(table) xtable::print.xtable(xtable,comment=FALSE,hline.after=c(0,2,4),sanitize.text.function=identity) ``` # Figures ```{r figure_MAP} load("results/application.RData",verbose=TRUE) sum <- fit[[1]]$lasso1 sum$cvm <- Reduce("+",lapply(fit,function(x) x$lasso1$cvm)) sum$sigma.min <- sapply(fit,function(x) x$lasso1$sigma.min) sum$pi.min <- sapply(fit,function(x) x$lasso1$pi.min) grDevices::pdf("manuscript/figure_MAP.pdf",width=4,height=3) graphics::par(mar=c(4,4,0.5,0.5)) cornet:::plot.cornet(sum) grDevices::dev.off() ``` ```{r figure_TFN} rm(list=ls()) sigma <- c(1,2,3); cutoff <- 25.5 x <- seq(from=20,to=30,length.out=100) grDevices::pdf("manuscript/figure_TFN.pdf",width=4,height=3) graphics::par(mar=c(4,4,0.5,0.5)) graphics::plot.new() graphics::plot.window(xlim=range(x),ylim=c(0,1)) graphics::box() graphics::axis(side=1) graphics::axis(side=2) graphics::title(xlab=expression(hat(y)),ylab=expression(Phi(hat(y),mu,sigma^2)),line=2.5) graphics::abline(h=0.5,lty=2,col="grey") graphics::abline(v=cutoff,lty=2,col="grey") lty <- c(2,1,3); lwd <- c(1,1,2) lty <- c("dashed","solid","dotted") for(i in seq_along(sigma)){ p <- stats::pnorm(q=x,mean=cutoff,sd=sigma[i]) graphics::lines(x=x,y=p,lty=lty[i],lwd=lwd[i]) } graphics::text(x=cutoff,y=0.40,labels=bquote(mu==.(cutoff)),pos=4) legend <- sapply(sigma,function(x) as.expression(bquote(sigma == .(x)))) graphics::legend(x="topleft",legend=legend,lty=lty,bty="n",lwd=lwd) grDevices::dev.off() ```