This file contains the code for reproducing the results shown in the related manuscript (numbers, tables, figures). We obtained our results on 2023-09-24 using R version 4.3.0 (2023-04-21) running natively on a MacBook Pro (2020) with an Apple M1 chip, 16 GB memory, and the operating system macOS Ventura (version 13.5.2). The total computation time was around 11 hours.
Choose the directory containing the sub-directories “data”, “results” and “manuscript”.
The directory “data” may be empty for the first external
application (script loads data - which are also available here
- from the R package GRridge
)
but must contain the file “pone.0181468.s001.csv” for the second
external application (available from Erez et al. 2017
in the supporting file journal.pone.0181468.s001.csv)
and the files “vcf_with_pvalue.tab” and “LuxPark_pheno.txt” for the
internal application (available upon request to [email protected]).
For the internal application, this directory will also contain the
intermediate file “app_int_data.RData”.
The directory “results” will contain the files “sim_int.RData” and “sim_ext.RData” for the external and internal simulation, the file “app_grridge.RData” for the first external application, the file “app_fwelnet.RData” for the second external application, and the file “app_int.RData” for the internal application.
The directory “manuscript” will contain the files “table_int.tex” and “table_ext.tex” for the internal and external simulations, the file “figure_example.pdf” for the methods section, the file “figure_ext.pdf” for the external applications, and the file “figure_int.pdf” for the internal application.
rm(list=ls())
dir <- "~/Desktop/transreg" # physical machine
#dir <- "/home/armin.rauschenberger/transreg" # virtual machine
setwd(dir)
if(!all(c("data","results","manuscript") %in% dir())){
stop("Missing folders!")
}
Install missing R packages from CRAN and GitHub. Note that
ecpc
is and transreg
will also be available on
CRAN. For package versions, see session information at the end of this
document and in the text files associated with each R data file.
pkgs <- c("devtools","palasso","glmtrans","xtable","ecpc","xrnet")
utils::install.packages(setdiff(pkgs,rownames(installed.packages())))
remotes::install_github("markvdwiel/GRridge") # ref="ef706afe", version 1.7.5
remotes::install_github("kjytay/fwelnet") # ref="5515fd2e", version 0.1
remotes::install_github("LCSB-BDS/transreg") # ref="82757441", version 1.0.0
rm(pkgs)
The following chunk generates the figure for the methods section.
input: none (simulating data)
execution time: some seconds
output: manuscript/figure_example.pdf
#<<init>>
#grDevices::pdf(file=paste0(dir,"/manuscript/figure_example.pdf"),width=8,height=5,pointsize=15)
#grDevices::png(file=paste0(dir,"/manuscript/figure_example.png"),width=8,height=5,units="in",pointsize=15,res=1200)
grDevices::postscript(file=paste0(dir,"/manuscript/figure_example.eps"),width=8,height=5,pointsize=15,horizontal=FALSE,paper="special")
set.seed(1)
n <- 200; p <- 500
X <- matrix(stats::rnorm(n*p),nrow=n,ncol=p)
temp <- stats::rnorm(p)
range <- stats::qnorm(p=c(0.01,0.99))
temp[temp<range[1]] <- range[1]
temp[temp>range[2]] <- range[2]
beta <- list()
beta$ident <- temp
beta$sqrt <- sign(temp)*sqrt(abs(temp))
beta$quad <- sign(temp)*abs(temp)^2
beta$trunc <- ifelse(temp<=0,0,temp)
beta$step <- ifelse(temp<=1,0,1)
beta$combn <- ifelse(temp<0,sign(temp)*sqrt(abs(temp)),sign(temp)*abs(temp)^2)
graphics::par(mfrow=c(2,3),mar=c(3,3,0.5,0.5))
for(i in seq_along(beta)){
prior <- matrix(temp,ncol=1)
eta <- X %*% beta[[i]]
y <- stats::rnorm(n=n,mean=eta,sd=sd(eta))
a <- transreg:::.exp.multiple(y=y,X=X,prior=prior,family="gaussian",select=FALSE)
b <- transreg:::.iso.fast.single(y=y,X=X,prior=prior,family="gaussian")
graphics::plot.new()
graphics::plot.window(xlim=range(prior,-prior),ylim=range(a$beta,b$beta))
graphics::axis(side=1)
graphics::axis(side=2)
graphics::abline(h=0,lty=2,col="grey")
graphics::abline(v=0,lty=2,col="grey")
graphics::box()
graphics::title(xlab=expression(z),ylab=expression(gamma),line=2)
graphics::points(x=prior,y=a$beta,col="red",cex=0.7)
graphics::points(x=prior,y=b$beta,col="blue",cex=0.7)
graphics::lines(x=prior[order(prior)],y=beta[[i]][order(prior)],lwd=1.2)
graphics::legend(x="topleft",legend=paste0("(",i,")"),bty="n",x.intersp=0)
}
grDevices::dev.off()
The following chunk performs the external (glmtrans
) and
the internal simulation.
input: none (simulating data)
execution time: 2 + 1 = 3 hours
output: results/sim_ext.RData, results/sim_int.RData, results/info_sim.RData
#<<init>>
# - - - modify glmtrans::models function - - -
glmtrans.models <- glmtrans::models
string <- base::deparse(glmtrans.models)
# return target beta
string <- gsub(pattern="list\\(x \\= NULL, y \\= NULL\\)",
replacement="list(x = NULL, y = NULL, beta = wk)",
x=string)
# return source beta
string <- gsub(pattern="list\\(x \\= x, y \\= y\\)",
replacement="list(x = x, y = y, beta = wk)",
x=string)
glmtrans.models <- eval(parse(text=string))
rm(string)
# - - - - - - - - - - - - - - - - - - - - - -
for(mode in c("ext","int")){
# simulation setting
if(mode=="ext"){
frame <- expand.grid(seed=1:10,
Ka=as.integer(c(1,3,5)),
K=as.integer(5),
h=as.integer(c(5,250)),
alpha=as.integer(c(0,1)),
family=c("gaussian","binomial")
)
} else if(mode=="int"){
frame <- expand.grid(seed=1:10,
rho.x=c(0.95,0.99),
rho.beta=c(0.6,0.8,0.99),
alpha=as.integer(c(0,1)),
family=c("gaussian","binomial")
)
}
frame$family <- as.character(frame$family)
frame[,c("cor.x","cor.beta","mean","glmnet","glmtrans","transreg")] <- NA
n0 <- 100; n1 <- 10000
n.target <- n0+n1
foldid.ext <- rep(c(0,1),times=c(n0,n1))
for(iter in seq_len(nrow(frame))){
cat(paste0(mode,": ",iter,"/",nrow(frame),"\n"))
if(!is.na(frame$seed[iter])){set.seed(frame$seed[iter])}
# data simulation
if(mode=="ext"){
message("Using external simulation study!")
s <- ifelse(frame$alpha[iter]==0,50,15)
data <- glmtrans.models(family=frame$family[iter],type="all",
p=1000,n.target=n.target,s=s,
Ka=frame$Ka[iter],K=frame$K[iter],h=frame$h[iter])
target <- data$target
source <- data$source
beta <- cbind(sapply(data$source,function(x) x$beta),data$target$beta)
} else if(mode=="int"){
message("Using internal simulation study!")
prop <- ifelse(frame$alpha[iter]==0,0.3,0.05)
data <- transreg:::simulate(p=500,n.target=n.target,family=frame$family[iter],
prop=prop,rho.beta=frame$rho.beta[iter],w=0.8,
rho.x=frame$rho.x[iter],k=3,exp=c(1,2,0.5),
trans=c(FALSE,TRUE,TRUE))
target <- data$target
source <- data$source
beta <- data$beta
}
# correlation
temp <- abs(stats::cor(data$target$x,method="pearson"))
temp[lower.tri(temp,diag=TRUE)] <- NA
frame$cor.x[iter] <- mean(temp,na.rm=TRUE)
temp <- abs(stats::cor(beta,method="pearson"))
temp[lower.tri(temp,diag=TRUE)] <- NA
frame$cor.beta[iter] <- max(temp[,ncol(temp)],na.rm=TRUE)
# predictive performance
loss <- transreg:::compare(target=target,source=source,
family=frame$family[iter],alpha=frame$alpha[iter],
foldid.ext=foldid.ext,nfolds.ext=1,
scale=c("exp","iso"),
sign=FALSE,switch=FALSE,select=TRUE,alpha.prior=NULL,
seed=frame$seed[iter],xrnet=TRUE)
frame[iter,names(loss$deviance)] <- loss$deviance
}
save(frame,file=paste0(dir,"/results/sim_",mode,".RData"))
}
writeLines(text=capture.output(utils::sessionInfo(),cat("\n"),
sessioninfo::session_info()),con=paste0(dir,"/results/info_sim.txt"))
The following chunk generates the tables for the external
(glmtrans
) and the internal simulation. Requires
execution of previous chunk.
input: results/sim_ext.RData, results/sim_int.RData
execution time: some seconds
output: manuscript/table_ext.tex, manuscript/table_int.tex
rm(list=ls())
dir <- "~/Desktop/transreg" # physical machine
#dir <- "/home/armin.rauschenberger/transreg" # virtual machine
setwd(dir)
if(!all(c("data","results","manuscript") %in% dir())){
stop("Missing folders!")
}
for(mode in c("ext","int")){
file <- paste0(dir,"/results/sim_",mode,".RData")
#if(mode=="int"){file <- "~/Desktop/transreg/results/sim_int_fast.RData"}
if(!file.exists(file)){warning("Missing file ",file,".");next}
load(file)
colnames(frame) <- gsub(pattern="transreg.",replacement="",x=colnames(frame))
info.var <- c("Ka","h","rho.x","rho.beta","alpha","family")
info <- frame[,colnames(frame) %in% info.var]
info.name <- gsub(pattern=" ",replacement="",x=apply(info,1,function(x) paste0(x,collapse=".")))
number <- as.numeric(factor(info.name,levels=unique(info.name)))
info <- unique(info)
info$cor.beta <- info$cor.x <- NA
loss.var <- c("glmnet","glmtrans","xrnet","exp.sta","exp.sim","iso.sta","iso.sim")
loss <- frame[,c(loss.var,"mean")]
# as percentage of empty model
loss <- 100*loss/loss$mean
# average over 10 different seeds
both <- mean <- sd <- p.better <- p.worse <- matrix(NA,nrow=max(number),ncol=length(loss.var),dimnames=list(1:max(number),loss.var))
for(i in 1:max(number)){
for(j in seq_along(loss.var)){
cond <- number==i
x <- loss$glmnet[cond]
y <- loss[[loss.var[j]]][cond]
p.better[i,j] <- stats::wilcox.test(x=x,y=y,paired=TRUE,alternative="greater",exact=FALSE)$p.value
p.worse[i,j] <- stats::wilcox.test(x=x,y=y,paired=TRUE,alternative="less",exact=FALSE)$p.value
mean[i,j] <- mean(y)
sd[i,j] <- sd(y)
info$cor.x[i] <- mean(frame$cor.x[cond])
info$cor.beta[i] <- mean(frame$cor.beta[cond])
}
}
front <- format(round(mean,digits=1),trim=TRUE)
front.nchar <- nchar(front)
back <- format(round(sd,digits=1),trim=TRUE)
back.nchar <- nchar(back)
both[] <- paste0(front,"$\\pm$",back)
grey <- mean>=mean[,"glmnet"]
both[grey] <- paste0("\\textcolor{gray}{",both[grey],"}")
min <- cbind(seq_len(max(number)),apply(mean,1,which.min))
both[min] <- paste0("\\underline{",both[min],"}")
star <- p.better<=0.05
both[star] <- paste0(both[star],"*")
#both[!star] <- paste0(both[!star],"\\phantom{*}")
dagger <- p.worse<=0.05
both[dagger] <- paste0(both[dagger],"$\\dagger$")
both[!star & !dagger] <- paste0(both[!star & !dagger],"\\phantom{*}")
both[front.nchar==3] <- paste0("$~$",both[front.nchar==3])
both[back.nchar==3] <- paste0(both[back.nchar==3],"$~$")
external <- "number of transferable source data sets ($K_a$), differences between source and target coefficients ($h$), dense setting with ridge regularisation ($s=50$, $\\alpha=0$) or sparse setting with lasso regularisation ($s=15$, $\\alpha=1$), family of distribution (`gaussian' or `binomial')."
internal <- "correlation parameter for features ($\\rho_x$), correlation parameter for coefficients ($\\rho_{\\beta}$), dense setting with ridge regularisation ($\\pi=30\\%$, $\\alpha=0$) or sparse setting with lasso regularisation ($\\pi=5\\%$, $\\alpha=1$), family of distribution (`gaussian' or `binomial')."
caption <- paste0("Predictive performance in ",mode,"ernal simulation. In each setting (row), we simulate $10$ data sets, calculate the performance metric (mean-squared error for numerical prediction, logistic deviance for binary classification) for the test sets, express these metrics as percentages of those from prediction by the mean, and show the mean and standard deviation of these percentages. Settings: ",ifelse(mode=="ext",external,ifelse(mode=="int",internal,NULL))," These parameters determine (i) the mean Pearson correlation among the features in the target data set ($\\bar{\\rho}_x$) and (ii) the maximum Pearson correlation between the coefficients in the target data set and the coefficients in the source data sets ($\\max(\\hat{\\rho}_{\\beta})$). Methods: regularised regression (\\texttt{glmnet}), competing transfer learning methods (\\texttt{glmtrans} , \\texttt{xrnet}), proposed transfer learning method (\\texttt{transreg}) with exponential/isotonic calibration and standard/simultaneous stacking. In each setting, the colour black (grey) highlights methods that are more (less) predictive than regularised regression without transfer learning (\\texttt{glmnet}), asterisks (daggers) indicate methods that are \\textit{significantly} more (less) predictive at the 5\\% level (one-sided Wilcoxon signed-rank test), and an underline highlights the most predictive method. \\label{sim_",mode,"}")
colnames(info) <- sapply(colnames(info),function(x) switch(x,"Ka"="$K_a$","K"="$K$","h"="$h$","alpha"="$\\alpha$","rho.x"="$\\rho_x$","rho.beta"="$\\rho_\\beta$","cor.x"="$\\bar{\\rho}_{x}$","cor.beta"="$\\max(\\hat{\\rho}_{\\beta})$","cor.t"="$\\bar{\\rho}_{a}$",x))
colnames(both) <- paste0("\\texttt{",colnames(both),"}")
align <- paste0("|r|",paste0(rep("c",times=ncol(info)),collapse=""),"|",paste0(rep("c",times=ncol(both)),collapse=""),"|")
add.to.row <- list()
add.to.row$pos <- list(-1)
add.to.row$command <- "\\multicolumn{9}{c}{~} & \\multicolumn{4}{|c|}{\\texttt{transreg}}\\\\"
xtable <- xtable::xtable(x=cbind(info,both),align=align,caption=caption)
xtable::print.xtable(x=xtable,
include.rownames=FALSE,
floating=TRUE,
sanitize.text.function=identity,
comment=FALSE,
caption.placement="top",
floating.environment="table*",
#size="\\small \\setlength{\\tabcolsep}{3pt}",
file=paste0(dir,"/manuscript/table_",mode,".tex"),
add.to.row=add.to.row,
hline.after=c(-1,0,nrow(xtable)))
}
The following chunk performs the first external application
(GRridge
).
input: none (script loads data - which are also available here
- from the R package GRridge
)
execution time: 2.5 hours
output: results/app_grridge.RData, results/info_app_grridge.txt
#<<init>>
data(dataVerlaat,package="GRridge")
target <- list()
target$y <- as.numeric(as.factor(respVerlaat))-1
target$x <- t(datcenVerlaat)
z <- -log10(pvalFarkas) # ecpc and fwelnet
prior <- sign(diffmeanFarkas)*(-log10(pvalFarkas)) # transreg
loss <- list()
for(i in 1:10){
cat("---",i,"---\n")
loss[[i]] <- transreg:::compare(target=target,prior=prior,z=as.matrix(z,ncol=1),
family="binomial",alpha=0,scale=c("exp","iso"),sign=FALSE,switch=FALSE,select=FALSE,type.measure=c("deviance","class","auc"),seed=i,xrnet=TRUE)
}
save(loss,file=paste0(dir,"/results/app_grridge.RData"))
writeLines(text=capture.output(utils::sessionInfo(),cat("\n"),
sessioninfo::session_info()),con=paste0(dir,"/results/info_app_grridge.txt"))
load(paste0(dir,"/results/app_grridge.RData"),verbose=TRUE)
table <- as.data.frame(t(sapply(loss,function(x) x$deviance)))
table <- (table-table$glmnet)/table$glmnet
table <- table[,c("glmnet","transreg.exp.sta","transreg.exp.sim","transreg.iso.sta","transreg.iso.sim","fwelnet","ecpc","xrnet")]
sapply(table[,-1],function(x) sum(x<table$glmnet))
round(100*colMeans(table[,-1]),digits=2)
The following chunk performs the second external application
(fwelnet
).
input: data/pone.0181468.s001.csv (available from Erez et al. 2017 in the supporting file journal.pone.0181468.s001.csv)
execution time: 0.5 hours
output: results/app_fwelnet.RData, results/info_app_fwelnet.txt
#<<init>>
table <- utils::read.csv("data/pone.0181468.s001.csv",header=TRUE,skip=3)
extract <- function(cond,y,X,id){
if(length(unique(c(length(cond),length(y),nrow(X),length(id))))!=1){stop("Invalid input.")}
n <- table(id,cond)[,"TRUE"]
y <- y[cond]
X <- X[cond,]
id <- id[cond]
weights <- rep(1/n,times=n)
ids <- unique(id)
ys <- sapply(ids,function(x) unique(y[id==x]))
foldid <- rep(NA,length=length(ids))
foldid[ys==0] <- sample(rep(1:10,length.out=sum(ys==0)))
foldid[ys==1] <- sample(rep(1:10,length.out=sum(ys==1)))
foldid <- rep(foldid,times=n[n!=0])
if(length(unique(c(length(y),nrow(X),length(weights),length(foldid))))!=1){
stop("Invalid output.")
}
return(list(y=y,x=X,weights=weights,foldid=foldid))
}
loss <- ridge <- lasso <- list()
for(i in 1:10){
cat("---",i,"---\n")
set.seed(i)
y <- table$LatePE
X <- as.matrix(table[,grepl(pattern="SL",x=colnames(table))])
X <- scale(X)
min <- sapply(unique(table$ID),function(i) min(table$GA[table$ID==i]))
max <- sapply(unique(table$ID),function(i) max(table$GA[table$ID==i]))
limit <- 20
group <- stats::rbinom(n=max(table$ID),size=1,prob=0.5)
source.id <- which(group==0 | min > limit)
target.id <- which(group==1 & min <= limit)
if(any(!table$ID %in% c(source.id,target.id))){stop()}
if(any(!c(source.id,target.id) %in% table$ID)){stop()}
if(any(duplicated(c(source.id,target.id)))){stop()}
source <- list()
source[[1]] <- extract(cond=(table$ID %in% source.id) & (table$GA<=limit),y=y,X=X,id=table$ID)
source[[2]] <- extract(cond=(table$ID %in% source.id),y=y,X=X,id=table$ID)
prior <- z <- matrix(NA,nrow=ncol(X),ncol=length(source))
for(j in seq_along(source)){
base <- glmnet::cv.glmnet(y=source[[j]]$y,x=source[[j]]$x,
family="binomial",alpha=0,
weights=source[[j]]$weights,
foldid=source[[j]]$foldid)
prior[,j] <- coef(base,s="lambda.min")[-1]
z[,j] <- abs(coef(base,s="lambda.min")[-1])
}
target <- list()
target$y <- sapply(target.id,function(i) unique(y[table$ID==i]))
target$x <- t(sapply(target.id,function(i) X[table$ID==i & table$GA==min(table$GA[table$ID==i]),]))
loss[[i]] <- transreg:::compare(target=target,prior=prior,family="binomial",alpha=0,scale=c("exp","iso"),sign=FALSE,switch=FALSE,select=FALSE,z=z,type.measure=c("deviance","class","auc"),seed=i,xrnet=TRUE)
}
save(loss,file=paste0(dir,"/results/app_fwelnet.RData"))
writeLines(text=capture.output(utils::sessionInfo(),cat("\n"),
sessioninfo::session_info()),con=paste0(dir,"/results/info_app_fwelnet.txt"))
load(paste0(dir,"/results/app_fwelnet.RData"))
table <- as.data.frame(t(sapply(loss,function(x) x$deviance)))
table <- (table-table$glmnet)/table$glmnet
table <- table[,c("glmnet","transreg.exp.sta","transreg.exp.sim","transreg.iso.sta","transreg.iso.sim","fwelnet","ecpc","xrnet")]
sapply(table[,-1],function(x) sum(x<table$glmnet))
round(100*colMeans(table[,-1]),digits=2)
The following chunk generates the figure for both external
applications (GRridge
and fwelnet
).
Requires execution of previous two chunks.
input: results/app_grridge.RData, results/app_fwelnet.RData
execution time: some seconds
output: manuscript/figure_ext.pdf
#grDevices::pdf(file=paste0(dir,"/manuscript/figure_ext.pdf"),width=8,height=6,pointsize=15)
#grDevices::png(file=paste0(dir,"/manuscript/figure_ext.png"),width=8,height=6,units="in",pointsize=15,res=1200)
grDevices::postscript(file=paste0(dir,"/manuscript/figure_ext.eps"),width=8,height=6,pointsize=15,horizontal=FALSE,paper="special")
graphics::par(mfrow=c(2,1),mar=c(2.5,2.0,0.5,0.5))
for(k in c("grridge","fwelnet")){
file <- paste0(dir,"/results/app_",k,".RData")
if(!file.exists(file)){plot.new();next}
load(file)
loss <- as.data.frame(t(sapply(loss,function(x) x$deviance)))
colnames(loss) <- gsub(pattern="transreg.",replacement="",x=colnames(loss))
loss <- 100*(loss-loss$glmnet)/loss$glmnet
temp <- c("exp.sta","exp.sim","iso.sta","iso.sim")
name <- c("fwelnet","ecpc","xrnet",temp)
graphics::plot.new()
graphics::plot.window(xlim=c(0.5,length(name)+0.5),ylim=range(loss,na.rm=TRUE))
graphics::abline(h=0,lty=2,col="grey")
#graphics::axis(side=1,at=seq_along(name),labels=name,cex.axis=0.7) # original
cond <- grepl(pattern="\\.",x=name)
graphics::axis(side=1,at=seq_along(name),labels=rep("",times=length(name)))
graphics::mtext(side=1,at=seq_along(name)[!cond],text=name[!cond],cex=0.7,line=1)
graphics::mtext(side=1,at=seq_along(name)[cond],text=name[cond],cex=0.7,line=0.25)
graphics::mtext(side=1,at=mean(seq_along(name)[cond]),text="transreg",cex=0.7,line=1)
if(grepl(pattern="grridge",x=k)){at <- seq(from=-10,to=10,by=5)}
if(grepl(pattern="fwelnet",x=k)){at <- seq(from=-20,to=20,by=10)}
labels <- ifelse(at==0,"0%",ifelse(at<0,paste0(at,"%"),paste0("+",at,"%")))
graphics::axis(side=2,cex.axis=0.7,at=at,labels=labels)
#graphics::title(ylab="change in metric",line=2.5,cex.lab=0.7)
graphics::box()
for(i in seq_along(name)){
palasso:::.boxplot(loss[,name[i]],at=i,invert=FALSE)
graphics::points(x=i,y=mean(loss[,name[i]]),pch=22,col="white",bg="black",cex=0.7)
}
}
grDevices::dev.off()
The following chunk performs the internal application.
input: data/vcf_with_pvalue.tab, data/LuxPark_pheno.txt (available upon request to [email protected])
execution time: 5.5 hours (parallel computing with 8 cores)
output: data/app_int_data.RData, results/app_int.RData, results/info_app_int.txt
#<<init>>
geno <- read.table(paste0(dir,"/data/vcf_with_pvalue.tab"),header=TRUE)
switch <- ifelse(geno$REF==geno$A1_gwas & geno$ALT==geno$A2_gwas,1,
ifelse(geno$REF==geno$A2_gwas & geno$ALT==geno$A1_gwas,-1,0))
#prior <- -geno$beta*switch # original effect sizes
prior <- log10(geno$p_value)*sign(geno$beta)*switch # pseudo effect sizes
pvalue <- geno$p_value
# Note: Why are pseudo-effect sizes more suitable as prior effects as compared to original effect sizes?
# graphics::plot(x=geno$beta,y=-log10(geno$p_value),xlim=c(-1,1),col=ifelse(stats::p.adjust(geno$p_value)<=0.05,"red","black"))
X <- geno[,grepl(pattern="ND",colnames(geno))]
X[X=="0/0"] <- 0
X[X=="0/1"] <- 1
X[X=="1/1"] <- 1
X <- sapply(X,as.numeric)
X <- t(X)
pheno <- read.delim(paste0(dir,"/data/LuxPark_pheno.txt"),sep=" ",header=FALSE)
y <- ifelse(pheno$V2==1,0,ifelse(pheno$V2==2,1,NA)); names(y) <- pheno$V1
names <- intersect(names(y[!is.na(y)]),rownames(X))
X <- X[names,]; y <- y[names]
# Note: Are prior effects positively correlated with correlation between outcome and features?
# cor <- as.numeric(stats::cor(y,X,method="spearman"))
# graphics::plot(x=prior,y=cor,col=ifelse(stats::p.adjust(geno$p_value)<=0.05,"red","black"))
# graphics::abline(h=0,lty=2,col="grey")
# descriptive statistics
sum(p.adjust(pvalue,method="BH")<=0.05)
sum(p.adjust(pvalue,method="holm")<=0.05)
mean(pvalue<=0.05)
dim(X)
table(y)
# memory reduction
cond <- pvalue <= 0.05
X <- X[,cond]
prior <- prior[cond]
pvalue <- pvalue[cond]
switch <- switch[cond]
save(y,X,prior,pvalue,switch,file=paste0(dir,"/data/app_int_data.RData"))
load(paste0(dir,"/data/app_int_data.RData"))
power <- seq(from=-2,to=-10,by=-1)
cutoff <- 5*10^power
frame <- expand.grid(cutoff=cutoff,alpha=0:1,seed=1:10,count=NA)
#- - - sequential (start) - - -
#loss <- list()
#for(i in seq_len(nrow(frame))){
# cat("--- i =",i,"---","\n")
# set.seed(frame$seed[i])
# foldid <- transreg:::.folds(y=y,nfolds.ext=10,nfolds.int=10)
# cond <- switch!=0 & pvalue < frame$cutoff[i]
# loss[[i]] <- transreg:::compare(target=list(y=y,x=X[,cond]),prior=prior[cond],family="binomial",alpha=frame$alpha[i],scale=c("exp","iso"),sign=FALSE,switch=FALSE,select=FALSE,foldid.ext=foldid$foldid.ext,foldid.int=foldid$foldid.int,type.measure=c("deviance","class","auc"),seed=frame$seed[i])
# frame$count[i] <- sum(cond)
#}
# - - - sequential (end) - - -
#- - - parallel (start) - - -
frame <- expand.grid(cutoff=cutoff,alpha=0:1,seed=1:10,count=NA)
cluster <- snow::makeCluster(8)
evaluate <- function(frame,i,switch,pvalue,y,X,prior){
set.seed(frame$seed[i])
foldid <- transreg:::.folds(y=y,nfolds.ext=10,nfolds.int=10)
cond <- switch!=0 & pvalue < frame$cutoff[i]
transreg:::compare(target=list(y=y,x=X[,cond]),prior=prior[cond],family="binomial",alpha=frame$alpha[i],scale=c("exp","iso"),sign=FALSE,switch=FALSE,select=FALSE,foldid.ext=foldid$foldid.ext,foldid.int=foldid$foldid.int,type.measure=c("deviance","class","auc"),seed=frame$seed[i])
}
snow::clusterExport(cl=cluster,list=c("evaluate","frame","switch","pvalue","y","X","prior"))
loss <- snow::parSapply(cl=cluster,X=seq_len(nrow(frame)),FUN=function(i) evaluate(frame=frame,i=i,switch=switch,pvalue=pvalue,y=y,X=X,prior=prior),simplify=FALSE)
#- - - parallel (end) - - -
save(frame,loss,file=paste0(dir,"/results/app_int.RData"))
writeLines(text=capture.output(utils::sessionInfo(),cat("\n"),
sessioninfo::session_info()),con=paste0(dir,"/results/info_app_int.txt"))
The following chunk generates the figures for the internal application. Requires execution of previous chunk.
input: results/app_int.RData
execution time: some seconds
output: manuscript/figure_int.pdf, manuscript/figure_temp.pdf
#<<init>>
plotter <- function(table,cutoff,number,horizontal=FALSE){
graphics::par(mfrow=c(2,2),mar=c(3,1.8,1.0,0.9))
for(scale in c("exp","iso")){
for(alpha in c("0","1")){
graphics::plot.new()
graphics::plot.window(xlim=range(log(cutoff)),ylim=range(table))
graphics::box()
graphics::title(main=paste(ifelse(alpha==0,"ridge",ifelse(alpha==1,"lasso",NA)),"-",scale),cex.main=1,line=0.2)
on <- rep(c(TRUE,FALSE),length.out=length(cutoff))
graphics::axis(side=1,at=log(cutoff),labels=rep("",times=length(on)),cex.axis=0.7)
graphics::axis(side=1,at=log(cutoff)[on],labels=paste0(cutoff[on],"\n","(",number[on],")"),cex.axis=0.7)
graphics::axis(side=2,cex.axis=0.7)
if(horizontal){
graphics::abline(h=0.5,col="grey",lty=2)
#graphics::abline(h=unique(table[["mean"]][,alpha]),col="grey",lty=2)
}
for(i in 1:3){
for(method in c("glmnet",paste0("transreg.",scale,c(".sta",".sim")),"naive")){
lty <- switch(method,"mean"=1,"glmnet"=1,"transreg.exp.sta"=2,"transreg.exp.sim"=2,"transreg.iso.sta"=2,"transreg.iso.sim"=2,"naive"=3)
col <- switch(method,"mean"="grey","glmnet"="black","transreg.exp.sta"=rgb(0.2,0.2,1),"transreg.iso.sta"=rgb(0.2,0.2,1),"transreg.exp.sim"=rgb(0,0,0.6),"transreg.iso.sim"=rgb(0,0,0.6),"naive"="red")
y <- table[[method]][,alpha]
x <- log(as.numeric(names(y)))
if(i==1){graphics::lines(x=x,y=y,col=col,lty=lty)}
if(i==2){graphics::points(x=x,y=y,col="white",pch=16)}
if(i==3){graphics::points(x=x,y=y,col=col)}
}
}
}
}
}
load(paste0(dir,"/data/app_int_data.RData"))
load(paste0(dir,"/results/app_int.RData"))
frame <- frame[seq_along(loss),colnames(frame)!="seed"]
cutoff <- unique(frame$cutoff)
number <- unique(sapply(loss,function(x) x$p))
auc <- as.data.frame(t(sapply(loss,function(x) x$auc)))
table <- lapply(auc,function(x) tapply(X=x,INDEX=list(frame$cutoff,frame$alpha),FUN=function(x) mean(x)))
#grDevices::pdf(file=paste0(dir,"/manuscript/figure_int.pdf"),width=8,height=6,pointsize=15)
#grDevices::png(file=paste0(dir,"/manuscript/figure_int.png"),width=8,height=6,units="in",pointsize=15,res=1200)
grDevices::postscript(file=paste0(dir,"/manuscript/figure_int.eps"),width=8,height=6,pointsize=15,horizontal=FALSE,paper="special")
plotter(table,cutoff,number,horizontal=TRUE)
grDevices::dev.off()
This chunk obtains the upper 95% confidence interval of a random classifier with 766 controls and 790 cases (result: 0.524).
input: -
execution time: some seconds
output: not saved
#--- empirical computation of confidence interval ---
set.seed(1)
auc <- list()
n <- c("small","large")
for(i in seq_along(n)){
auc[[i]] <- numeric()
for(j in 1:10000){
if(n[i]=="small"){
y <- rep(c(0,1),times=c(50,50))
}
if(n[i]=="large"){
y <- rep(c(0,1),times=c(766,790))
}
x <- stats::runif(n=length(y),min=0,max=1)
auc[[i]][j] <- pROC::auc(response=y,predictor=x,direction="<",levels=c(0,1))
}
}
q <- sapply(auc,function(x) quantile(x,probs=0.95))
graphics::par(mar=c(3.5,3.5,1,1))
graphics::plot.new()
graphics::plot.window(xlim=c(0.5,length(n)+0.5),ylim=range(auc))
graphics::box()
graphics::axis(side=1,at=seq_along(n),labels=n)
graphics::axis(side=2)
for(i in seq_along(n)){
graphics::boxplot(x=auc[[i]],at=i,add=TRUE)
}
graphics::abline(h=0.5)
graphics::title(xlab="sample size",ylab="AUC",line=2.5)
graphics::segments(x0=seq_along(n)-0.2,x1=seq_along(n)+0.2,y0=q,col="red",lwd=2)
graphics::text(x=seq_along(n)-0.2,y=q,labels=round(q,digits=3),pos=2,cex=0.5,col="red")
q
#--- analytical calculation of confidence interval ---
# This code is based on the website "Real Statistics using Excel" from Charles Zaiontz, https://real-statistics.com/descriptive-statistics/roc-curve-classification-table/auc-confidence-interval/).
var_AUC <- function(x,n1,n2) {
q1 = x/(2-x)
q2 = 2*x^2/(1+x)
var = (x*(1-x) +(n1-1)*(q1-x^2) +(n2-1)*(q2-x^2))/(n1*n2)
}
round(0.5 + stats::qnorm(p=0.95)*sqrt(var_AUC(0.5,n1=50,n2=50)),digits=3)
round(0.5 + stats::qnorm(p=0.95)*sqrt(var_AUC(0.5,n1=766,n2=790)),digits=3)
Additional code for further research (not included in the manuscript): [see source].
#--- This code chunk is not included in the manuscript! ---
# The following chunk performs the additional simulation study with either linearly or non-linearly related prior effects for the comparison of transreg and xrnet.
set.seed(1)
temp <- matrix(data=NA,nrow=10,ncol=2,dimnames=list(NULL,c("transreg","xrnet")))
mse <- list(linear=temp,nonlinear=temp)
for(i in c("linear","nonlinear")){
for(j in 1:10){
# simulate data
n0 <- 100; n1 <- 10000; n <- n0 + n1; p <- 2000
X <- matrix(stats::rnorm(n*p),nrow=n,ncol=p)
beta <- stats::rnorm(n=p)*stats::rbinom(n=p,size=1,prob=0.05)
y <- stats::rnorm(n=n,mean=X %*% beta,sd=1) #sd(X %*% beta)
# relation between prior effects and true effects
temp <- beta + stats::rnorm(n=p)*stats::rbinom(n=p,size=1,prob=0.01)
# temp <- beta + stats::rnorm(p,sd=0.1) # for comparison
if(i=="linear"){
prior <- temp
} else if(i=="nonlinear"){
prior <- sign(temp)*abs(temp)^2
}
# hold-out
y_hat <- list()
foldid <- rep(c(0,1),times=c(n0,n1))
# transfer learning with transreg
model <- transreg::transreg(y=y[foldid==0],X=X[foldid==0,],prior=prior)
y_hat$transreg <- predict(model,newx=X[foldid==1,])
# transfer learning with xrnet
model <- xrnet::tune_xrnet(x=X[foldid==0,],y=y[foldid==0],external=as.matrix(prior,ncol=1))
y_hat$xrnet <- stats::predict(model,newdata=X[foldid==1,])
# mean squared error (MSE)
mse[[i]][j,] <- sapply(y_hat,function(x) mean((x-y[foldid==1])^2))
}
}
# linear scenario
sum(mse$linear[,"xrnet"] < mse$linear[,"transreg"])
stats::wilcox.test(x=mse$linear[,"xrnet"],y=mse$linear[,"transreg"],paired=TRUE)$p.value
# non-linear scenario
sum(mse$nonlinear[,"transreg"] < mse$nonlinear[,"xrnet"])
stats::wilcox.test(x=mse$nonlinear[,"transreg"],y=mse$nonlinear[,"xrnet"],paired=TRUE)$p.value
Reformat list of consortium members for acknowledgements: [see source].
Print session information.