Data
We considered the data accompanying the paper from Sakellaropoulos et al., and made available at this link:
- batch-normalized GDSC/bortezomib dataset and clinical response data
Samples information for the clinical data was provided as supplementary material.
Setup
Firstly, we load renoir
and other needed packages:
library(renoir)
#> Warning: package 'Matrix' was built under R version 4.2.3
library(plotly)
library(htmltools)
Unsupervised Screening
Given the high dimensional data, we want to apply an initial step of
pre-processing by sub-setting the features space. A list of supported
methods is available through the
?list_supported_unsupervised_screening_methods
function
call.
#list methods
u.screening.methods = list_supported_unsupervised_screening_methods()
#print in table
knitr::kable(x = u.screening.methods)
id | description |
---|---|
na | filter by NA presence in the data (remove if NAs > threshold) |
intensity | filter by features intensity (keep if value > median across observations) |
variability | filter by features variability (select top variable features) |
We decided to reduce the dimensionality by selecting the top variable features.
#method id
u.screening.method.id = "variability"
Learning Methods
Now we retrieve the ids of all the supported learners. A list of
supported methods is available through the
?list_supported_learning_methods
function call.
#list methods
learning.methods = list_supported_learning_methods()
#print in table
knitr::kable(x = learning.methods)
id | method | default_hyperparameters |
---|---|---|
lasso | generalized linear model via L1 penalized maximum likelihood (lasso penalty) | lambda |
ridge | generalized linear model via L2 penalized maximum likelihood (ridge penalty) | lambda |
elasticnet | generalized linear model via L1/L2 penalized maximum likelihood (elasticnet penalty) | lambda, …. |
relaxed_lasso | generalized linear model via L1 penalized maximum likelihood (relaxed lasso penalty) | lambda, …. |
relaxed_ridge | generalized linear model via L2 penalized maximum likelihood (relaxed ridge penalty) | lambda, …. |
relaxed_elasticnet | generalized linear model via L1/L2 penalized maximum likelihood (relaxed elasticnet penalty) | lambda, …. |
randomForest | random forest | ntree |
gbm | generalized boosted model | eta, ntree |
linear_SVM | linear support vector machine | cost |
polynomial_SVM | polynomial support vector machine | cost, ga…. |
radial_SVM | radial support vector machine | cost, gamma |
sigmoid_SVM | sigmoid support vector machine | cost, gamma |
linear_NuSVM | linear nu-type support vector machine | nu |
polynomial_NuSVM | polynomial nu-type support vector machine | nu, gamm…. |
radial_NuSVM | radial nu-type support vector machine | nu, gamma |
sigmoid_NuSVM | sigmoid nu-type support vector machine | nu, gamma |
gknn | generalized k-nearest neighbours model | k |
nsc | nearest shrunken centroid model | threshold |
We can extract the ids by selecting the id
column. For
this use case we decided to focus only on some methods.
#learning method ids
learning.methods.ids = learning.methods$id[c(1,2,3,7)]
From the table, we can also see the default hyperparameters of the
methods. We create here our default values, and define a function to
dispatch them depending on the id
in input:
#get hyperparameters
get_hp = function(id, y){
#Generalised Linear Models with Penalisation
lambda = 10^seq(3, -2, length=100)
# alpha = seq(0.1, 0.9, length = 9)
# alpha = seq(0.1, 0.9, length = 5)
alpha = seq(0.05, 0.95, by = 0.05)^2
gamma = c(0, 0.25, 0.5, 0.75, 1)
#Random Forest
ntree = c(100, 500, 1000)
#Generalised Boosted Regression Modelling
eta = c(0.3, 0.1, 0.01, 0.001)
#Support Vector Machines
cost = 2^seq(from = -5, to = 15, length.out = 5)
svm.gamma = 2^seq(from = -15, to = 3, length.out = 4)
degree = seq(from = 1, to = 3, length.out = 3)
#Note that for classification nu must be
#nu * length(y)/2 <= min(table(y)). So instead of
#fixing it as
# nu = seq(from = 0.1, to = 0.6, length.out = 5)
#we do
nu.to = floor((min(table(y)) * 2/length(y)) * 10) / 10
nu = seq(from = 0.1, to = nu.to, length.out = 5)
#kNN
k = seq(from = 1, to = 9, length.out = 5)
#Nearest Shrunken Centroid
threshold = seq(0, 2, length.out = 30)
#hyperparameters
out = switch(
id,
'lasso' = list(lambda = lambda),
'ridge' = list(lambda = lambda),
'elasticnet' = list(lambda = lambda, alpha = alpha),
'relaxed_lasso' = list(lambda = lambda, gamma = gamma),
'relaxed_ridge' = list(lambda = lambda, gamma = gamma),
'relaxed_elasticnet' = list(lambda = lambda, gamma = gamma, alpha = alpha),
'randomForest' = list(ntree = ntree),
'gbm' = list(eta = eta, ntree = ntree),
'linear_SVM' = list(cost = cost),
'polynomial_SVM' = list(cost = cost, gamma = svm.gamma, degree = degree),
'radial_SVM' = list(cost = cost, gamma = svm.gamma),
'sigmoid_SVM' = list(cost = cost, gamma = svm.gamma),
'linear_NuSVM' = list(nu = nu),
'polynomial_NuSVM' = list(nu = nu, gamma = svm.gamma, degree = degree),
'radial_NuSVM' = list(nu = nu, gamma = svm.gamma),
'sigmoid_NuSVM' = list(nu = nu, gamma = svm.gamma),
'gknn' = list(k = k),
'nsc' = list(threshold = threshold)
)
return(out)
}
Performance metrics
A list of supported scoring metrics is available through the
?list_supported_performance_metrics
function call.
#list metrics
performance.metrics = list_supported_performance_metrics(resp.type = "gaussian")
#print in table
knitr::kable(x = performance.metrics)
id | name | problem |
---|---|---|
mae | Mean Absolute Error | regression |
mape | Mean Absolute Percentage Error | regression |
mse | Mean-squared Error | regression |
rmse | Root-mean-square Error | regression |
msle | Mean-squared Logarithmic Error | regression |
r2 | R2 | regression |
For this benchmark we want to select mean absolute error and mean squared error.
#metric for tuning
performance.metric.id.tuning = "mse"
#metrics for evaluation
performance.metric.ids.evaluation = c("mae", "mse")
Sampling Methods
A list of supported sampling methods is available through the
?list_supported_sampling_methods
function call.
#list methods
sampling.methods = list_supported_sampling_methods()
#print in table
knitr::kable(x = sampling.methods)
id | name | supported |
---|---|---|
random | random sampling without replacement | stratification, balance |
bootstrap | random sampling with replacement | stratification, balance |
cv | cross-validation | stratification |
We decided to select the common scenario of a stratified k-fold cross-validation for the tuning of hyperparameters, and repeated random sampling for the evaluation of the methodology.
#sampling for tuning
sampling.method.id.tuning = "cv"
#sampling for evaluation
sampling.method.id.evaluation = "random"
We can see that we are in a high-dimensional settings, as the number of variables is >> the number of observations. To reduce the noise associated with this settings (and the computational time) we can apply an initial unsupervised screening step. Moreover, we can also take advantage of a supervised screening step during the training.
Analysis
We can now run our analyses.
Bortezomib
Load data
#path
fp = file.path("..", "..", "data-raw", "use_case", "cell_lines", "data")
#load data
x = readRDS(file = file.path(fp, "bortezomib_cells.rds"))
#process
y = stats::setNames(object = x$Bortezomib, nm = rownames(x))
x = as.matrix(x[,-1])
#remove data without response
y = y[!is.na(y)]
x = x[names(y),]
#inspect predictors matrix
cat(paste("Number of features:", ncol(x)), sep = "\n")
cat(paste("Number of observations:", nrow(x)), sep = "\n")
We need to set the response type.
#set response type
resp.type = "gaussian"
Setup
Unsupervised Screener
Firstly, we want to setup a filter to reduce the dimensionality of our problem.
top.genes = c(0.2, 0.8, 0.9, 1)[2]
#filter
uscreener = Filter(
#id of the method
id = u.screening.method.id,
#fix some arguments
parameters = list(
threshold = top.genes, #select features in the top 25\%
method = "mad" #use standard deviation to measure variability
),
#logger
logger = Logger(verbose = T, level = "INFO")
)
We could provide the Filter object to the ?renoir
function call, or we can perform an unsupervised screening separately by
using the ?filter
function.
#unsupervised screening
filtered = renoir::filter(
filter = uscreener,
x = x
)
#get x
x.sub = filtered@filtered
Tuner
Now we can create a tuner performing a grid.search
via
5-fold cross-validation.
#tuner
tuner = Tuner(
id = "grid.search",
#Sampling strategy
sampler = Sampler(
method = sampling.method.id.tuning,
k = 5L,
n = integer()
# ,strata = y
),
#
screener = ScreenerList(screener),
#Use parallel
looper = Looper(cores = 1L),
#Logger
logger = Logger(verbose = T, level = "INFO")
# logger = Logger(verbose = T, level = "DEBUG")
)
Learner
We can now create the related ?Learner
objects.
#container
learners = list()
#loop
for(learning.method.id in learning.methods.ids){
#manual setup
learners[[learning.method.id]] = Learner(
tuner = tuner,
trainer = Trainer(id = learning.method.id),
forecaster = Forecaster(id = learning.method.id),
scorer = ScorerList(Scorer(id = performance.metric.id.tuning)),
selector = Selector(id = learning.method.id),
recorder = Recorder(id = learning.method.id, logger = Logger(level = "ALL", verbose = T)),
marker = Marker(id = learning.method.id, logger = Logger(level = "ALL", verbose = T)),
logger = Logger(level = "ALL")
)
}
Evaluator
Finally, we need to set up the ?Evaluator
.
#Evaluator
evaluator = Evaluator(
#Sampling strategy: stratified random sampling without replacement
sampler = Sampler(
method = "random",
k = 10L,
# strata = y,
N = as.integer(length(y))
),
#Performance metric
scorer = ScorerList(
Scorer(id = performance.metric.ids.evaluation[1]),
Scorer(id = performance.metric.ids.evaluation[2])
# Scorer(id = performance.metric.ids.evaluation[3])
)
)
Analysis with Pre-processsing
Let’s create a directory to store the results.
#define path
outdir = file.path("E:", "Personal", "My Software", "R packages", "renoir", "data-raw", "use_case", "cell_lines", "analysis", "bortezomib", paste0("top",top.genes*100))
outdir = file.path("..", "..", "data-raw", "use_case", "cell_lines", "analysis", "bortezomib", paste0("top",top.genes*100))
#create if not existing
if(!dir.exists(outdir)){dir.create(path = outdir, showWarnings = F, recursive = T)}
Now we can run the analysis.
Before running the analysis, we want to set a seed for the random
number generation (RNG). In fact, different R sessions have different
seeds created from current time and process ID by default, and
consequently different simulation results. By fixing a seed we ensure we
will be able to reproduce the results. We can specify a seed by calling
?set.seed
.
In the code below, we set a seed before running the analysis for each considered learning method.
#container list
resl = list()
#loop
# learning.method.id = learning.methods.ids[2]
for(learning.method.id in learning.methods.ids){
#Each analysis can take hours, so we save data
#for future faster load
#path to file
fp.obj = file.path(outdir, paste0(learning.method.id,".rds"))
fp.sum = file.path(outdir, paste0("st_",learning.method.id,".rds"))
#check if exists
if(file.exists(fp.sum)){
#load
cat(paste("Reading", learning.method.id, "..."), sep = "")
resl[[learning.method.id]] = readRDS(file = fp.sum)
cat("DONE", sep = "\n")
} else {
cat(paste("Learning method:", learning.method.id), sep = "\n")
#Set a seed for RNG
set.seed(
#A seed
seed = 5381L, #a randomly chosen integer value
#The kind of RNG to use
kind = "Mersenne-Twister", #we make explicit the current R default value
#The kind of Normal generation
normal.kind = "Inversion" #we make explicit the current R default value
)
resl[[learning.method.id]] = renoir(
#Unsupervised screening
# filter = filter,
#Training set size
npoints = 3,
# ngrid,
nmin = round(nrow(x)/2),
#Loop
looper = Looper(),
#Store
filename = "renoir",
outdir = file.path(outdir, learning.method.id),
restore = TRUE,
#Learn
learner = learners[[learning.method.id]],
#Evaluate
evaluator = evaluator,
#Log
logger = Logger(
path = file.path(outdir, learning.method.id, "log.txt"),
level = "ALL",
verbose = T),
#Data for training
hyperparameters = get_hp(id = learning.method.id, y = y),
# x = x,
x = x.sub,
y = y,
weights = NULL,
offset = NULL,
resp.type = resp.type,
#space
rm.call = T,
rm.fit = T,
#Group results
grouping = TRUE,
#No screening
# screening = c(500, 1000, 2358),
screening = NULL,
#Remove call from trainer to reduce space
keep.call = F
)
#save
saveRDS(object = resl[[learning.method.id]], file = fp.obj)
#create summary table
resl[[learning.method.id]] = renoir:::summary_table.RenoirList(resl[[learning.method.id]], key = c("id", "config"))
#save summary table
saveRDS(object = resl[[learning.method.id]], file = fp.sum)
cat("\n\n", sep = "\n")
}
}
#create summary table
resl = do.call(what = rbind, args = c(resl, make.row.names = F, stringsAsFactors = F))
Performance
Let’s now plot the performance metrics for the opt
and
1se
configurations, considering the test set of data.
Mean-squared error
We consider the mean-squared error
Test
This is the mse score for the opt
configuration when
considering the test set.
#plot
renoir:::plot.RenoirSummaryTable(
x = resl[resl$config == "opt",,drop=F],
measure = "mse",
set = "test",
interactive = T,
add.boxplot = F,
add.scores = F,
add.best = F,
key = c("id", "config")
)
This is the mse score for the 1se
configuration when
considering the test set.
#plot
renoir:::plot.RenoirSummaryTable(
x = resl[resl$config == "1se",,drop=F], #select 1se config
measure = "mse",
set = "test",
interactive = T,
add.boxplot = F,
add.scores = F,
add.best = F,
key = c("id", "config")
)
Plot for paper
#set colors
col.lasso = "rgba(255, 127, 14, 1)"
col.ridge = "rgba(140, 86, 75, 1)"
col.elnet = "rgba( 23, 190, 207, 1)"
col.rndfr = "rgba(214, 39, 40, 1)"
col.list = list(lasso = col.lasso, ridge = col.ridge, elasticnet = col.elnet, randomForest = col.rndfr)
#list to store data
outl = list()
#lasso
learning.method.id = 'lasso'
for(set in c("train", "test", "full")){
title = switch(
set,
"train" = "Train set",
"test" = "Test set",
"full" = "Full set"
)
#create and store
outl[[set]][[learning.method.id]] = renoir:::plot.RenoirSummaryTable(
x = resl[resl$config == "opt" & resl$learning == learning.method.id,,drop=F],
measure = "mse",
name.ms = paste0("mean score\n", learning.method.id),
set = "train",
interactive = T,
add.boxplot = T,
add.scores = T,
add.best = T,
key = c("id", "config"),
colour = col.list[[learning.method.id]]
) %>% layout(yaxis = list(title = title))
}
for(set in c("train", "test", "full")){
title = switch(
set,
"train" = "Train set",
"test" = "Test set",
"full" = "Full set"
)
for(learning.method.id in learning.methods.ids[2:4]){
#create
#store
outl[[set]][[learning.method.id]] = renoir:::plot.RenoirSummaryTable(
x = resl[resl$config == "opt" & resl$learning == learning.method.id,,drop=F],
measure = "mse",
name.ms = paste0("mean score\n", learning.method.id),
set = set,
interactive = T,
add.boxplot = T,
add.scores = T,
add.best = T,
key = c("id", "config"),
colour = col.list[[learning.method.id]]
) # %>% plotly::layout(yaxis = list(title = paste0('Mean-squared Error\n', title)))
}
}
fig <- plotly::subplot(
unlist(outl, recursive = FALSE),
nrows = 3,
titleY = TRUE,
titleX = FALSE,
shareY = TRUE,
shareX = TRUE
)
#layout
finalfig = fig %>% plotly::layout(margin = list(l = 75, r = 25, b = 50, t = 50))
finalfig = finalfig %>% plotly::layout(
annotations = list(
list(
x = 0,
y = 0.5,
text = "Mean-squared Error",
xshift = -75,
font = list(color = "black",size = 14),
textangle = 270,
showarrow = F,
xref='paper',
yref='paper',
size=48),
list(
x = 0.5,
y = 0,
text = "Trainig-set size",
yshift = -50,
font = list(color = "black",size = 14),
showarrow = F,
xref='paper',
yref='paper',
size=48)
)
)
Mean-absolute error
We consider the mae.
Test
This is the accuracy. score for the opt
configuration
when considering the test set.
#plot
renoir:::plot.RenoirSummaryTable(
x = resl[resl$config == "opt",,drop=F],
measure = "mae",
set = "test",
interactive = T,
add.boxplot = F,
add.scores = F,
add.best = F,
key = c("id", "config")
)
This is the accuracy score for the 1se
configuration
when considering the test set.
#plot
renoir:::plot.RenoirSummaryTable(
x = resl[resl$config == "1se",,drop=F], #select 1se config
measure = "mae",
set = "test",
interactive = T,
add.boxplot = F,
add.scores = F,
add.best = F,
key = c("id", "config")
)