install.packages("BiocManager")
BiocManager::install('mixOmics')
library(FactoMineR)
library(mixOmics)


# 1) ACP (PCA) --------------------------

# 1a) Package FactoMineR ------------------------

# iris dataset-----
head(iris)
table(iris$Species)

mypca <- PCA(iris,scale.unit=TRUE,quali.sup=5,graph=FALSE)
eigenvalues <- mypca$eig
barplot(eigenvalues[, 2], 
        names.arg=1:nrow(eigenvalues), 
        main = "Variances",
        xlab = "Principal Components",
        ylab = "Percentage of variances",
        col ="steelblue")
# Add connected line segments to the plot
lines(x = 1:nrow(eigenvalues), eigenvalues[, 2], 
      type="b", pch=19, col = "red")	

plot(mypca,choix="var")
plot(mypca,choix="ind",habillage=5)

# dataset srbc ------

data(srbct)
dataVar <- srbct$gene # gene expression data
dim(dataVar) 
clineClass<-srbct$class # tumor types

dataPCA<-cbind(as.data.frame(clineClass),dataVar) 

myPCA=PCA(dataPCA, scale.unit=TRUE, ncp=4, graph=T,quali.sup=1)
# scree plot
barplot(myPCA$eig[,2], main="Histogram of eigen values", names.arg=rownames(myPCA$eig), xlab="Axes", ylab="Pourcentage d’inertie")
# variables plot
plot.PCA(myPCA, axes=c(1, 2), choix="var")
# individuals plot
plot.PCA(myPCA, axes=c(1, 2), choix="ind",habillage=1)

# 1b) Package MixOmics ------------------------
mixo.pca <- pca(dataVar, ncomp = 10, center = TRUE, scale = TRUE)

plot(mixo.pca)

plotVar(mixo.pca, comp = c(1, 2), var.names = TRUE, 
        title = 'PCA comp 1 - 2',cex=2.5,cutoff = 0.7)

plotIndiv(mixo.pca, comp = c(1, 2), ind.names = TRUE,
          group = dataPCA[,1],  
          legend = TRUE)
 

## 2) sPCA ----------------------------
## 2a) optimal nb of variables calculations ---------------
# several nb are tested
test.keepX <- c(seq(5, 25, 5)) # 
tune.spca.res <- tune.spca(dataVar, ncomp = 3, # nb of compo to study
                           nrepeat = 5, # nb of cross validations
                           folds = 10, #nb of folds
                           test.keepX = test.keepX)
plot(tune.spca.res) # cross validation output
# otpimal nb of components
tune.spca.res$choice.keepX 

## 2b) sPCA ------------------------
final.spca <- spca(dataVar, ncomp = 3, # 3 compoentns kept
                 keepX = tune.spca.res$choice.keepX) # we keep the optimal nb of variable per component
# graph of variables => more informative variables
plotVar(final.spca, comp = c(1, 2), var.names = TRUE,  # plot variables against the sPCA components
        title = 'sPCA comp 1 - 2')
# graphe of individuals
plotIndiv(final.spca, comp = c(1, 2), ind.names = TRUE, # plot final sPCA samples for first two components
          group = clineClass,  # use the class to colour each sample
          legend = TRUE, title = 'sPCA comp 1 - 2')


# 3) PLS-DA -------------------
srbct.plsda <- plsda(dataPCA[,2:ncol(dataPCA)], dataPCA[,1], ncomp = 10) 
#with a sparse version 
#srbct.splsda <- splsda(dataPCA[,2:ncol(dataPCA)], dataPCA[,1], ncomp = 10) 
# individuals graph
plotIndiv(srbct.plsda , comp = 1:2, 
          group = dataPCA[,1], ind.names = FALSE,  # ind colored per group          ellipse = TRUE, # 95% intervalle de confiance affiché
          legend = TRUE, title = '(a) PLSDA with confidence ellipses')
# individual graphs
background = background.predict(srbct.plsda, comp.predicted=2, dist = "max.dist")
plotIndiv(srbct.plsda, comp = 1:2,
          group = clineClass, ind.names = FALSE, # colour points by class
          background = background, # include prediction background for each class
          legend = TRUE, title = " (b) PLSDA with prediction background")

plotVar(srbct.plsda, comp = c(1, 2), var.names = TRUE)
plotVar(srbct.plsda, comp = c(1, 2), var.names = TRUE, cutoff=0.7)


# 4) CCA --------------------
# 4a) when P + Q < N ----------------------
data(nutrimouse)
X <- nutrimouse$lipid[, 1:10]
Y <- nutrimouse$gene[, 1:10] 
result.cca.nutrimouse <- rcc(Y, X) # run the CCA method to ckeck !!!
plotIndiv(result.cca.nutrimouse) # plot projection into canonical variate subspace
plotVar(result.cca.nutrimouse) # plot original variables' correlation with canonical variates


# 4b)  P + Q > N ----------------------
X <- nutrimouse$lipid # extract all lipid concentration variables
Y <- nutrimouse$gene # extract all gene expression variables
# with a regularization
tune.rcc.res<-tune.rcc (Y,X)# : function to calculate optimal lambda values
result.cca.nutrimouse <- rcc(Y, X, method = "ridge", lambda1 = 0.025075, lambda2 = 0.025075) # using the ridge method
# other regularization
result.rcca.nutrimouse2 <- rcc(Y, X, method = 'shrinkage')  # using the shrinkage method
plotIndiv(result.cca.nutrimouse) 
plotVar(result.cca.nutrimouse,cutoff=0.5,cex=c(2,2)) 
plotVar(result.rcca.nutrimouse2) 


# 5) DIABLO --------------------
# 5a) data ------------------
data(breast.TCGA) # load the data
# list of the 3 datasets
data = list(miRNA = breast.TCGA$data.train$mirna, 
            mRNA = breast.TCGA$data.train$mrna,
            proteomics = breast.TCGA$data.train$protein)
lapply(data, dim) # vdimensions
Y = breast.TCGA$data.train$subtype # response variable 
summary(Y)

# 5b) PLS per pairs of datasets  ------------------

#tune.spls(data[["miRNA"]],Y)
list.keepX = c(25, 25) 
list.keepY = c(25, 25)
pls1 <- spls(data[["miRNA"]], data[["mRNA"]], 
             keepX = list.keepX, keepY = list.keepY) 
pls2 <- spls(data[["miRNA"]], data[["proteomics"]], 
             keepX = list.keepX, keepY = list.keepY)
pls3 <- spls(data[["mRNA"]], data[["proteomics"]], 
             keepX = list.keepX, keepY = list.keepY)

# graphics PLS 
plotVar(pls1, cutoff = 0.5, title = "(a) miRNA vs mRNA", 
        legend = c("miRNA", "mRNA"), 
        var.names = FALSE, style = 'graphics', 
        pch = c(16, 17), cex = c(2,2), 
        col = c('darkorchid', 'lightgreen'))
plotVar(pls2, cutoff = 0.5, title = "(b) miRNA vs proteomics", 
        legend = c("miRNA", "proteomics"), 
        var.names = FALSE, style = 'graphics', 
        pch = c(16, 17), cex = c(2,2), 
        col = c('darkorchid', 'lightgreen'))
plotVar(pls3, cutoff = 0.5, title = "(c) mRNA vs proteomics", 
        legend = c("mRNA", "proteomics"), 
        var.names = FALSE, style = 'graphics', 
        pch = c(16, 17), cex = c(2,2), 
        col = c('darkorchid', 'lightgreen'))
cor(pls1$variates$X, pls1$variates$Y) 
cor(pls2$variates$X, pls2$variates$Y) 
cor(pls3$variates$X, pls3$variates$Y) 

# 5b) DIABLO initial model------------------
# Here, we want to maximize the prediction of the tumor class,
# so although the datasets are highly correlated, we choose 0.1.
design = matrix(0.1, ncol = length(data), nrow = length(data), 
                dimnames = list(names(data), names(data)))
diag(design) = 0 # diagonale à 0
design
basic.diablo.model = block.splsda(X = data, Y = Y, ncomp = 5, design = design) 
# graph of inidividuals
plotIndiv(basic.plsda.model, legend = TRUE, 
          # graph of inidividuals
          plotIndiv(basic.plsda.model, legend = TRUE, 
                    
# 5c) Choice of the optimal number of components ---------------------
# cross validation
perf.diablo = perf(basic.diablo.model, validation = 'Mfold', 
                   folds = 10, nrepeat = 10) 
plot(perf.diablo) # prefomance evalution graphical output
# identification of the optimal ncomp value 
# for a given distance method and a given error rate calculation.
ncomp = perf.diablo$choice.ncomp$WeightedVote["Overall.BER", "centroids.dist"] 
# show the optimal choice for ncomp for each dist metric
perf.diablo$choice.ncomp$WeightedVote

# 5d) choice of the optimal number of variables ----------------
# grille de nombres de variables à tester
test.keepX = list (mRNA = c(5:9, seq(10, 18, 2), seq(20,30,5)), 
                   miRNA = c(5:9, seq(10, 18, 2), seq(20,30,5)),
                   proteomics = c(5:9, seq(10, 18, 2), seq(20,30,5)))

# selection of the optimal number of variables
tune.TCGA = tune.block.splsda(X = data, Y = Y, ncomp = ncomp, 
                              test.keepX = test.keepX, design = design,
                              validation = 'Mfold', folds = 10, nrepeat = 1,
                              dist = "centroids.dist")

list.keepX = tune.TCGA$choice.keepX # set the optimal values of features to retain
list.keepX


# 5e) Final model -------------------

final.diablo.model = block.splsda(X = data, Y = Y, ncomp = ncomp, 
                                  keepX = list.keepX, design = design)
final.diablo.model$design # design matrix for the final model

# the features selected to form the first component
selectVar(final.diablo.model, block = 'mRNA', comp = 1)$mRNA$name 

                
# 5f) Graphical output ---------------------

plotDiablo(final.diablo.model, ncomp = 1)
plotIndiv(final.diablo.model, ind.names = FALSE, legend = TRUE, 
          title = 'DIABLO Sample Plots')
plotArrow(final.diablo.model, ind.names = FALSE, legend = TRUE, 
          title = 'DIABLO')
plotVar(final.diablo.model, var.names = FALSE, 
        style = 'graphics', legend = TRUE,
        pch = c(16, 17, 15), cex = c(2,2,2), 
        col = c('darkorchid', 'brown1', 'lightgreen'))

  
# 6) MINT ---------------------------
data(stemcells) # stem cells data

X <- stemcells$gene # gene expression 
Y <- stemcells$celltype # stem cells types
study <- stemcells$study # studies of the samples
dim(X)

conttable<-table(Y,study)
chisq.test(conttable) # types of celle depend on the study

stem.plsda <- plsda(X, Y, ncomp = 10) 

plotIndiv(stem.plsda , comp = 1:2, 
          group = Y, ind.names = FALSE,  # colour points per group
          ellipse = TRUE, # include 95% confidence ellipse for each class
          legend = TRUE, title = '(a) PLSDA with confidence ellipses')

basic.plsda.model <- mint.plsda(X, Y, study = study, ncomp = 2) 
# graph of inidividuals
plotIndiv(basic.plsda.model, legend = TRUE, 
          title = ' ', subtitle = ' ', ellipse = TRUE)