install.packages("BiocManager") BiocManager::install('mixOmics') library(FactoMineR) library(mixOmics) # 1) ACP (PCA) -------------------------- # 1a) Package FactoMineR ------------------------ # iris dataset----- head(iris) table(iris$Species) mypca <- PCA(iris,scale.unit=TRUE,quali.sup=5,graph=FALSE) eigenvalues <- mypca$eig barplot(eigenvalues[, 2], names.arg=1:nrow(eigenvalues), main = "Variances", xlab = "Principal Components", ylab = "Percentage of variances", col ="steelblue") # Add connected line segments to the plot lines(x = 1:nrow(eigenvalues), eigenvalues[, 2], type="b", pch=19, col = "red") plot(mypca,choix="var") plot(mypca,choix="ind",habillage=5) # dataset srbc ------ data(srbct) dataVar <- srbct$gene # gene expression data dim(dataVar) clineClass<-srbct$class # tumor types dataPCA<-cbind(as.data.frame(clineClass),dataVar) myPCA=PCA(dataPCA, scale.unit=TRUE, ncp=4, graph=T,quali.sup=1) # scree plot barplot(myPCA$eig[,2], main="Histogram of eigen values", names.arg=rownames(myPCA$eig), xlab="Axes", ylab="Pourcentage d’inertie") # variables plot plot.PCA(myPCA, axes=c(1, 2), choix="var") # individuals plot plot.PCA(myPCA, axes=c(1, 2), choix="ind",habillage=1) # 1b) Package MixOmics ------------------------ mixo.pca <- pca(dataVar, ncomp = 10, center = TRUE, scale = TRUE) plot(mixo.pca) plotVar(mixo.pca, comp = c(1, 2), var.names = TRUE, title = 'PCA comp 1 - 2',cex=2.5,cutoff = 0.7) plotIndiv(mixo.pca, comp = c(1, 2), ind.names = TRUE, group = dataPCA[,1], legend = TRUE) ## 2) sPCA ---------------------------- ## 2a) optimal nb of variables calculations --------------- # several nb are tested test.keepX <- c(seq(5, 25, 5)) # tune.spca.res <- tune.spca(dataVar, ncomp = 3, # nb of compo to study nrepeat = 5, # nb of cross validations folds = 10, #nb of folds test.keepX = test.keepX) plot(tune.spca.res) # cross validation output # otpimal nb of components tune.spca.res$choice.keepX ## 2b) sPCA ------------------------ final.spca <- spca(dataVar, ncomp = 3, # 3 compoentns kept keepX = tune.spca.res$choice.keepX) # we keep the optimal nb of variable per component # graph of variables => more informative variables plotVar(final.spca, comp = c(1, 2), var.names = TRUE, # plot variables against the sPCA components title = 'sPCA comp 1 - 2') # graphe of individuals plotIndiv(final.spca, comp = c(1, 2), ind.names = TRUE, # plot final sPCA samples for first two components group = clineClass, # use the class to colour each sample legend = TRUE, title = 'sPCA comp 1 - 2') # 3) PLS-DA ------------------- srbct.plsda <- plsda(dataPCA[,2:ncol(dataPCA)], dataPCA[,1], ncomp = 10) #with a sparse version #srbct.splsda <- splsda(dataPCA[,2:ncol(dataPCA)], dataPCA[,1], ncomp = 10) # individuals graph plotIndiv(srbct.plsda , comp = 1:2, group = dataPCA[,1], ind.names = FALSE, # ind colored per group ellipse = TRUE, # 95% intervalle de confiance affiché legend = TRUE, title = '(a) PLSDA with confidence ellipses') # individual graphs background = background.predict(srbct.plsda, comp.predicted=2, dist = "max.dist") plotIndiv(srbct.plsda, comp = 1:2, group = clineClass, ind.names = FALSE, # colour points by class background = background, # include prediction background for each class legend = TRUE, title = " (b) PLSDA with prediction background") plotVar(srbct.plsda, comp = c(1, 2), var.names = TRUE) plotVar(srbct.plsda, comp = c(1, 2), var.names = TRUE, cutoff=0.7) # 4) CCA -------------------- # 4a) when P + Q < N ---------------------- data(nutrimouse) X <- nutrimouse$lipid[, 1:10] Y <- nutrimouse$gene[, 1:10] result.cca.nutrimouse <- rcc(Y, X) # run the CCA method to ckeck !!! plotIndiv(result.cca.nutrimouse) # plot projection into canonical variate subspace plotVar(result.cca.nutrimouse) # plot original variables' correlation with canonical variates # 4b) P + Q > N ---------------------- X <- nutrimouse$lipid # extract all lipid concentration variables Y <- nutrimouse$gene # extract all gene expression variables # with a regularization tune.rcc.res<-tune.rcc (Y,X)# : function to calculate optimal lambda values result.cca.nutrimouse <- rcc(Y, X, method = "ridge", lambda1 = 0.025075, lambda2 = 0.025075) # using the ridge method # other regularization result.rcca.nutrimouse2 <- rcc(Y, X, method = 'shrinkage') # using the shrinkage method plotIndiv(result.cca.nutrimouse) plotVar(result.cca.nutrimouse,cutoff=0.5,cex=c(2,2)) plotVar(result.rcca.nutrimouse2) # 5) DIABLO -------------------- # 5a) data ------------------ data(breast.TCGA) # load the data # list of the 3 datasets data = list(miRNA = breast.TCGA$data.train$mirna, mRNA = breast.TCGA$data.train$mrna, proteomics = breast.TCGA$data.train$protein) lapply(data, dim) # vdimensions Y = breast.TCGA$data.train$subtype # response variable summary(Y) # 5b) PLS per pairs of datasets ------------------ #tune.spls(data[["miRNA"]],Y) list.keepX = c(25, 25) list.keepY = c(25, 25) pls1 <- spls(data[["miRNA"]], data[["mRNA"]], keepX = list.keepX, keepY = list.keepY) pls2 <- spls(data[["miRNA"]], data[["proteomics"]], keepX = list.keepX, keepY = list.keepY) pls3 <- spls(data[["mRNA"]], data[["proteomics"]], keepX = list.keepX, keepY = list.keepY) # graphics PLS plotVar(pls1, cutoff = 0.5, title = "(a) miRNA vs mRNA", legend = c("miRNA", "mRNA"), var.names = FALSE, style = 'graphics', pch = c(16, 17), cex = c(2,2), col = c('darkorchid', 'lightgreen')) plotVar(pls2, cutoff = 0.5, title = "(b) miRNA vs proteomics", legend = c("miRNA", "proteomics"), var.names = FALSE, style = 'graphics', pch = c(16, 17), cex = c(2,2), col = c('darkorchid', 'lightgreen')) plotVar(pls3, cutoff = 0.5, title = "(c) mRNA vs proteomics", legend = c("mRNA", "proteomics"), var.names = FALSE, style = 'graphics', pch = c(16, 17), cex = c(2,2), col = c('darkorchid', 'lightgreen')) cor(pls1$variates$X, pls1$variates$Y) cor(pls2$variates$X, pls2$variates$Y) cor(pls3$variates$X, pls3$variates$Y) # 5b) DIABLO initial model------------------ # Here, we want to maximize the prediction of the tumor class, # so although the datasets are highly correlated, we choose 0.1. design = matrix(0.1, ncol = length(data), nrow = length(data), dimnames = list(names(data), names(data))) diag(design) = 0 # diagonale à 0 design basic.diablo.model = block.splsda(X = data, Y = Y, ncomp = 5, design = design) # graph of inidividuals plotIndiv(basic.plsda.model, legend = TRUE, # graph of inidividuals plotIndiv(basic.plsda.model, legend = TRUE, # 5c) Choice of the optimal number of components --------------------- # cross validation perf.diablo = perf(basic.diablo.model, validation = 'Mfold', folds = 10, nrepeat = 10) plot(perf.diablo) # prefomance evalution graphical output # identification of the optimal ncomp value # for a given distance method and a given error rate calculation. ncomp = perf.diablo$choice.ncomp$WeightedVote["Overall.BER", "centroids.dist"] # show the optimal choice for ncomp for each dist metric perf.diablo$choice.ncomp$WeightedVote # 5d) choice of the optimal number of variables ---------------- # grille de nombres de variables à tester test.keepX = list (mRNA = c(5:9, seq(10, 18, 2), seq(20,30,5)), miRNA = c(5:9, seq(10, 18, 2), seq(20,30,5)), proteomics = c(5:9, seq(10, 18, 2), seq(20,30,5))) # selection of the optimal number of variables tune.TCGA = tune.block.splsda(X = data, Y = Y, ncomp = ncomp, test.keepX = test.keepX, design = design, validation = 'Mfold', folds = 10, nrepeat = 1, dist = "centroids.dist") list.keepX = tune.TCGA$choice.keepX # set the optimal values of features to retain list.keepX # 5e) Final model ------------------- final.diablo.model = block.splsda(X = data, Y = Y, ncomp = ncomp, keepX = list.keepX, design = design) final.diablo.model$design # design matrix for the final model # the features selected to form the first component selectVar(final.diablo.model, block = 'mRNA', comp = 1)$mRNA$name # 5f) Graphical output --------------------- plotDiablo(final.diablo.model, ncomp = 1) plotIndiv(final.diablo.model, ind.names = FALSE, legend = TRUE, title = 'DIABLO Sample Plots') plotArrow(final.diablo.model, ind.names = FALSE, legend = TRUE, title = 'DIABLO') plotVar(final.diablo.model, var.names = FALSE, style = 'graphics', legend = TRUE, pch = c(16, 17, 15), cex = c(2,2,2), col = c('darkorchid', 'brown1', 'lightgreen')) # 6) MINT --------------------------- data(stemcells) # stem cells data X <- stemcells$gene # gene expression Y <- stemcells$celltype # stem cells types study <- stemcells$study # studies of the samples dim(X) conttable<-table(Y,study) chisq.test(conttable) # types of celle depend on the study stem.plsda <- plsda(X, Y, ncomp = 10) plotIndiv(stem.plsda , comp = 1:2, group = Y, ind.names = FALSE, # colour points per group ellipse = TRUE, # include 95% confidence ellipse for each class legend = TRUE, title = '(a) PLSDA with confidence ellipses') basic.plsda.model <- mint.plsda(X, Y, study = study, ncomp = 2) # graph of inidividuals plotIndiv(basic.plsda.model, legend = TRUE, title = ' ', subtitle = ' ', ellipse = TRUE)