WGCNA是一个R包,对一个完全不会R的人来说,确实费了一番功夫,不过也将我对R的学习提前提上日程。
分析步骤:
1.WGCNA安装
source("http://bioconductor.org/biocLite.R") biocLite(c("AnnotationDbi", "impute", "GO.db", "preprocessCore")) install.packages("WGCNA")
2.输入数据准备
原始数据https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE48213,共56个样本;对数据的合并可参考http://www.biotrainee.com:8080/thread-603-1-1.html,输入文件格式如下:
setwd('WGCNA/') fpkm=read.table('test.txt',sep = '\t',stringsAsFactors = F) head(fpkm) dim(fpkm) ##共56个细胞系,36953个基因的表达矩阵 names(fpkm)##细胞系名称
3.数据预处理
library(reshape2) library('WGCNA') RNAseq_voom <- fpkm WGCNA_matrix = t(RNAseq_voom[order(apply(RNAseq_voom,1,mad), decreasing = T)[1:3000],]) ##top 3000 mad genes,t参数将矩阵变为符合WGCNA要求的形式:行名为gene,列名为样品 datExpr <- WGCNA_matrix save(datExpr, file = "AS-green-FPKM-01-dataInput.RData")
4.确定最佳beta值
powers = c(c(1:10), seq(from = 12, to=20, by=2)) # Call the network topology analysis function sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5) # Plot the results: ##sizeGrWindow(9, 5) par(mfrow = c(1,2)); cex1 = 0.9; # Scale-free topology fit index as a function of the soft-thresholding power plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n", main = paste("Scale independence")); text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], labels=powers,cex=cex1,col="red"); # this line corresponds to using an R^2 cut-off of h abline(h=0.90,col="red") # Mean connectivity as a function of the soft-thresholding power plot(sft$fitIndices[,1], sft$fitIndices[,5], xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n", main = paste("Mean connectivity")) text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
最佳的beta值就是sft$powerEstimate,此例为6
5.一步法构建共表达矩阵
net = blockwiseModules(datExpr, power = 6, maxBlockSize = 6000, TOMType = "unsigned", minModuleSize = 30, reassignThreshold = 0, mergeCutHeight = 0.25, numericLabels = TRUE, pamRespectsDendro = FALSE, saveTOMs = TRUE, saveTOMFileBase = "AS-green-FPKM-TOM", verbose = 3) table(net$colors) #绘图结果展示 mergedColors = labels2colors(net$colors) # Plot the dendrogram and the module colors underneath plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]], "Module colors", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05) #结果保存 moduleLabels = net$colors moduleColors = labels2colors(net$colors) table(moduleColors) MEs = net$MEs; geneTree = net$dendrograms[[1]]; save(MEs, moduleLabels, moduleColors, geneTree, file = "AS-green-FPKM-02-networkConstruction-auto.RData")
#对样本做个系统聚类树 #明确基因数和样本数 nGenes = ncol(datExpr) nSamples = nrow(datExpr) datExpr_tree<-hclust(dist(datExpr), method = "average") par(mar = c(0,5,2,0)) plot(datExpr_tree, main = "Sample clustering", sub="", xlab="", cex.lab = 2, cex.axis = 1, cex.main = 1,cex.lab=1)
6.导出网络到Cytoscape
# Recalculate topological overlap if needed TOM = TOMsimilarityFromExpr(datExpr, power = 6); # Read in the annotation file # annot = read.csv(file = "GeneAnnotation.csv"); # Select modules需要修改,选择需要导出的模块颜色 modules = c("turquoise", "blue"); # Select module probes选择模块探测 probes = colnames(datExpr) inModule = is.finite(match(moduleColors, modules)); modProbes = probes[inModule]; #modGenes = annot$gene_symbol[match(modProbes, annot$substanceBXH)]; # Select the corresponding Topological Overlap modTOM = TOM[inModule, inModule]; dimnames(modTOM) = list(modProbes, modProbes) # Export the network into edge and node list files Cytoscape can read cyt = exportNetworkToCytoscape(modTOM, edgeFile = paste("AS-green-FPKM-One-step-CytoscapeInput-edges-", paste(modules, collapse="-"), ".txt", sep=""), nodeFile = paste("AS-green-FPKM-One-step-CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep=""), weighted = TRUE, threshold = 0.02, nodeNames = modProbes, #altNodeNames = modGenes, nodeAttr = moduleColors[inModule]);
7.分析网络可视化
用heatmap可视化权重网络,heatmap每一行或列对应一个基因,颜色越深表示有较高的邻近
options(stringsAsFactors = FALSE); lnames = load(file = "AS-green-FPKM-01-dataInput.RData"); lnames lnames = load(file = "AS-green-FPKM-02-networkConstruction-auto.RData"); lnames nGenes = ncol(datExpr) nSamples = nrow(datExpr) #可视化全部基因网络 # Calculate topological overlap anew: this could be done more efficiently by saving the TOM # calculated during module detection, but let us do it again here. dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 6); # Transform dissTOM with a power to make moderately strong connections more visible in the heatmap plotTOM = dissTOM^7; # Set diagonal to NA for a nicer plot diag(plotTOM) = NA; # Call the plot function #sizeGrWindow(9,9) TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes") #随便选取1000个基因来可视化 nSelect = 1000 # For reproducibility, we set the random seed set.seed(10); select = sample(nGenes, size = nSelect); selectTOM = dissTOM[select, select]; # There's no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster. selectTree = hclust(as.dist(selectTOM), method = "average") selectColors = moduleColors[select]; # Open a graphical window #sizeGrWindow(9,9) # Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing # the color palette; setting the diagonal to NA also improves the clarity of the plot plotDiss = selectTOM^7; diag(plotDiss) = NA; TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")
8.多步法构建网络
#2.选择合适的阀值,同上 #3. 网络构建:(1) Co-expression similarity and adjacency softPower = 6; adjacency = adjacency(datExpr, power = softPower); #(2) 邻近矩阵到拓扑矩阵的转换,Turn adjacency into topological overlap TOM = TOMsimilarity(adjacency); dissTOM = 1-TOM # (3) 聚类拓扑矩阵 #Call the hierarchical clustering function geneTree = hclust(as.dist(dissTOM), method = "average"); # Plot the resulting clustering tree (dendrogram) #sizeGrWindow(12,9) plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity", labels = FALSE, hang = 0.04); #(4) 聚类分支的休整dynamicTreeCut # We like large modules, so we set the minimum module size relatively high: minModuleSize = 30; # Module identification using dynamic tree cut: dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM, deepSplit = 2, pamRespectsDendro = FALSE, minClusterSize = minModuleSize); table(dynamicMods) #4. 绘画结果展示 # Convert numeric lables into colors dynamicColors = labels2colors(dynamicMods) table(dynamicColors) # Plot the dendrogram and colors underneath #sizeGrWindow(8,6) plotDendroAndColors(geneTree, dynamicColors, "Dynamic Tree Cut", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05, main = "Gene dendrogram and module colors") #5. 聚类结果相似模块的融合,Merging of modules whose expression profiles are very similar #在聚类树中每一leaf是一个短线,代表一个基因, #不同分之间靠的越近表示有高的共表达基因,将共表达极其相似的modules进行融合 # Calculate eigengenes MEList = moduleEigengenes(datExpr, colors = dynamicColors) MEs = MEList$eigengenes # Calculate dissimilarity of module eigengenes MEDiss = 1-cor(MEs); # Cluster module eigengenes METree = hclust(as.dist(MEDiss), method = "average"); # Plot the result #sizeGrWindow(7, 6) plot(METree, main = "Clustering of module eigengenes", xlab = "", sub = "") #选择有75%相关性的进行融合 MEDissThres = 0.25 # Plot the cut line into the dendrogram abline(h=MEDissThres, col = "red") # Call an automatic merging function merge = mergeCloseModules(datExpr, dynamicColors, cutHeight = MEDissThres, verbose = 3) # The merged module colors mergedColors = merge$colors; # Eigengenes of the new merged modules: mergedMEs = merge$newMEs; #绘制融合前(Dynamic Tree Cut)和融合后(Merged dynamic)的聚类图 #sizeGrWindow(12, 9) #pdf(file = "Plots/geneDendro-3.pdf", wi = 9, he = 6) plotDendroAndColors(geneTree, cbind(dynamicColors, mergedColors), c("Dynamic Tree Cut", "Merged dynamic"), dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05) #dev.off() # 只是绘制融合后聚类图 plotDendroAndColors(geneTree,mergedColors,"Merged dynamic", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05) #5.结果保存 # Rename to moduleColors moduleColors = mergedColors # Construct numerical labels corresponding to the colors colorOrder = c("grey", standardColors(50)); moduleLabels = match(moduleColors, colorOrder)-1; MEs = mergedMEs; # Save module colors and labels for use in subsequent parts save(MEs, moduleLabels, moduleColors, geneTree, file = "AS-green-FPKM-02-networkConstruction-stepByStep.RData") #6. 导出网络到Cytoscape # Recalculate topological overlap if needed TOM = TOMsimilarityFromExpr(datExpr, power = 6); # Read in the annotation file # annot = read.csv(file = "GeneAnnotation.csv"); # Select modules需要修改 modules = c("brown", "red"); # Select module probes probes = colnames(datExpr) inModule = is.finite(match(moduleColors, modules)); modProbes = probes[inModule]; #modGenes = annot$gene_symbol[match(modProbes, annot$substanceBXH)]; # Select the corresponding Topological Overlap modTOM = TOM[inModule, inModule]; dimnames(modTOM) = list(modProbes, modProbes) # Export the network into edge and node list files Cytoscape can read cyt = exportNetworkToCytoscape(modTOM, edgeFile = paste("AS-green-FPKM-Step-by-step-CytoscapeInput-edges-", paste(modules, collapse="-"), ".txt", sep=""), nodeFile = paste("AS-green-FPKM-Step-by-step-CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep=""), weighted = TRUE, threshold = 0.02, nodeNames = modProbes, #altNodeNames = modGenes, nodeAttr = moduleColors[inModule]); #===================================================================================== # 分析网络可视化,用heatmap可视化权重网络,heatmap每一行或列对应一个基因,颜色越深表示有较高的邻近 #===================================================================================== options(stringsAsFactors = FALSE); lnames = load(file = "AS-green-FPKM-01-dataInput.RData"); lnames lnames = load(file = "AS-green-FPKM-02-networkConstruction-stepByStep.RData"); lnames nGenes = ncol(datExpr) nSamples = nrow(datExpr) #1. 可视化全部基因网络 # Calculate topological overlap anew: this could be done more efficiently by saving the TOM # calculated during module detection, but let us do it again here. dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 6); # Transform dissTOM with a power to make moderately strong connections more visible in the heatmap plotTOM = dissTOM^7; # Set diagonal to NA for a nicer plot diag(plotTOM) = NA; # Call the plot function #sizeGrWindow(9,9) TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes") #随便选取1000个基因来可视化 nSelect = 1000 # For reproducibility, we set the random seed set.seed(10); select = sample(nGenes, size = nSelect); selectTOM = dissTOM[select, select]; # There's no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster. selectTree = hclust(as.dist(selectTOM), method = "average") selectColors = moduleColors[select]; # Open a graphical window #sizeGrWindow(9,9) # Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing # the color palette; setting the diagonal to NA also improves the clarity of the plot plotDiss = selectTOM^7; diag(plotDiss) = NA; TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")
根据基因间表达量进行模块聚类(前边得到)和所得到的各模块间的相关性
MEs = moduleEigengenes(datExpr, moduleColors)$eigengenes MET = orderMEs(MEs) sizeGrWindow(7, 6) plotEigengeneNetworks(MET, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2), plotDendrograms = FALSE, xLabelsAngle = 90)
脚本来自:
http://tiramisutes.github.io/2016/09/14/WGCNA.html
https://bioconductor.org/packages/devel/bioc/vignettes/CVE/inst/doc/WGCNA_from_TCGA_RNAseq.html
Leave a Reply to T Cancel reply