ggplot2绘制富集分析柱状图

library("ggplot2")
pathway=read.table("COS3_kegg_total.xls",header=T,sep="\t")
pathbar = ggplot(pathway,aes(x=Pathway,y=-1*log10(Pvalue)))
pathbar + geom_bar(stat="identity")
pathbar + geom_bar(stat="identity") + coord_flip()
pathbar+geom_bar(stat="identity",color="red",fill="blue")+coord_flip()
pathbar+geom_bar(stat="identity",aes(fill=-1*log10(Pvalue)))+coord_flip()
porder=factor(as.integer(rownames(pathway)),labels=pathway$Pathway)
pathbar+geom_bar(stat="identity",aes(x=porder,fill=-1*log10(Pvalue)))+coord_flip()
pathbar+geom_bar(stat="identity",aes(x=rev(porder),fill=-1*log10(Pvalue)))+coord_flip()
porder=factor(as.integer(rownames(pathway)),labels=rev(pathway$Pathway))
pathbar+geom_bar(stat="identity",aes(x=rev(porder),fill=-1*log10(Pvalue)))+coord_flip()
pqbar=pathbar+geom_bar(stat="identity",aes(x=rev(porder),fill=-1*log10(Pvalue)))+coord_flip()+theme(legend.position="none")+labs(title="Top20 of pathway enrichment",y=expression(-log[10](Pvalue)))
pqbar
pqbar+geom_hline(yintercept=2,color=c("red"),linetype=4)

pie图绘制

x <- c(344,1619,1002,74,882,117,1292,58,68,158,73,303,139,579,92,93,2110,997)
label <-c("Black","Blue","Brown","Cyan","Green","Greenyellow","Grey","Grey60","Lightcyan","Magenta","Midnightblue","Pink","Purple","Red","Salmon","Tan","Turquoise","Yellow")
pie(x, labels = label,radius =1.8,col =c("black","blue","brown","cyan","green","greenyellow","grey","grey60","lightcyan","magenta","midnightblue","pink","purple","red","salmon","tan","turquoise","yellow"),clockwise = TRUE)
percent<-round(100*x/sum(x),2)
percent <-paste(percent, "%", sep = "")
pie(x, labels = percent,radius =1.8,col =c("black","blue","brown","cyan","green","greenyellow","grey","grey60","lightcyan","magenta","midnightblue","pink","purple","red","salmon","tan","turquoise","yellow"),clockwise = TRUE)
pie(x, labels = percent,main="饼图",radius =2.0,col =c("black","blue","brown","cyan","green","greenyellow","grey","grey60","lightcyan","magenta","midnightblue","pink","purple","red","salmon","tan","turquoise","yellow"),clockwise = TRUE)
pie(x, labels = percent,main="饼图",radius =2.0,col =c("black","blue","brown","cyan","green","greenyellow","grey","grey60","lightcyan","magenta","midnightblue","pink","purple","red","salmon","tan","turquoise","yellow"),clockwise = TRUE)
legend(locator(1),label, cex=1.2, fill=c("black","blue","brown","cyan","green","greenyellow","grey","grey60","lightcyan","magenta","midnightblue","pink","purple","red","salmon","tan","turquoise","yellow"))

awk命令将fastq文件转换成fasta

awk '{if(NR%4==1||NR%4==2){print $0}}'  test.fq | sed 's/^@/>/g' > myfile.fasta

 有时候,比如说我们需要做一个clustalw,我们需要将多行的fasta文件(multiple line fasta)格式文件,转换成单行的fasta文件(single line fasta)序列文件,怎么办呢。 巧用awk + printf一行搞定。
awk '/^>/ { print n $0;}  !/^>/ {printf "%s", $0, n="\n"}  END {print ""}'  test.fa

 

WGCNA(my project)

setwd("D:/Ma_transcription/WGCNA/")
database <- read.table(file = "COS_deseq_counts_normalized.txt", sep = "\t", header = T, row.names = 1, stringsAsFactors = F)
library(reshape2)
library('WGCNA')
enableWGCNAThreads()#打开多线程
WGCNA_matrix = t(database[order(apply(database,1,mad), decreasing = T)[1:10000],])
subname=sapply(colnames(database),function(x) strsplit(x,"_")[[1]][1])
datTraits = data.frame(gsm=names(database),
subtype=subname)
rownames(datTraits)=datTraits[,1]
head(datTraits)
# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=20, by=2))
datExpr <- WGCNA_matrix
# Call the network topology analysis function
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
# Plot the results:
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="green")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
sft$powerEstimate
net = blockwiseModules(datExpr, power = sft$powerEstimate,
                       maxBlockSize = 5000,TOMType = "unsigned", 
                       minModuleSize = 30,reassignThreshold = 0, mergeCutHeight = 0.25,
                       numericLabels = TRUE, pamRespectsDendro = FALSE,
                       saveTOMs = TRUE,
                       saveTOMFileBase = "AS-green-FPKM-TOM",
                       verbose = 3)
table(net$colors)

# Convert labels to colors for plotting
mergedColors = labels2colors(net$colors)
table(mergedColors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
                    "Module colors",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)
## assign all of the gene to their corresponding module 
## hclust for the genes.
#明确样本数和基因数
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
#首先针对样本做个系统聚类树
datExpr_tree<-hclust(dist(datExpr), method = "average")
par(mar = c(0,5,2,0))
plot(datExpr_tree, main = "Sample clustering", sub="", xlab="", cex.lab = 2, 
     cex.axis = 1, cex.main = 1,cex.lab=1)
## 如果这个时候样本是有性状,或者临床表型的,可以加进去看看是否聚类合理
#针对前面构造的样品矩阵添加对应颜色
sample_colors <- numbers2colors(as.numeric(factor(datTraits$subtype)), 
                                colors = c("grey","blue","red","green"),signed = FALSE)
## 这个给样品添加对应颜色的代码需要自行修改以适应自己的数据分析项目。
#  sample_colors <- numbers2colors( datTraits ,signed = FALSE)
## 如果样品有多种分类情况,而且 datTraits 里面都是分类信息,那么可以直接用上面代码,
##当然,这样给的颜色不明显,意义不大。
#构造10个样品的系统聚类树及性状热图
par(mar = c(1,4,3,1),cex=0.8)
plotDendroAndColors(datExpr_tree, sample_colors,
                    groupLabels = colnames(sample),
                    cex.dendroLabels = 0.8,
                    marAll = c(1, 4, 3, 1),
                    cex.rowText = 0.01,
design=model.matrix(~0+ datTraits$subtype)
colnames(design)=levels(datTraits$subtype)
moduleColors <- labels2colors(net$colors)
# Recalculate MEs with color labels
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
MEs = orderMEs(MEs0); ##不同颜色的模块的ME值矩阵(样本vs模块)
moduleTraitCor = cor(MEs, design , use = "p");
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)

sizeGrWindow(10,6)
# Will display correlations and their p-values
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
                   signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3));
# Display the correlation values within a heatmap plot
labeledHeatmap(Matrix = moduleTraitCor,
               xLabels = names(design),
               yLabels = names(MEs),
               ySymbols = names(MEs),
               colorLabels = FALSE,
               colors = greenWhiteRed(50),
               textMatrix = textMatrix,
               setStdMargins = FALSE,
               cex.text = 0.5,
               zlim = c(-1,1),
               main = paste("Module-trait relationships"))                 main = "Sample dendrogram and trait heatmap")
head(design)
# names (colors) of the modules
modNames = substring(names(MEs), 3)
geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));
## 算出每个模块跟基因的皮尔森相关系数矩阵
## MEs是每个模块在每个样本里面的值
## datExpr是每个基因在每个样本的表达量
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));
names(geneModuleMembership) = paste("MM", modNames, sep="");
names(MMPvalue) = paste("p.MM", modNames, sep="");
## 只有连续型性状才能只有计算
## 这里把是否属于 CB 表型这个变量用0,1进行数值化。
CB = as.data.frame(design[,2]);
names(CB) = "CB"
geneTraitSignificance = as.data.frame(cor(datExpr, CB, use = "p"));
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));
names(geneTraitSignificance) = paste("GS.", names(CB), sep="");
names(GSPvalue) = paste("p.GS.", names(CB), sep="")
module = "turquoise"
column = match(module, modNames);
moduleGenes = moduleColors==module;
sizeGrWindow(7, 7);
par(mfrow = c(1,1));
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
                   abs(geneTraitSignificance[moduleGenes, 1]),
                   xlab = paste("Module Membership in", module, "module"),
                   ylab = "Gene significance for CB",
                   main = paste("Module membership vs. gene significance\n"),
                   cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)

#首先针对所有基因画热图
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
geneTree = net$dendrograms[[1]]; 
#生成全基因不相似TOM矩阵
dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 6); 
plotTOM = dissTOM^7; 
diag(plotTOM) = NA; 
#TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")

#然后随机选取部分基因作图
nSelect = 400
# For reproducibility, we set the random seed
set.seed(10);
select = sample(nGenes, size = nSelect);
selectTOM = dissTOM[select, select];
# There’s no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster.
selectTree = hclust(as.dist(selectTOM), method = "average")
selectColors = moduleColors[select];
# Open a graphical window
sizeGrWindow(9,9)
# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing
# the color palette; setting the diagonal to NA also improves the clarity of the plot
plotDiss = selectTOM^7;
diag(plotDiss) = NA;
TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")

#最后画模块和性状的关系
# Recalculate module eigengenes
MEs = moduleEigengenes(datExpr, moduleColors)$eigengenes
## 只有连续型性状才能只有计算
## 这里把是否属于 Luminal 表型这个变量用0,1进行数值化。
CB = as.data.frame(design[,2]);
names(CB) = "CB"
# Add the weight to existing module eigengenes
MET = orderMEs(cbind(MEs, CB))
# Plot the relationships among the eigengenes and the trait
sizeGrWindow(5,7.5);
par(cex = 0.9)
plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8,                         xLabelsAngle= 90)
# Plot the dendrogram
sizeGrWindow(6,6);
par(cex = 1.0)
## 模块的聚类图
plotEigengeneNetworks(MET, "Eigengene dendrogram", marDendro = c(0,4,2,0),
                      plotHeatmaps = FALSE)
# Plot the heatmap matrix (note: this plot will overwrite the dendrogram plot)
par(cex = 1.0)
## 性状与模块热图
plotEigengeneNetworks(MET, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2),
                      plotDendrograms = FALSE, xLabelsAngle = 90)
# Recalculate topological overlap
TOM = TOMsimilarityFromExpr(datExpr, power = 6); 
# Select module
module = "blue";
# Select module probes
probes = colnames(datExpr) ## 我们例子里面的probe就是基因名
inModule = (moduleColors==module);
modProbes = probes[inModule]; 
## 也是提取指定模块的基因名
# Select the corresponding Topological Overlap
modTOM = TOM[inModule, inModule];
dimnames(modTOM) = list(modProbes, modProbes)
nTop = 200;
IMConn = softConnectivity(datExpr[, modProbes]);
top = (rank(-IMConn) <= nTop)
vis = exportNetworkToCytoscape(modTOM[top, top], edgeFile = paste("CytoscapeInput-edges-top200", paste(module, collapse="-"), ".txt", sep=""), nodeFile = paste("CytoscapeInput-nodes-top200", paste(module, collapse="-"), ".txt", sep=""), weighted = TRUE, threshold = 0);

参考:lncRNA实战项目-第六步-WGCNA相关性分析

 

MA图绘制

# 导入ggplot2包
library(ggplot2)

# 设置好工作目录(到数据所在目录)
# 读取输入数据  "R0-vs-R3.isoforms.filter.tsv"
data = read.table("R0-vs-R3.isoforms.filter.tsv",header=T,row.names=1)

# 计算M值和A值,并将M作为y轴,A作为x轴
aes = aes(x=(log2(R0_fpkm)+log2(R3_fpkm))/2,y=log2(R0_fpkm)-log2(R3_fpkm))

# 绘制MAplot
ggplot(data=data,aes) + geom_point(aes(color=significant))

# 添加辅助线
ggplot(data=data,aes) + geom_hline(yintersect=0,linetype=4,color="blue") + 
geom_point(aes(color=significant))

# 改变点的大小
maplot = ggplot(data=data,aes) + geom_hline(yintersect=0,linetype=4,color="blue") + 
geom_point(aes(color=significant),size=1)

# 设置自定义染色
maplot + scale_color_manual(values=c("green","black","red"))

# 设置标题
maplot + scale_color_manual(values=c("green","black","red")) + 
labs(title="MAplot of R0-vs-R3",x="A",y="M")

# 保存MAplot
ggsave("R0-vs-R3.MAplot.png",width=8,height=6)

DESeq2使用流程

library("DESeq2")
database <- read.table(file = "COS_genes.count_table.xls", sep = "\t", header = T, row.names = 1)
countData <- database[,1:15]
condition <- factor(c(rep("COS0",3),rep("COS1",3),rep("COS3",3),rep("COS6",3),rep("COS12",3)), levels = c("COS0", "COS1","COS3","COS6","COS12"))
countData <- round(as.matrix(countData))
coldata <- data.frame(row.names = colnames(countData), condition)
dds <- DESeqDataSetFromMatrix(countData, DataFrame(condition), design= ~ condition )
dds <- dds[ rowSums(counts(dds)) > 1, ]
dds <- DESeq(dds)
res0vs1 <- results(dds, contrast = c("condition","COS1","COS0"))
table(res0vs1$pvalue <0.001)
res0vs1 <- res0vs1[order(res0vs1$pvalue),]
write.table(as.data.frame(res0vs1), file="res0vs1.xls", sep="\t", quote = F)

PCA图绘制

输入表达矩阵数据文件

library(ggplot2)
library(gmodels)
inname = "COS_deseq_counts_normalized.txt"
outname = "COS_PCA.png"
group <- factor(c(rep("COS0",3),rep("COS1",3),rep("COS3",3),rep("COS6",3),rep("COS12",3)), levels = c("COS0", "COS1","COS3","COS6","COS12"))
## step 1: 数据的读取和处理
# read the expr data
expr <- read.table(inname, header=T, row.names=1)
# transpose the data
data <- t(expr)
## step2:PCA分析
# do PCA 
data.pca <- fast.prcomp(data)
## step3: PCA结果解析
# fetch the proportion of PC1 and PC2
# 一般情况下PC1 + PC2 > 70% 二维PCA散点图才有效
a <- summary(data.pca)
tmp <- a[4]$importance
pro1 <- as.numeric(sprintf("%.3f",tmp[2,1]))*100
pro2 <- as.numeric(sprintf("%.3f",tmp[2,2]))*100

# 将成分矩阵转换为数据框
pc = as.data.frame(a$x)

# 给pc的数据框添加名称列和分组列(用来画图)
pc$group = group
pc$names = rownames(pc)

## step 4: 绘图
# draw PCA plot figure
xlab=paste("PC1(",pro1,"%)",sep="") 
ylab=paste("PC2(",pro2,"%)",sep="")
pca=ggplot(pc,aes(PC1,PC2)) + 
geom_point(size=3,aes(shape=group,color=group)) + 
geom_text(aes(label=names),size=4)+labs(x=xlab,y=ylab,title="PCA") + 
geom_hline(yintercept=0,linetype=4,color="grey") + 
geom_vline(xintercept=0,linetype=4,color="grey") + 
theme_bw()

# 保存结果
ggsave(outname,pca,width=10,height=8)

绘制盒形图

输入表达矩阵数据文件

library(ggplot2)
library(reshape2)
exprSet <- read.table(file = "COS_deseq_counts_normalized.txt", sep = "\t", header = T)
group_list <- factor(c(rep("COS0",3),rep("COS1",3),rep("COS3",3),rep("COS6",3),rep("COS12",3)), levels = c("COS0", "COS1","COS3","COS6","COS12"))
exprSet_L=melt(exprSet)
colnames(exprSet_L)=c('Gene_ID','sample','value')
exprSet_L$group=rep(group_list,each=nrow(exprSet))
#以sample作为x轴,group作为填充颜色
ggplot(exprSet,aes(x=sample,y=value,fill=group)) + geom_boxplot()
#以group作为x轴,sample作为填充颜色
#ggplot(exprSet,aes(x=group,y=value,fill=sample)) + geom_boxplot()
# 修改sample名称
exprSet_L$sample=paste(exprSet_L$group,exprSet_L$sample,sep="-")
ggplot(exprSet_L,aes(x=sample,y=value,fill=group)) + geom_boxplot()
# 离群点的大小
cvbox=ggplot(exprSet_L,aes(x=sample,y=value,fill=group))+geom_boxplot(outlier.size=0)
#outlier.size=-1 ,点将彻底消失
# 修改x轴字符方向
cvbox+theme(axis.text.x=element_text(angle=90,vjust=0.5,hjust=1))
# 修改图例位置和主题
cvbox+theme_bw()+theme(axis.text.x=element_text(angle=90,vjust=0.5,hjust=1),legend.position="top")
# geom_jitter():为图形设置抖动,防止相同点重叠
cvbox+theme_bw()+theme(axis.text.x=element_text(angle=90,vjust=0.5,hjust=1),legend.position="top") + geom_jitter()

叔本华:作为意志和表象的世界(一)

第一篇  世界作为表象初论

 服从充分根据律的表象

经验和科学的客体

跳出童年时代吧,朋友,觉醒呵!
——J·J·卢梭

“世界是我的表象”:这是一个真理,是对于任何一个生活着和认识着的生物都有效的真理;不过只有人能够将它纳入反省的,抽象的意识罢了。并且,要是人真的这样做了,那么,在他那儿就出现了哲学的思考。于是,他就会清楚而确切地明白,他不认识什么太阳,什么地球,而永远只是眼睛,是眼睛看见太阳;永远只是手,是手感触着地球就会明白围绕着他的这世界只是作为表象而存在着的,也就是说这世界的存在完全只是就它对一个其他事物的,一个进行“表象者”的关系来说的。这个进行“表象者”就是人自己。如果有一真理可以先验地说将出来,那就是这一真理了,因为这真理就是一切可能的、可想得到的经验所同具的那一形式的陈述。它比一切,比时间、空间、因果性等更为普遍,因为所有这些都要以这一真理为前提。我们既已把这些形式都认作根据律的一些特殊构成形态,如果其中每一形式只是对一特殊类型的表象有效,那么,与此相反,客体和主体的分立则是所有那些类型的共同形式。客体主体分立是这样一个形式:任何一个表象,不论是哪一种,抽象的或直观的,纯粹的或经验的,都只有在这一共同形式下,根本才有可能,才可想象。因此,再没有一个比这更确切,更不依赖其他真理,更不需要一个证明的真理了;即是说:对于“认识”而存在着的一切,也就是全世界,都只是同主体相关联着的客体,直观者的直观;一句话,都只是表象。当然,这里所说的对于现在,也对于任何过去,任何将来,对于最远的和近的都有效;因为这里所说的对于时间和空间本身就有效;而又只有在时间、空间中,所有这些[过去、现在、未来、远和近]才能区别出来。一切一切,凡已属于和能属于这世界的一切,都无可避免地带有以主体为条件[的性质],并且也仅仅只是为主体而存在。世界即是表象。

这个真理决不新颖。它已包含在笛卡儿所从出发的怀疑论观点中。不过贝克莱是断然把它说出来的第一人;尽管他那哲学的其余部分站不住脚,在这一点上,他却为哲学作出了不朽的贡献。康德首先一个缺点就是对这一命题的忽略,这在本书附录中将有详尽的交代。与此相反,吠檀多哲学被认为是毗耶舍的作品,这里所谈的基本原理在那里就已作为根本命题出现了,因此印度智者们很早就认识这一真理了。威廉·琼斯在他最近《论亚洲哲学》(《亚洲研究》,第四卷第164页)一文中为此作了证,他说:“吠檀多学派的基本教义不在于否认物质的存在,不在否认它的坚实性、不可入性、广延的形状(否认这些,将意味着疯狂),而是在于纠正世俗对于物质的观念,在于主张物质没有独立于心的知觉以外的本质,主张存在和可知觉性是可以互相换用的术语。”这些话已充分地表出了经验的实在性和先验的观念性两者的共存。

在这第一篇里,我们只从上述的这一方面,即仅仅是作为表象的一面来考察这世界。至于这一考察,虽无损于其为真理,究竟是片面的,从而也是由于某种任意的抽象作用引出来的,它宣告了每一个人内心的矛盾,他带着这一矛盾去假定这世界只是他的表象,另一方面他又再也不能摆脱这一假定。不过这一考察的片面性就会从下一篇得到补充,由另一真理得到补充。这一真理,可不如我们这里所从出发的那一个,是那么直接明确的,而是只有通过更深入的探讨,更艰难的抽象和“别异综同”的功夫才能达到的。它必然是很严肃的,对于每一个人纵不是可怕的,也必然是要加以郑重考虑的。这另一真理就是每人,他自己也能说并且必须说的:“世界是我的意志。”

在作这个补充之前,也就是在这第一篇里,我们必须坚定不移地考察世界的这一面,即我们所从出发的一面,“可知性”的一面:因此,也必须毫无抵触心情地将当前现成的客体,甚至自己的身体(我们就要进一步谈到这点)都仅仅作为表象看,并且也仅仅称之为表象。我们希望往后每一个人都会确切明白我们在这样做的时候,只仅仅是撇开了意志;而意志就是单独构成世界另外那一面的东西;因为这世界的一面自始至终是表象,正如另一面自始至终是意志。至于说有一种实在,并不是这两者中的任何一个方面,而是一个自在的客体(康德的“自在之物”可惜也不知不觉的蜕化为这样的客体),那是梦呓中的怪物;而承认这种怪物就会是哲学里引人误入迷途的鬼火。

转载:叔本华哲学智慧

GSEA命令行

前边已经说到MSigDB数据库的entrez ID和symble ID都是人源的,其他物种要想做GSEA的话,就必须要ID转换,做成属于自己研究物种的数据库,这一步是关键。想要根据简单的Nr注释或其他注释来转换是行不通的,比如我研究的物种为硬骨鱼类,Nr注释结果分值最高的前20个一般也都是鱼类,就算注释到了人源的基因,也可能存在版本不对的问题。因此我们要知道MSigDB数据库基因ID的版本,然后下载对应的全基因组蛋白序列,然后比对卡阈值,做成自己的数据库。

GSEA的运行可以界面操作和命令行操作,界面操作教程很多就不多做赘述,它的不足之处除了操作繁琐之外,里面几个较大的库需要的内存较高,像我的4G渣渣电脑,好几个都运行失败,不得不转到服务器上运行;而且界面也不能批量运行。

java -cp gsea-3.0.jar -Xmx10000m xtools.gsea.Gsea -res ma/normalized_counts_all.gct -cls ma/samples.cls#COS1_versus_COS0 -gmx ma/ma.h.all.v6.1.symbols.gmt -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute gene_set -rnd_type no_balance -scoring_scheme weighted -rpt_label COS1vsCOS0_h -metric Ratio_of_Classes -sort real -order descending -create_gcts false -create_svgs false -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 20 -rnd_seed timestamp -save_rnd_lists false -set_max 2000 -set_min 5 -zip_report false -out multiple -gui false

-Xmx10000m:设置最大内存10000兆,之前界面操作失败就因为这个参数
-res:表达量文件
-cls:样本信息
-gmx:数据库文件
-nperm:Number of permutations
-rpt_label:输出文件的前缀
-metric:排序的方法,如果有重复,可以考虑使用T-test;无重复,可以考虑使用Ratio of calsses(差异倍数)或Diff of classes(差异绝对值)
-plot_top_x:默认20,代表富集分析排名最高的20个通路
-set_max:通路基因的最大数量,默认500,但由于某些通路基因数大于500,建议提高阈值
-set_min:通路基因的最小数量,默认15
-out:输出的文件夹
其他参数均为默认

版权声明:本文为博主原创文章,未经博主允许不得转载。