DESeq2是一个用于分析基因表达差异的R包,具体操作要在R语言中运行
1.R语言安装DESeq2
>source("https://bioconductor.org/biocLite.R") >biocLite("DESeq2")
2.载入基因表达量文件,添加列名
> setwd("C:\\Users\\18019\\Desktop\\counts") > options(stringsAsFactors=FALSE) > control1<-read.table("SRR957677_counts.txt",sep = "\t",col.names = c("gene_id","control1")) > head(control1) gene_id control11 ENSG00000000003.14_2 15762 ENSG00000000005.5_2 03 ENSG00000000419.12_2 7564 ENSG00000000457.13_3 3015 ENSG00000000460.16_5 7646 ENSG00000000938.12_2 0> control2<-read.table("SRR957678_counts.txt",sep = "\t",col.names = c("gene_id","control2")) > treat1<-read.table("SRR957679_counts.txt",sep = "\t",col.names = c("gene_id","treat1")) >treat2<-read.table("SRR957680_counts.txt",sep = "\t",col.names = c("gene_id","treat2"))
3.数据整合
> raw_count <- merge(merge(control1, control2, by="gene_id"), merge(treat1, treat2, by="gene_id")) > head(raw_count) gene_id control1 control2 treat1 treat21 __alignment_not_unique 7440131 2973831 7861484 86768842 __ambiguous 976485 412543 1014239 11790513 __no_feature 1860117 768637 1289737 18120564 __not_aligned 1198545 572588 1256232 13480685 __too_low_aQual 0 0 0 06 ENSG00000000003.14_2 1576 713 1589 1969#删除前五行>raw_count_filt <- raw_count[-1:-5,]#因为我们无法在EBI数据库上直接搜索找到ENSMUSG00000024045.5这样的基因,只能是ENSMUSG00000024045的整数,没有小数点,所以需要进一步替换为整数的形式。#将_后面的数字替换为空赋值给a>a<- gsub("\\_\\d*", "", raw_count_filt$gene_id)#将.后面的数字替换为空赋值给ENSEMBL>ENSEMBL <- gsub("\\.\\d*", "", a)# 将ENSEMBL重新添加到raw_count_filt1矩阵>row.names(raw_count_filt) <- ENSEMBL > raw_count_filt <- cbind(ENSEMBL,raw_count_filt) > colnames(raw_count_filt)[1] <- c("ensembl_gene_id") >head(raraw_count_filt ) ensembl_gene_id gene_id control1 control2 treat1 treat2 ENSG00000000003 ENSG00000000003 ENSG00000000003.14_2 1576 713 1589 1969ENSG00000000005 ENSG00000000005 ENSG00000000005.5_2 0 0 0 1ENSG00000000419 ENSG00000000419 ENSG00000000419.12_2 756 384 806 984ENSG00000000457 ENSG00000000457 ENSG00000000457.13_3 301 151 217 324ENSG00000000460 ENSG00000000460 ENSG00000000460.16_5 764 312 564 784ENSG00000000938 ENSG00000000938 ENSG00000000938.12_2 0 0 0 0
4.对基因进行注释-获取gene_symbol
用bioMart对ensembl_id转换成gene_symbol
> library("biomaRt") > library("curl") > mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl")) > my_ensembl_gene_id <- row.names(raw_count_filt) > options(timeout = 4000000) > hg_symbols<- getBM(attributes=c('ensembl_gene_id','hgnc_symbol',"chromosome_name", "start_position","end_position", "band"), filters= 'ensembl_gene_id', values = my_ensembl_gene_id, mart = mart)#将合并后的表达数据框raw_count_filt和注释得到的hg_symbols整合为一readcount <- merge(raw_count_filt, hg_symbols, by="ensembl_gene_id") > head(readcount) ensembl_gene_id gene_id control1 control2 treat1 treat2 hgnc_symbol chromosome_name start_position1 ENSG00000000003 ENSG00000000003.14_2 1576 713 1589 1969 TSPAN6 X 1006271092 ENSG00000000005 ENSG00000000005.5_2 0 0 0 1 TNMD X 1005848023 ENSG00000000419 ENSG00000000419.12_2 756 384 806 984 DPM1 20 509348674 ENSG00000000457 ENSG00000000457.13_3 301 151 217 324 SCYL3 1 1698496315 ENSG00000000460 ENSG00000000460.16_5 764 312 564 784 C1orf112 1 1696620076 ENSG00000000938 ENSG00000000938.12_2 0 0 0 0 FGR 1 27612064 end_position band1 100639991 q22.12 100599885 q22.13 50958555 q13.134 169894267 q24.25 169854080 q24.26 27635277 p35.3#输出count表达矩阵> write.csv(readcount, file='readcount_all.csv') > readcount<-raw_count_filt[ ,-1:-2] > write.csv(readcount, file='readcount.csv') > head(readcount) control1 control2 treat1 treat2 ENSG00000000003 1576 713 1589 1969ENSG00000000005 0 0 0 1ENSG00000000419 756 384 806 984ENSG00000000457 301 151 217 324ENSG00000000460 764 312 564 784ENSG00000000938 0 0 0 0
5.DEseq2筛选差异表达基因并注释(bioMart)
#载入数据(countData和colData)> mycounts<-readcount > head(mycounts) control1 control2 treat1 treat2 ENSG00000000003 1576 713 1589 1969 ENSG00000000005 0 0 0 1 ENSG00000000419 756 384 806 984 ENSG00000000457 301 151 217 324 ENSG00000000460 764 312 564 784 ENSG00000000938 0 0 0 0 > condition <- factor(c(rep("control",2),rep("treat",2)), levels = c("control","treat")) > condition [1] control control treat treat Levels: control treat > colData <- data.frame(row.names=colnames(mycounts), condition) > colData condition control1 control control2 control treat1 treat treat2 treat
构建dds对象,开始DESeq流程
>library("DESeq2") > dds <- DESeqDataSetFromMatrix(mycounts, colData, design= ~ condition) > dds <- DESeq(dds) estimating size factors estimating dispersions gene-wise dispersion estimates mean-dispersion relationship final dispersion estimates fitting model and testing > ddsclass: DESeqDataSet dim: 60880 4 metadata(1): version assays(4): counts mu H cooks rownames(60880): ENSG00000000003 ENSG00000000005 ... ENSG00000285993 ENSG00000285994 rowData names(22): baseMean baseVar ... deviance maxCooks colnames(4): control1 control2 treat1 treat2 colData names(2): condition sizeFactor#查看总体结果> res = results(dds, contrast=c("condition", "control", "treat")) > res = res[order(res$pvalue),] > head(res) log2 fold change (MLE): condition control vs treat Wald test p-value: condition control vs treat DataFrame with 6 rows and 6 columns baseMean log2FoldChange lfcSE stat pvalue <numeric> <numeric> <numeric> <numeric> <numeric> ENSG00000178691 1025.66218695436 2.83012875791025 0.225513526042636 12.5497073615672 3.98981786210676e-36ENSG00000135535 2415.77359618136 1.22406336488047 0.183431131037356 6.67314952460929 2.5037106203736e-11ENSG00000164172 531.425786834548 1.30449018960413 0.207785830749451 6.27805170785243 3.42841906697453e-10ENSG00000172239 483.998634607265 1.31701332235233 0.215453141699223 6.11275988813803 9.79226522597759e-10ENSG00000237296 53.0114998109978 2.70139282483841 0.480033904207378 5.62750422660019 1.82835684560772e-08 ENSG00000196504 3592.67315807893 1.09372324353448 0.200308218929736 5.46020153031335 4.75594407815571e-08 padj <numeric> ENSG00000178691 3.90682965057494e-32ENSG00000135535 1.22581671973492e-07ENSG00000164172 1.11903598346049e-06ENSG00000172239 2.39714652731931e-06ENSG00000237296 NA ENSG00000196504 9.31404088266014e-05> summary(res) out of 33100 with nonzero total read count adjusted p-value < 0.1LFC > 0 (up) : 78, 0.24% LFC < 0 (down) : 15, 0.045% outliers [1] : 0, 0% low counts [2] : 23308, 70% (mean count < 135) [1] see 'cooksCutoff' argument of ?results [2] see 'independentFiltering' argument of ?results#这里可以看到有78个基因上调,15个基因下调#将分析结果输出> write.csv(res,file="All_results.csv")
提取差异表达基因
这里我用的方法是倍差法
获取padj(p值经过多重校验校正后的值)小于0.05,表达倍数取以2为对数后大于1或者小于-1的差异表达基因
> diff_gene_deseq2 <-subset(res, padj < 0.05 & abs(log2FoldChange) > 1) > dim(diff_gene_deseq2) [1] 21 6 > head(diff_gene_deseq2) log2 fold change (MLE): condition control vs treat Wald test p-value: condition control vs treat DataFrame with 6 rows and 6 columns baseMean log2FoldChange lfcSE stat pvalue <numeric> <numeric> <numeric> <numeric> <numeric> ENSG00000178691 1025.66218695436 2.83012875791025 0.225513526042636 12.5497073615672 3.98981786210676e-36 ENSG00000135535 2415.77359618136 1.22406336488047 0.183431131037356 6.67314952460929 2.5037106203736e-11 ENSG00000164172 531.425786834548 1.30449018960413 0.207785830749451 6.27805170785243 3.42841906697453e-10 ENSG00000172239 483.998634607265 1.31701332235233 0.215453141699223 6.11275988813803 9.79226522597759e-10 ENSG00000196504 3592.67315807893 1.09372324353448 0.200308218929736 5.46020153031335 4.75594407815571e-08 ENSG00000163848 633.066990185649 1.15489622775117 0.219655131372136 5.25777030810433 1.45812478575117e-07 padj <numeric> ENSG00000178691 3.90682965057494e-32 ENSG00000135535 1.22581671973492e-07 ENSG00000164172 1.11903598346049e-06 ENSG00000172239 2.39714652731931e-06 ENSG00000196504 9.31404088266014e-05 ENSG00000163848 0.000230253090268928#输出差异基因> write.csv(diff_gene_deseq2,file= "DEG_treat_vs_control.csv")#用bioMart对差异表达基因进行注释> library("biomaRt") > library("curl") > hg_symbols<- getBM(attributes=c('ensembl_gene_id','external_gene_name',"description"),filters = 'ensembl_gene_id', values = my_ensembl_gene_id, mart = mart) > head(hg_symbols) ensembl_gene_id external_gene_name 1 ENSG00000011405 PIK3C2A 2 ENSG00000100731 PCNX1 3 ENSG00000128512 DOCK4 4 ENSG00000135535 CD164 5 ENSG00000140526 ABHD2 6 ENSG00000144228 SPOPL description 1 phosphatidylinositol-4-phosphate 3-kinase catalytic subunit type 2 alpha [Source:HGNC Symbol;Acc:HGNC:8971] 2 pecanex 1 [Source:HGNC Symbol;Acc:HGNC:19740] 3 dedicator of cytokinesis 4 [Source:HGNC Symbol;Acc:HGNC:19192] 4 CD164 molecule [Source:HGNC Symbol;Acc:HGNC:1632] 5 abhydrolase domain containing 2 [Source:HGNC Symbol;Acc:HGNC:18717] 6 speckle type BTB/POZ protein like [Source:HGNC Symbol;Acc:HGNC:27934]#合并数据:res结果hg_symbols合并成一个文件> ensembl_gene_id<-rownames(diff_gene_deseq2) > diff_gene_deseq2<-cbind(ensembl_gene_id,diff_gene_deseq2) > colnames(diff_gene_deseq2)[1]<-c("ensembl_gene_id") > diff_name<-merge(diff_gene_deseq2,hg_symbols,by="ensembl_gene_id") > head(diff_name) DataFrame with 6 rows and 9 columns ensembl_gene_id baseMean log2FoldChange lfcSE stat pvalue <character> <numeric> <numeric> <numeric> <numeric> <numeric> 1 ENSG00000011405 1600.01408863821 1.07722909393382 0.24714564887963 4.35868120202462 1.30848557424083e-05 2 ENSG00000100731 1162.93822827396 1.0006257630015 0.214393389946423 4.66724166846545 3.05270197242525e-06 3 ENSG00000128512 368.442571635954 1.19657846347522 0.262780839813213 4.5535224878867 5.27550292947225e-06 4 ENSG00000135535 2415.77359618136 1.22406336488047 0.183431131037356 6.67314952460929 2.5037106203736e-11 5 ENSG00000140526 796.447227235737 1.05296203760187 0.23492350092969 4.48214858639031 7.38952622958053e-06 6 ENSG00000144228 293.746859588111 1.10903132755747 0.283181091639851 3.91633255290906 8.9906210067011e-05 padj external_gene_name <numeric> <character> 1 0.00533862114290257 PIK3C2A 2 0.002491004809499 PCNX1 3 0.00319558231320097 DOCK4 4 1.22581671973492e-07 CD164 5 0.00364483675888146 ABHD2 6 0.0214722343652725 SPOPL description <character> 1 phosphatidylinositol-4-phosphate 3-kinase catalytic subunit type 2 alpha [Source:HGNC Symbol;Acc:HGNC:8971] 2 pecanex 1 [Source:HGNC Symbol;Acc:HGNC:19740] 3 dedicator of cytokinesis 4 [Source:HGNC Symbol;Acc:HGNC:19192] 4 CD164 molecule [Source:HGNC Symbol;Acc:HGNC:1632] 5 abhydrolase domain containing 2 [Source:HGNC Symbol;Acc:HGNC:18717] 6 speckle type BTB/POZ protein like [Source:HGNC Symbol;Acc:HGNC:27934]#输出含注释的差异基因文件write.csv(diff_name,file= "diff_gene.csv")
作者:孤独巡礼_435a
链接:https://www.jianshu.com/p/4d0812195b65
点击查看更多内容
1人点赞
评论
共同学习,写下你的评论
评论加载中...
作者其他优质文章
正在加载中
感谢您的支持,我会继续努力的~
扫码打赏,你说多少就多少
赞赏金额会直接到老师账户
支付方式
打开微信扫一扫,即可进行扫码打赏哦