#### Enrichment analysis course, SIB, June 26th 2020 # Enrichment analysis, SIB course, June 26th 2020 # load the packages needed for the R exercise library(clusterProfiler) library(pathview) # Some reminders about the usage of R: # to look for help on a function and which arguments to change use ? ?t.test # to search for a keyword in all or in a particular package, use ?? ??adjust ??stats::adjust # Some reminders about the usage of R: # to look for help on a function and which arguments to change use ? ?t.test # to search for a keyword in all or in a particular package, use ?? ??adjust ??stats::adjust # to access a row in a data frame: iris[3, ] # to search for a word: which(iris$Species=="setosa") # to count events: table(iris$Species) # or summary(iris$Species) #### ---- Exercise 1 # Differential gene expression between two different # immune cell types, natural killer (NK) cells and CD4 T helper cells (Th)). # The data was generated by RNA sequencing and analyzed using limma. # Import file NK_vs_Th_diff_gene_exp.csv that contains gene ensembl ID, # gene symbols, log2 fold change, t-statistic and raw p-value # 1. Is CPS1 significantly differentially expressed at alpha=0.05? # 2. How many genes are up-regulated and how many genes are down-regulated # after BH p-value adjustment? # 3. Is CPS1 still differentially expressed after p-value adjustment at alpha=0.05? # Steps: # - import csv file NK_vs_Th_diff_gene_exercise_1.csv # - look for CPS1 gene in table # - create new column with adjusted p-values # - again look for CPS1 in table # Import table: NK_vs_Th<-read.csv("NK_vs_Th_diff_gene_exercise_1.csv", header = T) # Look at table structure: head(NK_vs_Th) # Search for CPS1 in symbol column: NK_vs_Th[which(NK_vs_Th$symbol=="CPS1"),] # Create new column with adjusted p-value: NK_vs_Th$p.adj<-p.adjust(NK_vs_Th$P.Value, method = "BH") # Summary of DE genes: # Number of down-regulated genes: summary(NK_vs_Th$p.adj<0.05&NK_vs_Th$logFC<0) # Mode FALSE TRUE # logical 18590 1895 # Number of up-regulated genes: summary(NK_vs_Th$p.adj<0.05&NK_vs_Th$logFC>0) # Mode FALSE TRUE # logical 18528 1957 # Again search for CPS1 in symbol column: NK_vs_Th[which(NK_vs_Th$symbol=="CPS1"),] #### ---- Exercise 2 # - Is the adaptive immune response gene set significantly enriched in genes up-regulated in NK vs Th? # - How many GO gene sets are significant after GSEA (use minGSSize=30, pvalueCutoff=0.1) ? # - Is the adaptive immune response gene set significant? Up-reg. or down-reg.? # - Are the majority of gene sets rather up-regulated or down-regulated? # Steps: # - Import the list of genes belonging to the adaptive immune # response: adaptive_immune_response_geneset_term2gene.csv" # - Run Fisher test of up-regulated genes # - Create named vector of t-statistics and run gsea of built-in GO gene sets: # set argument minGSSize=30. # NOTE: if the gsea fails, use readRDS() to import the pre-computed results # contained in file gseGO_Nk_vs_Th_results.rds # - Look for adatpive immune response gene set in results table # - Count the number of negative NES values and the number of positive NES values # Import adaptive immune response: adimresp_term2gene<-read.csv("adaptive_immune_response_geneset_term2gene.csv", header = T) nrow(adimresp_term2gene) # 663 # Check which genes in the gene set are also present in the RNA seq data length(which(NK_vs_Th$symbol %in% adimresp_term2gene$V2)) # 485 # Determine numbers of up-regulated genes that are within or outside of gene set: NK_up<-subset(NK_vs_Th, NK_vs_Th$p.adj<=0.05&NK_vs_Th$logFC>0) NK_up<-as.character(NK_up$symbol) summary(NK_up %in% adimresp_term2gene$V2) # Mode FALSE TRUE # logical 1882 75 NK_not_DE<-subset(NK_vs_Th, NK_vs_Th$p.adj>0.05) NK_not_DE<-as.character(NK_not_DE$symbol) summary(NK_not_DE %in% adimresp_term2gene$V2) # Mode FALSE TRUE # logical 16363 270 # create contingency table and run fisher test: cont.table<-matrix(c(75, 270, 1882, 16363), ncol=2,byrow = F) colnames(cont.table)<-c("up", "not_up") rownames(cont.table)<-c("in_set", "not_in_set") cont.table # up not_up # in_set 75 1882 # not_in_set 270 16363 fisher.test(cont.table) # Fisher's Exact Test for Count Data # data: cont.table # p-value = 8.041e-10 # alternative hypothesis: true odds ratio is not equal to 1 # 95 percent confidence interval: # 1.835981 3.144708 # sample estimates: # odds ratio # 2.414864 # GSEA: prepare gene symbol-named vector of sorted t-statistics # gsea using built-in GO genesets idType(OrgDb = "org.Hs.eg.db") # to determine allowed gene id type gl<-NK_vs_Th$t names(gl)<-make.names(NK_vs_Th$symbol, unique = T) gl<-gl[order(gl, decreasing = T)] # GO_NK_Th<-gseGO(gl, ont="BP", nPerm=1000, # OrgDb = org.Hs.eg.db, # keyType = "SYMBOL", # minGSSize=30) # if this fails, import the results: GO_NK_Th<-readRDS("gseGO_Nk_vs_Th_results.rds") # To have a glance at the results table View(GO_NK_Th@result) # Is the adaptive immune response gene set significant? GO_NK_Th@result[GO_NK_Th@result$Description=="adaptive immune response",] # Count the number of up- and down-regulated gene sets: summary(GO_NK_Th@result$p.adjust<0.05&GO_NK_Th@result$NES<0) # Mode FALSE TRUE # logical 204 70 summary(GO_NK_Th@result$p.adjust<0.05&GO_NK_Th@result$NES>0) # Mode FALSE TRUE # logical 204 70 #### ---- Exercise 3 # - First convert the gene symbols to EntrezID to perform a GSEA of KEGG # pathways (with argument minGSSize=30). # - Are the majority of gene sets rather up-regulated or down-regulated? # - Is there a KEGG immune-related gene set coming up? Is there a KEGG Natural # killer gene set coming up? # - If you want to see which genes are included in one of the built-in KEGG pathways, # where could you find this information? # - Import the hallmark gene sets and run a GSEA. How many significant gene sets are there? # Steps: # - Use this function to see the types of ID that can be converted: keytypes(org.Hs.eg.db) # Use bitr to convert Ensembl IDs to ENTREZID and create a named vector of t-statistics # to provide as gene list to the gseKEGG function, use minGSSize = 30 # NOTE: if the gsea fails, use readRDS to import the pre-computed results # contained in the file gseKEGG_Nk_vs_Th_results.rds # - count the number of positive NES values and of negative NES values # - To search for a partial word in a table, eg. "immune", use grep() # - use str() to view the structure and content of a gsearesult object # - use read.gmt to import hallmark genesets in file h.all.v7.1.symbols.gmt # use function GSEA() providing a named vector of t-statistics # NOTE: if the gsea fails, use readRDS to import the pre-computed results # contained in the file hGSEA_Nk_vs_Th_results.rds # count the number of adj. p-values below 0.05 keytypes(org.Hs.eg.db) # convert from= "ENSEMBL" to "SYMBOL" and "ENTREZID" gene_convert <- bitr(as.character(NK_vs_Th$ensembl_gene_id), fromType="ENSEMBL", toType=c("SYMBOL", "ENTREZID"), OrgDb="org.Hs.eg.db") dim(gene_convert) # [1] 15991 3 # GSEA of KEGG: create a vector of genes that are coded with the EntrezID: # use the sorted gene list gl previously created: gl_kegg<-cbind(SYMBOL=names(gl), t=gl) # merge with converted gene symbols to combine both: gl_kegg<-merge(gl_kegg, gene_convert) head(gl_kegg) # SYMBOL t ENSEMBL ENTREZID # 1 A1BG 1.129187394 ENSG00000121410 1 # 2 A2M -0.382294217 ENSG00000175899 2 # 3 A2MP1 -1.610033787 ENSG00000256069 3 # 4 A4GALT 0.808365644 ENSG00000128274 53947 # 5 AAAS 0.749990903 ENSG00000094914 8086 # 6 AACS 2.172253591 ENSG00000081760 65985 gl_kegg_list<-as.numeric(as.character(gl_kegg$t)) names(gl_kegg_list)<-as.character(gl_kegg$ENTREZID) gl_kegg_list<-sort(gl_kegg_list, decreasing = T) # run GSEA of KEGG: KEGG_NK_Th<-gseKEGG(gl_kegg_list, organism = "hsa", "ncbi-geneid", minGSSize = 30) # if fails, import results: # KEGG_NK_Th<-readRDS("gseKEGG_Nk_vs_Th_results.rds") View(KEGG_NK_Th@result) # Direction of the gene sets: summary(KEGG_NK_Th@result$NES>0) # Mode FALSE TRUE # logical 8 17 KEGG_NK_Th@result[grep("immune",KEGG_NK_Th@result$Description), ] # integer(0) KEGG_NK_Th@result[grep("Natural killer",KEGG_NK_Th@result$Description), ] # ID Description setSize enrichmentScore NES # hsa04650 hsa04650 Natural killer cell mediated cytotoxicity 96 0.6073955 2.424754 # pvalue p.adjust qvalues rank leading_edge # hsa04650 1e-10 1.23e-08 9.842105e-09 1889 tags=51%, list=13%, signal=45% # How many built-in KEGG pathways? length(KEGG_NK_Th@geneSets) # 337 KEGG_NK_Th@geneSets$hsa04650 # coded as Entrez Gene ID # Import hallmark, convert to term2gene and run GSEA: term2gene_h<-read.gmt("h.all.v7.1.symbols.gmt") head(term2gene_h) # Run GSEA with the function that allows to use custom gene sets, # provide the named vector of t statistics h_NK_vs_Th<-GSEA(gl, TERM2GENE = term2gene_h) # Number of significant gene sets: length(which(h_NK_vs_Th@result$p.adjust<=0.05)) # 4 # A quick dotplot of the hallmark results: dotplot(h_NK_vs_Th) #### ---- Exercise 4 # Create figures for the GSEA results: # - barplot of –log10(adj. p-value) of top 10 GO p-values # - barcode plot of one of the significant hallmark gene sets (choose one) # - pathview map for KEGG Natural Killer mediated cytotoxicity (optional: with none-significant genes in grey) # Steps: # - use function barplot() and provide -log10(adj. p-value) of GO GSEA results and name the bars # - use function gseaplot() and provide ID of gene set # - create a named vector of fold change values (optional: setting the logFC of non-significant # genes to 0), search the KEGG ID of the pathway (a hsa0000 type ID) to provide to pathview() function # Barplot of top 10 GO gene sets: par(mar=c(5,16,3,3)) barplot(-log10(GO_NK_Th@result$p.adjust)[1:10], horiz = T, names=GO_NK_Th@result$Description[1:10], las=2, xlab="-log10(adj.p-value)", cex.names = 0.7) # barcode plot with enrichplot: gseaplot(h_NK_vs_Th, geneSetID = "HALLMARK_MTORC1_SIGNALING", title="HALLMARK_MTORC1_SIGNALING") # pathview with non-significant genes in grey: # set log fold change of non-significant genes to 0: NK_vs_Th$logFC_0<-ifelse(NK_vs_Th$p.adj>0.05, 0, NK_vs_Th$logFC) # create named vector of fold change values: genePW<-NK_vs_Th$logFC_0 names(genePW)<-NK_vs_Th$symbol # create pathview map of Natural killer cell mediated cytotoxicity = hsa04650 pathview(gene.data = genePW, pathway.id = "hsa04650", species = "hsa", gene.idtype = "SYMBOL") #### ---- Extra exercise for credits # - Perform GSEA of the NK vs Th data using the Reactome gene sets downloaded on the # MSigDB website. # - How many gene sets are significantly enriched? Generate an ordered barplot of # the NES of all genesets, and generate a barcode plot for the gene set with the lowest NES # Steps # - import the reactome gene sets in the file c2.cp.reactome.v7.1.symbols.gmt using read.gmt() # Use GSEA providing a sorted named vector of t-statistics and the reactome gene sets, # use minGSSize=30 # NOTE: if the GSEA fails, use readRDS to import the pre-computed results contained in the # file "reactomeGSEA_NK_vs_Th_results.rds" # - count the number of significant adjusted p-values # - use barplot() and gseaplot() for the visualization of the results