####  Enrichment analysis course, SIB, June 26th 2020


# Enrichment analysis, SIB course, June 26th 2020

# load the packages needed for the R exercise
library(clusterProfiler)
library(pathview)

# Some reminders about the usage of R:
# to look for help on a function and which arguments to change use ?
?t.test
# to search for a keyword in all or in a particular package, use ??
??adjust
??stats::adjust

# Some reminders about the usage of R:
# to look for help on a function and which arguments to change use ?
?t.test
# to search for a keyword in all or in a particular package, use ??
??adjust
??stats::adjust
# to access a row in a data frame:
iris[3, ]
# to search for a word:
which(iris$Species=="setosa")
# to count events:
table(iris$Species) # or
summary(iris$Species)

#### ---- Exercise 1

# Differential gene expression between two different 
# immune cell types, natural killer (NK) cells and CD4 T helper cells (Th)).
# The data was generated by RNA sequencing and analyzed using limma.
# Import file NK_vs_Th_diff_gene_exp.csv that contains gene ensembl ID,
# gene symbols, log2 fold change, t-statistic and raw p-value

# 1. Is CPS1 significantly differentially expressed at alpha=0.05?
# 2. How many genes are up-regulated and how many genes are down-regulated
# after BH p-value adjustment?
# 3. Is CPS1 still differentially expressed after p-value adjustment at alpha=0.05?

# Steps: 
# - import csv file NK_vs_Th_diff_gene_exercise_1.csv
# - look for CPS1 gene in table
# - create new column with adjusted p-values
# - again look for CPS1 in table

# Import table:
NK_vs_Th<-read.csv("NK_vs_Th_diff_gene_exercise_1.csv",
                   header = T)
# Look at table structure:
head(NK_vs_Th)
# Search for CPS1 in symbol column:
NK_vs_Th[which(NK_vs_Th$symbol=="CPS1"),]

# Create new column with adjusted p-value:
NK_vs_Th$p.adj<-p.adjust(NK_vs_Th$P.Value, method = "BH")

# Summary of DE genes:
# Number of down-regulated genes:
summary(NK_vs_Th$p.adj<0.05&NK_vs_Th$logFC<0)
#    Mode   FALSE    TRUE 
# logical   18590    1895 
# Number of up-regulated genes:
summary(NK_vs_Th$p.adj<0.05&NK_vs_Th$logFC>0)
# Mode      FALSE    TRUE 
# logical   18528    1957 

# Again search for CPS1 in symbol column:
NK_vs_Th[which(NK_vs_Th$symbol=="CPS1"),]


#### ---- Exercise 2

# - Is the adaptive immune response gene set significantly enriched in genes up-regulated in NK vs Th?
# - How many GO gene sets are significant after GSEA (use minGSSize=30, pvalueCutoff=0.1) ?
# - Is the adaptive immune response gene set significant? Up-reg. or down-reg.?
# - Are the majority of gene sets rather up-regulated or down-regulated? 

# Steps:
# - Import the list of genes belonging to the adaptive immune 
# response: adaptive_immune_response_geneset_term2gene.csv"
# - Run Fisher test of up-regulated genes
# - Create named vector of t-statistics and run gsea of built-in GO gene sets: 
# set argument minGSSize=30.
# NOTE: if the gsea fails, use readRDS() to import the pre-computed results 
# contained in file gseGO_Nk_vs_Th_results.rds
# - Look for adatpive immune response gene set in results table
# - Count the number of negative NES values and the number of positive NES values

# Import adaptive immune response:
adimresp_term2gene<-read.csv("adaptive_immune_response_geneset_term2gene.csv",
                             header = T)
nrow(adimresp_term2gene) # 663

# Check which genes in the gene set are also present in the RNA seq data 
length(which(NK_vs_Th$symbol %in% adimresp_term2gene$V2)) # 485

# Determine numbers of up-regulated genes that are within or outside of gene set:
NK_up<-subset(NK_vs_Th, 
              NK_vs_Th$p.adj<=0.05&NK_vs_Th$logFC>0)
NK_up<-as.character(NK_up$symbol)
summary(NK_up %in% adimresp_term2gene$V2)
#    Mode   FALSE    TRUE 
# logical    1882      75
NK_not_DE<-subset(NK_vs_Th, NK_vs_Th$p.adj>0.05)
NK_not_DE<-as.character(NK_not_DE$symbol)
summary(NK_not_DE %in% adimresp_term2gene$V2)
#    Mode   FALSE    TRUE 
# logical   16363     270

# create contingency table and run fisher test:
cont.table<-matrix(c(75, 270, 1882, 16363), ncol=2,byrow = F)
colnames(cont.table)<-c("up", "not_up")
rownames(cont.table)<-c("in_set", "not_in_set")
cont.table
#             up not_up
# in_set      75   1882
# not_in_set 270  16363

fisher.test(cont.table)
# Fisher's Exact Test for Count Data
# data:  cont.table
# p-value = 8.041e-10
# alternative hypothesis: true odds ratio is not equal to 1
# 95 percent confidence interval:
#  1.835981 3.144708
# sample estimates:
# odds ratio 
#   2.414864


# GSEA: prepare gene symbol-named vector of sorted t-statistics
# gsea using built-in GO genesets
idType(OrgDb = "org.Hs.eg.db") # to determine allowed gene id type

gl<-NK_vs_Th$t
names(gl)<-make.names(NK_vs_Th$symbol, unique = T)
gl<-gl[order(gl, decreasing = T)]

# GO_NK_Th<-gseGO(gl, ont="BP", nPerm=1000,
#                 OrgDb = org.Hs.eg.db,
#                  keyType = "SYMBOL",
#                  minGSSize=30)
# if this fails, import the results:
GO_NK_Th<-readRDS("gseGO_Nk_vs_Th_results.rds")

# To have a glance at the results table
View(GO_NK_Th@result)

# Is the adaptive immune response gene set significant?
GO_NK_Th@result[GO_NK_Th@result$Description=="adaptive immune response",]

# Count the number of up- and down-regulated gene sets:
summary(GO_NK_Th@result$p.adjust<0.05&GO_NK_Th@result$NES<0)
#    Mode   FALSE    TRUE 
# logical     204      70 
summary(GO_NK_Th@result$p.adjust<0.05&GO_NK_Th@result$NES>0)
#     Mode   FALSE    TRUE 
# logical      204      70 


#### ---- Exercise 3

# - First convert the gene symbols to EntrezID to perform a GSEA of KEGG 
# pathways (with argument minGSSize=30). 
# - Are the majority of gene sets rather up-regulated or down-regulated?
# - Is there a KEGG immune-related gene set coming up? Is there a KEGG Natural 
# killer gene set coming up?
# - If you want to see which genes are included in one of the built-in KEGG pathways, 
# where could you find this information?
# - Import the hallmark gene sets and run a GSEA. How many significant gene sets are there?

# Steps:
# - Use this function to see the types of ID that can be converted: keytypes(org.Hs.eg.db)
# Use bitr to convert Ensembl IDs to ENTREZID and create a named vector of t-statistics
# to provide as gene list to the gseKEGG function, use minGSSize = 30
# NOTE: if the gsea fails, use readRDS to import the pre-computed results 
# contained in the file gseKEGG_Nk_vs_Th_results.rds
# - count the number of positive NES values and of negative NES values
# - To search for a partial word in a table, eg. "immune", use grep()
# - use str() to view the structure and content of a gsearesult object
# - use read.gmt to import hallmark genesets in file h.all.v7.1.symbols.gmt
# use function GSEA() providing a named vector of t-statistics
# NOTE: if the gsea fails, use readRDS to import the pre-computed results
# contained in the file hGSEA_Nk_vs_Th_results.rds
# count the number of adj. p-values below 0.05

keytypes(org.Hs.eg.db)
# convert from= "ENSEMBL" to "SYMBOL" and "ENTREZID"
gene_convert <- bitr(as.character(NK_vs_Th$ensembl_gene_id), 
                     fromType="ENSEMBL", 
                     toType=c("SYMBOL", "ENTREZID"), OrgDb="org.Hs.eg.db")
dim(gene_convert) # [1] 15991     3

# GSEA of KEGG: create a vector of genes that are coded with the EntrezID:
# use the sorted gene list gl previously created:
gl_kegg<-cbind(SYMBOL=names(gl), 
               t=gl)
# merge with converted gene symbols to combine both:
gl_kegg<-merge(gl_kegg, gene_convert)
head(gl_kegg)
#   SYMBOL            t         ENSEMBL ENTREZID
# 1   A1BG  1.129187394 ENSG00000121410        1
# 2    A2M -0.382294217 ENSG00000175899        2
# 3  A2MP1 -1.610033787 ENSG00000256069        3
# 4 A4GALT  0.808365644 ENSG00000128274    53947
# 5   AAAS  0.749990903 ENSG00000094914     8086
# 6   AACS  2.172253591 ENSG00000081760    65985
gl_kegg_list<-as.numeric(as.character(gl_kegg$t))
names(gl_kegg_list)<-as.character(gl_kegg$ENTREZID)
gl_kegg_list<-sort(gl_kegg_list, decreasing = T)

# run GSEA of KEGG:
KEGG_NK_Th<-gseKEGG(gl_kegg_list, organism = "hsa", "ncbi-geneid",
                    minGSSize = 30)
# if fails, import results:
# KEGG_NK_Th<-readRDS("gseKEGG_Nk_vs_Th_results.rds")
View(KEGG_NK_Th@result)

# Direction of the gene sets:
summary(KEGG_NK_Th@result$NES>0)
#     Mode   FALSE    TRUE 
# logical      8       17 

KEGG_NK_Th@result[grep("immune",KEGG_NK_Th@result$Description), ]
# integer(0)
KEGG_NK_Th@result[grep("Natural killer",KEGG_NK_Th@result$Description), ]
#                ID                               Description setSize enrichmentScore      NES
# hsa04650 hsa04650 Natural killer cell mediated cytotoxicity      96       0.6073955 2.424754
# pvalue p.adjust      qvalues rank                   leading_edge
# hsa04650  1e-10 1.23e-08 9.842105e-09 1889 tags=51%, list=13%, signal=45%

# How many built-in KEGG pathways?
length(KEGG_NK_Th@geneSets) # 337
KEGG_NK_Th@geneSets$hsa04650 # coded as Entrez Gene ID

# Import hallmark, convert to term2gene and run GSEA:
term2gene_h<-read.gmt("h.all.v7.1.symbols.gmt")
head(term2gene_h)

# Run GSEA with the function that allows to use custom gene sets, 
# provide the named vector of t statistics
h_NK_vs_Th<-GSEA(gl, TERM2GENE = term2gene_h)

# Number of significant gene sets:
length(which(h_NK_vs_Th@result$p.adjust<=0.05)) # 4

# A quick dotplot of the hallmark results:
dotplot(h_NK_vs_Th)

#### ---- Exercise 4

# Create figures for the GSEA results:
# - barplot of –log10(adj. p-value) of top 10 GO p-values
# - barcode plot of one of the significant hallmark gene sets (choose one)
# - pathview map for KEGG Natural Killer mediated cytotoxicity (optional: with none-significant genes in grey)

# Steps:
# - use function barplot() and provide -log10(adj. p-value) of GO GSEA results and name the bars
# - use function gseaplot() and provide ID of gene set
# - create a named vector of fold change values (optional: setting the logFC of non-significant
# genes to 0), search the KEGG ID of the pathway (a hsa0000 type ID) to provide to pathview() function

# Barplot of top 10 GO gene sets:
par(mar=c(5,16,3,3))
barplot(-log10(GO_NK_Th@result$p.adjust)[1:10],
        horiz = T, names=GO_NK_Th@result$Description[1:10],
        las=2, xlab="-log10(adj.p-value)",
        cex.names = 0.7) 

# barcode plot with enrichplot:
gseaplot(h_NK_vs_Th, geneSetID = "HALLMARK_MTORC1_SIGNALING",
         title="HALLMARK_MTORC1_SIGNALING")


# pathview with non-significant genes in grey:
# set log fold change of non-significant genes to 0:
NK_vs_Th$logFC_0<-ifelse(NK_vs_Th$p.adj>0.05, 0, NK_vs_Th$logFC)

# create named vector of fold change values:
genePW<-NK_vs_Th$logFC_0
names(genePW)<-NK_vs_Th$symbol

# create pathview map of Natural killer cell mediated cytotoxicity = hsa04650
pathview(gene.data  = genePW,
         pathway.id = "hsa04650",
         species    = "hsa",
         gene.idtype = "SYMBOL")

#### ---- Extra exercise for credits
# - Perform GSEA of the NK vs Th data using the Reactome gene sets downloaded on the 
# MSigDB website.
# - How many gene sets are significantly enriched? Generate an ordered barplot of 
# the NES of all genesets, and generate a barcode plot for the gene set with the lowest NES

# Steps
# - import the reactome gene sets in the file c2.cp.reactome.v7.1.symbols.gmt using read.gmt()
# Use GSEA providing a sorted named vector of t-statistics and the reactome gene sets, 
# use minGSSize=30
# NOTE: if the GSEA fails, use readRDS to import the pre-computed results contained in the 
# file "reactomeGSEA_NK_vs_Th_results.rds"
# - count the number of significant adjusted p-values
# - use barplot() and gseaplot() for the visualization of the results