Skip to content

Commit

Permalink
Switched goseq to clusterprofiler, Updated plots
Browse files Browse the repository at this point in the history
  • Loading branch information
royfrancis committed Jan 5, 2025
1 parent d4612ab commit c6afb58
Show file tree
Hide file tree
Showing 20 changed files with 731 additions and 199 deletions.
4 changes: 0 additions & 4 deletions assets/css/styles.scss
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,6 @@ $footer-fg: DimGray;
max-height: 20px;
}

table, .Reactable {
font-size: 0.8em;
}

.contents-table table {
line-height: 1.2;
}
Expand Down
97 changes: 97 additions & 0 deletions scripts/rnaseq/dardel/dge.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/usr/bin/env Rscript

## R script to run differential gene expression

# set custom lib path
.libPaths("/sw/courses/ngsintro/rnaseq/dardel/r")

library(edgeR)
library(DESeq2)

path_input <- "."
path_output <- "."

# metadata
table_meta <- data.frame(
accession = c("SRR3222409", "SRR3222410", "SRR3222411", "SRR3222412", "SRR3222413", "SRR3222414"), condition = c(rep(c("KO", "Wt"), each = 3)),
replicate = rep(1:3, 2)
)
table_meta$condition <- factor(as.character(table_meta$condition), levels = c("Wt", "KO"))
# table_meta$condition <- relevel(table_meta$condition,"Wt")
rownames(table_meta) <- table_meta$accession

# reading in data
message("Reading count data ...")
data_counts <- read.delim(file.path(path_input, "counts_full.txt"), skip = 1)
table_counts <- as.matrix(data_counts[, -c(1:6)])
row.names(table_counts) <- data_counts$Geneid

# fix column labels
colnames(table_counts) <- sub(".*?(SRR[0-9]+).*", "\\1", colnames(table_counts))

# match order of counts and metadata
mth <- match(colnames(table_counts), rownames(table_meta))
table_counts <- table_counts[, mth]
# check if metadata and counts match
all.equal(rownames(table_meta), colnames(table_counts))

# remove genes with low counts
# keep genes that have minimum 1 CPM across 3 samples (since group has three replicates)
message(paste0("Number of genes before filtering: ", nrow(table_counts)))
keepgenes <- rowSums(edgeR::cpm(table_counts) > 1) >= 3
table_counts <- table_counts[keepgenes, ]
message(paste0("Number of genes after filtering: ", nrow(table_counts)))

message("Preview of count table:")
print(head(table_counts))
message("\nView of metadata table:")
print(table_meta)

## deseq2
message("Running DESeq2 ...")

# run deseq2
d <- DESeqDataSetFromMatrix(countData = table_counts, colData = table_meta, design = ~condition)
d <- DESeq2::estimateSizeFactors(d, type = "ratio")
d <- DESeq2::estimateDispersions(d)

dg <- nbinomWaldTest(d)
print(resultsNames(dg))
res <- results(dg, contrast = c("condition", "KO", "Wt"), alpha = 0.05)
message("Summary of DEGs:")
summary(res)

# lfc shrink to correct fold changes
res1 <- lfcShrink(dg, contrast = c("condition", "KO", "Wt"), res = res, type = "normal")

# convert table to data.frame
table_res <- as.data.frame(res1)
table_res$ensembl_gene_id <- rownames(table_res)

# merge results with annotations
table_genes <- read.delim("../reference/mm-biomart99-genes.txt", header = T, stringsAsFactors = F)
table_genes <- table_genes[!duplicated(table_genes$ensembl_gene_id), ]
table_genes <- table_genes[!duplicated(table_genes$external_gene_name), ]
table_res <- merge(table_res, table_genes, by = "ensembl_gene_id")
table_res <- table_res[!is.na(table_res$gene), ]

# alternative solution to convert ensembl gene id to gene symbol
# library(clusterProfiler)
# library(org.Mm.eg.db)
# eg <- bitr(table_res$ensembl_gene_id, fromType = "ENSEMBL", toType = "SYMBOL", OrgDb = "org.Mm.eg.db")
# colnames(eg) <- c("ensembl_gene_id", "gene")
# table_res <- merge(table_res, eg, by = "ensembl_gene_id")

message("Preview of DEG table:")
print(head(table_res))

message("Exporting results ...")
write.table(table_res, file.path(path_output, "dge_results_full.txt"), sep = "\t", quote = F, row.names = F)
saveRDS(table_res, "dge_results_full.rds")

# export vst counts
cv <- as.data.frame(assay(DESeq2::varianceStabilizingTransformation(d, blind = T)), check.names = F)
write.table(cv, file.path(path_output, "counts_vst_full.txt"), sep = "\t", dec = ".", quote = FALSE)
saveRDS(cv, "counts_vst_full.rds")

cat("Completed ...\n")
61 changes: 61 additions & 0 deletions scripts/rnaseq/dardel/gene.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/usr/bin/env Rscript

# set custom library path
.libPaths("/sw/courses/ngsintro/rnaseq/dardel/r")

# path_input <- "/crex/course_data/ngsintro/rnaseq/main_full/3_mapping/"
path_input <- "."
path_output <- "."

# For running as command line script and parsing options from command line
args <- commandArgs(trailingOnly = TRUE)
# test if correct number of arguments are given as input: if not, return an error
if (length(args) < 3) {
stop("Please supply chromosome number as well as start and stop position for the plot\n Example: Rscript gene.r chr19 6062821 6067842",
call. = FALSE
)
}

library(Gviz)
library(EnsDb.Mmusculus.v79)
library(GenomicRanges)

edb <- EnsDb.Mmusculus.v79
gen <- "mm10"
chr <- args[1] # parse the first argument after the r-script to chr
start <- as.integer(args[2]) # parse the second argument after the r-script to start
stop <- as.integer(args[3]) # parse the second argument after the r-script to stop

message("Reading BAM files ...")

# Collect bam filenames from bam folder
bams <- list.files(path_input, pattern = "*.bam$", full.names = TRUE, recursive = TRUE)

# Allow for using different chromosome names than ucsc
options(ucscChromosomeNames = FALSE)

# create tracks
itrack <- IdeogramTrack(genome = gen, chromosome = chr)
gtrack <- GenomeAxisTrack()
grtrack <- getGeneRegionTrackForGviz(edb, chromosome = chr, start = start, end = stop)
genome(grtrack) <- gen
atrack <- lapply(bams, function(bam) {
AlignmentsTrack(bam = bam, isPaired = TRUE, start = start, end = end, genome = gen, chromosome = chr, name = basename(bam))
})

# merge all tracks
toplot <- append(list(itrack, gtrack, GeneRegionTrack(grtrack)), atrack)

width <- 14
height <- 14

# plot
message("Exporting coverage plot ...")
png(file.path(path_output, paste0("coverage-", chr, "-", start, "-", stop, ".png")), width = width, height = height, units = "cm", res = 300)
plotTracks(toplot, type = c("coverage"), from = start, to = stop)
dev.off()

message("Exporting sashimi plot ...")
png(file.path(path_output, paste0("sashimi-", chr, "-", start, "-", stop, ".png")), width = width, height = height, units = "cm", res = 300)
plotTracks(toplot, type = c("sashimi"), from = start, to = stop)
dev.off()
73 changes: 73 additions & 0 deletions scripts/rnaseq/dardel/gse-clusterprofiler.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/usr/local/bin/Rscript

## R script to run gene set enrichment (over-enrichment analysis) on DE genes

# set custom library path
.libPaths("/sw/courses/ngsintro/rnaseq/dardel/r")

library(clusterProfiler)
library(org.Mm.eg.db)

path_input <- "."
path_output <- "."

message("Reading data ...")

table_res <- read.table(file.path(path_input, "dge_results_full.txt"), sep = "\t", header = TRUE, stringsAsFactors = FALSE, quote = "")
table_res <- table_res[!is.na(table_res$padj), ]
table_res <- table_res[!is.na(table_res$entrezgene_id), ]
table_res <- table_res[!duplicated(table_res$entrezgene_id), ]

universe <- unique(table_res$entrezgene_id)
table_res <- table_res[table_res$padj < 0.05, ]
degs <- unique(table_res$entrezgene_id)

message("Running GSE on CC ...")
gse_cc <- enrichGO(
gene = as.character(degs),
universe = as.character(universe),
OrgDb = org.Mm.eg.db,
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05,
readable = TRUE
)

message("Saving CC results ...")
saveRDS(gse_cc, "gse-clusterprofiler-cc.rds")
write.table(gse_cc, file.path(path_output, "gse-cc.txt"), col.names = TRUE, row.names = FALSE, sep = "\t", quote = FALSE)

message("Running GSE on MF ...")
gse_mf <- enrichGO(
gene = as.character(degs),
universe = as.character(universe),
OrgDb = org.Mm.eg.db,
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05,
readable = TRUE
)

message("Saving MF results ...")
saveRDS(gse_mf, "gse-clusterprofiler-mf.rds")
write.table(gse_mf, file.path(path_output, "gse-mf.txt"), col.names = TRUE, row.names = FALSE, sep = "\t", quote = FALSE)

message("Running GSE on BP ...")
gse_bp <- enrichGO(
gene = as.character(degs),
universe = as.character(universe),
OrgDb = org.Mm.eg.db,
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.01,
qvalueCutoff = 0.05,
readable = TRUE
)

message("Saving BP results ...")
saveRDS(gse_bp, "gse-clusterprofiler-bp.rds")
write.table(gse_bp, file.path(path_output, "gse-bp.txt"), col.names = TRUE, row.names = FALSE, sep = "\t", quote = FALSE)

message("Completed.")
91 changes: 91 additions & 0 deletions scripts/rnaseq/dardel/gse-goseq.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/usr/local/bin/Rscript

## R script to run gene set enrichment (over-enrichment analysis) on DE genes

# set custom lib path
.libPaths("/sw/courses/ngsintro/rnaseq/dardel/r")

library(goseq)
library(GO.db)
library(reactome.db)
library(org.Mm.eg.db)

## prepare directory where results will be saved
export_dir <- "gse-goseq"

message("Reading data ...")

table_res <- read.table("dge_results_full.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE, quote = "")
table_res <- table_res[!is.na(table_res$padj), ]

# merge results with annotations
table_genes <- read.delim("mm-biomart99-genes.txt", header = T, stringsAsFactors = F)
table_genes <- table_genes[!duplicated(table_genes$ensembl_gene_id), ]
table_genes$entrezgene_id <- as.character(table_genes$entrezgene_id)
table_dge <- merge(table_res, table_genes, by = "ensembl_gene_id")

## select only genes with entrez gene id
## GO term and reactome pathway annotations are mapped to entrez gene ids
table_dge_e <- table_dge[!is.na(table_dge$entrezgene_id), ]

# remove lines with non-unique entrez gene id
table_dge_e <- table_dge_e[!duplicated(table_dge_e$entrezgene_id), ]

## list of all genes included in the analysis
allgenes <- table_dge_e$entrezgene_id

# select the downregulated genes at FDR<0.05
dn.genes <- table_dge_e$entrezgene_id[(table_dge_e$padj < 0.05 & table_dge_e$log2FoldChange < 0)]
gene.vector.de.dn <- as.integer(allgenes %in% dn.genes)
names(gene.vector.de.dn) <- allgenes

up.genes <- table_dge_e$entrezgene_id[(table_dge_e$padj < 0.05 & table_dge_e$log2FoldChange > 0)]
gene.vector.de.up <- as.integer(allgenes %in% up.genes)
names(gene.vector.de.up) <- allgenes

message("Preparing ontology data ...")

# prepare go db
table_go <- as.data.frame(org.Mm.egGO2ALLEGS)
colnames(table_go) <- c("gene_id", "category", "evidence", "ontology")

# read reactome data
table_reactome <- as.data.frame(reactomeEXTID2PATHID)
colnames(table_reactome) <- c("category", "gene_id")
table_reactome_desc <- as.data.frame(reactomePATHID2NAME)
colnames(table_reactome_desc) <- c("category", "path_name")
# remove the duplicates
table_reactome_desc <- table_reactome_desc[!duplicated(table_reactome_desc$category), ]

message("Running functional analyses ...")

## calculate the gene length bias (using nullp, a function from goseq package)
pwf.de.up <- nullp(gene.vector.de.up, "mm9", "knownGene", plot.fit = F)
pwf.de.dn <- nullp(gene.vector.de.dn, "mm9", "knownGene", plot.fit = F)

# goseq on go
go.up <- goseq(pwf.de.up, gene2cat = table_go)
go.up.adj <- go.up[p.adjust(go.up$over_represented_pvalue, method = "BH") < 0.1, ]

go.dn <- goseq(pwf.de.dn, gene2cat = table_go)
go.dn.adj <- go.dn[p.adjust(go.dn$over_represented_pvalue, method = "BH") < 0.1, ]

# goseq on reactome
react.up <- goseq(pwf.de.up, gene2cat = table_reactome)
react.up <- merge(react.up, table_reactome_desc, by = "category")
react.up.adj <- react.up[p.adjust(react.up$over_represented_pvalue, method = "BH") < 0.1, ]

react.dn <- goseq(pwf.de.dn, gene2cat = table_reactome)
react.dn <- merge(react.dn, table_reactome_desc, by = "category")
react.dn.adj <- react.dn[p.adjust(react.dn$over_represented_pvalue, method = "BH") < 0.1, ]

message("Exporting data ...")
dir.create(export_dir)

# save results as tables
write.table(go.up.adj, file.path(export_dir, "go_up.txt"), col.names = TRUE, row.names = FALSE, sep = "\t", quote = FALSE)
write.table(go.dn.adj, file.path(export_dir, "go_down.txt"), col.names = TRUE, row.names = FALSE, sep = "\t", quote = FALSE)
write.table(react.up.adj, file.path(export_dir, "reactome_up.txt"), col.names = TRUE, row.names = FALSE, sep = "\t", quote = FALSE)
write.table(react.dn.adj, file.path(export_dir, "reactome_down.txt"), col.names = TRUE, row.names = FALSE, sep = "\t", quote = FALSE)

message("Completed.")
Loading

0 comments on commit c6afb58

Please sign in to comment.