diff --git a/DESCRIPTION b/DESCRIPTION index 4450754..585cfd2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,19 +1,17 @@ Type: Package Package: rats -Version: 0.5.0-0 -Date: 2017-06-01 -Title: Relative Abundance of Transcripts (BETA) +Version: 0.6.0-0 +Date: 2017-09-18 +Title: Relative Abundance of Transcripts Encoding: UTF-8 Authors: c(person("Kimon Froussios", role=c("aut"), email="k.froussios@dundee.ac.uk"), - person("Kira Mour??o", role=c("aut"), email="k.mourao@dundee.ac.uk"), + person("Kira Mourão", role=c("aut"), email="k.mourao@dundee.ac.uk"), person("Nick J. Schurch", role=c("cre"), email="n.schurch@dundee.ac.uk")) -Author: Kimon Froussios [aut], Kira Mour??o [aut], Nick Schurch [cre] +Author: Kimon Froussios [aut], Kira Mourão [aut], Nick Schurch [cre] Maintainer: Kimon Froussios Description: Given a set of transript abundances or a Sleuth output object, and a transcript-to-gene index of identifiers, this package identifies genes where differential transcript usage occurs. License: MIT + file LICENSE -URL: https://github.com/bartongroupb/RATS -BugReports: https://github.com/bartongroup/RATS/issues VignetteBuilder: knitr Depends: matrixStats, diff --git a/NAMESPACE b/NAMESPACE index 9c7dcb2..ea5be83 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,8 +2,6 @@ export(annot2ids) export(call_DTU) -export(combine_sim_dtu) -export(countrange_sim) export(denest_sleuth_boots) export(dtu_plurality_summary) export(dtu_summary) @@ -15,10 +13,10 @@ export(get_dtu_ids) export(get_plurality_ids) export(get_switch_ids) export(group_samples) +export(maxabs) export(plot_gene) export(plot_overview) export(plot_shiny_volcano) -export(plot_sim) export(sim_boot_data) export(sim_count_data) export(sim_sleuth_data) diff --git a/R/data_simulators.R b/R/data_simulators.R index b257d19..b3b5040 100644 --- a/R/data_simulators.R +++ b/R/data_simulators.R @@ -19,17 +19,18 @@ #' but this allows testing that the code can handle it. #' #' @import data.table -#' @export #' -sim_sleuth_data <- function(varname="condition", COUNTS_COL="est_counts", TARGET_COL="target_id" , +#' @export +sim_sleuth_data <- function(varname="condition", COUNTS_COL="est_counts", TARGET_COL="target_id", PARENT_COL="parent_id", BS_TARGET_COL="target_id", cnames=c("A","B"), errannot_inconsistent=FALSE, cv_dt=FALSE) { # !!! Some of the tests of the package are tightly connected to the specifics of the object returned by this function. # !!! Entry additions might be tolerated. Changes or removals of entries or structure will certainly cause failures. - tx <- data.frame("target_id"= c("NIB.1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.1", "1B1C.2", "CC_a", "CC_b", "1NN", "2NN", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "LC1", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "parent_id"= c("NIB", "1A1N", "1D1C", "1D1C", "1B1C", "1B1C", "CC", "CC", "NN", "NN", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "LC", "LC", "ALLA", "ALLB", "ALLB")) + tx <- data.frame(target_id= c("NIB.1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.1", "1B1C.2", "CC_a", "CC_b", "1NN", "2NN", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "LC1", "LC2", "ALLA1", "ALLB1", "ALLB2"), + parent_id= c("NIB", "1A1N", "1D1C", "1D1C", "1B1C", "1B1C", "CC", "CC", "NN", "NN", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "LC", "LC", "ALLA", "ALLB", "ALLB"), + stringsAsFactors=FALSE) names(tx) <- c(TARGET_COL, PARENT_COL) sl <- list() @@ -47,43 +48,51 @@ sim_sleuth_data <- function(varname="condition", COUNTS_COL="est_counts", TARGET sl$kal[[1]] <- list() sl$kal[[1]]["bootstrap"] <- list() sl$kal[[1]]$bootstrap[[1]] <- data.frame("target"= c("LC1", "NIA1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(3, 333, 666, 10, 20, 0, 76, 52, 20, 50, 103, 321, 0, 100, 90, 0, 10, 30, 4, 50, 0, 0), stringsAsFactors=FALSE) + "my_counts"= c(3, 333, 666, 10, 20, 0, 76, 52, 20, 50, 103, 321, 0, 100, 90, 0, 10, 30, 4, 50, 0, 0), + stringsAsFactors=FALSE) sl$kal[[1]]$bootstrap[[2]] <- data.frame("target"= c("LC1", "NIA1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(2, 310, 680, 11, 21, 0, 80, 55, 22, 52, 165, 320, 0, 130, 80, 0, 11, 29, 5, 40, 0, 0), stringsAsFactors=FALSE) + "my_counts"= c(2, 310, 680, 11, 21, 0, 80, 55, 22, 52, 165, 320, 0, 130, 80, 0, 11, 29, 5, 40, 0, 0), + stringsAsFactors=FALSE) sl$kal[[1]]$bootstrap[[3]] <- data.frame("target"= c("LC1", "NIA1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(0, 340, 610, 7, 18, 0, 72, 50, 21, 49, 150, 325, 0, 120, 70, 0, 9, 28, 4, 60, 0, 0), stringsAsFactors=FALSE) + "my_counts"= c(0, 340, 610, 7, 18, 0, 72, 50, 21, 49, 150, 325, 0, 120, 70, 0, 9, 28, 4, 60, 0, 0), + stringsAsFactors=FALSE) sl$kal[[2]] <- list() sl$kal[[2]]["bootstrap"] <- list() sl$kal[[2]]$bootstrap[[1]] <- data.frame("target"= c("NIA1", "LC1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(333, 6, 666, 12, 23, 0, 25, 150, 15, 80, 325, 105, 40, 0, 200, 0, 15, 45, 12, 0, 80, 200), stringsAsFactors=FALSE) + "my_counts"= c(333, 6, 666, 12, 23, 0, 25, 150, 15, 80, 325, 105, 40, 0, 200, 0, 15, 45, 12, 0, 80, 200), + stringsAsFactors=FALSE) sl$kal[[2]]$bootstrap[[2]] <- data.frame("target"= c("NIA1", "LC1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(323, 1, 606, 15, 22, 0, 23, 190, 20, 90, 270, 115, 30, 0, 150, 0, 20, 60, 15, 0, 120, 250), stringsAsFactors=FALSE) + "my_counts"= c(323, 1, 606, 15, 22, 0, 23, 190, 20, 90, 270, 115, 30, 0, 150, 0, 20, 60, 15, 0, 120, 250), + stringsAsFactors=FALSE) sl$kal[[3]] <- list() sl$kal[[3]]$bootstrap[[1]] <- data.frame("target"= c("NIA1", "1A1N-2", "LC1", "NIA2", "1D1C:one", "1D1C:two", "1B1C.2", "MIX6.c1", "1A1N-1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.d", "MIX6.nc", "CC_a", "CC_b", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(333, 20, 3, 666, 0, 76, 52, 100, 10, 360, 0, 100, 0, 180, 25, 60, 7, 27, 13, 35, 0, 0), stringsAsFactors=FALSE) + "my_counts"= c(333, 20, 3, 666, 0, 76, 52, 100, 10, 360, 0, 100, 0, 180, 25, 60, 7, 27, 13, 35, 0, 0), + stringsAsFactors=FALSE) sl$kal[[3]]$bootstrap[[2]] <- data.frame("target"= c("MIX6.c4", "NIA1", "LC1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1B1C.2", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.nc", "1D1C:two", "MIX6.d", "CC_b", "CC_a", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(80, 330, 4, 560, 11, 21, 0, 55, 90, 380, 0, 240, 80, 0, 55, 23, 10, 31, 2, 55, 0, 0), stringsAsFactors=FALSE) + "my_counts"= c(80, 330, 4, 560, 11, 21, 0, 55, 90, 380, 0, 240, 80, 0, 55, 23, 10, 31, 2, 55, 0, 0), + stringsAsFactors=FALSE) sl$kal[[4]] <- list() sl$kal[[4]]["bootstrap"] <- list() sl$kal[[4]]$bootstrap[[1]] <- data.frame("target"= c("NIA2", "1A1N-1", "NIA1", "1A1N-2", "1D1C:one", "1D1C:two", "LC1", "1B1C.2", "MIX6.c2", "MIX6.c3", "MIX6.c1", "MIX6.c4", "MIX6.nc", "MIX6.d", "CC_b", "CC_a", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(666, 12, 333, 23, 0, 25, 0, 150, 155, 40, 300, 0, 33, 0, 93, 22, 22, 61, 11, 0, 110, 210), stringsAsFactors=FALSE) + "my_counts"= c(666, 12, 333, 23, 0, 25, 0, 150, 155, 40, 300, 0, 33, 0, 93, 22, 22, 61, 11, 0, 110, 210), + stringsAsFactors=FALSE) sl$kal[[4]]$bootstrap[[2]] <- data.frame("target"= c("NIA1", "MIX6.d", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "LC1", "1D1C:two", "MIX6.c1", "1B1C.2", "MIX6.c3", "MIX6.c2", "MIX6.c4", "MIX6.nc", "CC_a", "CC_b", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(323, 0, 656, 15, 22, 0, 2, 23, 280, 160, 35, 120, 0, 95, 18, 119, 19, 58, 7, 0, 150, 220), stringsAsFactors=FALSE) + "my_counts"= c(323, 0, 656, 15, 22, 0, 2, 23, 280, 160, 35, 120, 0, 95, 18, 119, 19, 58, 7, 0, 150, 220), + stringsAsFactors=FALSE) sl$kal[[4]]$bootstrap[[3]] <- data.frame("target"= c("NIA1", "NIA2", "1A1N-1", "1A1N-2", "MIX6.nc", "1B1C.2", "LC1", "MIX6.c1", "MIX6.c2", "MIX6.c3", "1D1C:one", "1D1C:two", "MIX6.c4", "MIX6.d", "CC_a", "CC_b", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(343, 676, 13, 21, 55, 145, 3, 270, 133, 30, 0, 20, 0, 0, 23, 80, 17, 50, 14, 0, 130, 200), stringsAsFactors=FALSE) + "my_counts"= c(343, 676, 13, 21, 55, 145, 3, 270, 133, 30, 0, 20, 0, 0, 23, 80, 17, 50, 14, 0, 130, 200), + stringsAsFactors=FALSE) names(sl$kal[[1]]$bootstrap[[1]]) <- c(BS_TARGET_COL, COUNTS_COL) names(sl$kal[[1]]$bootstrap[[2]]) <- c(BS_TARGET_COL, COUNTS_COL) names(sl$kal[[1]]$bootstrap[[3]]) <- c(BS_TARGET_COL, COUNTS_COL) names(sl$kal[[2]]$bootstrap[[1]]) <- c(BS_TARGET_COL, COUNTS_COL) names(sl$kal[[2]]$bootstrap[[2]]) <- c(BS_TARGET_COL, COUNTS_COL) -# names(sl$kal[[2]]$bootstrap[[3]]) <- c(BS_TARGET_COL, COUNTS_COL) names(sl$kal[[3]]$bootstrap[[1]]) <- c(BS_TARGET_COL, COUNTS_COL) names(sl$kal[[3]]$bootstrap[[2]]) <- c(BS_TARGET_COL, COUNTS_COL) -# names(sl$kal[[3]]$bootstrap[[3]]) <- c(BS_TARGET_COL, COUNTS_COL) names(sl$kal[[4]]$bootstrap[[1]]) <- c(BS_TARGET_COL, COUNTS_COL) names(sl$kal[[4]]$bootstrap[[2]]) <- c(BS_TARGET_COL, COUNTS_COL) names(sl$kal[[4]]$bootstrap[[3]]) <- c(BS_TARGET_COL, COUNTS_COL) @@ -98,40 +107,50 @@ sim_sleuth_data <- function(varname="condition", COUNTS_COL="est_counts", TARGET #============================================================================== #' Generate an artificial dataset of bootstrapped abundance estimates, for code-testing or examples. #' -#' Based on sim_sleuth_data(). -#' +#' @param varname Name for the variables by which to compare. +#' @param PARENT_COL Name for the bootdtraps column containing counts. +#' @param TARGET_COL Name for annotation column containing transcript identification. #' @param cnames A vector of (two) name values for the comparison variable. -#' @return A list with 3 elements. First a data.frame with the corresponding annotation. Then, 2 lists of data.tables. One list per condition. Each data table represents a sample and contains the estimates from the bootstrap iterations. +#' @param errannot_inconsistent Logical. Introduces an inconsistency in the transcript IDs, for testing of sanity checks. (FALSE) +#' @return A list with 3 elements. First a data.frame with the corresponding annotation. +#' Then, 2 lists of data.tables. One list per condition. Each data table represents a sample and contains the estimates from the bootstrap iterations. #' #' @export #' -sim_boot_data <- function(cnames=c("A", "B")) { - sim <- sim_sleuth_data(cnames=cnames) +sim_boot_data <- function(varname="condition", cnames=c("A", "B"), errannot_inconsistent=FALSE, PARENT_COL="parent_id", TARGET_COL="target_id") { + sim <- sim_sleuth_data(cnames=cnames, varname=varname, errannot_inconsistent=errannot_inconsistent, PARENT_COL=PARENT_COL, TARGET_COL=TARGET_COL) # Emulate non-sleuth bootstrap data. - data_A <- denest_sleuth_boots(sim$slo, sim$annot, c(1,3), "est_counts", "target_id") - data_B <- denest_sleuth_boots(sim$slo, sim$annot, c(2,4), "est_counts", "target_id") + data_A <- denest_sleuth_boots(sim$slo, sim$annot, c(1,3), COUNTS_COL="est_counts", BS_TARGET_COL="target_id", PARENT_COL=PARENT_COL, TARGET_COL=TARGET_COL) + data_B <- denest_sleuth_boots(sim$slo, sim$annot, c(2,4), COUNTS_COL="est_counts", BS_TARGET_COL="target_id", PARENT_COL=PARENT_COL, TARGET_COL=TARGET_COL) + + if (errannot_inconsistent) + data_A[[2]][14, 1 := "Unexpected-name"] + return(list('annot'= sim$annot, 'boots_A'= data_A, 'boots_B'= data_B)) } #============================================================================== -#' Generate an artificial dataset of bootstrapped abundance estimates, for code-testing or examples. -#' -#' Based on sim_sleuth_data(). +#' Generate an artificial dataset of abundance estimates, for code-testing or examples. #' +#' @param varname Name for the variables by which to compare. +#' @param PARENT_COL Name for the bootdtraps column containing counts. +#' @param TARGET_COL Name for annotation column containing transcript identification. #' @param cnames A vector of (two) name values for the comparison variable. +#' @param errannot_inconsistent Logical. Introduces an inconsistency in the transcript IDs, for testing of sanity checks. (FALSE) #' @return A list with 3 elements. First, a data.frame with the corresponding annotation table. #' Then, 2 data.tables, one per condition. Each data table contains the abundance estimates for all the samples for the respective condition. #' #' @export #' -sim_count_data <- function(cnames=c("A","B")) { - sim <- sim_sleuth_data(cnames=cnames) +sim_count_data <- function(varname="condition", cnames=c("A", "B"), errannot_inconsistent=FALSE, PARENT_COL="parent_id", TARGET_COL="target_id") { + sim <- sim_sleuth_data(cnames=cnames, varname=varname, errannot_inconsistent=errannot_inconsistent, PARENT_COL=PARENT_COL, TARGET_COL=TARGET_COL) # Emulate non-sleuth bootstrap data. - data_A <- denest_sleuth_boots(sim$slo, sim$annot, c(1,3), "est_counts", "target_id") - data_B <- denest_sleuth_boots(sim$slo, sim$annot, c(2,4), "est_counts", "target_id") + data_A <- denest_sleuth_boots(sim$slo, sim$annot, c(1,3), COUNTS_COL="est_counts", BS_TARGET_COL="target_id", PARENT_COL=PARENT_COL, TARGET_COL=TARGET_COL) + data_B <- denest_sleuth_boots(sim$slo, sim$annot, c(2,4), COUNTS_COL="est_counts", BS_TARGET_COL="target_id", PARENT_COL=PARENT_COL, TARGET_COL=TARGET_COL) # Emulate non-bootstrap data. counts_A <- data_A[[1]] counts_B <- data_B[[1]] + return(list(sim$annot, 'counts_A'= counts_A, 'counts_B'= counts_B)) } @@ -151,7 +170,7 @@ sim_count_data <- function(cnames=c("A","B")) { #' \code{anno} a dataframe matching \code{target_id} to \code{parent_id}. #' \code{sim} a dataframe with the simulation parameters per transcript. #' -#' @export +# @export countrange_sim <- function(proportions= seq(0, 1, 0.01), magnitudes= c(1,5,10,30,100,500,1000)) { # Combine proportions and magnitudes. @@ -198,7 +217,7 @@ countrange_sim <- function(proportions= seq(0, 1, 0.01), #' @return dataframe #' #' @import data.table -##' @export +# @export combine_sim_dtu <- function(sim, dtuo) { with(dtuo, { results <- data.table(Genes[, list(parent_id, pval_AB, pval_BA, dtu_AB, dtu_BA, dtu, pval_prop_min, dtu_prop)]) @@ -219,7 +238,7 @@ combine_sim_dtu <- function(sim, dtuo) { #' #' @import ggplot2 #' @import data.table -##' @export +# @export plot_sim <- function(data, type = "AvBvM") { with(data, { if(type =="AvBvM"){ diff --git a/R/func.R b/R/func.R index b15c37b..671d748 100644 --- a/R/func.R +++ b/R/func.R @@ -1,18 +1,12 @@ #================================================================================ #' Check input parameters. #' -#' @param slo Sleuth object #' @param annot Annotation dataframe. -#' @param name_A Condition name. -#' @param name_B Condition name. -#' @param varname Name of condition variable. -#' @param COUNTS_COL Name of counts column in bootstrap. #' @param correction P-value correction method. #' @param p_thresh Significance level. #' @param TARGET_COL Name of transcript id column in annotation. #' @param PARENT_COL Name of gene id column in annotation. -#' @param BS_TARGET_COL Name of transcript id column in bootstrap. -#' @param count_thresh Minimum transcript abundance per sample. +#' @param abund_thresh Minimum transcript abundance per sample. #' @param testmode Which test to run. #' @param qboot Whether to bootstrap against quantifications. #' @param qbootnum Number of bootstrap iterations. @@ -24,7 +18,6 @@ #' @param boot_data_B A list of dataframes, one per sample, each with all the bootstrapped estimetes for the sample. #' @param rboot Whether to bootstrap against samples. #' @param rrep_thresh Confidence threshold. -#' @param rrep_as_crit Whether to use rrep as a DTU criterion. #' @param threads Number of threads. #' @param scaling Abundance scaling factor. #' @@ -38,20 +31,21 @@ #' #' @import data.table #' -parameters_are_good <- function(slo, annot, name_A, name_B, varname, COUNTS_COL, - correction, p_thresh, TARGET_COL, PARENT_COL, BS_TARGET_COL, - count_thresh, testmode, qboot, qbootnum, dprop_thresh, - count_data_A, count_data_B, boot_data_A, boot_data_B, qrep_thresh, - threads, rboot, rrep_thresh, rrep_as_crit, scaling) { +parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A, boot_data_B, + TARGET_COL, PARENT_COL, + correction, testmode, scaling, threads, + p_thresh, abund_thresh, dprop_thresh, + qboot, qbootnum, qrep_thresh, rboot, rrep_thresh) { warnmsg <- list() # Input format. if(any(is.null(annot), - all( any(is.null(slo), is.null(name_A), is.null(name_B), is.null(varname), is.null(BS_TARGET_COL), is.null(COUNTS_COL)), - any(is.null(count_data_A), is.null(count_data_B)), + all( any(is.null(count_data_A), is.null(count_data_B)), any(is.null(boot_data_A), is.null(boot_data_B)) ) )) return(list("error"=TRUE, "message"="Insufficient parameters!")) - + if(any(is.null(count_data_A) != is.null(count_data_B), is.null(boot_data_A) != is.null(boot_data_B))) + return(list("error"=TRUE, "message"="You must specify (the same type of) data for both conditions!")) + # Annotation. if (!is.data.frame(annot)) return(list("error"=TRUE, "message"="The provided annot is not a data.frame!")) @@ -69,7 +63,7 @@ parameters_are_good <- function(slo, annot, name_A, name_B, varname, COUNTS_COL, return(list("error"=TRUE, "message"="Invalid p-value threshold!")) if ((!is.numeric(dprop_thresh)) || dprop_thresh < 0 || dprop_thresh > 1) return(list("error"=TRUE, "message"="Invalid proportion difference threshold! Must be between 0 and 1.")) - if ((!is.numeric(count_thresh)) || count_thresh < 0) + if ((!is.numeric(abund_thresh)) || abund_thresh < 0) return(list("error"=TRUE, "message"="Invalid abundance threshold! Must be between 0 and 1.")) if ((!is.numeric(qbootnum)) || qbootnum < 0) return(list("error"=TRUE, "message"="Invalid number of bootstraps! Must be a positive integer number.")) @@ -85,39 +79,19 @@ parameters_are_good <- function(slo, annot, name_A, name_B, varname, COUNTS_COL, return(list("error"=TRUE, "message"="Invalid number of threads! Must be positive integer.")) if (threads > parallel::detectCores(logical= TRUE)) return(list("error"=TRUE, "message"="Number of threads exceeds system's reported capacity.")) - if (!is.logical(rrep_as_crit)) - return(list("error"=TRUE, "message"="Unrecognized value for rrep_as_crit! Must be TRUE/FALSE.")) if ((!is.numeric(scaling)) || scaling == 0) return(list("error"=TRUE, "message"="Invalid scaling factor! Must be non-zero number.")) - # Sleuth - if (!is.null(slo)) { - if (any(! c("kal","sample_to_covariates") %in% names(slo), ! "bootstrap" %in% names(slo$kal[[1]]) )) - return(list("error"=TRUE, "message"="The specified sleuth object is not valid!")) - if (!COUNTS_COL %in% names(slo$kal[[1]]$bootstrap[[1]])) - return(list("error"=TRUE, "message"="The specified counts field name does not exist!")) - if (!varname %in% names(slo$sample_to_covariates)) - return(list("error"=TRUE, "message"="The specified covariate name does not exist!")) - if (any(!c(name_A, name_B) %in% slo$sample_to_covariates[[varname]] )) - return(list("error"=TRUE, "message"="One or both of the specified conditions do not exist!")) - if (!BS_TARGET_COL %in% names(slo$kal[[1]]$bootstrap[[1]])) - return(list("error"=TRUE, "message"="The specified target IDs field name does not exist in the bootstraps!")) - } - # Booted counts. - if (!is.null(boot_data_A)) { # If slo available ignore boot_data. - if(is.null(slo)) { + if (!is.null(boot_data_A)) { if (any(!is.list(boot_data_A), !is.list(boot_data_A), !is.data.table(boot_data_A[[1]]), !is.data.table(boot_data_B[[1]]) )) return(list("error"=TRUE, "message"="The bootstrap data are not lists of data.tables!")) - } else { - warnmsg["slo&boots"] <- "Received multiple input formats! Only the sleuth object will be used." - } } # Counts. - if (!is.null(count_data_A)) { # If slo or boot_data available, ignore count_data. - if(is.null(slo) && any(is.null(boot_data_A), is.null(boot_data_B))) { - if (any(!is.data.table(count_data_A), !is.data.table(count_data_B))) + if (!is.null(count_data_A)) { # If boot_data available, ignore count_data. + if(is.null(boot_data_A)) { + if (!is.data.table(count_data_A) | !is.data.table(count_data_B)) return(list("error"=TRUE, "message"="The counts data are not data.tables!")) if (!all( count_data_A[[1]][order(count_data_A[[1]])] == count_data_B[[1]][order(count_data_B[[1]])] )) return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs between conditions!")) @@ -130,16 +104,13 @@ parameters_are_good <- function(slo, annot, name_A, name_B, varname, COUNTS_COL, minboots <- NA_integer_ samples_by_condition <- NULL numsamples <- NA_integer_ - if (!is.null(slo)) { - samples_by_condition <- group_samples(slo$sample_to_covariates)[[varname]] - numsamples <- length(samples_by_condition[[1]]) + length(samples_by_condition[[2]]) - } else if (!is.null(boot_data_A) && !is.null(boot_data_B)) { + if (!is.null(boot_data_A)) { numsamples <- length(boot_data_A) + length(boot_data_B) } maxmatrix <- 2^31 - 1 if (qboot) { # Consistency, - if (!is.null(boot_data_A) && !is.null(boot_data_B)) { + if (!is.null(boot_data_A)) { # Direct data. tx <- boot_data_A[[1]][[1]][order(boot_data_A[[1]][[1]])] for (k in 2:length(boot_data_A)){ @@ -150,34 +121,27 @@ parameters_are_good <- function(slo, annot, name_A, name_B, varname, COUNTS_COL, if (!all( tx == boot_data_B[[k]][[1]][order(boot_data_B[[k]][[1]])] )) return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs across samples!")) } - } else if (!is.null(slo)) { - # Sleuth. - tx <- slo$kal[[1]]$bootstrap[[1]][[BS_TARGET_COL]][ order(slo$kal[[1]]$bootstrap[[1]][[BS_TARGET_COL]]) ] - for (k in 2:length(slo$kal)) { - if (!all( tx == slo$kal[[k]]$bootstrap[[1]][[BS_TARGET_COL]][ order(slo$kal[[k]]$bootstrap[[1]][[BS_TARGET_COL]]) ] )) - return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs across samples!")) - } + + # Number of iterations. + minboots <- infer_bootnum(boot_data_A, boot_data_B) + if (!is.na(minboots)) { + if (minboots <= 1) + return(list("error"=TRUE, "message"="It appears some of your samples have no bootstraps!")) + if (minboots < 100) { + warnmsg["toofewboots"] <- "Your quantifications have few bootstrap iterations, which reduces reproducibility of the calls." + } + if (qbootnum < 100 && qbootnum != 0) { + warnmsg["toolowbootnum"] <- "The requested qbootnum is low, which reduces reproducibility of the calls." + } + bootcombos <- minboots^numsamples # Conservative estimate. + if (qbootnum >= bootcombos/100) + warnmsg["toomanyboots"] <- "The requested number of quantification bootstraps is very high, relatively to the supplied data. Over 1% chance of duplicate iterations." + if (qbootnum >= maxmatrix/dim(annot)[1]) + return(list("error"=TRUE,"message"="The requested number of quantification bootstraps would exceed the maximum capacity of an R matrix.")) + } # else it is probably count data and qboot will be auto-set to FALSE } else { warnmsg["noboots"] <- "qboot is TRUE but no bootstrapped estimates were provided! Continuing without bootstrapping." } - - # Number of iterations. - minboots <- infer_bootnum(slo, boot_data_A, boot_data_B) - if (!is.na(minboots)) { - if (minboots <= 1) - return(list("error"=TRUE, "message"="It appears some of your samples have no bootstraps!")) - if (minboots < 100) { - warnmsg["toofewboots"] <- "Your quantifications have few bootstrap iterations, which reduces reproducibility of the calls." - } - if (qbootnum < 100 && qbootnum != 0) { - warnmsg["toolowbootnum"] <- "The requested qbootnum is low, which reduces reproducibility of the calls." - } - bootcombos <- minboots^numsamples # Conservative estimate. - if (qbootnum >= bootcombos/100) - warnmsg["toomanyboots"] <- "The requested number of quantification bootstraps is very high, relatively to the supplied data. Over 1% chance of duplicate iterations." - if (qbootnum >= maxmatrix/dim(annot)[1]) - return(list("error"=TRUE,"message"="The requested number of quantification bootstraps would exceed the maximum capacity of an R matrix.")) - } # else it is probably count data and qboot will be auto-set to FALSE } if (rboot & any( length(samples_by_condition[[1]]) * length(samples_by_condition[[2]]) > maxmatrix/dim(annot)[1], length(boot_data_A) * length(boot_data_B) > maxmatrix/dim(annot)[1] ) ) @@ -190,28 +154,18 @@ parameters_are_good <- function(slo, annot, name_A, name_B, varname, COUNTS_COL, #================================================================================ #' Rule-of-thumb number of iterations. #' -#' @param slo Sleuth object. #' @param boot_data_A List of tables of bootstrapped counts. #' @param boot_data_B List of tables of bootstrapped counts. #' @return The least number of iterations seen in the input. #' -infer_bootnum <- function(slo, boot_data_A, boot_data_B){ +infer_bootnum <- function(boot_data_A, boot_data_B){ minboot <- NA_integer_ - # Bootstrapped counts. - if (!is.null(boot_data_A) && !is.null(boot_data_B)) { - minboot <- length(boot_data_A[[1]]) - 1 # ID column - for (k in 2:length(boot_data_A)){ - minboot <- min(minboot, length(boot_data_A[[k]]) - 1) - } - for (k in 1:length(boot_data_B)){ - minboot <- min(minboot, length(boot_data_B[[k]]) - 1) - } - # Sleuth. - } else if (!is.null(slo)) { - minboot <- length(slo$kal[[1]]$bootstrap) - for (k in 2:length(slo$kal)) { - minboot <- min(minboot, length(slo$kal[[k]]$bootstrap)) - } + minboot <- length(boot_data_A[[1]]) - 1 # ID column + for (k in 2:length(boot_data_A)){ + minboot <- min(minboot, length(boot_data_A[[k]]) - 1) + } + for (k in 1:length(boot_data_B)){ + minboot <- min(minboot, length(boot_data_B[[k]]) - 1) } return(minboot) } @@ -220,8 +174,10 @@ infer_bootnum <- function(slo, boot_data_A, boot_data_B){ #================================================================================ #' Group samples by covariate value. #' -#' Sleuth records covariate values per sample. However, RATs needs the reverse: the -#' samples that correspond to a given covariate value. +#' *Legacy function* No longer in use, but kept for possible general utility. +#' +#' Converts a table where each row is a sample and each column a variable into a +#' lookup structure that matches each value of each variable to a vector of corresponding samples. #' #' @param covariates A dataframe with different factor variables. Like the \code{sample_to_covariates} table of a \code{sleuth} object. Each row is a sample, each column is a covariate, each cell is a covariate value for the sample. #' @return list of lists (per covariate) of vectors (per factor level). @@ -243,7 +199,12 @@ group_samples <- function(covariates) { #================================================================================ #' Extract bootstrap counts into a less nested structure. -#' +#' +#' *Legacy function* +#' +#' It extracts the bootstrap data from the older-style \code{sleuth} object. +#' As of sleuth version 0.29, the bootstrap data is no longer kept in the object. +#' #' @param slo A sleuth object. #' @param annot A data.frame matching transcript identfier to gene identifiers. #' @param samples A numeric vector of samples to extract counts for. @@ -266,10 +227,8 @@ group_samples <- function(covariates) { denest_sleuth_boots <- function(slo, annot, samples, COUNTS_COL="tpm", BS_TARGET_COL="target_id", TARGET_COL="target_id", PARENT_COL="parent_id", threads= 1) { # Ensure data.table complies. - # if (packageVersion("data.table") >= "1.9.8") - setDTthreads(threads) + setDTthreads(threads) - # Could just always use tidy_annot(), but that duplicates the annotation without reason. # Compared to the size of other structures involved in calling DTU, this should make negligible difference. tx <- tidy_annot(annot, TARGET_COL, PARENT_COL)$target_id @@ -312,7 +271,7 @@ alloc_out <- function(annot, full){ "num_genes"=NA_integer_, "num_transc"=NA_integer_, "tests"=NA_character_, "p_thresh"=NA_real_, "abund_thresh"=NA_real_, "dprop_thresh"=NA_real_, "abund_scaling"=NA_real_, "quant_reprod_thresh"=NA_real_, "quant_boot"=NA, "quant_bootnum"=NA_integer_, - "rep_reprod_thresh"=NA_real_, "rep_boot"=NA, "rep_bootnum"=NA_integer_, "rep_reprod_as_crit"=NA) + "rep_reprod_thresh"=NA_real_, "rep_boot"=NA, "rep_bootnum"=NA_integer_) Genes <- data.table("parent_id"=as.vector(unique(annot$parent_id)), "elig"=NA, "sig"=NA, "elig_fx"=NA, "quant_reprod"=NA, "rep_reprod"=NA, "DTU"=NA, "transc_DTU"=NA, "known_transc"=NA_integer_, "detect_transc"=NA_integer_, "elig_transc"=NA_integer_, "maxDprop"=NA_real_, @@ -376,8 +335,7 @@ alloc_out <- function(annot, full){ #' calculate_DTU <- function(counts_A, counts_B, tx_filter, test_transc, test_genes, full, count_thresh, p_thresh, dprop_thresh, correction, threads= 1) { # Ensure data.table complies. - # if (packageVersion("data.table") >= "1.9.8") - setDTthreads(threads) + setDTthreads(threads) #---------- PRE-ALLOCATE @@ -457,13 +415,17 @@ calculate_DTU <- function(counts_A, counts_B, tx_filter, test_transc, test_genes #================================================================================ #' Log-likelihood test of goodness of fit. #' +#' For a set of observations against a set of probabilities. +#' +#' General utility function. +#' #' @param obsx a numeric vector of finite positive counts, with at least one non-zero value. #' @param px a vector of probabilities of the same length as xobs. ( sum(px) <= 1 ) #' #' The order of values in the two vectors should be the same. #' If any value of xp is zero and the corresponding xobs is not, g.test.1 will always reject the hypothesis. #' No corrections are applied. -#' No input checks are applied, as RATs needs to run this millions of times. +#' No input checks are applied. #' #' @import stats #' @export @@ -481,6 +443,8 @@ g.test.1 <- function(obsx, px) { #' Log-likelihood test of independence. #' #' For two sets of observations. +#' +#' General utility function. #' #' @param obsx a numeric vector of positive counts, with at least one non-zero value. #' @param obsy a numeric vector of positive counts of same length as obsx, with at least one non-zero value. @@ -542,12 +506,16 @@ tidy_annot <- function(annot, TARGET_COL="target_id", PARENT_COL="parent_id") { #================================================================================ #' Get largest value by absolute comparison. #' -#' Get back the original signed value. In case of equal absolutes, the positive -#' value will be returned. +#' General utility function. +#' +#' Returns the original signed value with the largest absolute value in the vector. +#' In case of equal absolutes between a positive and negative value, the positive +#' value will always be the one that is returned. #' #' @param v A numeric vector. #' @return A numeric value. -#' +#' +#' @export maxabs <- function(v) { if (all(is.na(v))) return(NA_real_) diff --git a/R/rats.R b/R/rats.R index e5a94b7..1a9002f 100644 --- a/R/rats.R +++ b/R/rats.R @@ -1,38 +1,34 @@ #================================================================================ #' Calculate differential transcript usage. #' -#' There are three options for input: +#' There are two modes for input: #' \itemize{ -#' \item{A sleuth object. This requires the following parameters: \code{slo}, \code{name_A}, \code{name_B} and optionally \code{varname}, \code{COUNT_COL}, \code{BS_TARGET_COL}.} -#' \item{Bootstrapped count estimates. This requires the following parameters: \code{boot_data_A} and \code{boot_data_B}. \code{name_A} and \code{name_B} can optionally be used to name the conditions.} -#' \item{Count estimates. This requires the following parameters: \code{count_data_A} and \code{count_data_B}. \code{name_A} and \code{name_B} can optionally be used to name the conditions.} +# \item{A sleuth object. This requires the following parameters: \code{slo}, \code{name_A}, \code{name_B} and optionally \code{varname}, \code{COUNT_COL}, \code{BS_TARGET_COL}.} +#' \item{Bootstrapped count estimates. This requires the following parameters: \code{boot_data_A} and \code{boot_data_B}.} +#' \item{Count estimates. This requires the following parameters: \code{count_data_A} and \code{count_data_B}.} #' } #' #' @param annot A data.table matching transcript identifiers to gene identifiers. Any additional columns are allowed but ignored. -#' @param TARGET_COL The name of the column for the transcript identifiers in \code{annot}. (Default \code{"target_id"}) -#' @param PARENT_COL The name of the column for the gene identifiers in \code{annot}. (Default \code{"parent_id"}) -#' @param slo A Sleuth object. -#' @param name_A The name for one condition, as it appears in the \code{sample_to_covariates} table within the Sleuth object. (Default "Condition-A") -#' @param name_B The name for the other condition, as it appears in the \code{sample_to_covariates} table within the sleuth object. (Default "Condition-B") -#' @param varname The name of the covariate to which the two conditions belong, as it appears in the \code{sample_to_covariates} table within the sleuth object. (Default \code{"condition"}). -#' @param COUNTS_COL For Sleuth objects only. The name of the counts column to use for the DTU calculation (est_counts or tpm). (Default \code{"est_counts"}) -#' @param BS_TARGET_COL For Sleuth objects only. The name of the transcript identifiers column in the bootstrap tables. (Default \code{"target_id"}) #' @param count_data_A A data.table of estimated counts for condition A. One column per sample/replicate, one row per transcript. The first column should contain the transcript identifiers. #' @param count_data_B A data.table of estimated counts for condition B. One column per sample/replicate, one row per transcript. The first column should contain the transcript identifiers. #' @param boot_data_A A list of data.tables, one per sample/replicate of condition A. One bootstrap iteration's estimates per column, one transcript per row. The first column should contain the transcript identifiers. #' @param boot_data_B A list of data.tables, one per sample/replicate of condition B. One bootstrap iteration's estimates per column, one transcript per row. The first column should contain the transcript identifiers. +#' @param TARGET_COL The name of the column for the transcript identifiers in \code{annot}. (Default \code{"target_id"}) +#' @param PARENT_COL The name of the column for the gene identifiers in \code{annot}. (Default \code{"parent_id"}) +#' @param name_A The name for one condition. (Default "Condition-A") +#' @param name_B The name for the other condition. (Default "Condition-B") +#' @param varname The name of the covariate to which the two conditions belong. (Default \code{"condition"}). #' @param p_thresh The p-value threshold. (Default 0.05) #' @param abund_thresh Noise threshold. Minimum mean abundance for transcripts to be eligible for testing. (Default 5) #' @param dprop_thresh Effect size threshold. Minimum change in proportion of a transcript for it to be considered meaningful. (Default 0.20) -#' @param correction The p-value correction to apply, as defined in \code{stats::p.adjust.methods}. (Default \code{"BH"}) -#' @param scaling A scaling factor to be applied to the abundances, *prior* to any thresholding and testing. Useful for scaling TPM abundances to the actual number of transcripts in the sequencing sample, if you use TPMs and if you can infer that information from your sample preparation. Improper use of the scaling factor will artificially inflate/deflate the significances obtained. (Default 1) +#' @param correction The p-value correction to apply, as defined in \code{\link[stats]{p.adjust.methods}}. (Default \code{"BH"}) +#' @param scaling A scaling factor to be applied to the abundances, *prior* to any thresholding and testing. Useful for scaling TPM (transcripts per 1 million reads) abundances to the actual library size. WARNING: Improper use of the scaling factor will artificially inflate/deflate the significances obtained. (Default 1) #' @param testmode One of \itemize{\item{"genes"}, \item{"transc"}, \item{"both" (default)}}. #' @param qboot Bootstrap the DTU robustness against bootstrapped quantifications data. (Default \code{TRUE}) Ignored if input is \code{count_data}. #' @param qbootnum Number of iterations for \code{qboot}. (Default 0) If 0, RATs will try to infer a value from the data. #' @param qrep_thresh Reproducibility threshold for quantification bootsrapping. (Default 0.95) -#' @param rboot Bootstrap the DTU robustness against the replicates. Does ALL 1 vs 1 combinations. (Default \code{TRUE}) +#' @param rboot Bootstrap the DTU robustness against the replicates. Does *ALL* 1 vs 1 combinations. (Default \code{TRUE}) #' @param rrep_thresh Reproducibility threshold for replicate bootsrapping. (Default 0.85) With few replicates per condition, the reproducibility takes heavily quantized values. For 3x3, there are 9 possible 1v1 comparisons, and a consistency of 8/9 = 0.88. -#' @param rrep_as_crit Whether DTU calls should include replicate reproducibility as a criterion. (Default TRUE) #' @param description Free-text description of the run. You can use this to add metadata to the results object. #' @param verbose Display progress updates and warnings. (Default \code{TRUE}) #' @param threads Number of threads to use. (Default 1) Multi-threading will be ignored on non-POSIX systems. @@ -45,10 +41,10 @@ #' @import matrixStats #' @export call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_id", - slo= NULL, name_A= "Condition-A", name_B= "Condition-B", varname= "condition", COUNTS_COL= "est_counts", BS_TARGET_COL= "target_id", count_data_A = NULL, count_data_B = NULL, boot_data_A = NULL, boot_data_B = NULL, + name_A= "Condition-A", name_B= "Condition-B", varname= "condition", p_thresh= 0.05, abund_thresh= 5, dprop_thresh= 0.2, correction= "BH", scaling= 1, - testmode= "both", qboot= TRUE, qbootnum= 0L, qrep_thresh= 0.95, rboot=TRUE, rrep_thresh= 0.85, rrep_as_crit= TRUE, + testmode= "both", qboot= TRUE, qbootnum= 0L, qrep_thresh= 0.95, rboot=TRUE, rrep_thresh= 0.85, description= NA_character_, verbose= TRUE, threads= 1L, dbg= 0) { #---------- PREP @@ -59,10 +55,11 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i message("Checking parameters...") } # Input checks. - paramcheck <- parameters_are_good(slo, annot, name_A, name_B, varname, COUNTS_COL, - correction, p_thresh, TARGET_COL, PARENT_COL, BS_TARGET_COL, abund_thresh, testmode, - qboot, qbootnum, dprop_thresh, count_data_A, count_data_B, boot_data_A, boot_data_B, - qrep_thresh, threads, rboot, rrep_thresh, rrep_as_crit, scaling) + paramcheck <- parameters_are_good(annot, count_data_A, count_data_B, boot_data_A, boot_data_B, + TARGET_COL, PARENT_COL, + correction, testmode, scaling, threads, + p_thresh, abund_thresh, dprop_thresh, + qboot, qbootnum, qrep_thresh, rboot, rrep_thresh) if (paramcheck$error) stop(paramcheck$message) if (verbose) @@ -74,8 +71,7 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i threads <- as.integer(threads) # Can't be decimal. # Ensure data.table complies. - # if (packageVersion("data.table") >= "1.9.8") - setDTthreads(threads) + setDTthreads(threads) if (qbootnum == 0 && qboot) # Use smart default. qbootnum = paramcheck$maxboots @@ -84,14 +80,12 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i test_genes <- any(testmode == c("genes", "both")) # Determine data extraction steps. - steps <- 1 # Assume estimated counts. Simplest case. Bypasses both extraction and averaging. + steps <- 1 # Assume estimated counts. Simplest case. if (!is.null(boot_data_A)) { - steps <- 2 # Bootstrapped estimates. Bypasses extraction, only needs averaging. - } else if (!is.null(slo)) { - steps <- 3 # Sleuth object. Most steps. Requires both extraction and averaging. + steps <- 2 # Bootstrapped estimates. } - if (steps == 1 || is.na(qbootnum)) - qboot <- FALSE # No quantification bootstraps data was provided. + if (steps == 1 || is.na(qbootnum) || qbootnum==0) + qboot <- FALSE # No quantification bootstraps data was provided or no iterations required. if (dbg == "prep") return(steps) @@ -115,14 +109,7 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i if (verbose) message("Preparing data...") - if (steps == 3) { # From Sleuth - # Reverse look-up from replicates to covariates. - samples_by_condition <- group_samples(slo$sample_to_covariates)[[varname]] - # Re-order rows and collate booted counts in a dataframe per sample. Put dataframes in a list per condition. - # Target_id is included but NOT used as key so as to ensure that the order keeps matching tx_filter. - boot_data_A <- denest_sleuth_boots(slo, tx_filter, samples_by_condition[[name_A]], COUNTS_COL, BS_TARGET_COL, "target_id", "parent_id", threads ) - boot_data_B <- denest_sleuth_boots(slo, tx_filter, samples_by_condition[[name_B]], COUNTS_COL, BS_TARGET_COL, "target_id", "parent_id", threads ) - } else if (steps == 2) { # From generic bootstrapped data + if (steps == 2) { # From generic bootstrapped data # Just re-order rows. boot_data_A <- lapply(boot_data_A, function(x) { x[match(tx_filter$target_id, x[[1]]), ] }) boot_data_B <- lapply(boot_data_B, function(x) { x[match(tx_filter$target_id, x[[1]]), ] }) @@ -131,7 +118,8 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i if (dbg == "bootin") return(list("bootA"=count_data_A, "bootB"=count_data_B)) - if (steps > 1) { # Either Sleuth or generic bootstraps. + # From generic bootstrapped data. + if (steps > 1) { # Remove ID columns so I don't have to always subset for math operations. for (smpl in c(boot_data_A, boot_data_B)) { smpl[, 1 := NULL] @@ -150,7 +138,8 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i mc.cores = threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE)) count_data_B <- as.data.table(mclapply(boot_data_B, function(b) { return(rowMeans(b)) }, mc.cores = threads, mc.preschedule = TRUE, mc.allow.recursive = FALSE)) - } else { # From generic unbootstrapped data. + # From generic unbootstrapped data. + } else { # Just re-order rows and crop out the transcript IDs. nn <- names(count_data_A) count_data_A <- count_data_A[match(tx_filter$target_id, count_data_A[[1]]), nn[seq.int(2, length(nn))], with=FALSE] @@ -209,9 +198,7 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i resobj$Parameters["tests"] <- testmode resobj$Parameters["rep_boot"] <- rboot resobj$Parameters["quant_boot"] <- qboot - if (steps==3) { - resobj$Parameters["data_type"] <- "sleuth" - } else if (steps==2) { + if (steps==2) { resobj$Parameters["data_type"] <- "bootstrapped abundance estimates" } else if (steps==1) { resobj$Parameters["data_type"] <- "plain abundance estimates" @@ -235,8 +222,7 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i numpairs <- length(pairs) resobj$Parameters["rep_bootnum"] <- numpairs resobj$Parameters["rep_reprod_thresh"] <- rrep_thresh - resobj$Parameters["rep_reprod_as_crit"] <- rrep_as_crit - + if (verbose) myprogress <- utils::txtProgressBar(min = 0, max = numpairs, initial = 0, char = "=", width = NA, style = 3, file = "") @@ -399,7 +385,7 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i Transcripts[(elig), DTU := (DTU & quant_reprod)] Genes[(elig), DTU := (DTU & quant_reprod)] } - if (rrep_as_crit & rboot) { + if (rboot) { Transcripts[(elig), DTU := (DTU & rep_reprod)] Genes[(elig), DTU := (DTU & rep_reprod)] } @@ -439,11 +425,10 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i if(verbose) { message("All done!") - print("") - print("Summary of DTU results:") + print(noquote("Summary of DTU results:")) dtusum <- dtu_summary(resobj) print(dtusum) - print("Isoform-switching subset of DTU:") + print(noquote("Isoform-switching subset of DTU:")) switchsum <- dtu_switch_summary(resobj) print(switchsum) } diff --git a/R/results.R b/R/results.R index fb9c71c..f4dbbff 100644 --- a/R/results.R +++ b/R/results.R @@ -1,16 +1,3 @@ -#================================================================================ -#' Summary of DTU calling. -#' -#' @param dtuo A DTU object. -#' @return A named numerical vector giving a tally of the results. -#' -#'@export -dtu_summary <- function(dtuo) { - tally <- sapply(get_dtu_ids(dtuo), length) - return( data.frame("category"=names(tally), "tally"=tally, row.names=NULL, stringsAsFactors=FALSE) ) -} - - #================================================================================ #' List of DTU ids. #' @@ -29,31 +16,46 @@ get_dtu_ids <- function(dtuo) { myt[, adp := abs(Dprop)] setorder(myt, -adp, na.last=TRUE) }) - + myp <- copy(dtuo$Genes[, c("DTU", "transc_DTU", "parent_id", "maxDprop")]) with(myp, { # Sort genes to match. myp[, adp := abs(maxDprop)] setorder(myp, -adp, na.last=TRUE) - + # Extract. return(list("DTU genes (gene test)" = as.vector( myp[(DTU), parent_id] ), - "non-DTU genes (gene test)" = as.vector( myp[DTU==FALSE, parent_id] ), - "NA genes (gene test)" = as.vector( myp[is.na(DTU), parent_id] ), - "DTU genes (transc. test)" = as.vector( myp[(transc_DTU), parent_id] ), - "non-DTU genes (transc. test)" = as.vector( myp[transc_DTU==FALSE, parent_id] ), - "NA genes (transc. test)" = as.vector( myp[is.na(transc_DTU), parent_id] ), - "DTU genes (both tests)" = as.vector( myp[(DTU & transc_DTU), parent_id] ), - "non-DTU genes (both tests)" = as.vector( myp[DTU==FALSE & transc_DTU==FALSE, parent_id] ), - "NA genes (both tests)" = as.vector( myp[is.na(DTU) & is.na(transc_DTU), parent_id] ), - "DTU transcripts" = as.vector(myt[(DTU), target_id]), - "non-DTU transcripts" = as.vector(myt[DTU==FALSE, target_id]), - "NA transcripts" = as.vector(myt[is.na(DTU), target_id]) + "non-DTU genes (gene test)" = as.vector( myp[DTU==FALSE, parent_id] ), + "NA genes (gene test)" = as.vector( myp[is.na(DTU), parent_id] ), + "DTU genes (transc. test)" = as.vector( myp[(transc_DTU), parent_id] ), + "non-DTU genes (transc. test)" = as.vector( myp[transc_DTU==FALSE, parent_id] ), + "NA genes (transc. test)" = as.vector( myp[is.na(transc_DTU), parent_id] ), + "DTU genes (both tests)" = as.vector( myp[(DTU & transc_DTU), parent_id] ), + "non-DTU genes (both tests)" = as.vector( myp[DTU==FALSE & transc_DTU==FALSE, parent_id] ), + "NA genes (both tests)" = as.vector( myp[is.na(DTU) & is.na(transc_DTU), parent_id] ), + "DTU transcripts" = as.vector(myt[(DTU), target_id]), + "non-DTU transcripts" = as.vector(myt[DTU==FALSE, target_id]), + "NA transcripts" = as.vector(myt[is.na(DTU), target_id]) )) }) } +#================================================================================ +#' Summary of DTU calling. +#' +#' A tally of DTU genes and transcripts. +#' +#' @param dtuo A DTU object. +#' @return A data.frame with catefory names in the first column and values in the second column. +#' +#'@export +dtu_summary <- function(dtuo) { + tally <- sapply(get_dtu_ids(dtuo), length) + return( data.frame("category"=names(tally), "tally"=tally, row.names=NULL, stringsAsFactors=FALSE) ) +} + + #================================================================================ #' List of genes that switch isoform ranks. #' @@ -70,8 +72,8 @@ get_switch_ids <- function(dtuo) { myt <- copy(dtuo$Transcripts[dtuo$Genes[(dtuo$Genes$DTU), c("parent_id")], c("parent_id", "target_id", "propA", "propB")]) result <- list() if (nrow(myt)==0) { - result[["Primary switch (gene test)"]] <- NA_character_ - result[["Non-primary switch (gene test)"]] <- NA_character_ + result[["Primary switch (gene test)"]] <- character(0) + result[["Non-primary switch (gene test)"]] <- character(0) } else { with(myt, { myt[, rankA := frank(propA), by=parent_id] @@ -88,8 +90,8 @@ get_switch_ids <- function(dtuo) { # Get all the transcripts from genes with at least one DTU transcript. myt <- copy(dtuo$Transcripts[dtuo$Genes[(dtuo$Genes$transc_DTU), c("parent_id")], c("parent_id", "target_id", "propA", "propB")]) if (nrow(myt)==0) { - result[["Primary switch (transc. test)"]] <- NA_character_ - result[["Non-primary switch (transc. test)"]] <- NA_character_ + result[["Primary switch (transc. test)"]] <- character(0) + result[["Non-primary switch (transc. test)"]] <- character(0) } else { with(myt, { myt[, rankA := frank(propA), by=parent_id] @@ -106,8 +108,8 @@ get_switch_ids <- function(dtuo) { # Get all the transcripts from DTU genes with at least one DTU isoform. myt <- copy(dtuo$Transcripts[dtuo$Genes[(dtuo$Genes$DTU & dtuo$Genes$transc_DTU), c("parent_id")], c("parent_id", "target_id", "propA", "propB")]) if (nrow(myt)==0) { - result[["Primary switch (both tests)"]] <- NA_character_ - result[["Non-primary switch (both tests)"]] <- NA_character_ + result[["Primary switch (both tests)"]] <- character(0) + result[["Non-primary switch (both tests)"]] <- character(0) } else { with(myt, { myt[, rankA := frank(propA), by=parent_id] @@ -128,8 +130,10 @@ get_switch_ids <- function(dtuo) { #================================================================================ #' Summary of isoform switching events. #' +#' A tally of genes showing isoform switching. +#' #' @param dtuo A DTU object. -#' @return A named numerical vector giving a tally of the results. +#' @return A data.frame with catefory names in the first column and values in the second column. #' #'@export dtu_switch_summary <- function(dtuo) { @@ -139,10 +143,10 @@ dtu_switch_summary <- function(dtuo) { #================================================================================ -#' Summary of DTU plurality. +#' List the gene IDs by number of isoforms showing significant change. #' #' Get the IDs of DTU genes organised by the number of isoforms affected. This -#' is possible only based on the transcript-level results. +#' is possible only if transcript-level results are enabled. #' #' @param dtuo A DTU object. #' @return A named numerical vector giving a tally of the results. @@ -163,9 +167,11 @@ get_plurality_ids <- function(dtuo){ #================================================================================ #' Summary of DTU plurality. +#' +#' A tally of genes based on how many isoforms show significant change. #' #' @param dtuo A DTU object. -#' @return A named numerical vector giving a tally of the results. +#' @return A data.frame with catefory names in the first column and values in the second column. #' #'@export dtu_plurality_summary <- function(dtuo) { diff --git a/README.md b/README.md index c3c8bc1..b0e162f 100644 --- a/README.md +++ b/README.md @@ -16,28 +16,22 @@ Anyone working in transcriptomics, analysing gene expression and transcript abun This is called **Differential Transcript Usage (DTU)**. 2. **RATs is workflow-agnostic**. Quantification quality details are left to the quantification tools; RATs uses only the -transcript abundances. This makes it *suitable for use with alignment-free quantification tools* like [Kallisto](http://pachterlab.github.io/kallisto/) -or [Salmon](https://github.com/COMBINE-lab/salmon). It is also compatible with DTE output from [Sleuth](http://pachterlab.github.io/sleuth). +transcript abundances, which you can obtain using any tool you like. This makes it *suitable for use with alignment-free quantification tools* +like [Kallisto](http://pachterlab.github.io/kallisto/) or [Salmon](https://github.com/COMBINE-lab/salmon). 3. RATs is able to take advantage of the bootstrapped quantifications provided by the alignment-free tools. These bootstrapped -data are used by `RATs to assess how much the technical variability of the heuristic quantifications affects differential transcript usage +data are used by RATs to assess how much the technical variability of the heuristic quantifications affects differential transcript usage and thus provide a measure of confidence in the DTU calls. #### What it needs -1. This is an R source package, and will run on any platform with a reasonably up-to-date R environment. +1. This is an R source package, and will run on any platform with a reasonably up-to-date R environment. A few third-party R packages are also required (see below). -2. As input, RATs requires transcript abundance estimates with or without bootstrapping. For convenience, these can also be extracted directly -from the output of [Sleuth](http://pachterlab.github.io/sleuth/). +2. As input, RATs requires transcript abundance estimates with or without bootstrapping. The format either way is tables with the samples as columns and the transcripts as rows. An extra column holds the transcript IDs. Some functionality to create these from Salmon or Kallisto quantification files is provided by RATs. -3. RATs also requires a look-up table matching transcript identifiers to respective gene identifiers. This can be obtained through various means, -one of them being extracting this info from a GTF file. - -4. RATs makes use of the [data.table](https://cran.r-project.org/web/packages/data.table/index.html) and -[matrixStats](https://cran.r-project.org/web/packages/matrixStats/index.html) packages, as well as -[ggplot2](https://cran.r-project.org/web/packages/ggplot2/index.html) and [shiny](https://cran.r-project.org/web/packages/shiny/shiny.pdf) for visualisations. All these are -available from CRAN. +3. RATs also requires a look-up table matching the transcript identifiers to the respective gene identifiers. This can be obtained through various means, +one of them being extracting this info from a GTF file using functionality provided by RATs. *** @@ -66,17 +60,17 @@ install.packages(c("data.table", "matrixStats"), dependencies=TRUE) install.packages("ggplot2", dependencies=TRUE) ``` -* Packages needed only for importing abundances from Salmon/Kallisto output (optional) - +* Packages needed only for importing abundances from Salmon/Kallisto output (recommended) - ``` install.packages("devtools", dependencies=TRUE) source("http://bioconductor.org/biocLite.R") -# Wasabi converter from Salmon/Sailfish to Kallisto. +# Format converter from Salmon/Sailfish to Kallisto. biocLite("COMBINE-lab/wasabi") -# Kallisto parser. +# Compressed format parser. biocLite("rhdf5") ``` @@ -105,21 +99,24 @@ For testing purposes (bug resolutions, new features), you can install the ongoin `devtools::install_github("bartongroup/rats", ref="development")` -Developmental versions are works in progress and will not be archived in snapshots. They are also likely to contain new bugs and -may at times not work correctly or at all. For reproducible/publishable analyses, **always use a release version**, NOT a developmental version. +From `v0.6.0` onwards, release versions of RATs continue to have a 3-part release number, whereas developmental versions now have a 4-part version number. Despite having version numbers, developmental versions are not archived and are **not suitable** for reproducible/critical/publishable analyses. They may also temporarily not work correctly or at all. For critical analyses you shoud use the latest release version. Eventually, we aim to make the package also available through Bioconductor. + ### Differential Transcript Usage -A typical command to call DTU given a Sleuth object looks like this: +A typical command to call DTU bootstrapped quantifications looks like this: + +`results <- call_DTU(annot = my_identifiers_table, boot_data_A=my_list_of_tables_A, boot_data_B=my_list_of_tables_B)` -`results <- call_DTU(annot = my_identifiers_table, slo = my_sleuth_object, name_A = "Condition-1", name_B = "Condition-2")` +and for plain quantifications like this: -RATs also accepts data input in **generic formats**. Please consult the vignettes for syntax details, format specifications and additional settings. +`results <- call_DTU(annot = my_identifiers_table, count_data_A=my_table_A, count_data_B=my_tables_B)` -The output is a list containing (among other items) two tables that list the final results as well as the intermediate calculations and decisions. -Details on the output structure and visualisation options are provided in the vignettes. +The output is a list containing (among other items) two tables, one with gene-level details and one with transcript-level details. In these you will +find the raw values for all calculations, the pass/fail for each threshold and the final classification for each gene or transcript respectively. +Details on the output structure and the result visualisation options are provided in the vignettes. *** @@ -128,7 +125,7 @@ Details on the output structure and visualisation options are provided in the vi The `rats` R package was developed within [The Barton Group](http://www.compbio.dundee.ac.uk) at [The University of Dundee](http://www.dundee.ac.uk) by Dr. Kimon Froussios, Dr. Kira Mourão and Dr. Nick Schurch. -To **report problems** or ask for **assistance**, please raise a new issue [on the project's support forum](https://github.com/bartongroup/Rats/issues). +To report **problems** or ask for **assistance**, please raise a new issue [on the project's support forum](https://github.com/bartongroup/Rats/issues). Providing a *reproducible working example* that demonstrates your issue is strongly encouraged. Also, be sure to **read the vignette(s)**, and browse/search the support forum before posting a new issue, in case your question is already answered there. diff --git a/inst/doc/input.R b/inst/doc/input.R index f1afd42..cb21491 100644 --- a/inst/doc/input.R +++ b/inst/doc/input.R @@ -8,7 +8,7 @@ library(rats) # library(rats) ## ---- echo=FALSE--------------------------------------------------------- -# Show the first rows of the table corresponding to one sample, from simulated data. +# Show the first rows of the table corresponding to one sample, from emulated data. head(sim_boot_data()[[2]][[1]]) ## ---- echo=FALSE--------------------------------------------------------- @@ -22,151 +22,117 @@ head(sim_count_data()[[1]]) ## ---- eval=FALSE--------------------------------------------------------- # # Simulate some data. # simdat <- sim_count_data() -# -# # For convenience let's assign the contents of the list to separate variables. +# # For convenience let's assign the contents of the list to separate variables # mycond_A <- simdat[[2]] # Simulated abundances for one condition. # mycond_B <- simdat[[3]] # Simulated abundances for other condition. # myannot <- simdat[[1]] # Transcript and gene IDs for the above data. ## ---- eval=FALSE--------------------------------------------------------- # # Find DTU between the simulated datasets. -# mydtu <- call_DTU(annot= myannot, count_data_A= mycond_A, count_data_B= mycond_B, -# verbose= FALSE, -# name_A= "healthy", name_B= "patients", varname= "My phenotype", -# description="Comparison of two simulated counts datasets for the tutorial. Simulated using built-in functionality of RATs.") +# mydtu <- call_DTU(annot= myannot, count_data_A= mycond_A, +# count_data_B= mycond_B, verbose= FALSE, +# name_A= "healthy", name_B= "patients", +# varname= "My phenotype", +# description="Comparison of two simulated counts datasets +# for the tutorial. Simulated using built-in functionality +# of RATs.") ## ------------------------------------------------------------------------ # Simulate some data. (Notice it is a different function than before.) simdat <- sim_boot_data() -# For convenience let's assign the contents of the list to separate variables. -mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition. -mycond_B <- simdat[[3]] # Simulated bootstrapped data for other condition. -myannot <- simdat[[1]] # Transcript and gene IDs for the above data. - -## ---- eval=FALSE--------------------------------------------------------- -# # Find DTU between conditions "controls" and "patients" in the simulated data. -# mydtu <- call_DTU(annot= myannot, boot_data_A= mycond_A, boot_data_B= mycond_B, -# verbose= FALSE, -# name_A= "wildtype", name_B= "some mutant", varname = "My phenotype", -# description="Comparison of two simulated datasets of bootstrapped counts for the tutorial. Simulated using built-in functionality of RATs.") - -## ------------------------------------------------------------------------ -# Simulate some data. -simdat <- sim_sleuth_data(cnames = c("controls", "patients")) -# controls and patients are arbitrary names to use as conditions. - -# For convenience let's assign the contents of the list to separate variables. -myslo <- simdat$slo # Simulated minimal sleuth object. -myannot <- simdat$annot # Transcript and gene Identifiers for the above data. +# For convenience let's assign the contents of the list to separate variables +mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition. +mycond_B <- simdat[[3]] # Simulated bootstrapped data for other condition. +myannot <- simdat[[1]] # Transcript and gene IDs for the above data. ## ---- eval=FALSE--------------------------------------------------------- -# # Find DTU between conditions "controls" and "patients" in the simulated data. -# mydtu <- call_DTU(annot= myannot, slo= myslo, name_A= "controls", name_B= "patients", -# varname= "condition", verbose= FALSE, -# description="Comparison of two conditions using a simulated sleuth object for the purposes of the tutorial. Simulated using built-in functionality of RATs.") +# # Find DTU between conditions. +# mydtu <- call_DTU(annot= myannot, boot_data_A= mycond_A, +# boot_data_B= mycond_B, verbose= FALSE, +# name_A= "wildtype", name_B= "some mutant", +# varname = "My phenotype", description="Comparison of +# two simulated datasets of bootstrapped counts for the +# tutorial. Simulated using built-in functionality +# of RATs.") ## ---- eval=FALSE--------------------------------------------------------- # # 1. Collect your outputs into vectors. The end of each path should be a # # directory with a unique name/identifier for one sample. -# samples_A <- c("your/path/SAMPLE1", "your/path/SAMPLE4","your/path/SAMPLE5", ...) -# samples_B <- c("your/path/SAMPLE2", "your/path/SAMPLE3","your/path/SAMPLE7", ...) -# # OR if your outputs are all in the same path, this syntax might be more convenient: -# samples_A <- file.path("your/path/", c("SAMPLE1", "SAMPLE4","SAMPLE5", ...)) -# samples_B <- file.path("your/path/", c("SAMPLE2", "SAMPLE3","SAMPLE7", ...)) +# samples_A <- c("your/path/SAMPLE1", "your/path/SAMPLE4","your/path/SAMPLE5") +# samples_B <- c("your/path/SAMPLE2", "your/path/SAMPLE3","your/path/SAMPLE7", +# "your/path/SAMPLE10") # # # 2. Convert, import, and extract. -# # The annotation is needed to enforce consistent transcripts order throughout RATs. -# mydata <- fish4rodents(A_paths= samples_A, B_paths= samples_B, annot= myannot) -# -# # 3. Run RATs with the generic bootstrapped format: +# mydata <- fish4rodents(A_paths= samples_A, B_paths= samples_B, +# annot= myannot, scaleto=100000000) + +## ---- eval=FALSE--------------------------------------------------------- +# # 3. Run RATs with the bootstrapped table data format: # mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A, -# boot_data_B= mydata$boot_data_B, verbose= FALSE, -# name_A= "controls", name_B= "patients", -# varname = "condition", description="Comparison of two sets of bootstrapped counts imported from the quantification output.") +# boot_data_B= mydata$boot_data_B, verbose= FALSE) ## ---- eval=FALSE--------------------------------------------------------- # # Calling DTU with custom thresholds. -# mydtu <- call_DTU(annot= myannot, slo= myslo, name_A= "controls", name_B= "patients", +# mydtu <- call_DTU(annot= myannot, +# boot_data_A= mycond_A, boot_data_B= mycond_B, # p_thresh= 0.01, abund_thresh= 10, dprop_thres = 0.25) ## ---- eval=FALSE--------------------------------------------------------- # # Bootstrap (default). Do 100 iterations. -# mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", -# name_B = "patients", qboot = TRUE, qbootnum = 100, qrep_thresh= 0.95) +# mydtu <- call_DTU(annot = myannot, +# boot_data_A= mycond_A, boot_data_B= mycond_B, +# qboot = TRUE, qbootnum = 100, qrep_thresh= 0.95) # # # Skip bootstraps. -# mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", -# name_B = "patients", qboot = FALSE) +# mydtu <- call_DTU(annot = myannot, +# boot_data_A= mycond_A, boot_data_B= mycond_B, +# qboot = FALSE) ## ---- eval=FALSE--------------------------------------------------------- # # Bootstrap (default). -# # Note that for few replicates, the reproducibility values are highly discrete: -# # The number of iterations for 3 samples per condition is 3*3=9. -# # So the minimum possible error rate is 1/9=0.111. -# # The corresponding threshold is 1-0.111=0.888. -# mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", -# name_B = "patients", rboot = TRUE, qrep_thresh= 0.85) +# mydtu <- call_DTU(annot = myannot, +# boot_data_A= mycond_A, boot_data_B= mycond_B, +# rboot = TRUE, qrep_thresh= 0.85) # # # Skip bootstraps. -# mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", -# name_B = "patients", rboot = FALSE) +# mydtu <- call_DTU(annot = myannot, +# boot_data_A= mycond_A, boot_data_B= mycond_B, +# rboot = FALSE) ## ----eval=FALSE---------------------------------------------------------- # # Using 8 threads/cores for parallel computing. -# mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", -# name_B = "patients", threads = 8) +# mydtu <- call_DTU(annot = myannot, +# boot_data_A= mycond_A, boot_data_B= mycond_B, +# threads = 8) ## ---- eval=FALSE--------------------------------------------------------- # # Transcripts only. -# mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", -# name_B = "patients", testmode="transc") +# mydtu <- call_DTU(annot = myannot, +# boot_data_A= mycond_A, boot_data_B= mycond_B, +# testmode="transc") # # Genes only. -# mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", -# name_B = "patients", testmode="genes") +# mydtu <- call_DTU(annot = myannot, +# boot_data_A= mycond_A, boot_data_B= mycond_B, +# testmode="genes") ## ---- eval=FALSE--------------------------------------------------------- # # Bonferroni correction. -# mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", -# name_B = "patients", correction = "bonferroni") - -## ------------------------------------------------------------------------ -# Lets emulate some input with custom field names. -sim <- sim_sleuth_data(varname = "mouse", cnames = c("Splinter", "Mickey"), - COUNTS_COL = "the-counts", TARGET_COL = "transcript", - PARENT_COL = "gene", BS_TARGET_COL = "trscr") -myslo <- sim$slo -myannot <- sim$annot - -# Sleuth covariates table. -print( sim$slo$sample_to_covariates ) -# Sleuth bootstrapped quantifications. -print( head(sim$slo$kal[[1]]$bootstrap[[1]]) ) -# Annotation. -print( head(sim$annot) ) - -## ---- eval=FALSE--------------------------------------------------------- -# # Call DTU on data with custom field names. -# mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "Splinter", name_B = "Mickey", -# varname="mouse", COUNTS_COL="the-counts", BS_TARGET_COL="trscr", -# verbose = FALSE) -# -# #! In our example data here, the annotation fields have been modified as well, -# #! so this command will NOT run as it is shown. You will also need the parameters -# #! shown in teh section below. +# mydtu <- call_DTU(annot = myannot, +# boot_data_A= mycond_A, boot_data_B= mycond_B, +# correction = "bonferroni") ## ---- eval=FALSE--------------------------------------------------------- # # Call DTU using annotation with custom field names. -# mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "Splinter", name_B = "Mickey", -# varname="mouse", TARGET_COL="transcript", PARENT_COL="gene", -# verbose = FALSE) -# -# #! In our example data here, the Sleuth fields have been modified as well, -# #! so this command will NOT run as it is shown. You will also need the parameters -# #! shown in the section above. +# mydtu <- call_DTU(annot = myannot, +# boot_data_A= mycond_A, boot_data_B= mycond_B, +# TARGET_COL="transcript", PARENT_COL="gene") ## ----eval=FALSE---------------------------------------------------------- -# # If your sample preps contain ~70 million transcripts and you are using TPM abundances -# mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", -# name_B = "patients", COUNTS_COL = "tpm", scaling = 70) +# # If your sample preps contain ~70 million transcripts +# # and you are using TPM abundances, the scaling factor +# # is 10M / 1M = 70. +# mydtu <- call_DTU(annot = myannot, +# boot_data_A= mycond_A, boot_data_B= mycond_B, +# scaling = 70) diff --git a/inst/doc/input.Rmd b/inst/doc/input.Rmd index 7adca52..bc59076 100644 --- a/inst/doc/input.Rmd +++ b/inst/doc/input.Rmd @@ -1,7 +1,7 @@ --- title: 'RATs: Input and Settings' author: "Kimon Froussios" -date: "31 MAY 2017" +date: "19 SEP 2017" output: html_document: keep_md: yes @@ -39,42 +39,39 @@ library(rats) RATs can work with several input types: -1. A Sleuth object. -2. Salmon/Kallisto quantifications. -3. Bootstrapped abundance estimates. -4. Abundance estimates. +1. [Salmon](https://github.com/COMBINE-lab/salmon) and [Kallisto](http://pachterlab.github.io/kallisto/) quantifications. +2. Bootstrapped abundance estimates in lists of R `data.table`s. +3. Abundance estimates in R `data.table`s. --> [Sleuth](http://pachterlab.github.io/sleuth/) objects contain their input data, so RATs can extract the bootstrapped abundance estimates directly from there, -for your convenience if Sleuth is in your workflow already. +As of `v0.6.0`, input from a `sleuth` object is no longer supported by RATs. This is due to changes in Sleuth `v0.29` that made the +bootstrap data no longer available. Instead, RATs already supports input directly from the Kallisto/Salmon quantifications. --> [Salmon](https://combine-lab.github.io/salmon/) quantifications require the [wasabi](https://github.com/COMBINE-lab/wasabi) converter. -[Kallisto](https://pachterlab.github.io/kallisto/) quantifications are accessed directly. +For option 1, the function `fish4rodents()` will load the data into tables suitable for option 2. Details in the respective +section below. --> Bootstrapped abundance estimates from other sources can be input directly as `list`s of `data.table`s. Two lists are needed, one per condition. -Each datatable should contain the transcript identifiers in the first column, followed by columns containing the estimates from the bootstrap iterations: +For options 2 and 3, the format of the tables is as in the example below. The first column contains the transcript identifiers. +Subsequent columns contain the abundances. ```{r, echo=FALSE} -# Show the first rows of the table corresponding to one sample, from simulated data. +# Show the first rows of the table corresponding to one sample, from emulated data. head(sim_boot_data()[[2]][[1]]) ``` --> Abundance estimates, without bootstrapping information, can be input simply as two `data.table`s, one per condition. The first column should -contain the transcript identifiers, followed by columns listing the abundance per sample. The format of each table is identical to the one shown above. - +* In the case of option 3, each column represents a sample. Therefore, each condition is represented by a single table containing +the relevant samples. +* In the case of option 2, each column represents a bootstrap iteration and each table represents a sample. Therefore, each +condition is represented by a list of tables. ### Read counts, TPMs, etc -RATs will happily crunch any type of numeric value, but the type of abundances you provide will have an impact on the DTU outcome. - -Read counts (such as the `est_counts`) work well with RATs and achieve decent False Discovery Rate. However, they are influenced by -the length of the transcripts. If the isoforms of a gene differ considerably in length, a well-expressed long isoform can drown a valid -change in the relative abundances of shorter isoforms. +RATs will happily crunch any type of numeric value, but the type of abundances you provide will have an impact on the DTU outcome. The statistical test +employed (G test) is meant to work with actual counts, not arbitrarily scaled values like TPMs (transcripts per million reads) or other per-million-reads representations. +On the other hand, read counts work well with RATs and achieve a good False Discovery Rate, but they are influenced by +the length of the transcripts and a well-expressed long isoform can drown out a valid change in the relative abundances of short isoforms. -TPM abundances account for transcript length and are a truer representation of relative isoform abundance. However, they are scaled -arbitrarily to 1 million transcripts, which is an unrealistically small sample size and this causes loss of statistical power. Therefore, -RATs provides the option to scale abundance values such as TPMs, by a constant factor. Use of this option should not be done -light-heartedly, as it will artificially inflate or deflate the significances obtained. Ideally, you should infer an appropriate -scaling factor based on your sample preparation details (for example, based on RNA concentration). +To get the best of both worlds, we recommend obtaining TPM abundances, so as to account for transcript lengths, and then scaling these values to your actual library size +to cancel out the per-million part and regain realistic counts. RATs provides functionality to enable you to do this, the first of which is the `scaleto` parameter in +`fish4rodents()`, shown above. ## Annotation @@ -87,8 +84,8 @@ By default the column labels are `target_id` for the transcript IDs and `parent_ head(sim_count_data()[[1]]) ``` -A function is provided to create this table, given a GTF file. -(**Note:** GFF3 is **not** supported, as the specification is too relaxed.) +Such a table can be compiled with a variety of methods and from a variety of resources. RATs provides functionality +to create this table from a GTF file. (**Note:** GFF3 is not supported for this) ```{r, eval=FALSE} # Extract transcript ID to gene ID index from a GTF annotation. @@ -101,31 +98,30 @@ myannot <- annot2ids("my_annotation_file.gtf") # Calling DTU -To bypass the complexity of running third-party tools in this tutorial, we will instead use **emulated data**. RATs comes -with data emulators, intended to be used for testing the code. However, they are also convenient to use for showcasing -how RATs works. If you happen to have real data available, you can use that instead if you wish. +To bypass the complexity of involving third-party tools in this tutorial, we will instead use **emulated data**. +RATs comes with data emulators, intended to be used for testing the code. However, they are also convenient to +use for showcasing how RATs works. -By default, RATs reports on its progress and produces a summary report. -The progress messages and summary can be suppressed by adding the `verbose = FALSE` parameter to the call. -To prevent cluttering this tutorial with verbose output, we will use this option in all the examples. +By default, RATs reports on its progress and produces a summary report. These can be suppressed using the +`verbose = FALSE` parameter to the call. To prevent cluttering this tutorial with verbose output, we will use this +option in all the examples. -If you leave `verbose = TRUE` when trying out the examples below using the emulated data, you will get **warnings** +If you choose to allow verbose output when trying out the examples below using the emulated data, you will get some **warnings** about the number of bootstraps. The warning is triggered because the emulated dataset used in the examples immitates only the structure of -real data, not the actual volume of it, and as such contains too few bootstrap iterations. +real data, not the actual volume of it, and as such it contains unrealistically few bootstrap iterations. Simply ignore these warnings +for these examples. -### ...from abundance estimates, without bootstraps +### DTU from abundance estimates, without bootstraps -This is the simplest usage case, provided only for completeness. We recommend using bootstrapped data whenever possible, -for reasons that will be discussed in the relevant section below. +This is the simplest usage case, and your likely input type if you use quantifications methods other than Kallisto and Salmon. First, let's emulate some data to work with: ```{r, eval=FALSE} # Simulate some data. simdat <- sim_count_data() - -# For convenience let's assign the contents of the list to separate variables. +# For convenience let's assign the contents of the list to separate variables mycond_A <- simdat[[2]] # Simulated abundances for one condition. mycond_B <- simdat[[3]] # Simulated abundances for other condition. myannot <- simdat[[1]] # Transcript and gene IDs for the above data. @@ -135,10 +131,13 @@ Now we can call DTU: ```{r, eval=FALSE} # Find DTU between the simulated datasets. -mydtu <- call_DTU(annot= myannot, count_data_A= mycond_A, count_data_B= mycond_B, - verbose= FALSE, - name_A= "healthy", name_B= "patients", varname= "My phenotype", - description="Comparison of two simulated counts datasets for the tutorial. Simulated using built-in functionality of RATs.") +mydtu <- call_DTU(annot= myannot, count_data_A= mycond_A, + count_data_B= mycond_B, verbose= FALSE, + name_A= "healthy", name_B= "patients", + varname= "My phenotype", + description="Comparison of two simulated counts datasets + for the tutorial. Simulated using built-in functionality + of RATs.") ``` #### Mandatory arguments: @@ -146,14 +145,18 @@ mydtu <- call_DTU(annot= myannot, count_data_A= mycond_A, count_data_B= mycond_B 1. `annot` - An annotation data frame, as described in the *Input formats* section. 2. `count_data_A` and `count_data_B` - are each a `data.table` of transcript abundances as described in the Input section. -#### Optional arguments: +#### Optional arguments (will be recorded in the output object): * `name_A`, `name_B` - A name for each conditon. (Default `NA`) * `varname` - The name of the variable/condition. (Default `NA`) -* `description` - Free-text description. You can note experiment details, annotation source, anything you might find useful later. (Default `NA`) +* `description` - Free-text description. (Default `NA`) + +#### Advanced options + +Refer to the respective sections later in this vignette. -### ...from bootstrapped abundance estimates +### DTU from bootstrapped abundance estimates First, let's emulate some data, as we did before. @@ -161,20 +164,23 @@ First, let's emulate some data, as we did before. # Simulate some data. (Notice it is a different function than before.) simdat <- sim_boot_data() -# For convenience let's assign the contents of the list to separate variables. -mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition. -mycond_B <- simdat[[3]] # Simulated bootstrapped data for other condition. -myannot <- simdat[[1]] # Transcript and gene IDs for the above data. +# For convenience let's assign the contents of the list to separate variables +mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition. +mycond_B <- simdat[[3]] # Simulated bootstrapped data for other condition. +myannot <- simdat[[1]] # Transcript and gene IDs for the above data. ``` Now we can call DTU: ```{r, eval=FALSE} -# Find DTU between conditions "controls" and "patients" in the simulated data. -mydtu <- call_DTU(annot= myannot, boot_data_A= mycond_A, boot_data_B= mycond_B, - verbose= FALSE, - name_A= "wildtype", name_B= "some mutant", varname = "My phenotype", - description="Comparison of two simulated datasets of bootstrapped counts for the tutorial. Simulated using built-in functionality of RATs.") +# Find DTU between conditions. +mydtu <- call_DTU(annot= myannot, boot_data_A= mycond_A, + boot_data_B= mycond_B, verbose= FALSE, + name_A= "wildtype", name_B= "some mutant", + varname = "My phenotype", description="Comparison of + two simulated datasets of bootstrapped counts for the + tutorial. Simulated using built-in functionality + of RATs.") ``` #### Mandatory arguments: @@ -182,143 +188,62 @@ mydtu <- call_DTU(annot= myannot, boot_data_A= mycond_A, boot_data_B= mycond_B, 1. `annot` - An annotation data frame, as described in the *Input formats* section. 2. `boot_data_A` and `boot_data_B` - are each a list of `data.table` objects, as described in the Input section. -#### Optional arguments: +#### Optional arguments (will be recorded in the output object): * `name_A`, `name_B` - A name for each conditon. (Default `NA`) * `varname` - The name of the variable/condition. (Default `NA`) -* `description` - Free-text description. You can note experiment details, annotation source, anything you might find useful later. (Default `NA`) - - -### ...with a sleuth object - -First, let's emulate a Sleuth object, using the bundled tools. The real Sleuth objects are very large and very complex nested lists. -The emulated one contains only the minimum essential elements relevant to calling DTU with RATs. - -```{r} -# Simulate some data. -simdat <- sim_sleuth_data(cnames = c("controls", "patients")) -# controls and patients are arbitrary names to use as conditions. - -# For convenience let's assign the contents of the list to separate variables. -myslo <- simdat$slo # Simulated minimal sleuth object. -myannot <- simdat$annot # Transcript and gene Identifiers for the above data. -``` - -Now call DTU. - -```{r, eval=FALSE} -# Find DTU between conditions "controls" and "patients" in the simulated data. -mydtu <- call_DTU(annot= myannot, slo= myslo, name_A= "controls", name_B= "patients", - varname= "condition", verbose= FALSE, - description="Comparison of two conditions using a simulated sleuth object for the purposes of the tutorial. Simulated using built-in functionality of RATs.") -``` - -#### Mandatory arguents: - -1. `annot` - an annotation data frame, as described in the *Input formats* section. -2. `slo` - a Sleuth object. -3. `name_A` and `name_B` - the two groups of samples to compare. These are restricted to values available in `myslo$sample_to_covariates`. The values must both exist in the column specified by `varname`. -4. `varname` - the variable for which samples are compared. must be an existing column header in `myslo$sample_to_covariates`. - -#### Optional arguments: - -* `description` - Free-text description. You can note experiment details, annotation source, anything you might find useful later. (Default `NA`) - -Please note that, unlike the other two usage cases, `name_A`, `name_B` and `varname` are **not optional**, as they specify how data is extracted from the Sleuth object. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +* `description` - Free-text description. (Default `NA`) - +#### Advanced options - - +Refer to the respective sections later in this vignette. - - - - - - - - - -### ...with Salmon/Kallisto output +### DTU with Salmon/Kallisto output RATs offers a method to import bootstrapped abundances directly from [Salmon](https://combine-lab.github.io/salmon/) output (requires [wasabi](https://github.com/COMBINE-lab/wasabi)) or [Kallisto](https://pachterlab.github.io/kallisto/) -output. Abundances are corrected for transcript length and scaled to 1M transcripts (TPM). +output. The raw abundances are normalised to TPM (default). ```{r, eval=FALSE} # 1. Collect your outputs into vectors. The end of each path should be a # directory with a unique name/identifier for one sample. -samples_A <- c("your/path/SAMPLE1", "your/path/SAMPLE4","your/path/SAMPLE5", ...) -samples_B <- c("your/path/SAMPLE2", "your/path/SAMPLE3","your/path/SAMPLE7", ...) -# OR if your outputs are all in the same path, this syntax might be more convenient: -samples_A <- file.path("your/path/", c("SAMPLE1", "SAMPLE4","SAMPLE5", ...)) -samples_B <- file.path("your/path/", c("SAMPLE2", "SAMPLE3","SAMPLE7", ...)) +samples_A <- c("your/path/SAMPLE1", "your/path/SAMPLE4","your/path/SAMPLE5") +samples_B <- c("your/path/SAMPLE2", "your/path/SAMPLE3","your/path/SAMPLE7", + "your/path/SAMPLE10") # 2. Convert, import, and extract. -# The annotation is needed to enforce consistent transcripts order throughout RATs. -mydata <- fish4rodents(A_paths= samples_A, B_paths= samples_B, annot= myannot) - -# 3. Run RATs with the generic bootstrapped format: -mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A, - boot_data_B= mydata$boot_data_B, verbose= FALSE, - name_A= "controls", name_B= "patients", - varname = "condition", description="Comparison of two sets of bootstrapped counts imported from the quantification output.") +mydata <- fish4rodents(A_paths= samples_A, B_paths= samples_B, + annot= myannot, scaleto=100000000) ``` +The return value of `fish4rodents` is a list with two items: `$boot_data_A` and `$boot_data_B`. These two items are +formatted for RATs' generic bootstrapped input. + #### Mandatory arguments (`fish4rodents`): 1. `A_paths` and `B_paths` - Two vectors containing the paths to the quantification output directories, one vector for each condition. The last segments of each path should be a directory with a unique identifying name for a single sample. 2. `annot` - The annotation table that you will use for calling DTU (and that was used for the quantification). +This is needed to enforce consistent order of the transcripts in the tables, enabling efficient processing. -The return value of `fish4rodents` is a list with two items: `$boot_data_A` and `$boot_data_B`. These two items are -formatted for RATs' generic bootstrapped input. +#### Optional arguments + +* `scaleto` allows you to control the normalisation factor. (Default 1000000 gives TPM values). + +And finally run DTU in the way already shown: + +```{r, eval=FALSE} +# 3. Run RATs with the bootstrapped table data format: +mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A, + boot_data_B= mydata$boot_data_B, verbose= FALSE) +``` *** -# Additional Parameters and Settings +# Advanced Parameters and Settings In summary, `rats` works as follows: @@ -332,15 +257,16 @@ The following three main thresholds are used in RATs: ```{r, eval=FALSE} # Calling DTU with custom thresholds. -mydtu <- call_DTU(annot= myannot, slo= myslo, name_A= "controls", name_B= "patients", +mydtu <- call_DTU(annot= myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, p_thresh= 0.01, abund_thresh= 10, dprop_thres = 0.25) ``` 1. `p_thresh` - Statistical significance level. P-values below this will be considered significant. Lower threshold values are stricter. (Default 0.05) -2. `abund_thresh` - Noise threshold. Transcripts with abundance below that value in both conditions are ignored. Higher threshold values are stricter. (Default 5, assumes use of estimated counts similar to those reported by [Salmon](https://github.com/COMBINE-lab/salmon)) +2. `abund_thresh` - Noise threshold. Transcripts with abundance below that value in both conditions are ignored. Higher threshold values are stricter. (Default 5, assumes abundances scaled to library size) 3. `dprop_thresh` - Effect size threshold. Transcripts whose proportion changes between conditions by less than the threshold are considered non-DTU, regardless of their statistical significance. Higher threshold values are stricter. (Default 0.20) -The default values for these thresholds have been chosen such that they achieve a *median* FPR <5% for a high quality dataset from *Arabidopsis thaliana*, even with only 3 replicates per condition. Your mileage may vary and you should give some consideration to selecting appropriate values. +The default values for these thresholds have been chosen such that they achieve a *median* FDR <5% for a high quality dataset from *Arabidopsis thaliana*, even with only 3 replicates per condition. Your mileage may vary and you should give some consideration to selecting appropriate values. Depending on the settings, *additional thresholds* are available and will be discussed in their respective sections below. @@ -348,8 +274,8 @@ Depending on the settings, *additional thresholds* are available and will be dis RATs offers two types of bootstrapping: -1. Bootstrapping of `p-value` and `Dprop` against the technical fluctuations of the quantifications. This requires bootstrapped quantifications as input. -2. Bootstrapping of `p-value` and `Dprop` against the abundance fluctuations among samples. +1. Bootstrapping of significance and effect size against the technical fluctuations of the quantifications. This requires bootstrapped quantifications as input. +2. Bootstrapping of significance and effect size against the abundance fluctuations among samples. Enabling these two procedures assesses the robustness of the DTU calls. Calls that are easily overturned depending on which subset of the data is used may be spurious but may also be valid. Deeper sequencing and/or higher replication may resolve these ambiguities. @@ -357,16 +283,18 @@ may be spurious but may also be valid. Deeper sequencing and/or higher replicati In both cases, what is measured is the fraction of iterations in which the significance and effect size meet their respective thresholds. These fractions can be used as indicators of confidence in the calls, especially when a large number of iterations is performed. -If bootstrapping is switched off, the respective output fields will not be included in the output. +If bootstrapping is switched off, the respective fields will not be included in the output. ### Quantification bootstraping -Three parameters control bootstrapping of DTU calls on the abundance estimates: +Three parameters control bootstrapping of DTU calls against the quantification uncertainty: 1. `qboot` - Whether to bootstrap the quantifications or not. (Default TRUE) -2. `qbootnum` - How many bootstrap iterations to do. Preferably at least 100. If 0, RATs will try to infer a value from the data. (Default 0) -3. `qrep_thresh` - Reproducibility threshold. What fraction of the iterations has to agree on a result to consider it confident. Higher threshold values are stricter. (Default 0.95) +2. `qbootnum` - How many bootstrap iterations to do. Preferably at least 100. (Default 0) If 0, +RATs will try to infer a value from the data (currently equal to the number of bootstraps in the data). +3. `qrep_thresh` - Reproducibility threshold. What fraction of the iterations has to agree on a result to consider it confident. (Default 0.95) +If you want to calculate the reproducibility but don't want it to weigh in on the DTU classifications, set the threshold to 0. In this process, one quantification iteration will be randomly selected from each sample and DTU will be called on it. This will be repeated `qbootnum` times. Because the number of replicates remains the same, the statistical power is not compromised. Therefore, the reproducibility will be **used as a criterion** in calling DTU, along with @@ -381,12 +309,14 @@ Warnings will be generated if `qbootnum` is too low or too high, but in most cas ```{r, eval=FALSE} # Bootstrap (default). Do 100 iterations. -mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", - name_B = "patients", qboot = TRUE, qbootnum = 100, qrep_thresh= 0.95) +mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + qboot = TRUE, qbootnum = 100, qrep_thresh= 0.95) # Skip bootstraps. -mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", - name_B = "patients", qboot = FALSE) +mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + qboot = FALSE) ``` @@ -395,61 +325,66 @@ mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", Three parameters control bootstrapping of DTU calls agaisnt the samples: 1. `rboot` - Whether to bootstrap the replicates or not. (Default TRUE) -2. `rrep_thresh` - Reproducibility threshold. What fraction of the iterations has to agree on a result to consider it confident. Higher threshold values are stricter. (Default 0.85) -3. `rrep_as_crit` - Whether the replicate-wise reproducibility should be used as a criterion in calling DTU. (Default TRUE) +2. `rrep_thresh` - Reproducibility threshold. What fraction of the iterations has to agree on a result to consider it confident. (Default 0.85) +If you want to calculate the reproducibility but don't want it to weigh in on the DTU classifications, set the threshold to 0. Unlike bootstrapping the quantifications, bootstrapping the replicates is currently **not stochastic**. RATs will do ALL the 1 vs 1 combinations of samples, one from each condition. This behaviour may be changed if in the future there is demand for high replication that reaches the capacity limits of R matrices. - To maintain comparable statistical significance to the actual experimental design, because only one replicate is used each time, the abundances are scaled up by the number of replicates in the condition. This is equivalent to the idealised scenario of all the replicates being extremely consistent and similar to the chosen one. ```{r, eval=FALSE} # Bootstrap (default). -# Note that for few replicates, the reproducibility values are highly discrete: -# The number of iterations for 3 samples per condition is 3*3=9. -# So the minimum possible error rate is 1/9=0.111. -# The corresponding threshold is 1-0.111=0.888. -mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", - name_B = "patients", rboot = TRUE, qrep_thresh= 0.85) +mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + rboot = TRUE, qrep_thresh= 0.85) # Skip bootstraps. -mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", - name_B = "patients", rboot = FALSE) +mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + rboot = FALSE) ``` +**Note** that for few replicates, the reproducibility values are highly discrete: +The number of iterations for 3 samples per condition is `3 * 3 = 9`. +So the minimum possible error rate is `1 / 9 = 0.111111...`. +The corresponding threshold is `1-0.1111111... = 0.8888888...`, hence the default threshold of 0.85. + ## Multi-threading -RATs completion time depends on the number of expressed annotated transcripts. Single-threaded, RATs can take up to a few minutes per iteration for large annotations. +RATs completion time depends on the number of annotated and expressed transcripts. Single-threaded, RATs can take up to a few minutes per iteration for large annotations. Fortunately, the task is highly parallelisable: ```{r eval=FALSE} # Using 8 threads/cores for parallel computing. -mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", - name_B = "patients", threads = 8) +mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + threads = 8) ``` 1. `threads` - The number of threads to use. (Default 1) -Due to core R implementation limitations, multi-threading works only in POSIX-compliant systems. In other systems, the option will be ignored and R will run serially. -Most Linux distros, Unix versions, and recent OSX versions are supposedly compatible, and there are environments that can be installed on Windows that might circumvent the issue. +Due to core R implementation limitations, the type of multi-threading used in RATs works only in POSIX-compliant systems. +Refer to the `parallel` package for details. ## Test selection -RATs runs both gene-level calls and transcript-level calls. They are complementary and we recommend using them -together, but the option to skip either is provided for special use cases. The fields of the skipped test will be filled with `NA`. +RATs runs both gene-level DTU calls and transcript-level DTU calls. They are independent from one another and we consider them complementary and recommend using them together, but the option to skip either is provided for special use cases. +The output fields of the skipped test will be filled with `NA`. ```{r, eval=FALSE} # Transcripts only. -mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", - name_B = "patients", testmode="transc") +mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + testmode="transc") # Genes only. -mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", - name_B = "patients", testmode="genes") +mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + testmode="genes") ``` -1. `testmode` - Which test(s) to run {"transc", "genes", "both""}. (Default "both") +1. `testmode` - Which test(s) to run {"transc", "genes", "both"}. (Default "both") ## Correction for multiple testing @@ -459,8 +394,9 @@ options is listed in R's `p.adjust.methods`. ```{r, eval=FALSE} # Bonferroni correction. -mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", - name_B = "patients", correction = "bonferroni") +mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + correction = "bonferroni") ``` 1. `correction` - Type of multiple testing correction. (Default "BH") @@ -470,47 +406,6 @@ mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", RATs needs to pull information from different fields of the data and annotation and it does so based on the names of the columns. You can override the default names of these fields. - -```{r} -# Lets emulate some input with custom field names. -sim <- sim_sleuth_data(varname = "mouse", cnames = c("Splinter", "Mickey"), - COUNTS_COL = "the-counts", TARGET_COL = "transcript", - PARENT_COL = "gene", BS_TARGET_COL = "trscr") -myslo <- sim$slo -myannot <- sim$annot - -# Sleuth covariates table. -print( sim$slo$sample_to_covariates ) -# Sleuth bootstrapped quantifications. -print( head(sim$slo$kal[[1]]$bootstrap[[1]]) ) -# Annotation. -print( head(sim$annot) ) -``` - - -### Sleuth field names - -Although we expect Sleuth field names to be constant with the exception of covariate names, RATs allows overriding the -expected names. One reason to do this would be to switch between using `tpm` and `est_count` abundances. - -```{r, eval=FALSE} -# Call DTU on data with custom field names. -mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "Splinter", name_B = "Mickey", - varname="mouse", COUNTS_COL="the-counts", BS_TARGET_COL="trscr", - verbose = FALSE) - -#! In our example data here, the annotation fields have been modified as well, -#! so this command will NOT run as it is shown. You will also need the parameters -#! shown in teh section below. -``` - -1. `varname` - The field name in `myslo$sample_to_covariates` where the desired condition names are listed. (Default "condition") -2. `COUNTS_COL` - The name of the field holding the abundances in the bootstrap tables of Sleuth. (Default "tpm") -3. `BS_TARGET_COL` - The name of the field holding the transcript identifiers in the bootstrap tables of Sleuth. (Default "target_id") - -The `COUNTS_COL` and `BS_TARGET_COL` parameters are also available for `denest_sleuth_boots()`. - - ### Annotation field names Although it is easy to rename the columns of a table to comply with the expected names, this may sometimes be undesireable, so RATs @@ -518,13 +413,9 @@ allows you to change the expected names instead. ```{r, eval=FALSE} # Call DTU using annotation with custom field names. -mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "Splinter", name_B = "Mickey", - varname="mouse", TARGET_COL="transcript", PARENT_COL="gene", - verbose = FALSE) - -#! In our example data here, the Sleuth fields have been modified as well, -#! so this command will NOT run as it is shown. You will also need the parameters -#! shown in the section above. +mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + TARGET_COL="transcript", PARENT_COL="gene") ``` 1. `TARGET_COL` - The name of the field holding the transcript identifiers in the annotation data frame. (Default "target_id") @@ -536,29 +427,29 @@ The `TARGET_COL` and `PARENT_COL` parameters are also available for `denest_sleu ## Abundance scaling As mentioned previously, various normalised abundance units (like TPM) are scaled to an arbitrary and usually smaller than actual -sample size. This artificially deflates the significances calculated by RATs and reduces the statistical power. If you can infer -a more likely number of transcripts in your sequenced samples (based on your sample preparation details), you can specify this -scaling factor to "correct" the abundance values. +sample size. This artificially deflates the significances calculated by RATs and reduces the statistical power. The `scaling` parameter allows you to scale normalised abundances up to the actual library sizes. ```{r eval=FALSE} -# If your sample preps contain ~70 million transcripts and you are using TPM abundances -mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", - name_B = "patients", COUNTS_COL = "tpm", scaling = 70) +# If your sample preps contain ~70 million transcripts +# and you are using TPM abundances, the scaling factor +# is 10M / 1M = 70. +mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + scaling = 70) ``` - *** # Annotation discrepancies -Different annotation versions often preserve transcript IDs, despite altering the details of the transcript models, -such as UTRs. It is important to use the same annotation throughout the workflow, especially for the quantifications. -Otherwise, the data is not comparable. +Different annotation versions often preserve transcript IDs, despite altering the details of the transcript model. +It is important to use the same annotation throughout the workflow, otherwise the abundances will not be comparable +in a meaningful way. -All internal operations and the output of RATs will be based on the annotation provided: +All internal operations and the output of RATs are based on the annotation provided: * Any transcripts present in the data but missing from the annotation will be ignored completely and will not show up in the output, as there is no reliable way to match them to gene IDs. diff --git a/inst/doc/input.html b/inst/doc/input.html index ae717d9..48d78f6 100644 --- a/inst/doc/input.html +++ b/inst/doc/input.html @@ -11,7 +11,7 @@ - + RATs: Input and Settings @@ -217,7 +217,7 @@

RATs: Input and Settings

Kimon Froussios

-

31 MAY 2017

+

19 SEP 2017

@@ -231,14 +231,13 @@

Input formats

Data

RATs can work with several input types:

    -
  1. A Sleuth object.
  2. -
  3. Salmon/Kallisto quantifications.
  4. -
  5. Bootstrapped abundance estimates.
  6. -
  7. Abundance estimates.
  8. +
  9. Salmon and Kallisto quantifications.
  10. +
  11. Bootstrapped abundance estimates in lists of R data.tables.
  12. +
  13. Abundance estimates in R data.tables.
-

-> Sleuth objects contain their input data, so RATs can extract the bootstrapped abundance estimates directly from there, for your convenience if Sleuth is in your workflow already.

-

-> Salmon quantifications require the wasabi converter. Kallisto quantifications are accessed directly.

-

-> Bootstrapped abundance estimates from other sources can be input directly as lists of data.tables. Two lists are needed, one per condition. Each datatable should contain the transcript identifiers in the first column, followed by columns containing the estimates from the bootstrap iterations:

+

As of v0.6.0, input from a sleuth object is no longer supported by RATs. This is due to changes in Sleuth v0.29 that made the bootstrap data no longer available. Instead, RATs already supports input directly from the Kallisto/Salmon quantifications.

+

For option 1, the function fish4rodents() will load the data into tables suitable for option 2. Details in the respective section below.

+

For options 2 and 3, the format of the tables is as in the example below. The first column contains the transcript identifiers. Subsequent columns contain the abundances.

##    target_id V1 V2 V3
 ## 1:    1A1N-2 20 21 18
 ## 2:    1B1C.1  0  0  0
@@ -246,12 +245,14 @@ 

Data

## 4: 1D1C:one 0 0 0 ## 5: 1D1C:two 76 80 72 ## 6: ALLA1 50 40 60
-

-> Abundance estimates, without bootstrapping information, can be input simply as two data.tables, one per condition. The first column should contain the transcript identifiers, followed by columns listing the abundance per sample. The format of each table is identical to the one shown above.

+

Read counts, TPMs, etc

-

RATs will happily crunch any type of numeric value, but the type of abundances you provide will have an impact on the DTU outcome.

-

Read counts (such as the est_counts) work well with RATs and achieve decent False Discovery Rate. However, they are influenced by the length of the transcripts. If the isoforms of a gene differ considerably in length, a well-expressed long isoform can drown a valid change in the relative abundances of shorter isoforms.

-

TPM abundances account for transcript length and are a truer representation of relative isoform abundance. However, they are scaled arbitrarily to 1 million transcripts, which is an unrealistically small sample size and this causes loss of statistical power. Therefore, RATs provides the option to scale abundance values such as TPMs, by a constant factor. Use of this option should not be done light-heartedly, as it will artificially inflate or deflate the significances obtained. Ideally, you should infer an appropriate scaling factor based on your sample preparation details (for example, based on RNA concentration).

+

RATs will happily crunch any type of numeric value, but the type of abundances you provide will have an impact on the DTU outcome. The statistical test employed (G test) is meant to work with actual counts, not arbitrarily scaled values like TPMs (transcripts per million reads) or other per-million-reads representations. On the other hand, read counts work well with RATs and achieve a good False Discovery Rate, but they are influenced by the length of the transcripts and a well-expressed long isoform can drown out a valid change in the relative abundances of short isoforms.

+

To get the best of both worlds, we recommend obtaining TPM abundances, so as to account for transcript lengths, and then scaling these values to your actual library size to cancel out the per-million part and regain realistic counts. RATs provides functionality to enable you to do this, the first of which is the scaleto parameter in fish4rodents(), shown above.

@@ -264,7 +265,7 @@

Annotation

## 4 1D1C:two 1D1C ## 5 1B1C.1 1B1C ## 6 1B1C.2 1B1C -

A function is provided to create this table, given a GTF file. (Note: GFF3 is not supported, as the specification is too relaxed.)

+

Such a table can be compiled with a variety of methods and from a variety of resources. RATs provides functionality to create this table from a GTF file. (Note: GFF3 is not supported for this)

# Extract transcript ID to gene ID index from a GTF annotation.
 myannot <- annot2ids("my_annotation_file.gtf")

@@ -272,26 +273,28 @@

Annotation

Calling DTU

-

To bypass the complexity of running third-party tools in this tutorial, we will instead use emulated data. RATs comes with data emulators, intended to be used for testing the code. However, they are also convenient to use for showcasing how RATs works. If you happen to have real data available, you can use that instead if you wish.

-

By default, RATs reports on its progress and produces a summary report. The progress messages and summary can be suppressed by adding the verbose = FALSE parameter to the call. To prevent cluttering this tutorial with verbose output, we will use this option in all the examples.

-

If you leave verbose = TRUE when trying out the examples below using the emulated data, you will get warnings about the number of bootstraps. The warning is triggered because the emulated dataset used in the examples immitates only the structure of real data, not the actual volume of it, and as such contains too few bootstrap iterations.

-
-

…from abundance estimates, without bootstraps

-

This is the simplest usage case, provided only for completeness. We recommend using bootstrapped data whenever possible, for reasons that will be discussed in the relevant section below.

+

To bypass the complexity of involving third-party tools in this tutorial, we will instead use emulated data. RATs comes with data emulators, intended to be used for testing the code. However, they are also convenient to use for showcasing how RATs works.

+

By default, RATs reports on its progress and produces a summary report. These can be suppressed using the verbose = FALSE parameter to the call. To prevent cluttering this tutorial with verbose output, we will use this option in all the examples.

+

If you choose to allow verbose output when trying out the examples below using the emulated data, you will get some warnings about the number of bootstraps. The warning is triggered because the emulated dataset used in the examples immitates only the structure of real data, not the actual volume of it, and as such it contains unrealistically few bootstrap iterations. Simply ignore these warnings for these examples.

+
+

DTU from abundance estimates, without bootstraps

+

This is the simplest usage case, and your likely input type if you use quantifications methods other than Kallisto and Salmon.

First, let’s emulate some data to work with:

# Simulate some data.
 simdat <- sim_count_data()
-
-# For convenience let's assign the contents of the list to separate variables.
+# For convenience let's assign the contents of the list to separate variables
 mycond_A <- simdat[[2]]       # Simulated abundances for one condition.
 mycond_B <- simdat[[3]]       # Simulated abundances for other condition.
 myannot <- simdat[[1]]        # Transcript and gene IDs for the above data.

Now we can call DTU:

# Find DTU between the simulated datasets.
-mydtu <- call_DTU(annot= myannot, count_data_A= mycond_A, count_data_B= mycond_B, 
-                  verbose= FALSE,
-                  name_A= "healthy", name_B= "patients", varname= "My phenotype",
-                  description="Comparison of two simulated counts datasets for the tutorial. Simulated using built-in functionality of RATs.")
+mydtu <- call_DTU(annot= myannot, count_data_A= mycond_A, + count_data_B= mycond_B, verbose= FALSE, + name_A= "healthy", name_B= "patients", + varname= "My phenotype", + description="Comparison of two simulated counts datasets + for the tutorial. Simulated using built-in functionality + of RATs.")

Mandatory arguments:

    @@ -299,31 +302,38 @@

    Mandatory arguments:

  1. count_data_A and count_data_B - are each a data.table of transcript abundances as described in the Input section.
-
-

Optional arguments:

+
+

Optional arguments (will be recorded in the output object):

  • name_A, name_B - A name for each conditon. (Default NA)
  • varname - The name of the variable/condition. (Default NA)
  • -
  • description - Free-text description. You can note experiment details, annotation source, anything you might find useful later. (Default NA)
  • +
  • description - Free-text description. (Default NA)
+
+

Advanced options

+

Refer to the respective sections later in this vignette.

+
-
-

…from bootstrapped abundance estimates

+
+

DTU from bootstrapped abundance estimates

First, let’s emulate some data, as we did before.

# Simulate some data. (Notice it is a different function than before.)
 simdat <- sim_boot_data()
 
-# For convenience let's assign the contents of the list to separate variables.
-mycond_A <- simdat[[2]]       # Simulated bootstrapped data for one condition.
-mycond_B <- simdat[[3]]       # Simulated bootstrapped data for other condition.
-myannot <- simdat[[1]]        # Transcript and gene IDs for the above data.
+# For convenience let's assign the contents of the list to separate variables +mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition. +mycond_B <- simdat[[3]] # Simulated bootstrapped data for other condition. +myannot <- simdat[[1]] # Transcript and gene IDs for the above data.

Now we can call DTU:

-
# Find DTU between conditions "controls" and "patients" in the simulated data.
-mydtu <- call_DTU(annot= myannot, boot_data_A= mycond_A, boot_data_B= mycond_B, 
-                  verbose= FALSE, 
-                  name_A= "wildtype", name_B= "some mutant", varname = "My phenotype",
-                  description="Comparison of two simulated datasets of bootstrapped counts for the tutorial. Simulated using built-in functionality of RATs.")
+
# Find DTU between conditions.
+mydtu <- call_DTU(annot= myannot, boot_data_A= mycond_A, 
+                  boot_data_B= mycond_B, verbose= FALSE, 
+                  name_A= "wildtype", name_B= "some mutant", 
+                  varname = "My phenotype", description="Comparison of 
+                  two simulated datasets of bootstrapped counts for the 
+                  tutorial. Simulated using built-in functionality 
+                  of RATs.")

Mandatory arguments:

    @@ -331,115 +341,54 @@

    Mandatory arguments:

  1. boot_data_A and boot_data_B - are each a list of data.table objects, as described in the Input section.
-
-

Optional arguments:

+
+

Optional arguments (will be recorded in the output object):

  • name_A, name_B - A name for each conditon. (Default NA)
  • varname - The name of the variable/condition. (Default NA)
  • -
  • description - Free-text description. You can note experiment details, annotation source, anything you might find useful later. (Default NA)
  • +
  • description - Free-text description. (Default NA)
+
+

Advanced options

+

Refer to the respective sections later in this vignette.

-
-

…with a sleuth object

-

First, let’s emulate a Sleuth object, using the bundled tools. The real Sleuth objects are very large and very complex nested lists. The emulated one contains only the minimum essential elements relevant to calling DTU with RATs.

-
# Simulate some data.
-simdat <- sim_sleuth_data(cnames = c("controls", "patients")) 
-# controls and patients are arbitrary names to use as conditions.
-
-# For convenience let's assign the contents of the list to separate variables.
-myslo <- simdat$slo       # Simulated minimal sleuth object.
-myannot <- simdat$annot   # Transcript and gene Identifiers for the above data.
-

Now call DTU.

-
# Find DTU between conditions "controls" and "patients" in the simulated data.
-mydtu <- call_DTU(annot= myannot, slo= myslo, name_A= "controls", name_B= "patients", 
-                  varname= "condition", verbose= FALSE,
-                  description="Comparison of two conditions using a simulated sleuth object for the purposes of the tutorial. Simulated using built-in functionality of RATs.")
-
-

Mandatory arguents:

-
    -
  1. annot - an annotation data frame, as described in the Input formats section.
  2. -
  3. slo - a Sleuth object.
  4. -
  5. name_A and name_B - the two groups of samples to compare. These are restricted to values available in myslo$sample_to_covariates. The values must both exist in the column specified by varname.
  6. -
  7. varname - the variable for which samples are compared. must be an existing column header in myslo$sample_to_covariates.
  8. -
-
-

Optional arguments:

-
    -
  • description - Free-text description. You can note experiment details, annotation source, anything you might find useful later. (Default NA)
  • -
-

Please note that, unlike the other two usage cases, name_A, name_B and varname are not optional, as they specify how data is extracted from the Sleuth object.

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
-
-

…with Salmon/Kallisto output

-

RATs offers a method to import bootstrapped abundances directly from Salmon output (requires wasabi) or Kallisto output. Abundances are corrected for transcript length and scaled to 1M transcripts (TPM).

+
+

DTU with Salmon/Kallisto output

+

RATs offers a method to import bootstrapped abundances directly from Salmon output (requires wasabi) or Kallisto output. The raw abundances are normalised to TPM (default).

# 1. Collect your outputs into vectors. The end of each path should be a 
 # directory with a unique name/identifier for one sample.
-samples_A <- c("your/path/SAMPLE1", "your/path/SAMPLE4","your/path/SAMPLE5", ...)
-samples_B <- c("your/path/SAMPLE2", "your/path/SAMPLE3","your/path/SAMPLE7", ...)
-# OR if your outputs are all in the same path, this syntax might be more convenient:
-samples_A <- file.path("your/path/", c("SAMPLE1", "SAMPLE4","SAMPLE5", ...))
-samples_B <- file.path("your/path/", c("SAMPLE2", "SAMPLE3","SAMPLE7", ...))
+samples_A <- c("your/path/SAMPLE1", "your/path/SAMPLE4","your/path/SAMPLE5")
+samples_B <- c("your/path/SAMPLE2", "your/path/SAMPLE3","your/path/SAMPLE7", 
+               "your/path/SAMPLE10")
 
 # 2. Convert, import, and extract. 
-# The annotation is needed to enforce consistent transcripts order throughout RATs.
-mydata <- fish4rodents(A_paths= samples_A, B_paths= samples_B, annot= myannot)
-
-# 3. Run RATs with the generic bootstrapped format:
-mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A, 
-                  boot_data_B= mydata$boot_data_B, verbose= FALSE, 
-                  name_A= "controls", name_B= "patients", 
-                  varname = "condition", description="Comparison of two sets of bootstrapped counts imported from the quantification output.")
+mydata <- fish4rodents(A_paths= samples_A, B_paths= samples_B, + annot= myannot, scaleto=100000000) +

The return value of fish4rodents is a list with two items: $boot_data_A and $boot_data_B. These two items are formatted for RATs’ generic bootstrapped input.

Mandatory arguments (fish4rodents):

  1. A_paths and B_paths - Two vectors containing the paths to the quantification output directories, one vector for each condition. The last segments of each path should be a directory with a unique identifying name for a single sample.
  2. -
  3. annot - The annotation table that you will use for calling DTU (and that was used for the quantification).
  4. +
  5. annot - The annotation table that you will use for calling DTU (and that was used for the quantification). This is needed to enforce consistent order of the transcripts in the tables, enabling efficient processing.
-

The return value of fish4rodents is a list with two items: $boot_data_A and $boot_data_B. These two items are formatted for RATs’ generic bootstrapped input.

+
+
+

Optional arguments

+
    +
  • scaleto allows you to control the normalisation factor. (Default 1000000 gives TPM values).
  • +
+

And finally run DTU in the way already shown:

+
# 3. Run RATs with the bootstrapped table data format:
+mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A, 
+                  boot_data_B= mydata$boot_data_B, verbose= FALSE)

-
-

Additional Parameters and Settings

+
+

Advanced Parameters and Settings

In summary, rats works as follows:

RATs design @@ -449,97 +398,101 @@

Additional Parameters and Settings

Thresholds

The following three main thresholds are used in RATs:

# Calling DTU with custom thresholds.
-mydtu <- call_DTU(annot= myannot, slo= myslo, name_A= "controls", name_B= "patients", 
+mydtu <- call_DTU(annot= myannot, 
+                  boot_data_A= mycond_A, boot_data_B= mycond_B,
                   p_thresh= 0.01, abund_thresh= 10, dprop_thres = 0.25)
  1. p_thresh - Statistical significance level. P-values below this will be considered significant. Lower threshold values are stricter. (Default 0.05)
  2. -
  3. abund_thresh - Noise threshold. Transcripts with abundance below that value in both conditions are ignored. Higher threshold values are stricter. (Default 5, assumes use of estimated counts similar to those reported by Salmon)
  4. +
  5. abund_thresh - Noise threshold. Transcripts with abundance below that value in both conditions are ignored. Higher threshold values are stricter. (Default 5, assumes abundances scaled to library size)
  6. dprop_thresh - Effect size threshold. Transcripts whose proportion changes between conditions by less than the threshold are considered non-DTU, regardless of their statistical significance. Higher threshold values are stricter. (Default 0.20)
-

The default values for these thresholds have been chosen such that they achieve a median FPR <5% for a high quality dataset from Arabidopsis thaliana, even with only 3 replicates per condition. Your mileage may vary and you should give some consideration to selecting appropriate values. Depending on the settings, additional thresholds are available and will be discussed in their respective sections below.

+

The default values for these thresholds have been chosen such that they achieve a median FDR <5% for a high quality dataset from Arabidopsis thaliana, even with only 3 replicates per condition. Your mileage may vary and you should give some consideration to selecting appropriate values. Depending on the settings, additional thresholds are available and will be discussed in their respective sections below.

Bootstrapping

RATs offers two types of bootstrapping:

    -
  1. Bootstrapping of p-value and Dprop against the technical fluctuations of the quantifications. This requires bootstrapped quantifications as input.
  2. -
  3. Bootstrapping of p-value and Dprop against the abundance fluctuations among samples.
  4. +
  5. Bootstrapping of significance and effect size against the technical fluctuations of the quantifications. This requires bootstrapped quantifications as input.
  6. +
  7. Bootstrapping of significance and effect size against the abundance fluctuations among samples.

Enabling these two procedures assesses the robustness of the DTU calls. Calls that are easily overturned depending on which subset of the data is used may be spurious but may also be valid. Deeper sequencing and/or higher replication may resolve these ambiguities.

In both cases, what is measured is the fraction of iterations in which the significance and effect size meet their respective thresholds. These fractions can be used as indicators of confidence in the calls, especially when a large number of iterations is performed.

-

If bootstrapping is switched off, the respective output fields will not be included in the output.

+

If bootstrapping is switched off, the respective fields will not be included in the output.

Quantification bootstraping

-

Three parameters control bootstrapping of DTU calls on the abundance estimates:

+

Three parameters control bootstrapping of DTU calls against the quantification uncertainty:

  1. qboot - Whether to bootstrap the quantifications or not. (Default TRUE)
  2. -
  3. qbootnum - How many bootstrap iterations to do. Preferably at least 100. If 0, RATs will try to infer a value from the data. (Default 0)
  4. -
  5. qrep_thresh - Reproducibility threshold. What fraction of the iterations has to agree on a result to consider it confident. Higher threshold values are stricter. (Default 0.95)
  6. +
  7. qbootnum - How many bootstrap iterations to do. Preferably at least 100. (Default 0) If 0, RATs will try to infer a value from the data (currently equal to the number of bootstraps in the data).
  8. +
  9. qrep_thresh - Reproducibility threshold. What fraction of the iterations has to agree on a result to consider it confident. (Default 0.95) If you want to calculate the reproducibility but don’t want it to weigh in on the DTU classifications, set the threshold to 0.

In this process, one quantification iteration will be randomly selected from each sample and DTU will be called on it. This will be repeated qbootnum times. Because the number of replicates remains the same, the statistical power is not compromised. Therefore, the reproducibility will be used as a criterion in calling DTU, along with statistical significance and effect size. The process is stochastic; the quantifications are randomly sampled, so runs on the same data may yield slight differences in DTU. Using higher qbootnum improves reproducibility between runs on the same dataset.

Low reproducibility indicates that the quantification tool found it hard to distinguish these transcripts. This can be caused by high similarity of the isoforms, genes with a large number of isoforms, and/or poor read coverage in the regions differentiating the isoforms from one another. In these cases, skepticism is required about the quantifications and any potential differential expression regarding these transcripts.

Warnings will be generated if qbootnum is too low or too high, but in most cases RATs will continue with the analysis. The warnings will not be shown if verbose = FALSE.

# Bootstrap (default). Do 100 iterations.
-mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", 
-                  name_B = "patients", qboot = TRUE, qbootnum = 100, qrep_thresh= 0.95)
+mydtu <- call_DTU(annot = myannot, 
+                  boot_data_A= mycond_A, boot_data_B= mycond_B,
+                  qboot = TRUE, qbootnum = 100, qrep_thresh= 0.95)
 
 # Skip bootstraps.
-mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", 
-                  name_B = "patients", qboot = FALSE)
+mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + qboot = FALSE)

Replicate bootstraping

Three parameters control bootstrapping of DTU calls agaisnt the samples:

  1. rboot - Whether to bootstrap the replicates or not. (Default TRUE)
  2. -
  3. rrep_thresh - Reproducibility threshold. What fraction of the iterations has to agree on a result to consider it confident. Higher threshold values are stricter. (Default 0.85)
  4. -
  5. rrep_as_crit - Whether the replicate-wise reproducibility should be used as a criterion in calling DTU. (Default TRUE)
  6. +
  7. rrep_thresh - Reproducibility threshold. What fraction of the iterations has to agree on a result to consider it confident. (Default 0.85) If you want to calculate the reproducibility but don’t want it to weigh in on the DTU classifications, set the threshold to 0.
-

Unlike bootstrapping the quantifications, bootstrapping the replicates is currently not stochastic. RATs will do ALL the 1 vs 1 combinations of samples, one from each condition. This behaviour may be changed if in the future there is demand for high replication that reaches the capacity limits of R matrices.

-

To maintain comparable statistical significance to the actual experimental design, because only one replicate is used each time, the abundances are scaled up by the number of replicates in the condition. This is equivalent to the idealised scenario of all the replicates being extremely consistent and similar to the chosen one.

+

Unlike bootstrapping the quantifications, bootstrapping the replicates is currently not stochastic. RATs will do ALL the 1 vs 1 combinations of samples, one from each condition. This behaviour may be changed if in the future there is demand for high replication that reaches the capacity limits of R matrices. To maintain comparable statistical significance to the actual experimental design, because only one replicate is used each time, the abundances are scaled up by the number of replicates in the condition. This is equivalent to the idealised scenario of all the replicates being extremely consistent and similar to the chosen one.

# Bootstrap (default).
-# Note that for few replicates, the reproducibility values are highly discrete:
-# The number of iterations for 3 samples per condition is 3*3=9. 
-# So the minimum possible error rate is 1/9=0.111. 
-# The corresponding threshold is 1-0.111=0.888.
-mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", 
-                  name_B = "patients", rboot = TRUE, qrep_thresh= 0.85)
+mydtu <- call_DTU(annot = myannot, 
+                  boot_data_A= mycond_A, boot_data_B= mycond_B,
+                  rboot = TRUE, qrep_thresh= 0.85)
 
 # Skip bootstraps.
-mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", 
-                  name_B = "patients", rboot = FALSE)
+mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + rboot = FALSE) +

Note that for few replicates, the reproducibility values are highly discrete: The number of iterations for 3 samples per condition is 3 * 3 = 9. So the minimum possible error rate is 1 / 9 = 0.111111.... The corresponding threshold is 1-0.1111111... = 0.8888888..., hence the default threshold of 0.85.

Multi-threading

-

RATs completion time depends on the number of expressed annotated transcripts. Single-threaded, RATs can take up to a few minutes per iteration for large annotations. Fortunately, the task is highly parallelisable:

+

RATs completion time depends on the number of annotated and expressed transcripts. Single-threaded, RATs can take up to a few minutes per iteration for large annotations. Fortunately, the task is highly parallelisable:

# Using 8 threads/cores for parallel computing.
-mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", 
-                  name_B = "patients", threads = 8)
+mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + threads = 8)
  1. threads - The number of threads to use. (Default 1)
-

Due to core R implementation limitations, multi-threading works only in POSIX-compliant systems. In other systems, the option will be ignored and R will run serially. Most Linux distros, Unix versions, and recent OSX versions are supposedly compatible, and there are environments that can be installed on Windows that might circumvent the issue.

+

Due to core R implementation limitations, the type of multi-threading used in RATs works only in POSIX-compliant systems. Refer to the parallel package for details.

Test selection

-

RATs runs both gene-level calls and transcript-level calls. They are complementary and we recommend using them together, but the option to skip either is provided for special use cases. The fields of the skipped test will be filled with NA.

+

RATs runs both gene-level DTU calls and transcript-level DTU calls. They are independent from one another and we consider them complementary and recommend using them together, but the option to skip either is provided for special use cases. The output fields of the skipped test will be filled with NA.

# Transcripts only.
-mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", 
-                  name_B = "patients", testmode="transc")
+mydtu <- call_DTU(annot = myannot, 
+                  boot_data_A= mycond_A, boot_data_B= mycond_B,
+                  testmode="transc")
 # Genes only.
-mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", 
-                  name_B = "patients", testmode="genes")
+mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + testmode="genes")
    -
  1. testmode - Which test(s) to run {“transc”, “genes”, “both”“}. (Default”both“)
  2. +
  3. testmode - Which test(s) to run {“transc”, “genes”, “both”}. (Default “both”)

Correction for multiple testing

Testing multiple null hypotheses increases the chance of one being falsely rejected. To keep the overall false rate at the desired level, the raw p-values must be adjusted. The default adjustment method is BH (Benjamini-Hochberg). A full list of options is listed in R’s p.adjust.methods.

# Bonferroni correction.
-mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", 
-                  name_B = "patients", correction = "bonferroni")
+mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + correction = "bonferroni")
  1. correction - Type of multiple testing correction. (Default “BH”)
@@ -547,67 +500,13 @@

Correction for multiple testing

Input field names

RATs needs to pull information from different fields of the data and annotation and it does so based on the names of the columns. You can override the default names of these fields.

-
# Lets emulate some input with custom field names. 
-sim <- sim_sleuth_data(varname = "mouse", cnames = c("Splinter", "Mickey"), 
-                       COUNTS_COL = "the-counts", TARGET_COL = "transcript", 
-                       PARENT_COL = "gene", BS_TARGET_COL = "trscr")
-myslo <- sim$slo
-myannot <- sim$annot
-
-# Sleuth covariates table.
-print( sim$slo$sample_to_covariates )
-
##      mouse batch
-## 1 Splinter    ba
-## 2   Mickey    ba
-## 3 Splinter    bb
-## 4   Mickey    bb
-
# Sleuth bootstrapped quantifications.
-print( head(sim$slo$kal[[1]]$bootstrap[[1]]) )
-
##      trscr the-counts
-## 1      LC1          3
-## 2     NIA1        333
-## 3     NIA2        666
-## 4   1A1N-1         10
-## 5   1A1N-2         20
-## 6 1D1C:one          0
-
# Annotation.
-print( head(sim$annot) )
-
##   transcript gene
-## 1      NIB.1  NIB
-## 2     1A1N-2 1A1N
-## 3   1D1C:one 1D1C
-## 4   1D1C:two 1D1C
-## 5     1B1C.1 1B1C
-## 6     1B1C.2 1B1C
-
-

Sleuth field names

-

Although we expect Sleuth field names to be constant with the exception of covariate names, RATs allows overriding the expected names. One reason to do this would be to switch between using tpm and est_count abundances.

-
# Call DTU on data with custom field names.
-mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "Splinter", name_B = "Mickey", 
-                  varname="mouse", COUNTS_COL="the-counts", BS_TARGET_COL="trscr", 
-                  verbose = FALSE)
-
-#! In our example data here, the annotation fields have been modified as well, 
-#! so this command will NOT run as it is shown. You will also need the parameters 
-#! shown in teh section below.
-
    -
  1. varname - The field name in myslo$sample_to_covariates where the desired condition names are listed. (Default “condition”)
  2. -
  3. COUNTS_COL - The name of the field holding the abundances in the bootstrap tables of Sleuth. (Default “tpm”)
  4. -
  5. BS_TARGET_COL - The name of the field holding the transcript identifiers in the bootstrap tables of Sleuth. (Default “target_id”)
  6. -
-

The COUNTS_COL and BS_TARGET_COL parameters are also available for denest_sleuth_boots().

-

Annotation field names

Although it is easy to rename the columns of a table to comply with the expected names, this may sometimes be undesireable, so RATs allows you to change the expected names instead.

# Call DTU using annotation with custom field names.
-mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "Splinter", name_B = "Mickey", 
-                  varname="mouse", TARGET_COL="transcript", PARENT_COL="gene", 
-                  verbose = FALSE)
-
-#! In our example data here, the Sleuth fields have been modified as well, 
-#! so this command will NOT run as it is shown. You will also need the parameters 
-#! shown in the section above.
+mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + TARGET_COL="transcript", PARENT_COL="gene")
  1. TARGET_COL - The name of the field holding the transcript identifiers in the annotation data frame. (Default “target_id”)
  2. PARENT_COL - The name of the field holding the respective gene identifiers in the annotation data frame. (Default “parent_id”)
  3. @@ -617,17 +516,20 @@

    Annotation field names

Abundance scaling

-

As mentioned previously, various normalised abundance units (like TPM) are scaled to an arbitrary and usually smaller than actual sample size. This artificially deflates the significances calculated by RATs and reduces the statistical power. If you can infer a more likely number of transcripts in your sequenced samples (based on your sample preparation details), you can specify this scaling factor to “correct” the abundance values.

-
# If your sample preps contain ~70 million transcripts and you are using TPM abundances
-mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", 
-                  name_B = "patients", COUNTS_COL = "tpm", scaling = 70)
+

As mentioned previously, various normalised abundance units (like TPM) are scaled to an arbitrary and usually smaller than actual sample size. This artificially deflates the significances calculated by RATs and reduces the statistical power. The scaling parameter allows you to scale normalised abundances up to the actual library sizes.

+
# If your sample preps contain ~70 million transcripts 
+# and you are using TPM abundances, the scaling factor 
+# is 10M / 1M = 70.
+mydtu <- call_DTU(annot = myannot, 
+                  boot_data_A= mycond_A, boot_data_B= mycond_B,
+                  scaling = 70)

Annotation discrepancies

-

Different annotation versions often preserve transcript IDs, despite altering the details of the transcript models, such as UTRs. It is important to use the same annotation throughout the workflow, especially for the quantifications. Otherwise, the data is not comparable.

-

All internal operations and the output of RATs will be based on the annotation provided:

+

Different annotation versions often preserve transcript IDs, despite altering the details of the transcript model. It is important to use the same annotation throughout the workflow, otherwise the abundances will not be comparable in a meaningful way.

+

All internal operations and the output of RATs are based on the annotation provided:

  • Any transcripts present in the data but missing from the annotation will be ignored completely and will not show up in the output, as there is no reliable way to match them to gene IDs.
  • Any transcript/gene present in the annotation but missing from the data will be included in the output as zero expression.
  • diff --git a/inst/doc/output.R b/inst/doc/output.R index a8fffb1..9685453 100644 --- a/inst/doc/output.R +++ b/inst/doc/output.R @@ -1,24 +1,21 @@ ## ----setup, include=FALSE------------------------------------------------ knitr::opts_chunk$set(echo = TRUE) -## ---- include=FALSE------------------------------------------------------ +## ------------------------------------------------------------------------ library(rats) -## ---- eval=FALSE--------------------------------------------------------- -# library(rats) - ## ------------------------------------------------------------------------ # Simulate some data. -simdat <- sim_sleuth_data(cnames = c("controls", "patients")) -# For convenience let's assign the contents of the list to separate variables. -myslo <- simdat$slo -myannot <- simdat$annot +simdat <- sim_boot_data() +# For convenience let's assign the contents of the list to separate variables +mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition. +mycond_B <- simdat[[3]] # Simulated bootstrapped data for other condition. +myannot <- simdat[[1]] # Transcript and gene IDs for the above data. # Call DTU -mydtu <- call_DTU(annot= myannot, slo= myslo, name_A= "controls", - name_B= "patients", varname= "condition", verbose= FALSE, - dprop_thresh=0.1, qboot=TRUE, rboot=TRUE, - description="Comparison of two conditions using a simulated sleuth object for the purposes of the tutorial. Simulated using built-in functionality of RATs.") +mydtu <- call_DTU(annot= myannot, verbose= FALSE, + boot_data_A= mycond_A, boot_data_B= mycond_B, + dprop_thresh=0.1, qboot=TRUE, rboot=FALSE) ## ------------------------------------------------------------------------ # A really simple tally of the outcome. @@ -61,7 +58,7 @@ print( names(mydtu$Genes) ) print( names(mydtu$Transcripts) ) ## ------------------------------------------------------------------------ -# Elements of ReplicateData +# Elements of Abundances. print( names(mydtu$Abundances) ) ## ------------------------------------------------------------------------ diff --git a/inst/doc/output.Rmd b/inst/doc/output.Rmd index 32399d6..f83b5ba 100644 --- a/inst/doc/output.Rmd +++ b/inst/doc/output.Rmd @@ -1,7 +1,7 @@ --- title: 'RATs: Raw Output' author: "Kimon Froussios" -date: "01 JUN 2017" +date: "19 SEP 2017" output: html_document: keep_md: yes @@ -22,11 +22,7 @@ knitr::opts_chunk$set(echo = TRUE) *** -```{r, include=FALSE} -library(rats) -``` - -```{r, eval=FALSE} +```{r} library(rats) ``` @@ -35,20 +31,18 @@ Set up an example. ```{r} # Simulate some data. -simdat <- sim_sleuth_data(cnames = c("controls", "patients")) -# For convenience let's assign the contents of the list to separate variables. -myslo <- simdat$slo -myannot <- simdat$annot +simdat <- sim_boot_data() +# For convenience let's assign the contents of the list to separate variables +mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition. +mycond_B <- simdat[[3]] # Simulated bootstrapped data for other condition. +myannot <- simdat[[1]] # Transcript and gene IDs for the above data. # Call DTU -mydtu <- call_DTU(annot= myannot, slo= myslo, name_A= "controls", - name_B= "patients", varname= "condition", verbose= FALSE, - dprop_thresh=0.1, qboot=TRUE, rboot=TRUE, - description="Comparison of two conditions using a simulated sleuth object for the purposes of the tutorial. Simulated using built-in functionality of RATs.") +mydtu <- call_DTU(annot= myannot, verbose= FALSE, + boot_data_A= mycond_A, boot_data_B= mycond_B, + dprop_thresh=0.1, qboot=TRUE, rboot=FALSE) ``` -We lowered `dprop_thresh` and bypassed the bootstrapping to increase the number of DTU positive -classifications, to better illustrate the summary fucntions below for the given artificial data. *** @@ -56,8 +50,9 @@ classifications, to better illustrate the summary fucntions below for the given # Quick results For your convenience, RATs provides a few functions to give you a quick summary of your results. -However, we do recommend you become familiar with the actual results structure and content, so that you -can judge the quality of the DTU calls and trace the reasons behind the classification of each gene or transcript. +However, we do recommend you become familiar with the actual results structure and content, as +this will allow you to better understand the reasons behind your results and enable you to ask +more complex/detailed questions than those covered by the functions provided. ## Summary of DTU @@ -65,19 +60,17 @@ The `dtu_summary()` function lists the total number of genes and transcripts for * DTU: There is significant change in isoform ratios. * non-DTU: No significant change. -* NA: Not applicable. Genes/transcripts with abundance below the noise threshold, or where the gene has only one transcript. +* NA: Not applicable. Genes/transcripts with abundance below the noise threshold, or where the gene has only one known transcript. ```{r} # A really simple tally of the outcome. print( dtu_summary(mydtu) ) ``` -RATs uses *two* separate methods of identifying DTU: The gene-level test works on the isoforms of each gene as a set, -whereas the transcript-level test works on each isoform individually -and then aggregates the isoform results to form a gene-level result. Therefore, as of v0.4.2, `dtu_summury()` +RATs tests for DTU at both the gene level and the transcript level. Therefore, as of v0.4.2, `dtu_summury()` will display the DTU genes according to either method as well as the intersection of the two. -The `get_dtu_ids()` function lists the actual identifiers per category, instead of the numbers in each category. +The `get_dtu_ids()` function lists the coresponding identifiers per category. As of `v0.4.2` it uses the same category names as `dtu_summury()` for consistency and clarity. ```{r} @@ -105,7 +98,7 @@ print( ids ) ``` Again, three versions of the results are shown, covering the respective DTU methods in RATs. -`MIX6` is demonstrating a switch in both primary and tertiary isoforms. + ## Summary of DTU plurality @@ -120,7 +113,7 @@ ids <- get_plurality_ids(mydtu) print( ids ) ``` -The categories are named by the number of isoforms affected. There is one gene (`MIX6`) that has `3` isoforms affected. +These give you the number and IDs of genes in which 2, 3, etc isoforms show DTU. *** @@ -143,28 +136,27 @@ print( names(mydtu) ) print( names(mydtu$Parameters) ) ``` -1. `description` - (str) Free-text description of the run. It is useful to record data sources, annotation source and version, experimental parameters... -2. `time` - (str) Date and time for the run. This does not represent the exact time the run started. -3. `rats_version` - (str) The version of RATs. -4. `R_version` - (str) The version of R (including OS architecture). -5. `var_name` - (str) The value passed to the `varname` parameter. -6. `cond_A` & `cond_B` - (str) The values passed to the `name_A` and `name_B` parameters. -7. `data_type` - (str) The format of the input data. -8. `num_replic_A` & `num_replic_B` - (int) The number of samples in each condition. -9. `num_genes` - (int) The number of genes in the provided annotation. -10. `num_transc` - (int) The number of transcripts in the provided annotation. -11. `tests` - (str) The value passed to the `testmode` parameter. -12. `p_thresh` - (num) The value passed to the `p_thresh` parameter. -13. `abund_thresh` - (num) The value passed to the `abund_thresh` parameter. -14. `dprop_thresh` - (num) The value passed to the `dprop_thresh` parameter. -15. `abund_sclaing` - (num) The value passed to the `scaling` parameter. -16. `quant_reprod_thresh` - (num) The value passed to the `qrep_thresh` parameter. -17. `quant_boot` - (bool) The value passed to the `qboots` parameter. -18. `quant_bootnum` - (int) The value passed to the `qbootnum` parameter. -19. `rep_reprod_thresh` - (num) The value passed to the `rrep_thresh` parameter. -20. `rep_boots` - (bool) The value passed to the `rboot` parameter. -21. `rep_bootnum` - (int) The number of replicate bootstrapping iterations (usually M*N, where M and N are the number of samples in the two conditions). -22. `rep_reprod_as_crit` - (bool) The value passed to the `rrep_as_crit` parameter. +* `description` - (str) Free-text description of the run. It is useful to record data sources, annotation source and version, experimental parameters... +* `time` - (str) Date and time for the run. This does not represent the exact time the run started. +* `rats_version` - (str) The version of RATs. +* `R_version` - (str) The version of R (including OS architecture). +* `var_name` - (str) The value passed to the `varname` parameter. +* `cond_A` & `cond_B` - (str) The values passed to the `name_A` and `name_B` parameters. +* `data_type` - (str) The format of the input data. +* `num_replic_A` & `num_replic_B` - (int) The number of samples in each condition. +* `num_genes` - (int) The number of genes in the provided annotation. +* `num_transc` - (int) The number of transcripts in the provided annotation. +* `tests` - (str) The value passed to the `testmode` parameter. +* `p_thresh` - (num) The value passed to the `p_thresh` parameter. +* `abund_thresh` - (num) The value passed to the `abund_thresh` parameter. +* `dprop_thresh` - (num) The value passed to the `dprop_thresh` parameter. +* `abund_sclaing` - (num) The value passed to the `scaling` parameter. +* `quant_reprod_thresh` - (num) The value passed to the `qrep_thresh` parameter. +* `quant_boot` - (bool) The value passed to the `qboots` parameter. +* `quant_bootnum` - (int) The value passed to the `qbootnum` parameter. +* `rep_reprod_thresh` - (num) The value passed to the `rrep_thresh` parameter. +* `rep_boots` - (bool) The value passed to the `rboot` parameter. +* `rep_bootnum` - (int) The number of replicate bootstrapping iterations (usually M*N, where M and N are the number of samples in the two conditions). **Note:** If bootstraps are disabled, the bootstrap-related fields may have `NA` values despite any values that were passed to their respective parameters. @@ -185,33 +177,33 @@ print( names(mydtu$Genes) ) The first few columns show the result of each decision step. The remaining columns list the values based on which the decisions were made. Pseudo-code formulas are shown to help understand how the different fields interact when making decisions. -1. `parent_id` - (str) Gene identifier. -2. `elig` - (bool) Eligible for testing. Whether the gene met the pre-filtering criteria. `= (elig_transc >= 2)`. -3. `sig` - (bool) Statistically significant. `= (pvalAB_corr < Parameters$p_thresh) & (pvalBA_corr < Parameters$p_thresh)`. -4. `elig_fx` - (bool) Eligible effect size. Whether at least one of the isoforms meets the effect size criterion. `= any(Transcript[parent_id, elig_fx])`. -5. `quant_reprod` - (bool) Quantification reproducible. +* `parent_id` - (str) Gene identifier. +* `elig` - (bool) Eligible for testing. Whether the gene met the pre-filtering criteria. `= (elig_transc >= 2)`. +* `sig` - (bool) Statistically significant. `= (pvalAB_corr < Parameters$p_thresh) & (pvalBA_corr < Parameters$p_thresh)`. +* `elig_fx` - (bool) Eligible effect size. Whether at least one of the isoforms meets the effect size criterion. `= any(Transcript[parent_id, elig_fx])`. +* `quant_reprod` - (bool) Quantification reproducible. For positive DTU, `= (quant_dtu_freq >= Parameters$quant_reprod_thresh)`, for non-DTU `= (quant_dtu_freq <= 1 - Parameters$quant_reprod_thresh)`. -6. `rep_reprod` - (bool) Replication reproducible. +* `rep_reprod` - (bool) Replication reproducible. For positive DTU, `= (rep_dtu_freq >= Parameters$rep_reprod_thresh)`, for non-DTU `= (rep_dtu_freq <= 1 - Parameters$rep_reprod_thresh)`. -7. `DTU` - (bool) Differential Transcript Usage. `= (sig & elig_fx & quant_reprod)`, or if `rrep_as_crit==TRUE` then `= (sig & elig_fx & quant_reprod & rep_reprod)`. -8. `transc_DTU` - (bool) Aggregated from `Transcripts$DTU`: At least one isoform was individually reported as DTU. `= any(Transcript[parent_id, DTU])`. -9. `known_transc` - (int) Number of known trascripts for this gene according to the given annotation. -10. `detect_trancs` - (int) Number of detected (expressed) transcripts in the given dataset. -11. `elig_transc` - (int) Number of eligible transcripts, aggregated from `Transcripts$elig`. -12. `pval` - (num) G test of independence p-value for the isoform ratios. -13. `pval_corr` - (num) Multiple testing corrected p-value. `pval_corr= p.adjust(pval, Parameters$correction)`. -14. `quant_p_mean` - (num) Mean p-value across quantification bootstraps. -15. `quant_p_stdev` - (num) Standard deviation of p-values across quantification bootstraps. -16. `quant_p_min` - (num) Minimum observed p-value across quantification bootstraps. -17. `quant_p_max` - (num) Maximum observed p-value across quantification bootstraps. -18. `quant_na_freq` - (num) Fraction of quantification iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). -19. `quant_dtu_freq` - (num) Fraction of replicate iterations that support a positive `DTU` classification. -20. `rep_p_mean` - (num) Mean p-value across replication bootstraps. -21. `rep_p_stdev` - (num) Standard deviation of p-values across replication bootstraps. -22. `rep_p_min` - (num) Minimum observed p-value across replication bootstraps. -23. `rep_p_max` - (num) Maximum observed p-value across replication bootstraps. -24. `rep_na_freq` - (num) Fraction of replication iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). -25. `rep_dtu_freq` - (num) Fraction of replication iterations that support a positive `DTU` classification. +* `DTU` - (bool) Differential Transcript Usage. `= (sig & elig_fx & quant_reprod)`, or if `rrep_as_crit==TRUE` then `= (sig & elig_fx & quant_reprod & rep_reprod)`. +* `transc_DTU` - (bool) Aggregated from `Transcripts$DTU`: At least one isoform was individually reported as DTU. `= any(Transcript[parent_id, DTU])`. +* `known_transc` - (int) Number of known trascripts for this gene according to the given annotation. +* `detect_trancs` - (int) Number of detected (expressed) transcripts in the given dataset. +* `elig_transc` - (int) Number of eligible transcripts, aggregated from `Transcripts$elig`. +* `pval` - (num) G test of independence p-value for the isoform ratios. +* `pval_corr` - (num) Multiple testing corrected p-value. `pval_corr= p.adjust(pval, Parameters$correction)`. +* `quant_p_mean` - (num) Mean p-value across quantification bootstraps. +* `quant_p_stdev` - (num) Standard deviation of p-values across quantification bootstraps. +* `quant_p_min` - (num) Minimum observed p-value across quantification bootstraps. +* `quant_p_max` - (num) Maximum observed p-value across quantification bootstraps. +* `quant_na_freq` - (num) Fraction of quantification iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). +* `quant_dtu_freq` - (num) Fraction of replicate iterations that support a positive `DTU` classification. +* `rep_p_mean` - (num) Mean p-value across replication bootstraps. +* `rep_p_stdev` - (num) Standard deviation of p-values across replication bootstraps. +* `rep_p_min` - (num) Minimum observed p-value across replication bootstraps. +* `rep_p_max` - (num) Maximum observed p-value across replication bootstraps. +* `rep_na_freq` - (num) Fraction of replication iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). +* `rep_dtu_freq` - (num) Fraction of replication iterations that support a positive `DTU` classification. **Note:** The fields reporting on the bootstraps will not be shown when bootstrapping is disabled. @@ -228,39 +220,39 @@ The maximum likelihood-based G-test of independence is used to compare each give print( names(mydtu$Transcripts) ) ``` -1. `target_id` - (str) Transcript identifier. -2. `parent_id` - (str) Gene identifier. -3. `elig_xp` - (bool) Eligible expression level. `= (meanA >= Parameters$abund_thresh | meanB >= Parameters$abund_thresh)`. -4. `elig` - (bool) Eligible for testing (meets the noise threshold and at least one other isoform is expressed). `= (elig_xp & totalA != 0 & totalB != 0 & (sumA != totalA | sumB != totalB))`. -5. `sig` - (bool) Statistically significant. `= (pval_corr < Parameters$p_thresh)`. -6. `elig_fx` - (bool) Eligible effect size. Proxy for biological significance. `= (Dprop > Parameters$dprop_thresh)`. +* `target_id` - (str) Transcript identifier. +* `parent_id` - (str) Gene identifier. +* `elig_xp` - (bool) Eligible expression level. `= (meanA >= Parameters$abund_thresh | meanB >= Parameters$abund_thresh)`. +* `elig` - (bool) Eligible for testing (meets the noise threshold and at least one other isoform is expressed). `= (elig_xp & totalA != 0 & totalB != 0 & (sumA != totalA | sumB != totalB))`. +* `sig` - (bool) Statistically significant. `= (pval_corr < Parameters$p_thresh)`. +* `elig_fx` - (bool) Eligible effect size. Proxy for biological significance. `= (Dprop > Parameters$dprop_thresh)`. 7. `quant_reprod` - (bool) Quantification reproducible. For positive DTU, `= (quant_dtu_freq >= Parameters$quant_reprod_thresh)`, for non-DTU `= (quant_dtu_freq <= 1 - Parameters$quant_reprod_thresh)`. -8. `rep_reprod` - (bool) Replication reproducible. +* `rep_reprod` - (bool) Replication reproducible. For positive DTU, `= (rep_dtu_freq >= Parameters$rep_reprod_thresh)`, for non-DTU `= (rep_dtu_freq <= 1 - Parameters$rep_reprod_thresh)`. -9. `DTU` - (bool) The Transcript is Differentially Used. `= (sig & elig_fx & quant_reprod)`, or if `rrep_as_crit==TRUE` then `= (sig & elig_fx & quant_reprod & rep_reprod)`. -10. `gene_DTU` - (bool) Expanded from `Genes$DTU`. Indicates that the gene as a whole shows significant change in isoform ratios. `= (Genes[parent_id, DTU])`. -11. `meanA` and `meanB` - (num) The mean abundance across replicates, for each condition. -12. `stdevA` and `stdevB` - (num) The standard deviation of the abundance across the replicates. -13. `sumA` and `sumB` - (num) The sum of the abundances across replicates. This is the value used for the tests, so that replication level informs the significance. -14. `log2FC` - (num) lgo2 of fold-change of transcript abundance: `= (sumB / sumA)`. sumA and sumB are NOT normalised for sequencing depth, so when log2FC is plotted the distribution may not be centred on 0. If it is not centred on 0, do NOT use this log2FC for differential transcript expression analysis! -15. `totalA` and `totalB` - (num) The total abundance for the gene: `totalA= sum(transcripts[parent_id, sumA])`. -16. `propA` and `propB` - (num) The proportion of the gene expression owed to this transcript. `propA= sumA / totalA`. -17. `Dprop` - (num) The difference in the proportion of the transcript between the two conditions (effect size). `= (probB - propA)`. -18. `pval` - (num) The proportion equality test P-value for the transcript. -19. `pval_corr` - (num) Multiple testing corrected p-value. `= p.adjust(pval, Parameters$correction)`. -20. `quant_p_mean` - (num) Mean p-value across quantification bootstraps. -21. `quant_p_stdev` - (num) Standard deviation of p-values across quantification bootstraps. -22. `quant_p_min` - (num) Minimum observed p-value across quantification bootstraps. -23. `quant_p_max` - (num) Maximum observed p-value across quantification bootstraps. -24. `quant_na_freq` - (num) Fraction of quantification iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). -25. `quant_dtu_freq` - (num) Fraction of quantification iterations that support a positive `DTU` classification. -26. `rep_p_mean` - (num) Mean p-value across replication bootstraps. -27. `rep_p_stdev` - (num) Standard deviation of p-values across replication bootstraps. -28. `rep_p_min` - (num) Minimum observed p-value across replication bootstraps. -29. `rep_p_max` - (num) Maximum observed p-value across replication bootstraps. -30. `rep_na_freq` - (num) Fraction of replication iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). -31. `rep_dtu_freq` - (num) Fraction of replication iterations that support a positive `DTU` classification. +* `DTU` - (bool) The Transcript is Differentially Used. `= (sig & elig_fx & quant_reprod)`, or if `rrep_as_crit==TRUE` then `= (sig & elig_fx & quant_reprod & rep_reprod)`. +*. `gene_DTU` - (bool) Expanded from `Genes$DTU`. Indicates that the gene as a whole shows significant change in isoform ratios. `= (Genes[parent_id, DTU])`. +* `meanA` and `meanB` - (num) The mean abundance across replicates, for each condition. +* `stdevA` and `stdevB` - (num) The standard deviation of the abundance across the replicates. +* `sumA` and `sumB` - (num) The sum of the abundances across replicates. This is the value used for the tests, so that replication level informs the significance. +* `log2FC` - (num) lgo2 of fold-change of transcript abundance: `= (sumB / sumA)`. sumA and sumB are NOT normalised for sequencing depth, so when log2FC is plotted the distribution may not be centred on 0. If it is not centred on 0, do NOT use this log2FC for differential transcript expression analysis! +* `totalA` and `totalB` - (num) The total abundance for the gene: `totalA= sum(transcripts[parent_id, sumA])`. +* `propA` and `propB` - (num) The proportion of the gene expression owed to this transcript. `propA= sumA / totalA`. +* `Dprop` - (num) The difference in the proportion of the transcript between the two conditions (effect size). `= (probB - propA)`. +* `pval` - (num) The proportion equality test P-value for the transcript. +* `pval_corr` - (num) Multiple testing corrected p-value. `= p.adjust(pval, Parameters$correction)`. +* `quant_p_mean` - (num) Mean p-value across quantification bootstraps. +* `quant_p_stdev` - (num) Standard deviation of p-values across quantification bootstraps. +* `quant_p_min` - (num) Minimum observed p-value across quantification bootstraps. +* `quant_p_max` - (num) Maximum observed p-value across quantification bootstraps. +* `quant_na_freq` - (num) Fraction of quantification iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). +* `quant_dtu_freq` - (num) Fraction of quantification iterations that support a positive `DTU` classification. +* `rep_p_mean` - (num) Mean p-value across replication bootstraps. +* `rep_p_stdev` - (num) Standard deviation of p-values across replication bootstraps. +* `rep_p_min` - (num) Minimum observed p-value across replication bootstraps. +* `rep_p_max` - (num) Maximum observed p-value across replication bootstraps. +* `rep_na_freq` - (num) Fraction of replication iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). +* `rep_dtu_freq` - (num) Fraction of replication iterations that support a positive `DTU` classification. **Note:** The fields reporting on the bootstraps will not be shown when bootstrapping is disabled. @@ -271,9 +263,8 @@ For positive DTU, `= (rep_dtu_freq >= Parameters$rep_reprod_thresh)`, for non-DT so if bootstrapped data was used, these values are the means across iterations. If plain abundances were provided as input, then `Abundances` essentially contains the input data. These abundances are included in the output because they are required for some of RATs' plotting options. - ```{r} -# Elements of ReplicateData +# Elements of Abundances. print( names(mydtu$Abundances) ) ``` diff --git a/inst/doc/output.html b/inst/doc/output.html index 0eefc88..60f8c5a 100644 --- a/inst/doc/output.html +++ b/inst/doc/output.html @@ -11,7 +11,7 @@ - + RATs: Raw Output @@ -217,7 +217,7 @@

    RATs: Raw Output

    Kimon Froussios

    -

    01 JUN 2017

    +

    19 SEP 2017

@@ -226,69 +226,68 @@

01 JUN 2017

library(rats)

Set up an example.

# Simulate some data.
-simdat <- sim_sleuth_data(cnames = c("controls", "patients")) 
-# For convenience let's assign the contents of the list to separate variables.
-myslo <- simdat$slo
-myannot <- simdat$annot
+simdat <- sim_boot_data() 
+# For convenience let's assign the contents of the list to separate variables
+mycond_A <- simdat[[2]]   # Simulated bootstrapped data for one condition.
+mycond_B <- simdat[[3]]   # Simulated bootstrapped data for other condition.
+myannot <- simdat[[1]]    # Transcript and gene IDs for the above data.
 
 # Call DTU
-mydtu <- call_DTU(annot= myannot, slo= myslo, name_A= "controls", 
-                  name_B= "patients", varname= "condition", verbose= FALSE, 
-                  dprop_thresh=0.1, qboot=TRUE, rboot=TRUE,
-                  description="Comparison of two conditions using a simulated sleuth object for the purposes of the tutorial. Simulated using built-in functionality of RATs.")
-

We lowered dprop_thresh and bypassed the bootstrapping to increase the number of DTU positive classifications, to better illustrate the summary fucntions below for the given artificial data.

+mydtu <- call_DTU(annot= myannot, verbose= FALSE, + boot_data_A= mycond_A, boot_data_B= mycond_B, + dprop_thresh=0.1, qboot=TRUE, rboot=FALSE)

Quick results

-

For your convenience, RATs provides a few functions to give you a quick summary of your results. However, we do recommend you become familiar with the actual results structure and content, so that you can judge the quality of the DTU calls and trace the reasons behind the classification of each gene or transcript.

+

For your convenience, RATs provides a few functions to give you a quick summary of your results. However, we do recommend you become familiar with the actual results structure and content, as this will allow you to better understand the reasons behind your results and enable you to ask more complex/detailed questions than those covered by the functions provided.

Summary of DTU

The dtu_summary() function lists the total number of genes and transcripts for each of 3 categories:

  • DTU: There is significant change in isoform ratios.
  • non-DTU: No significant change.
  • -
  • NA: Not applicable. Genes/transcripts with abundance below the noise threshold, or where the gene has only one transcript.
  • +
  • NA: Not applicable. Genes/transcripts with abundance below the noise threshold, or where the gene has only one known transcript.
# A really simple tally of the outcome.
 print( dtu_summary(mydtu) )
##                        category tally
-## 1         DTU genes (gene test)     0
-## 2     non-DTU genes (gene test)     3
+## 1         DTU genes (gene test)     1
+## 2     non-DTU genes (gene test)     2
 ## 3          NA genes (gene test)     7
-## 4      DTU genes (transc. test)     0
+## 4      DTU genes (transc. test)     1
 ## 5  non-DTU genes (transc. test)     2
-## 6       NA genes (transc. test)     8
-## 7        DTU genes (both tests)     0
+## 6       NA genes (transc. test)     7
+## 7        DTU genes (both tests)     1
 ## 8    non-DTU genes (both tests)     2
 ## 9         NA genes (both tests)     7
-## 10              DTU transcripts     0
-## 11          non-DTU transcripts    10
+## 10              DTU transcripts     3
+## 11          non-DTU transcripts     7
 ## 12               NA transcripts    11
-

RATs uses two separate methods of identifying DTU: The gene-level test works on the isoforms of each gene as a set, whereas the transcript-level test works on each isoform individually and then aggregates the isoform results to form a gene-level result. Therefore, as of v0.4.2, dtu_summury() will display the DTU genes according to either method as well as the intersection of the two.

-

The get_dtu_ids() function lists the actual identifiers per category, instead of the numbers in each category. As of v0.4.2 it uses the same category names as dtu_summury() for consistency and clarity.

+

RATs tests for DTU at both the gene level and the transcript level. Therefore, as of v0.4.2, dtu_summury() will display the DTU genes according to either method as well as the intersection of the two.

+

The get_dtu_ids() function lists the coresponding identifiers per category. As of v0.4.2 it uses the same category names as dtu_summury() for consistency and clarity.

# Gene and transcript IDs corresponding to the tally above.
 ids <- get_dtu_ids(mydtu)
 print( ids )
## $`DTU genes (gene test)`
-## character(0)
+## [1] "MIX6"
 ## 
 ## $`non-DTU genes (gene test)`
-## [1] "MIX6" "CC"   "NN"  
+## [1] "CC" "NN"
 ## 
 ## $`NA genes (gene test)`
 ## [1] "LC"   "1A1N" "1B1C" "1D1C" "ALLA" "ALLB" "NIB" 
 ## 
 ## $`DTU genes (transc. test)`
-## character(0)
+## [1] "MIX6"
 ## 
 ## $`non-DTU genes (transc. test)`
 ## [1] "CC" "NN"
 ## 
 ## $`NA genes (transc. test)`
-## [1] "MIX6" "LC"   "1A1N" "1B1C" "1D1C" "ALLA" "ALLB" "NIB" 
+## [1] "LC"   "1A1N" "1B1C" "1D1C" "ALLA" "ALLB" "NIB" 
 ## 
 ## $`DTU genes (both tests)`
-## character(0)
+## [1] "MIX6"
 ## 
 ## $`non-DTU genes (both tests)`
 ## [1] "CC" "NN"
@@ -297,11 +296,10 @@ 

Summary of DTU

## [1] "LC" "1A1N" "1B1C" "1D1C" "ALLA" "ALLB" "NIB" ## ## $`DTU transcripts` -## character(0) +## [1] "MIX6.c1" "MIX6.c2" "MIX6.c4" ## ## $`non-DTU transcripts` -## [1] "MIX6.c1" "MIX6.c2" "MIX6.c4" "LC2" "CC_a" "CC_b" "MIX6.c3" -## [8] "2NN" "1NN" "MIX6.nc" +## [1] "LC2" "CC_a" "CC_b" "MIX6.c3" "2NN" "1NN" "MIX6.nc" ## ## $`NA transcripts` ## [1] "LC1" "1A1N-2" "1B1C.1" "1B1C.2" "1D1C:one" "1D1C:two" @@ -324,36 +322,37 @@

Summary of isoform switching

ids <- get_switch_ids(mydtu) print( ids )
## $`Primary switch (gene test)`
-## [1] NA
+## [1] "MIX6"
 ## 
 ## $`Non-primary switch (gene test)`
-## [1] NA
+## [1] "MIX6"
 ## 
 ## $`Primary switch (transc. test)`
-## [1] NA
+## [1] "MIX6"
 ## 
 ## $`Non-primary switch (transc. test)`
-## [1] NA
+## [1] "MIX6"
 ## 
 ## $`Primary switch (both tests)`
-## [1] NA
+## [1] "MIX6"
 ## 
 ## $`Non-primary switch (both tests)`
-## [1] NA
-

Again, three versions of the results are shown, covering the respective DTU methods in RATs. MIX6 is demonstrating a switch in both primary and tertiary isoforms.

+## [1] "MIX6" +

Again, three versions of the results are shown, covering the respective DTU methods in RATs.

Summary of DTU plurality

In case you want to know how many isoforms are affected per gene.

# A tally of genes switching isoform ranks.
 print( dtu_plurality_summary(mydtu) )
-
## [1] isof_affected
-## <0 rows> (or 0-length row.names)
+
##   isof_affected num_of_genes
+## 1             3            1
# The gene IDs displaying isoform switching.
 ids <- get_plurality_ids(mydtu)
 print( ids )
-
## named list()
-

The categories are named by the number of isoforms affected. There is one gene (MIX6) that has 3 isoforms affected.

+
## $`3`
+## [1] "MIX6"
+

These give you the number and IDs of genes in which 2, 3, etc isoforms show DTU.


@@ -374,8 +373,8 @@

Parameters

## [13] "tests" "p_thresh" "abund_thresh" ## [16] "dprop_thresh" "abund_scaling" "quant_reprod_thresh" ## [19] "quant_boot" "quant_bootnum" "rep_reprod_thresh" -## [22] "rep_boot" "rep_bootnum" "rep_reprod_as_crit" -
    +## [22] "rep_boot" "rep_bootnum" +
    • description - (str) Free-text description of the run. It is useful to record data sources, annotation source and version, experimental parameters…
    • time - (str) Date and time for the run. This does not represent the exact time the run started.
    • rats_version - (str) The version of RATs.
    • @@ -397,8 +396,7 @@

      Parameters

    • rep_reprod_thresh - (num) The value passed to the rrep_thresh parameter.
    • rep_boots - (bool) The value passed to the rboot parameter.
    • rep_bootnum - (int) The number of replicate bootstrapping iterations (usually M*N, where M and N are the number of samples in the two conditions).
    • -
    • rep_reprod_as_crit - (bool) The value passed to the rrep_as_crit parameter.
    • -
+

Note: If bootstraps are disabled, the bootstrap-related fields may have NA values despite any values that were passed to their respective parameters.

@@ -408,14 +406,12 @@

Genes

# Genes table's fields.
 print( names(mydtu$Genes) )
##  [1] "parent_id"      "elig"           "sig"            "elig_fx"       
-##  [5] "quant_reprod"   "rep_reprod"     "DTU"            "transc_DTU"    
-##  [9] "known_transc"   "detect_transc"  "elig_transc"    "maxDprop"      
-## [13] "pval"           "pval_corr"      "quant_p_mean"   "quant_p_stdev" 
-## [17] "quant_p_min"    "quant_p_max"    "quant_na_freq"  "quant_dtu_freq"
-## [21] "rep_p_mean"     "rep_p_stdev"    "rep_p_min"      "rep_p_max"     
-## [25] "rep_na_freq"    "rep_dtu_freq"
+## [5] "quant_reprod" "DTU" "transc_DTU" "known_transc" +## [9] "detect_transc" "elig_transc" "maxDprop" "pval" +## [13] "pval_corr" "quant_p_mean" "quant_p_stdev" "quant_p_min" +## [17] "quant_p_max" "quant_na_freq" "quant_dtu_freq"

The first few columns show the result of each decision step. The remaining columns list the values based on which the decisions were made. Pseudo-code formulas are shown to help understand how the different fields interact when making decisions.

-
    +
    • parent_id - (str) Gene identifier.
    • elig - (bool) Eligible for testing. Whether the gene met the pre-filtering criteria. = (elig_transc >= 2).
    • sig - (bool) Statistically significant. = (pvalAB_corr < Parameters$p_thresh) & (pvalBA_corr < Parameters$p_thresh).
    • @@ -441,7 +437,7 @@

      Genes

    • rep_p_max - (num) Maximum observed p-value across replication bootstraps.
    • rep_na_freq - (num) Fraction of replication iterations in which DTU could not be determined (not meeting pre-filtering criteria).
    • rep_dtu_freq - (num) Fraction of replication iterations that support a positive DTU classification.
    • -
+

Note: The fields reporting on the bootstraps will not be shown when bootstrapping is disabled.

@@ -451,25 +447,27 @@

Transcripts

# Transcripts table's fields.
 print( names(mydtu$Transcripts) )
##  [1] "target_id"      "parent_id"      "elig_xp"        "elig"          
-##  [5] "sig"            "elig_fx"        "quant_reprod"   "rep_reprod"    
-##  [9] "DTU"            "gene_DTU"       "meanA"          "meanB"         
-## [13] "stdevA"         "stdevB"         "sumA"           "sumB"          
-## [17] "log2FC"         "totalA"         "totalB"         "propA"         
-## [21] "propB"          "Dprop"          "pval"           "pval_corr"     
-## [25] "quant_p_mean"   "quant_p_stdev"  "quant_p_min"    "quant_p_max"   
-## [29] "quant_na_freq"  "quant_dtu_freq" "rep_p_mean"     "rep_p_stdev"   
-## [33] "rep_p_min"      "rep_p_max"      "rep_na_freq"    "rep_dtu_freq"
-
    +## [5] "sig" "elig_fx" "quant_reprod" "DTU" +## [9] "gene_DTU" "meanA" "meanB" "stdevA" +## [13] "stdevB" "sumA" "sumB" "log2FC" +## [17] "totalA" "totalB" "propA" "propB" +## [21] "Dprop" "pval" "pval_corr" "quant_p_mean" +## [25] "quant_p_stdev" "quant_p_min" "quant_p_max" "quant_na_freq" +## [29] "quant_dtu_freq" +
    • target_id - (str) Transcript identifier.
    • parent_id - (str) Gene identifier.
    • elig_xp - (bool) Eligible expression level. = (meanA >= Parameters$abund_thresh | meanB >= Parameters$abund_thresh).
    • elig - (bool) Eligible for testing (meets the noise threshold and at least one other isoform is expressed). = (elig_xp & totalA != 0 & totalB != 0 & (sumA != totalA | sumB != totalB)).
    • sig - (bool) Statistically significant. = (pval_corr < Parameters$p_thresh).
    • elig_fx - (bool) Eligible effect size. Proxy for biological significance. = (Dprop > Parameters$dprop_thresh).
    • +
    +
    1. quant_reprod - (bool) Quantification reproducible. For positive DTU, = (quant_dtu_freq >= Parameters$quant_reprod_thresh), for non-DTU = (quant_dtu_freq <= 1 - Parameters$quant_reprod_thresh).
    2. +
    +
    • rep_reprod - (bool) Replication reproducible. For positive DTU, = (rep_dtu_freq >= Parameters$rep_reprod_thresh), for non-DTU = (rep_dtu_freq <= 1 - Parameters$rep_reprod_thresh).
    • -
    • DTU - (bool) The Transcript is Differentially Used. = (sig & elig_fx & quant_reprod), or if rrep_as_crit==TRUE then = (sig & elig_fx & quant_reprod & rep_reprod).
    • -
    • gene_DTU - (bool) Expanded from Genes$DTU. Indicates that the gene as a whole shows significant change in isoform ratios. = (Genes[parent_id, DTU]).
    • +
    • DTU - (bool) The Transcript is Differentially Used. = (sig & elig_fx & quant_reprod), or if rrep_as_crit==TRUE then = (sig & elig_fx & quant_reprod & rep_reprod). *. gene_DTU - (bool) Expanded from Genes$DTU. Indicates that the gene as a whole shows significant change in isoform ratios. = (Genes[parent_id, DTU]).
    • meanA and meanB - (num) The mean abundance across replicates, for each condition.
    • stdevA and stdevB - (num) The standard deviation of the abundance across the replicates.
    • sumA and sumB - (num) The sum of the abundances across replicates. This is the value used for the tests, so that replication level informs the significance.
    • @@ -491,13 +489,13 @@

      Transcripts

    • rep_p_max - (num) Maximum observed p-value across replication bootstraps.
    • rep_na_freq - (num) Fraction of replication iterations in which DTU could not be determined (not meeting pre-filtering criteria).
    • rep_dtu_freq - (num) Fraction of replication iterations that support a positive DTU classification.
    • -
+

Note: The fields reporting on the bootstraps will not be shown when bootstrapping is disabled.

Abundances

Abundances is a list of two data.tables, one for each condition. Each transcript is represented by a single abundance value per replicate, so if bootstrapped data was used, these values are the means across iterations. If plain abundances were provided as input, then Abundances essentially contains the input data. These abundances are included in the output because they are required for some of RATs’ plotting options.

-
# Elements of ReplicateData
+
# Elements of Abundances.
 print( names(mydtu$Abundances) )
## [1] "condA" "condB"
    diff --git a/inst/doc/plots.R b/inst/doc/plots.R index 21f592d..e028214 100644 --- a/inst/doc/plots.R +++ b/inst/doc/plots.R @@ -9,15 +9,16 @@ library(rats) ## ------------------------------------------------------------------------ # Simulate some data. -simdat <- sim_sleuth_data(cnames = c("controls", "patients")) -# For convenience let's assign the contents of the list to separate variables. -myslo <- simdat$slo -myannot <- simdat$annot +simdat <- sim_boot_data() +# For convenience let's assign the contents of the list to separate variables +mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition. +mycond_B <- simdat[[3]] # Simulated bootstrapped data for other condition. +myannot <- simdat[[1]] # Transcript and gene IDs for the above data. # Call DTU -mydtu <- call_DTU(annot= myannot, slo= myslo, name_A= "controls", name_B= "patients", - varname= "condition", verbose= FALSE, - description="Comparison of two conditions using a simulated sleuth object for the purposes of the tutorial. Simulated using built-in functionality of RATs.") +mydtu <- call_DTU(annot= myannot, verbose= FALSE, + boot_data_A= mycond_A, boot_data_B= mycond_B, + dprop_thresh=0.1, qboot=TRUE, rboot=FALSE) ## ------------------------------------------------------------------------ # Grouping by condition (DEFAULT): diff --git a/inst/doc/plots.Rmd b/inst/doc/plots.Rmd index 8b8a83d..aa29347 100644 --- a/inst/doc/plots.Rmd +++ b/inst/doc/plots.Rmd @@ -1,7 +1,7 @@ --- title: 'RATs: Plots' author: "Kimon Froussios" -date: "01 JUN 2017" +date: "19 SEP 2017" output: html_document: keep_md: yes @@ -35,15 +35,16 @@ Set up an example. ```{r} # Simulate some data. -simdat <- sim_sleuth_data(cnames = c("controls", "patients")) -# For convenience let's assign the contents of the list to separate variables. -myslo <- simdat$slo -myannot <- simdat$annot +simdat <- sim_boot_data() +# For convenience let's assign the contents of the list to separate variables +mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition. +mycond_B <- simdat[[3]] # Simulated bootstrapped data for other condition. +myannot <- simdat[[1]] # Transcript and gene IDs for the above data. # Call DTU -mydtu <- call_DTU(annot= myannot, slo= myslo, name_A= "controls", name_B= "patients", - varname= "condition", verbose= FALSE, - description="Comparison of two conditions using a simulated sleuth object for the purposes of the tutorial. Simulated using built-in functionality of RATs.") +mydtu <- call_DTU(annot= myannot, verbose= FALSE, + boot_data_A= mycond_A, boot_data_B= mycond_B, + dprop_thresh=0.1, qboot=TRUE, rboot=FALSE) ``` @@ -78,19 +79,19 @@ plot_gene(mydtu, "MIX6", style="bycondition") The top two facets show the absolute abundances (counts) of the isoforms as supplied in the input, the bottom two show the correspondng relative abundances (proportions). These are split by condition: The left two facets refer to one condition, the right two ones to the other condition. The boxplots describe the abundance measurements of each isoform across replicates. +The actual measurements are overlayed as points and connected by lines so as to group the set of isoform abundances measured in each replicate. +This highlights the level of consistency between the replicates. The points and lines are placed slightly off-centre to prevent replicates +masking one another when the measurements are very similar. -The actual measurements are overlayed as points and connected by lines so as to group the set of isoform abundances measured in each replicate. This highlights the level of -consistency between the replicates. The points and lines are placed slightly off-centre to prevent replicates masking one another when -the measurements are very similar. The colours of the replicates are recycled between the two conditions, to reduce the required palette and -boost contrast. So in the example, there are 4 samples: controls-1, controls-2, patients-1 and patients-2. +The colours of the replicates are recycled between the two conditions, to reducethe required palette and boost contrast. +So in the example, there are 4 samples: controls-1, controls-2, patients-1 and patients-2. In the past, the fill colour encoded the DTU outcome, but to keep things more consistent with the other styles the DTU is now encoded as point shape while the fill colour echoes the condition. In this example, isoforms `.c1` and `.c2` change significantly, isoforms `.c3`, `.c4` and `.nc` do not change significantly -and isoform `.d` could not be tested. It is not always possible to see the fill colour, depending on the spead of the measurements, so -refering back to the RATs output tables may be necessary. +and isoform `.d` could not be tested. -This structure is great for seeing changes in the isoform distribution profile of a gene. A more minimalist version of the plot, without the boxpltos, +This structure is great for seeing changes in the isoform distribution profile of a gene. A more minimalist version of this, without the boxpltos, can also be obtained: ```{r, eval=FALSE} @@ -98,7 +99,7 @@ can also be obtained: plot_gene(mydtu, "MIX6", style="lines") ``` -When there are many lines or many isoforms, it may be preferable to group abundances by isoform, making individual comparisons easier: +When there are many replicates or many isoforms, it may be preferable to group abundances by isoform, making individual comparisons easier: ```{r} # Grouping by isoform: @@ -143,7 +144,7 @@ This is what these look like on a larger dataset: The gene version of the effect size distribution is often bi-modal with the positive peak being larger than the negative, because when a gene has isoforms with the same absolute effect size (for example genes with two isoforms) -the positive value is selected. +the positive value is deterministically selected. Although fold-changes of transcript expression are not tied to DTU, if your input counts are suitably normalised between conditions, you can plot the traditional FC volcano and the relationship between FC and proportion change. @@ -162,7 +163,7 @@ plot_overview(mydtu, type="fcVSdprop") ## Interactive plots -If you prefer picking points from a plot rather than sorting through tables, the volcano plot is also available through +If you prefer picking points from a plot rather than sorting through tables, the gene-level volcano plot is also available through an interactive `shiny` app, that pulls up the relevant `Gene` and `Transcript` results and the isoform abundance plot for any volcano point you select. @@ -210,30 +211,19 @@ plot_gene(mydtu, "MIX6", style="bycondition", fillby="condition", replcolvec=c("green", "darkgreen")) ``` -1. `isofcolvec` - Colour vector for isoform highlighting. Used to build a colorRampPalette. -2. `dtucolvec` - Colour vector for DTU highlighting. -3. `condcolvec` - Colour vector for condition highlighting. -4. `replcolvec` - Colour vector for replicate highlighting. Used to build a colorRampPalette. -5. `nonecol` - Colour to use when no colour coding is wanted. +* `isofcolvec` - Colour vector for isoform highlighting. Used to build a colorRampPalette. +* `dtucolvec` - Colour vector for DTU highlighting. +* `condcolvec` - Colour vector for condition highlighting. +* `replcolvec` - Colour vector for replicate highlighting. Used to build a colorRampPalette. +* `nonecol` - Colour to use when no colour coding is wanted. ## General customisations for all plots -You can save any of the gene and overview plots as a `ggplot2` object and use [ggplot2](http://ggplot2.org) manipulations on it, such as changing the axis scales. -Other `ggplot2` customisations include the axis tick marks, axis values, labels, titles, colours... Consult the [ggplot2](http://ggplot2.org) +You can save any of the gene and overview plots as a `ggplot2` object and use ggplot2 manipulations on it, such as +the axis tick marks, axis values, labels, titles, colours... Consult the [ggplot2](http://ggplot2.org) documentation for more help on these. - - - - - - - - - - - *** diff --git a/inst/doc/plots.html b/inst/doc/plots.html index 2ce6c20..e49f71f 100644 --- a/inst/doc/plots.html +++ b/inst/doc/plots.html @@ -11,7 +11,7 @@ - + RATs: Plots @@ -217,7 +217,7 @@

    RATs: Plots

    Kimon Froussios

    -

    01 JUN 2017

    +

    19 SEP 2017

@@ -226,15 +226,16 @@

01 JUN 2017

library(rats)

Set up an example.

# Simulate some data.
-simdat <- sim_sleuth_data(cnames = c("controls", "patients")) 
-# For convenience let's assign the contents of the list to separate variables.
-myslo <- simdat$slo
-myannot <- simdat$annot
+simdat <- sim_boot_data() 
+# For convenience let's assign the contents of the list to separate variables
+mycond_A <- simdat[[2]]   # Simulated bootstrapped data for one condition.
+mycond_B <- simdat[[3]]   # Simulated bootstrapped data for other condition.
+myannot <- simdat[[1]]    # Transcript and gene IDs for the above data.
 
 # Call DTU
-mydtu <- call_DTU(annot= myannot, slo= myslo, name_A= "controls", name_B= "patients", 
-                  varname= "condition", verbose= FALSE,
-                  description="Comparison of two conditions using a simulated sleuth object for the purposes of the tutorial. Simulated using built-in functionality of RATs.")
+mydtu <- call_DTU(annot= myannot, verbose= FALSE, + boot_data_A= mycond_A, boot_data_B= mycond_B, + dprop_thresh=0.1, qboot=TRUE, rboot=FALSE)

Visualisation of results

@@ -246,17 +247,17 @@

Isoform abundances for a given gene

There are several styles for this plot, depending on your preferences. The default plot format includes the most information:

# Grouping by condition (DEFAULT):
 plot_gene(mydtu, "MIX6", style="bycondition")
-

-

The top two facets show the absolute abundances (counts) of the isoforms as supplied in the input, the bottom two show the correspondng relative abundances (proportions). These are split by condition: The left two facets refer to one condition, the right two ones to the other condition. The boxplots describe the abundance measurements of each isoform across replicates.

-

The actual measurements are overlayed as points and connected by lines so as to group the set of isoform abundances measured in each replicate. This highlights the level of consistency between the replicates. The points and lines are placed slightly off-centre to prevent replicates masking one another when the measurements are very similar. The colours of the replicates are recycled between the two conditions, to reduce the required palette and boost contrast. So in the example, there are 4 samples: controls-1, controls-2, patients-1 and patients-2.

-

In the past, the fill colour encoded the DTU outcome, but to keep things more consistent with the other styles the DTU is now encoded as point shape while the fill colour echoes the condition. In this example, isoforms .c1 and .c2 change significantly, isoforms .c3, .c4 and .nc do not change significantly and isoform .d could not be tested. It is not always possible to see the fill colour, depending on the spead of the measurements, so refering back to the RATs output tables may be necessary.

-

This structure is great for seeing changes in the isoform distribution profile of a gene. A more minimalist version of the plot, without the boxpltos, can also be obtained:

+

+

The top two facets show the absolute abundances (counts) of the isoforms as supplied in the input, the bottom two show the correspondng relative abundances (proportions). These are split by condition: The left two facets refer to one condition, the right two ones to the other condition. The boxplots describe the abundance measurements of each isoform across replicates. The actual measurements are overlayed as points and connected by lines so as to group the set of isoform abundances measured in each replicate. This highlights the level of consistency between the replicates. The points and lines are placed slightly off-centre to prevent replicates masking one another when the measurements are very similar.

+

The colours of the replicates are recycled between the two conditions, to reducethe required palette and boost contrast. So in the example, there are 4 samples: controls-1, controls-2, patients-1 and patients-2.

+

In the past, the fill colour encoded the DTU outcome, but to keep things more consistent with the other styles the DTU is now encoded as point shape while the fill colour echoes the condition. In this example, isoforms .c1 and .c2 change significantly, isoforms .c3, .c4 and .nc do not change significantly and isoform .d could not be tested.

+

This structure is great for seeing changes in the isoform distribution profile of a gene. A more minimalist version of this, without the boxpltos, can also be obtained:

# Grouping by condition (minimalist):
 plot_gene(mydtu, "MIX6", style="lines")
-

When there are many lines or many isoforms, it may be preferable to group abundances by isoform, making individual comparisons easier:

+

When there are many replicates or many isoforms, it may be preferable to group abundances by isoform, making individual comparisons easier:

# Grouping by isoform:
 plot_gene(mydtu, "MIX6", style="byisoform")
-

+

Overview plots

@@ -275,7 +276,7 @@

Overview plots

# Distribution of largest isoform proportion change per gene. plot_overview(mydtu, type="maxdprop")

This is what these look like on a larger dataset: Effect size distribution Largest isoform effect size per gene

-

The gene version of the effect size distribution is often bi-modal with the positive peak being larger than the negative, because when a gene has isoforms with the same absolute effect size (for example genes with two isoforms) the positive value is selected.

+

The gene version of the effect size distribution is often bi-modal with the positive peak being larger than the negative, because when a gene has isoforms with the same absolute effect size (for example genes with two isoforms) the positive value is deterministically selected.

Although fold-changes of transcript expression are not tied to DTU, if your input counts are suitably normalised between conditions, you can plot the traditional FC volcano and the relationship between FC and proportion change.

# Proportion change VS transcript-level significance. Each point is a transcript
 plot_overview(mydtu, type="fcvolcano")
@@ -286,7 +287,7 @@ 

Overview plots

Interactive plots

-

If you prefer picking points from a plot rather than sorting through tables, the volcano plot is also available through an interactive shiny app, that pulls up the relevant Gene and Transcript results and the isoform abundance plot for any volcano point you select.

+

If you prefer picking points from a plot rather than sorting through tables, the gene-level volcano plot is also available through an interactive shiny app, that pulls up the relevant Gene and Transcript results and the isoform abundance plot for any volcano point you select.

# Start the interactive volcano plot.
 plot_shiny_volcano(mydtu)
    @@ -305,10 +306,10 @@

    Change information layers

    The fillby, colourby and shapeby parameters can be respectively used to control which information layers are encoded as fill, line/point colour, and point shape. Possible values are c("isoform", "condition", "DTU", "none", "replicate"). Be aware that some combinations of plot style and information layers are not possible. If safe to do so these will be silently ignored, otherwise an error message will appear.

    # For a less busy look, any of the information layers can be disabled.
     plot_gene(mydtu, "MIX6", style="byisoform", colourby="none", shapeby="none")
    -

    +

    The following command will approximate the older version of the gene plot (where DTU determined the fill colour):

    plot_gene(mydtu, "MIX6", fillby="DTU", shapeby="none")
    -

    +

Change colour code

@@ -317,27 +318,19 @@

Change colour code

plot_gene(mydtu, "MIX6", style="bycondition", fillby="condition", condcolvec=c("magenta", "cyan"), replcolvec=c("green", "darkgreen"))
-

-
    +

    +
    • isofcolvec - Colour vector for isoform highlighting. Used to build a colorRampPalette.
    • dtucolvec - Colour vector for DTU highlighting.
    • condcolvec - Colour vector for condition highlighting.
    • replcolvec - Colour vector for replicate highlighting. Used to build a colorRampPalette.
    • nonecol - Colour to use when no colour coding is wanted.
    • -
+

General customisations for all plots

-

You can save any of the gene and overview plots as a ggplot2 object and use ggplot2 manipulations on it, such as changing the axis scales. Other ggplot2 customisations include the axis tick marks, axis values, labels, titles, colours… Consult the ggplot2 documentation for more help on these.

- - - - - - - - +

You can save any of the gene and overview plots as a ggplot2 object and use ggplot2 manipulations on it, such as the axis tick marks, axis values, labels, titles, colours… Consult the ggplot2 documentation for more help on these.


diff --git a/man/call_DTU.Rd b/man/call_DTU.Rd index e4a8d6f..4211ca3 100644 --- a/man/call_DTU.Rd +++ b/man/call_DTU.Rd @@ -5,14 +5,13 @@ \title{Calculate differential transcript usage.} \usage{ call_DTU(annot = NULL, TARGET_COL = "target_id", PARENT_COL = "parent_id", - slo = NULL, name_A = "Condition-A", name_B = "Condition-B", - varname = "condition", COUNTS_COL = "est_counts", - BS_TARGET_COL = "target_id", count_data_A = NULL, count_data_B = NULL, - boot_data_A = NULL, boot_data_B = NULL, p_thresh = 0.05, - abund_thresh = 5, dprop_thresh = 0.2, correction = "BH", scaling = 1, - testmode = "both", qboot = TRUE, qbootnum = 0L, qrep_thresh = 0.95, - rboot = TRUE, rrep_thresh = 0.85, rrep_as_crit = TRUE, - description = NA_character_, verbose = TRUE, threads = 1L, dbg = 0) + count_data_A = NULL, count_data_B = NULL, boot_data_A = NULL, + boot_data_B = NULL, name_A = "Condition-A", name_B = "Condition-B", + varname = "condition", p_thresh = 0.05, abund_thresh = 5, + dprop_thresh = 0.2, correction = "BH", scaling = 1, testmode = "both", + qboot = TRUE, qbootnum = 0L, qrep_thresh = 0.95, rboot = TRUE, + rrep_thresh = 0.85, description = NA_character_, verbose = TRUE, + threads = 1L, dbg = 0) } \arguments{ \item{annot}{A data.table matching transcript identifiers to gene identifiers. Any additional columns are allowed but ignored.} @@ -21,18 +20,6 @@ call_DTU(annot = NULL, TARGET_COL = "target_id", PARENT_COL = "parent_id", \item{PARENT_COL}{The name of the column for the gene identifiers in \code{annot}. (Default \code{"parent_id"})} -\item{slo}{A Sleuth object.} - -\item{name_A}{The name for one condition, as it appears in the \code{sample_to_covariates} table within the Sleuth object. (Default "Condition-A")} - -\item{name_B}{The name for the other condition, as it appears in the \code{sample_to_covariates} table within the sleuth object. (Default "Condition-B")} - -\item{varname}{The name of the covariate to which the two conditions belong, as it appears in the \code{sample_to_covariates} table within the sleuth object. (Default \code{"condition"}).} - -\item{COUNTS_COL}{For Sleuth objects only. The name of the counts column to use for the DTU calculation (est_counts or tpm). (Default \code{"est_counts"})} - -\item{BS_TARGET_COL}{For Sleuth objects only. The name of the transcript identifiers column in the bootstrap tables. (Default \code{"target_id"})} - \item{count_data_A}{A data.table of estimated counts for condition A. One column per sample/replicate, one row per transcript. The first column should contain the transcript identifiers.} \item{count_data_B}{A data.table of estimated counts for condition B. One column per sample/replicate, one row per transcript. The first column should contain the transcript identifiers.} @@ -41,15 +28,21 @@ call_DTU(annot = NULL, TARGET_COL = "target_id", PARENT_COL = "parent_id", \item{boot_data_B}{A list of data.tables, one per sample/replicate of condition B. One bootstrap iteration's estimates per column, one transcript per row. The first column should contain the transcript identifiers.} +\item{name_A}{The name for one condition. (Default "Condition-A")} + +\item{name_B}{The name for the other condition. (Default "Condition-B")} + +\item{varname}{The name of the covariate to which the two conditions belong. (Default \code{"condition"}).} + \item{p_thresh}{The p-value threshold. (Default 0.05)} \item{abund_thresh}{Noise threshold. Minimum mean abundance for transcripts to be eligible for testing. (Default 5)} \item{dprop_thresh}{Effect size threshold. Minimum change in proportion of a transcript for it to be considered meaningful. (Default 0.20)} -\item{correction}{The p-value correction to apply, as defined in \code{stats::p.adjust.methods}. (Default \code{"BH"})} +\item{correction}{The p-value correction to apply, as defined in \code{\link[stats]{p.adjust.methods}}. (Default \code{"BH"})} -\item{scaling}{A scaling factor to be applied to the abundances, *prior* to any thresholding and testing. Useful for scaling TPM abundances to the actual number of transcripts in the sequencing sample, if you use TPMs and if you can infer that information from your sample preparation. Improper use of the scaling factor will artificially inflate/deflate the significances obtained. (Default 1)} +\item{scaling}{A scaling factor to be applied to the abundances, *prior* to any thresholding and testing. Useful for scaling TPM (transcripts per 1 million reads) abundances to the actual library size. WARNING: Improper use of the scaling factor will artificially inflate/deflate the significances obtained. (Default 1)} \item{testmode}{One of \itemize{\item{"genes"}, \item{"transc"}, \item{"both" (default)}}.} @@ -59,12 +52,10 @@ call_DTU(annot = NULL, TARGET_COL = "target_id", PARENT_COL = "parent_id", \item{qrep_thresh}{Reproducibility threshold for quantification bootsrapping. (Default 0.95)} -\item{rboot}{Bootstrap the DTU robustness against the replicates. Does ALL 1 vs 1 combinations. (Default \code{TRUE})} +\item{rboot}{Bootstrap the DTU robustness against the replicates. Does *ALL* 1 vs 1 combinations. (Default \code{TRUE})} \item{rrep_thresh}{Reproducibility threshold for replicate bootsrapping. (Default 0.85) With few replicates per condition, the reproducibility takes heavily quantized values. For 3x3, there are 9 possible 1v1 comparisons, and a consistency of 8/9 = 0.88.} -\item{rrep_as_crit}{Whether DTU calls should include replicate reproducibility as a criterion. (Default TRUE)} - \item{description}{Free-text description of the run. You can use this to add metadata to the results object.} \item{verbose}{Display progress updates and warnings. (Default \code{TRUE})} @@ -77,10 +68,9 @@ call_DTU(annot = NULL, TARGET_COL = "target_id", PARENT_COL = "parent_id", List of mixed types. Contains a list of runtime settings, a table of gene-level results, a table of transcript-level results, and a list of two tables with the transcript abundaces. } \description{ -There are three options for input: +There are two modes for input: \itemize{ - \item{A sleuth object. This requires the following parameters: \code{slo}, \code{name_A}, \code{name_B} and optionally \code{varname}, \code{COUNT_COL}, \code{BS_TARGET_COL}.} - \item{Bootstrapped count estimates. This requires the following parameters: \code{boot_data_A} and \code{boot_data_B}. \code{name_A} and \code{name_B} can optionally be used to name the conditions.} - \item{Count estimates. This requires the following parameters: \code{count_data_A} and \code{count_data_B}. \code{name_A} and \code{name_B} can optionally be used to name the conditions.} + \item{Bootstrapped count estimates. This requires the following parameters: \code{boot_data_A} and \code{boot_data_B}.} + \item{Count estimates. This requires the following parameters: \code{count_data_A} and \code{count_data_B}.} } } diff --git a/man/denest_sleuth_boots.Rd b/man/denest_sleuth_boots.Rd index 4bc5651..4c77a1d 100644 --- a/man/denest_sleuth_boots.Rd +++ b/man/denest_sleuth_boots.Rd @@ -34,5 +34,9 @@ Transcripts in \code{slo} that are missing from \code{annot} will be skipped com Transcripts in \code{annot} that are missing from \code{slo} are automatically padded with NA, which we re-assign as 0. } \description{ -Extract bootstrap counts into a less nested structure. +*Legacy function* +} +\details{ +It extracts the bootstrap data from the older-style \code{sleuth} object. +As of sleuth version 0.29, the bootstrap data is no longer kept in the object. } diff --git a/man/dtu_plurality_summary.Rd b/man/dtu_plurality_summary.Rd index 258cfac..d2c0c6a 100644 --- a/man/dtu_plurality_summary.Rd +++ b/man/dtu_plurality_summary.Rd @@ -10,8 +10,8 @@ dtu_plurality_summary(dtuo) \item{dtuo}{A DTU object.} } \value{ -A named numerical vector giving a tally of the results. +A data.frame with catefory names in the first column and values in the second column. } \description{ -Summary of DTU plurality. +A tally of genes based on how many isoforms show significant change. } diff --git a/man/dtu_summary.Rd b/man/dtu_summary.Rd index a8a7873..e087986 100644 --- a/man/dtu_summary.Rd +++ b/man/dtu_summary.Rd @@ -10,8 +10,8 @@ dtu_summary(dtuo) \item{dtuo}{A DTU object.} } \value{ -A named numerical vector giving a tally of the results. +A data.frame with catefory names in the first column and values in the second column. } \description{ -Summary of DTU calling. +A tally of DTU genes and transcripts. } diff --git a/man/dtu_switch_summary.Rd b/man/dtu_switch_summary.Rd index 611cf50..54f832e 100644 --- a/man/dtu_switch_summary.Rd +++ b/man/dtu_switch_summary.Rd @@ -10,8 +10,8 @@ dtu_switch_summary(dtuo) \item{dtuo}{A DTU object.} } \value{ -A named numerical vector giving a tally of the results. +A data.frame with catefory names in the first column and values in the second column. } \description{ -Summary of isoform switching events. +A tally of genes showing isoform switching. } diff --git a/man/g.test.1.Rd b/man/g.test.1.Rd index 235814d..fef12fc 100644 --- a/man/g.test.1.Rd +++ b/man/g.test.1.Rd @@ -14,8 +14,11 @@ g.test.1(obsx, px) The order of values in the two vectors should be the same. If any value of xp is zero and the corresponding xobs is not, g.test.1 will always reject the hypothesis. No corrections are applied. -No input checks are applied, as RATs needs to run this millions of times.} +No input checks are applied.} } \description{ -Log-likelihood test of goodness of fit. +For a set of observations against a set of probabilities. +} +\details{ +General utility function. } diff --git a/man/g.test.2.Rd b/man/g.test.2.Rd index 0d37f8c..7bd76a1 100644 --- a/man/g.test.2.Rd +++ b/man/g.test.2.Rd @@ -18,3 +18,6 @@ No input checks are applied, as RATs needs to run this millions of times.} \description{ For two sets of observations. } +\details{ +General utility function. +} diff --git a/man/get_plurality_ids.Rd b/man/get_plurality_ids.Rd index cb343f0..f4cee2b 100644 --- a/man/get_plurality_ids.Rd +++ b/man/get_plurality_ids.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/results.R \name{get_plurality_ids} \alias{get_plurality_ids} -\title{Summary of DTU plurality.} +\title{List the gene IDs by number of isoforms showing significant change.} \usage{ get_plurality_ids(dtuo) } @@ -14,5 +14,5 @@ A named numerical vector giving a tally of the results. } \description{ Get the IDs of DTU genes organised by the number of isoforms affected. This -is possible only based on the transcript-level results. +is possible only if transcript-level results are enabled. } diff --git a/man/group_samples.Rd b/man/group_samples.Rd index b7d20e4..eca674e 100644 --- a/man/group_samples.Rd +++ b/man/group_samples.Rd @@ -13,6 +13,9 @@ group_samples(covariates) list of lists (per covariate) of vectors (per factor level). } \description{ -Sleuth records covariate values per sample. However, RATs needs the reverse: the -samples that correspond to a given covariate value. +*Legacy function* No longer in use, but kept for possible general utility. +} +\details{ +Converts a table where each row is a sample and each column a variable into a +lookup structure that matches each value of each variable to a vector of corresponding samples. } diff --git a/man/infer_bootnum.Rd b/man/infer_bootnum.Rd index a8932e1..3b4518f 100644 --- a/man/infer_bootnum.Rd +++ b/man/infer_bootnum.Rd @@ -4,11 +4,9 @@ \alias{infer_bootnum} \title{Rule-of-thumb number of iterations.} \usage{ -infer_bootnum(slo, boot_data_A, boot_data_B) +infer_bootnum(boot_data_A, boot_data_B) } \arguments{ -\item{slo}{Sleuth object.} - \item{boot_data_A}{List of tables of bootstrapped counts.} \item{boot_data_B}{List of tables of bootstrapped counts.} diff --git a/man/maxabs.Rd b/man/maxabs.Rd index dcefd58..6faf69e 100644 --- a/man/maxabs.Rd +++ b/man/maxabs.Rd @@ -13,6 +13,10 @@ maxabs(v) A numeric value. } \description{ -Get back the original signed value. In case of equal absolutes, the positive -value will be returned. +General utility function. +} +\details{ +Returns the original signed value with the largest absolute value in the vector. +In case of equal absolutes between a positive and negative value, the positive +value will always be the one that is returned. } diff --git a/man/parameters_are_good.Rd b/man/parameters_are_good.Rd index a20fe7a..d2e4274 100644 --- a/man/parameters_are_good.Rd +++ b/man/parameters_are_good.Rd @@ -4,63 +4,48 @@ \alias{parameters_are_good} \title{Check input parameters.} \usage{ -parameters_are_good(slo, annot, name_A, name_B, varname, COUNTS_COL, correction, - p_thresh, TARGET_COL, PARENT_COL, BS_TARGET_COL, count_thresh, testmode, - qboot, qbootnum, dprop_thresh, count_data_A, count_data_B, boot_data_A, - boot_data_B, qrep_thresh, threads, rboot, rrep_thresh, rrep_as_crit, scaling) +parameters_are_good(annot, count_data_A, count_data_B, boot_data_A, boot_data_B, + TARGET_COL, PARENT_COL, correction, testmode, scaling, threads, p_thresh, + abund_thresh, dprop_thresh, qboot, qbootnum, qrep_thresh, rboot, rrep_thresh) } \arguments{ -\item{slo}{Sleuth object} - \item{annot}{Annotation dataframe.} -\item{name_A}{Condition name.} - -\item{name_B}{Condition name.} - -\item{varname}{Name of condition variable.} +\item{count_data_A}{A dataframe of estimated counts.} -\item{COUNTS_COL}{Name of counts column in bootstrap.} +\item{count_data_B}{A dataframe of estimated counts.} -\item{correction}{P-value correction method.} +\item{boot_data_A}{A list of dataframes, one per sample, each with all the bootstrapped estimetes for the sample.} -\item{p_thresh}{Significance level.} +\item{boot_data_B}{A list of dataframes, one per sample, each with all the bootstrapped estimetes for the sample.} \item{TARGET_COL}{Name of transcript id column in annotation.} \item{PARENT_COL}{Name of gene id column in annotation.} -\item{BS_TARGET_COL}{Name of transcript id column in bootstrap.} - -\item{count_thresh}{Minimum transcript abundance per sample.} +\item{correction}{P-value correction method.} \item{testmode}{Which test to run.} -\item{qboot}{Whether to bootstrap against quantifications.} +\item{scaling}{Abundance scaling factor.} -\item{qbootnum}{Number of bootstrap iterations.} +\item{threads}{Number of threads.} -\item{dprop_thresh}{Minimum change in proportion.} +\item{p_thresh}{Significance level.} -\item{count_data_A}{A dataframe of estimated counts.} +\item{abund_thresh}{Minimum transcript abundance per sample.} -\item{count_data_B}{A dataframe of estimated counts.} +\item{dprop_thresh}{Minimum change in proportion.} -\item{boot_data_A}{A list of dataframes, one per sample, each with all the bootstrapped estimetes for the sample.} +\item{qboot}{Whether to bootstrap against quantifications.} -\item{boot_data_B}{A list of dataframes, one per sample, each with all the bootstrapped estimetes for the sample.} +\item{qbootnum}{Number of bootstrap iterations.} \item{qrep_thresh}{Confidence threshold.} -\item{threads}{Number of threads.} - \item{rboot}{Whether to bootstrap against samples.} \item{rrep_thresh}{Confidence threshold.} - -\item{rrep_as_crit}{Whether to use rrep as a DTU criterion.} - -\item{scaling}{Abundance scaling factor.} } \value{ List: \itemize{ diff --git a/man/sim_boot_data.Rd b/man/sim_boot_data.Rd index bb65086..d18fb17 100644 --- a/man/sim_boot_data.Rd +++ b/man/sim_boot_data.Rd @@ -4,14 +4,25 @@ \alias{sim_boot_data} \title{Generate an artificial dataset of bootstrapped abundance estimates, for code-testing or examples.} \usage{ -sim_boot_data(cnames = c("A", "B")) +sim_boot_data(varname = "condition", cnames = c("A", "B"), + errannot_inconsistent = FALSE, PARENT_COL = "parent_id", + TARGET_COL = "target_id") } \arguments{ +\item{varname}{Name for the variables by which to compare.} + \item{cnames}{A vector of (two) name values for the comparison variable.} + +\item{errannot_inconsistent}{Logical. Introduces an inconsistency in the transcript IDs, for testing of sanity checks. (FALSE)} + +\item{PARENT_COL}{Name for the bootdtraps column containing counts.} + +\item{TARGET_COL}{Name for annotation column containing transcript identification.} } \value{ -A list with 3 elements. First a data.frame with the corresponding annotation. Then, 2 lists of data.tables. One list per condition. Each data table represents a sample and contains the estimates from the bootstrap iterations. +A list with 3 elements. First a data.frame with the corresponding annotation. +Then, 2 lists of data.tables. One list per condition. Each data table represents a sample and contains the estimates from the bootstrap iterations. } \description{ -Based on sim_sleuth_data(). +Generate an artificial dataset of bootstrapped abundance estimates, for code-testing or examples. } diff --git a/man/sim_count_data.Rd b/man/sim_count_data.Rd index e98db51..91d1435 100644 --- a/man/sim_count_data.Rd +++ b/man/sim_count_data.Rd @@ -2,17 +2,27 @@ % Please edit documentation in R/data_simulators.R \name{sim_count_data} \alias{sim_count_data} -\title{Generate an artificial dataset of bootstrapped abundance estimates, for code-testing or examples.} +\title{Generate an artificial dataset of abundance estimates, for code-testing or examples.} \usage{ -sim_count_data(cnames = c("A", "B")) +sim_count_data(varname = "condition", cnames = c("A", "B"), + errannot_inconsistent = FALSE, PARENT_COL = "parent_id", + TARGET_COL = "target_id") } \arguments{ +\item{varname}{Name for the variables by which to compare.} + \item{cnames}{A vector of (two) name values for the comparison variable.} + +\item{errannot_inconsistent}{Logical. Introduces an inconsistency in the transcript IDs, for testing of sanity checks. (FALSE)} + +\item{PARENT_COL}{Name for the bootdtraps column containing counts.} + +\item{TARGET_COL}{Name for annotation column containing transcript identification.} } \value{ A list with 3 elements. First, a data.frame with the corresponding annotation table. Then, 2 data.tables, one per condition. Each data table contains the abundance estimates for all the samples for the respective condition. } \description{ -Based on sim_sleuth_data(). +Generate an artificial dataset of abundance estimates, for code-testing or examples. } diff --git a/tests/testthat/test_1_data-munging.R b/tests/testthat/test_1_data-munging.R index 68a5c6d..a9a2380 100644 --- a/tests/testthat/test_1_data-munging.R +++ b/tests/testthat/test_1_data-munging.R @@ -82,8 +82,8 @@ test_that("Filters work correctly", { # !!! This test is tightly dependent on the data used for the test and the default parameter values, in order to # !!! ensure correct response to specific scenarios. - sim <- sim_sleuth_data(cnames=c("ONE","TWO")) - mydtu <- call_DTU(annot= sim$annot, slo= sim$slo, name_A= "ONE", name_B= "TWO", abund_thresh=10, dprop_thresh=0.1, verbose = FALSE, rboot=FALSE, qboot = FALSE) + sim <- sim_boot_data(cnames=c("ONE","TWO")) + mydtu <- call_DTU(annot= sim$annot, boot_data_A=sim$boots_A, boot_data_B=sim$boots_B, name_A= "ONE", name_B= "TWO", abund_thresh=10, dprop_thresh=0.1, verbose = FALSE, rboot=FALSE, qboot = FALSE) expect_equivalent(as.list(mydtu$Genes["1A1N", list(known_transc, detect_transc, elig_transc, elig, elig_fx)]), list(1, 1, 0, FALSE, FALSE)) @@ -153,10 +153,7 @@ test_that("Filters work correctly", { #============================================================================== test_that("The number of iterations is detected correctly", { - sim <- sim_sleuth_data() - expect_equal(infer_bootnum(sim$slo, NULL, NULL), 2) - sim <- sim_boot_data() - expect_equal(infer_bootnum(NULL,sim$boots_A, sim$boots_B), 2) + expect_equal(infer_bootnum(sim$boots_A, sim$boots_B), 2) }) diff --git a/tests/testthat/test_1_internal-structures.R b/tests/testthat/test_1_internal-structures.R index 7f819a6..115f991 100644 --- a/tests/testthat/test_1_internal-structures.R +++ b/tests/testthat/test_1_internal-structures.R @@ -4,7 +4,7 @@ context("DTU Internal structures") #============================================================================== test_that("The reporting structures are created correctly", { - sim <- sim_sleuth_data() + sim <- sim_boot_data() full <- alloc_out(sim$annot, "full") short <- alloc_out(sim$annot, "short") @@ -17,13 +17,13 @@ test_that("The reporting structures are created correctly", { expect_type(full$Parameters, "list") expect_true(typeof(full$Parameters) == typeof(short$Parameters)) - expect_length(full$Parameters, 24) + expect_length(full$Parameters, 23) expect_length(short$Parameters, 2) expect_named(full$Parameters, c("description", "time", "rats_version", "R_version", "var_name", "cond_A", "cond_B", "data_type", "num_replic_A", "num_replic_B", "num_genes", "num_transc", "tests", "p_thresh", "abund_thresh", "dprop_thresh", "abund_scaling", "quant_reprod_thresh", "quant_boot", "quant_bootnum", - "rep_reprod_thresh", "rep_boot", "rep_bootnum", "rep_reprod_as_crit")) + "rep_reprod_thresh", "rep_boot", "rep_bootnum")) expect_named(short$Parameters, c("num_replic_A", "num_replic_B")) expect_true(is.data.frame(full$Genes)) diff --git a/tests/testthat/test_2_input-checks.R b/tests/testthat/test_2_input-checks.R index 73342e2..5a83a2c 100644 --- a/tests/testthat/test_2_input-checks.R +++ b/tests/testthat/test_2_input-checks.R @@ -8,59 +8,50 @@ test_that("The input checks work", { name_B <- "two" wrong_name <- "RUBBISH_COLUMN_NAME" - # Emulate sleuth and annotation. - sim1 <- sim_sleuth_data(cnames=c(name_A, name_B)) - sim2 <- sim_sleuth_data(varname= "waffles", COUNTS_COL= "counts", TARGET_COL= "target" , PARENT_COL= "parent", - BS_TARGET_COL= "id", cnames= c("AAAA","BBBB")) - # Emulate non-sleuth bootstrap data. - data_A <- denest_sleuth_boots(sim1$slo, sim1$annot, c(1,3), "est_counts", "target_id") - data_B <- denest_sleuth_boots(sim1$slo, sim1$annot, c(2,4), "est_counts", "target_id") + # Emulate bootstrap data. + sim1 <- sim_boot_data() + sim2 <- sim_boot_data(varname="waffles", cnames=c("AAAA", "BBBB"), errannot_inconsistent=FALSE, PARENT_COL="parent", TARGET_COL="target") # Emulate non-bootstrap data. - counts_A <- data_A[[1]] - counts_B <- data_B[[1]] + counts_A <- sim1$boots_A[[1]] + counts_B <- sim1$boots_B[[1]] # No false alarms with valid calls. - expect_silent(call_DTU(annot= sim2$annot, slo= sim2$slo, name_A = "AAAA", name_B = "BBBB", varname= "waffles", p_thresh= 0.01, abund_thresh= 10, + expect_silent(call_DTU(annot= sim2$annot, boot_data_A= sim2$boots_A, boot_data_B= sim2$boots_B, name_A = "AAAA", name_B = "BBBB", varname= "waffles", p_thresh= 0.01, abund_thresh= 10, rrep_thresh = 0.6, qrep_thresh = 0.8, testmode= "transc", correction= "bonferroni", verbose= FALSE, rboot= FALSE, qboot=TRUE, - qbootnum= 100, COUNTS_COL= "counts", TARGET_COL= "target", - PARENT_COL= "parent", BS_TARGET_COL= "id", threads= 2, rrep_as_crit=TRUE, dbg= "prep")) - expect_silent(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, verbose = FALSE, dbg= "prep")) - expect_silent(call_DTU(annot= sim1$annot, boot_data_A= data_A, boot_data_B= data_B, verbose = FALSE, dbg= "prep")) - expect_silent(call_DTU(annot= sim1$annot, count_data_A= counts_A, count_data_B= counts_B, rboot=FALSE, qboot= FALSE, verbose = FALSE, dbg= "prep")) - expect_silent(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, verbose = FALSE, - boot_data_A= data_A, boot_data_B= data_B, count_data_A= counts_A, count_data_B= counts_B, dbg= "prep")) + qbootnum= 100, TARGET_COL= "target", PARENT_COL= "parent", threads= 2, dbg= "prep")) + expect_silent(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, verbose = FALSE, dbg= "prep")) + expect_silent(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, rboot=FALSE, qboot= FALSE, verbose = FALSE, dbg= "prep")) # Mixed input formats. - expect_warning(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, verbose = TRUE, - boot_data_A= data_A, boot_data_B= data_B, qbootnum= 100, dbg= "prep"), - "Only the sleuth object will be used", fixed= TRUE) - expect_warning(call_DTU(annot= sim1$annot, verbose = TRUE, boot_data_A= data_A, boot_data_B= data_B, + expect_silent(call_DTU(annot= sim1$annot, name_A= name_A, name_B= name_B, verbose = FALSE, + boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, count_data_A= counts_A, count_data_B= counts_B, dbg= "prep")) + expect_warning(call_DTU(annot= sim1$annot, verbose = TRUE, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, count_data_A= counts_A, count_data_B= counts_B, qbootnum= 100, dbg= "prep"), "Only the bootstrapped data will be used", fixed= TRUE) # Annotation is not a dataframe. - expect_error(call_DTU(annot= list("not", "a", "dataframe"), slo= sim1$slo, name_A= name_A, name_B= name_B, verbose = FALSE, dbg= "prep"), "annot is not a data.frame") + expect_error(call_DTU(annot= list("not", "a", "dataframe"), boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, verbose = FALSE, dbg= "prep"), "annot is not a data.frame") # Annotation field names. - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, TARGET_COL= wrong_name, verbose = FALSE, dbg= "prep"), + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, TARGET_COL= wrong_name, verbose = FALSE, dbg= "prep"), "target and/or parent IDs field names do not exist in annot", fixed= TRUE) - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, PARENT_COL= wrong_name, verbose = FALSE, dbg= "prep"), + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, PARENT_COL= wrong_name, verbose = FALSE, dbg= "prep"), "target and/or parent IDs field names do not exist in annot", fixed= TRUE) # Inconsistent annotation. - sim3 <- sim_sleuth_data(errannot_inconsistent= TRUE, cnames= c(name_A, name_B)) - expect_error(call_DTU(annot= sim3$annot, slo= sim3$slo, name_A= name_A, name_B= name_B, verbose = FALSE, dbg= "prep"), + sim3 <- sim_boot_data(errannot_inconsistent= TRUE, cnames= c(name_A, name_B)) + expect_error(call_DTU(annot= sim3$annot, boot_data_A= sim3$boots_A, boot_data_B= sim3$boots_B, name_A= name_A, name_B= name_B, verbose = FALSE, dbg= "prep"), "Inconsistent set of transcript IDs", fixed= TRUE) # Non unique IDs. a <- copy(sim1$annot) a[1, "target_id"] <- a[2, "target_id"] - expect_error(call_DTU(annot= a, slo= sim1$slo, name_A= name_A, name_B= name_B, verbose = FALSE, dbg= "prep"), "transcript identifiers are not unique") + expect_error(call_DTU(annot= a, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, verbose = FALSE, dbg= "prep"), "transcript identifiers are not unique") # Boot data is not a list of datatables. - expect_error(call_DTU(annot= sim1$annot, boot_data_A= c("not", "a", "list"), boot_data_B= data_B, verbose = FALSE, dbg= "prep"), "bootstrap data are not lists") - expect_error(call_DTU(annot= sim1$annot, boot_data_A= data_A, boot_data_B= c("not", "a", "list"), verbose = FALSE, dbg= "prep"), "bootstrap data are not lists") - expect_error(call_DTU(annot= sim1$annot, boot_data_A= list( list("not"), list("a"), list("table")), boot_data_B= data_B, verbose = FALSE, dbg= "prep"), "bootstrap data are not lists") - expect_error(call_DTU(annot= sim1$annot, boot_data_A= data_A, boot_data_B= list( list("not"), list("a"), list("table")), verbose = FALSE, dbg= "prep"), "bootstrap data are not lists") - expect_error(call_DTU(annot= sim1$annot, boot_data_A= list(data.frame(a="not", b="a", c="table")), boot_data_B= data_B, verbose = FALSE, dbg= "prep"), "not lists of data.tables") - expect_error(call_DTU(annot= sim1$annot, boot_data_A= data_A, boot_data_B= list(data.frame(a="not", b="a", c="table")), verbose = FALSE, dbg= "prep"), "not lists of data.tables") + expect_error(call_DTU(annot= sim1$annot, boot_data_A= c("not", "a", "list"), boot_data_B= sim1$boots_B, verbose = FALSE, dbg= "prep"), "bootstrap data are not lists") + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= c("not", "a", "list"), verbose = FALSE, dbg= "prep"), "bootstrap data are not lists") + expect_error(call_DTU(annot= sim1$annot, boot_data_A= list( list("not"), list("a"), list("table")), boot_data_B= sim1$boots_B, verbose = FALSE, dbg= "prep"), "bootstrap data are not lists") + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= list( list("not"), list("a"), list("table")), verbose = FALSE, dbg= "prep"), "bootstrap data are not lists") + expect_error(call_DTU(annot= sim1$annot, boot_data_A= list(data.frame(a="not", b="a", c="table")), boot_data_B= sim1$boots_B, verbose = FALSE, dbg= "prep"), "not lists of data.tables") + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= list(data.frame(a="not", b="a", c="table")), verbose = FALSE, dbg= "prep"), "not lists of data.tables") # Counts data is not datatables. expect_error(call_DTU(annot= sim1$annot, count_data_A= c("not", "a", "list"), count_data_B= counts_B, verbose = FALSE, dbg= "prep"), "counts data are not data.tables") @@ -68,94 +59,75 @@ test_that("The input checks work", { expect_error(call_DTU(annot= sim1$annot, count_data_A= data.frame(a="not", b="a", c="table"), count_data_B= counts_B, verbose = FALSE, dbg= "prep"), "counts data are not data.tables") expect_error(call_DTU(annot= sim1$annot, count_data_A= counts_A, count_data_B= data.frame(a="not", b="a", c="table"), verbose = FALSE, dbg= "prep"), "counts data are not data.tables") - # Bootstrap field names. - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, BS_TARGET_COL= wrong_name, verbose = FALSE, dbg= "prep"), - "target IDs field name does not exist in the bootstraps", fixed= TRUE) - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, COUNTS_COL= wrong_name, verbose = FALSE, dbg= "prep"), - "counts field name does not exist", fixed= TRUE) # Number of bootstraps. - expect_warning(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, verbose = TRUE, dbg= "prep"), + expect_warning(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, verbose = TRUE, dbg= "prep"), "few bootstrap iterations", fixed= TRUE) - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, qbootnum = -5, verbose = FALSE, dbg= "prep"), + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, qbootnum = -5, verbose = FALSE, dbg= "prep"), "Invalid number of bootstraps", fixed= TRUE) - expect_warning(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, qbootnum = 5, verbose = TRUE, dbg= "prep"), + expect_warning(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, qbootnum = 5, verbose = TRUE, dbg= "prep"), "qbootnum is low", fixed= TRUE) - expect_warning(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, qbootnum = 5000, verbose = TRUE, dbg= "prep"), + expect_warning(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, qbootnum = 5000, verbose = TRUE, dbg= "prep"), "number of quantification bootstraps is very high", fixed= TRUE) - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, qbootnum = 5000000000000, verbose = FALSE, dbg= "prep"), + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, qbootnum = 5000000000000, verbose = FALSE, dbg= "prep"), "number of quantification bootstraps would exceed the maximum capacity", fixed= TRUE) # Bootstraps without boot data. expect_warning(call_DTU(annot= sim1$annot, count_data_A= counts_A, count_data_B= counts_B, qboot=TRUE, qbootnum=2, verbose= TRUE, dbg= "prep"), - "no bootstrapped estimates", fixed= TRUE) + "qboot is TRUE but no bootstrapped estimates", fixed= TRUE) # Correction method. - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, correction= wrong_name, verbose= FALSE, dbg= "prep"), + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, correction= wrong_name, verbose= FALSE, dbg= "prep"), "Invalid p-value correction method name", fixed= TRUE) - # Covariate name. - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, varname= wrong_name, verbose= FALSE, dbg= "prep"), - "covariate name does not exist", fixed= TRUE) - - # Condition names. - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= wrong_name, name_B= name_B, verbose= FALSE, dbg= "prep"), - "conditions do not exist", fixed= TRUE) - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= wrong_name, verbose= FALSE, dbg= "prep"), - "conditions do not exist", fixed= TRUE) - # Verbose is bool. - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, verbose="yes", dbg= "prep"), + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, verbose="yes", dbg= "prep"), "not interpretable as logical", fixed= TRUE) # Tests. - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, testmode="GCSE", verbose = FALSE, dbg= "prep"), + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, testmode="GCSE", verbose = FALSE, dbg= "prep"), "Unrecognized value for testmode", fixed= TRUE) - expect_silent(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, testmode="genes", verbose = FALSE, rboot=FALSE, qboot = FALSE, dbg= "prep")) - expect_silent(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, testmode="transc", verbose = FALSE, rboot=FALSE, qboot = FALSE, dbg= "prep")) + expect_silent(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, testmode="genes", verbose = FALSE, rboot=FALSE, qboot = FALSE, dbg= "prep")) + expect_silent(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, testmode="transc", verbose = FALSE, rboot=FALSE, qboot = FALSE, dbg= "prep")) - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, qboot="GCSE", verbose = FALSE, dbg= "prep"), + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, qboot="GCSE", verbose = FALSE, dbg= "prep"), "Unrecognized value for qboot", fixed= TRUE) - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, rboot="GCSE", verbose = FALSE, dbg= "prep"), + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, rboot="GCSE", verbose = FALSE, dbg= "prep"), "Unrecognized value for rboot", fixed= TRUE) # Probability threshold. - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, p_thresh = 666, verbose = FALSE, dbg= "prep"), + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, p_thresh = 666, verbose = FALSE, dbg= "prep"), "Invalid p-value threshold", fixed= TRUE) - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, p_thresh = -0.05, verbose = FALSE, dbg= "prep"), + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, p_thresh = -0.05, verbose = FALSE, dbg= "prep"), "Invalid p-value threshold", fixed= TRUE) # Read counts threshold. - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, abund_thresh = -5, verbose = FALSE, dbg= "prep"), + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, abund_thresh = -5, verbose = FALSE, dbg= "prep"), "Invalid abundance threshold", fixed= TRUE) # Proportion change threshold. - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, dprop_thresh = -2, verbose = FALSE, dbg= "prep"), + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, dprop_thresh = -2, verbose = FALSE, dbg= "prep"), "Invalid proportion difference threshold", fixed= TRUE) - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, dprop_thresh = 2, verbose = FALSE, dbg= "prep"), + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, dprop_thresh = 2, verbose = FALSE, dbg= "prep"), "Invalid proportion difference threshold", fixed= TRUE) # Confidence threshold. - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, rrep_thresh = -2, verbose = FALSE, dbg= "prep"), + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, rrep_thresh = -2, verbose = FALSE, dbg= "prep"), "Invalid reproducibility threshold", fixed= TRUE) - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, rrep_thresh = 2, verbose = FALSE, dbg= "prep"), + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, rrep_thresh = 2, verbose = FALSE, dbg= "prep"), "Invalid reproducibility threshold", fixed= TRUE) - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, qrep_thresh = -2, verbose = FALSE, dbg= "prep"), + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, qrep_thresh = -2, verbose = FALSE, dbg= "prep"), "Invalid reproducibility threshold", fixed= TRUE) - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, qrep_thresh = 2, verbose = FALSE, dbg= "prep"), + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, qrep_thresh = 2, verbose = FALSE, dbg= "prep"), "Invalid reproducibility threshold", fixed= TRUE) - # Conservative flag. - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, rrep_as_crit = "nope", verbose = FALSE, dbg= "prep"), - "Unrecognized value", fixed= TRUE) - # Number of threads. - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, threads = 0, verbose= FALSE, dbg= "prep"), + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, threads = 0, verbose= FALSE, dbg= "prep"), "Invalid number of threads", fixed= TRUE) - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, threads = -4, verbose= FALSE, dbg= "prep"), + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, threads = -4, verbose= FALSE, dbg= "prep"), "Invalid number of threads", fixed= TRUE) - expect_silent(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, threads = parallel::detectCores(logical=FALSE), verbose= FALSE, dbg= "prep")) - expect_silent(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, threads = parallel::detectCores(logical=TRUE), verbose= FALSE, dbg= "prep")) - expect_error(call_DTU(annot= sim1$annot, slo= sim1$slo, name_A= name_A, name_B= name_B, threads = parallel::detectCores(logical=TRUE) + 1, verbose= FALSE, dbg= "prep"), + expect_silent(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, threads = parallel::detectCores(logical=FALSE), verbose= FALSE, dbg= "prep")) + expect_silent(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, threads = parallel::detectCores(logical=TRUE), verbose= FALSE, dbg= "prep")) + expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, threads = parallel::detectCores(logical=TRUE) + 1, verbose= FALSE, dbg= "prep"), "threads exceed", fixed= TRUE) }) diff --git a/tests/testthat/test_2_output.R b/tests/testthat/test_2_output.R index b7f4786..f6a6e38 100644 --- a/tests/testthat/test_2_output.R +++ b/tests/testthat/test_2_output.R @@ -4,20 +4,20 @@ context("DTU Output") #============================================================================== test_that("The output structure is correct", { - sim <- sim_sleuth_data(cnames=c("ONE","TWO")) - mydtu <- call_DTU(annot= sim$annot, slo= sim$slo, name_A= "ONE", name_B= "TWO", qbootnum=2, qboot=TRUE, rboot=TRUE, verbose = FALSE) + sim <- sim_boot_data(cnames=c("ONE","TWO")) + mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qbootnum=2, qboot=TRUE, rboot=TRUE, verbose = FALSE) expect_type(mydtu, "list") expect_equal(length(mydtu), 4) expect_named(mydtu, c("Parameters", "Genes", "Transcripts", "Abundances")) expect_type(mydtu$Parameters, "list") - expect_length(mydtu$Parameters, 24) + expect_length(mydtu$Parameters, 23) expect_named(mydtu$Parameters, c("description", "time", "rats_version", "R_version", "var_name", "cond_A", "cond_B", "data_type", "num_replic_A", "num_replic_B", "num_genes", "num_transc", "tests", "p_thresh", "abund_thresh", "dprop_thresh", "abund_scaling", "quant_reprod_thresh", "quant_boot", "quant_bootnum", - "rep_reprod_thresh", "rep_boot", "rep_bootnum", "rep_reprod_as_crit")) + "rep_reprod_thresh", "rep_boot", "rep_bootnum")) expect_true(is.data.frame(mydtu$Genes)) expect_equal(dim(mydtu$Genes)[2], 26) @@ -93,22 +93,22 @@ test_that("The output structure is correct", { expect_true(is.data.frame(mydtu$Abundances[[1]])) expect_true(is.data.frame(mydtu$Abundances[[2]])) - mydtu <- call_DTU(annot= sim$annot, slo= sim$slo, name_A= "ONE", name_B= "TWO", verbose = FALSE, qboot = FALSE) + mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", verbose = FALSE, qboot = FALSE) expect_false(any(c("quant_dtu_freq", "quant_p_mean", "quant_p_stdev", "quant_p_min", "quant_p_max", "quant_na_freq", "quant_reprod") %in% names(mydtu$Genes))) expect_false(any(c("quant_dtu_freq", "quant_p_mean", "quant_p_stdev", "quant_p_min", "quant_p_max", "quant_na_freq", "quant_reprod") %in% names(mydtu$Transcripts))) - mydtu <- call_DTU(annot= sim$annot, slo= sim$slo, name_A= "ONE", name_B= "TWO", verbose = FALSE, rboot = FALSE) + mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", verbose = FALSE, rboot = FALSE) expect_false(any(c("rep_dtu_freq", "rep_p_mean", "rep_p_stdev", "rep_p_min", "rep_p_max", "rep_na_freq", "rep_reprod") %in% names(mydtu$Genes))) expect_false(any(c("rep_dtu_freq", "rep_p_mean", "rep_p_stdev", "rep_p_min", "rep_p_max", "rep_na_freq", "rep_reprod") %in% names(mydtu$Transcripts))) - mydtu <- call_DTU(annot= sim$annot, slo= sim$slo, name_A= "ONE", name_B= "TWO", testmode="transc", qbootnum=2, rboot=TRUE, qboot=TRUE, verbose = FALSE) + mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", testmode="transc", qbootnum=2, rboot=TRUE, qboot=TRUE, verbose = FALSE) expect_false(any(c("quant_dtu_freq", "quant_p_mean", "quant_p_stdev", "quant_p_min", "quant_p_max", "quant_na_freq", "quant_reprod", "rep_dtu_freq", "rep_p_mean", "rep_p_stdev", "rep_p_min", "rep_p_max", "rep_na_freq", "rep_reprod") %in% names(mydtu$Genes))) expect_true(all(c("quant_dtu_freq", "quant_p_mean", "quant_p_stdev", "quant_p_min", "quant_p_max", "quant_na_freq", "quant_reprod", "rep_dtu_freq", "rep_p_mean", "rep_p_stdev", "rep_p_min", "rep_p_max", "rep_na_freq", "rep_reprod") %in% names(mydtu$Transcripts))) - mydtu <- call_DTU(annot= sim$annot, slo= sim$slo, name_A= "ONE", name_B= "TWO", testmode="genes", qbootnum=2, rboot=TRUE, qboot=TRUE, verbose = FALSE) + mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", testmode="genes", qbootnum=2, rboot=TRUE, qboot=TRUE, verbose = FALSE) expect_false(any(c("quant_dtu_freq", "quant_p_mean", "quant_p_stdev", "quant_p_min", "quant_p_max", "quant_na_freq", "quant_reprod", "rep_dtu_freq", "rep_p_mean", "rep_p_stdev", "rep_p_min", "rep_p_max", "rep_na_freq", "rep_reprod") %in% names(mydtu$Transcripts))) expect_true(all(c("quant_dtu_freq", "quant_p_mean", "quant_p_stdev", "quant_p_min", "quant_p_max", "quant_na_freq", "quant_reprod", @@ -126,9 +126,9 @@ test_that("The output content is complete", { counts_A <- data_A[[1]] counts_B <- data_B[[2]] - mydtu <- list(call_DTU(annot= sim$annot, slo= sim$slo, name_A= "ONE", name_B= "TWO", rboot=TRUE, qboot=TRUE, qbootnum=2, verbose = FALSE, description="test"), - call_DTU(annot= sim$annot, boot_data_A = data_A, boot_data_B = data_B, rboot=TRUE, qboot=TRUE, qbootnum=2, verbose = FALSE, description="test"), - call_DTU(annot= sim$annot, count_data_A = counts_A, count_data_B = counts_B, rboot=FALSE, qboot=FALSE, verbose = FALSE, description="test")) + mydtu <- list(call_DTU(annot= sim$annot, count_data_A = counts_A, count_data_B = counts_B, rboot=FALSE, qboot=FALSE, verbose = FALSE, description="test"), + call_DTU(annot= sim$annot, boot_data_A = data_A, boot_data_B = data_B, rboot=TRUE, qboot=TRUE, qbootnum=2, verbose = FALSE, description="test") + ) for (x in length(mydtu)) { # Parameters. @@ -150,9 +150,8 @@ test_that("The output content is complete", { expect_false(is.na(mydtu[[x]]$Parameters$"time")) expect_false(is.na(mydtu[[x]]$Parameters$"rep_boot")) expect_false(is.na(mydtu[[x]]$Parameters$"quant_boot")) - if (x<3) { + if (x>1) { expect_false(is.na(mydtu[[x]]$Parameters$"rep_bootnum")) - expect_false(is.na(mydtu[[x]]$Parameters$"rep_reprod_as_crit")) expect_false(is.na(mydtu[[x]]$Parameters$"quant_bootnum")) expect_false(is.na(mydtu[[x]]$Parameters$"rep_reprod_thresh")) expect_false(is.na(mydtu[[x]]$Parameters$"quant_reprod_thresh")) @@ -168,7 +167,7 @@ test_that("The output content is complete", { expect_false(all(is.na(mydtu[[x]]$Genes[["sig"]]))) expect_false(all(is.na(mydtu[[x]]$Genes[["DTU"]]))) expect_false(all(is.na(mydtu[[x]]$Genes[["transc_DTU"]]))) - if (x <3) { + if (x>1) { expect_false(all(is.na(mydtu[[x]]$Genes[["quant_dtu_freq"]]))) expect_false(all(is.na(mydtu[[x]]$Genes[["quant_p_mean"]]))) expect_false(all(is.na(mydtu[[x]]$Genes[["quant_p_stdev"]]))) @@ -204,7 +203,7 @@ test_that("The output content is complete", { expect_false(all(is.na(mydtu[[x]]$Transcripts[["pval"]]))) expect_false(all(is.na(mydtu[[x]]$Transcripts[["pval_corr"]]))) expect_false(all(is.na(mydtu[[x]]$Transcripts[["sig"]]))) - if (x <3) { + if (x>1) { expect_false(all(is.na(mydtu[[x]]$Transcripts[["quant_dtu_freq"]]))) expect_false(all(is.na(mydtu[[x]]$Transcripts[["quant_p_mean"]]))) expect_false(all(is.na(mydtu[[x]]$Transcripts[["quant_p_stdev"]]))) @@ -221,8 +220,8 @@ test_that("The output content is complete", { expect_false(all(is.na(mydtu[[x]]$Transcripts[["rep_reprod"]]))) } # Paranoid check that all the annotation entries are present in the output. - expect_equal(dim(mydtu[[x]]$Genes)[1], length(levels(sim$annot$parent_id))) - expect_equal(dim(mydtu[[x]]$Transcripts)[1], length(levels(sim$annot$target_id))) + expect_equal(dim(mydtu[[x]]$Genes)[1], length(unique(sim$annot$parent_id))) + expect_equal(dim(mydtu[[x]]$Transcripts)[1], length(sim$annot$target_id)) } }) @@ -236,14 +235,11 @@ test_that("The result is consistent across input data formats", { counts_A <- data_A[[1]] counts_B <- data_B[[2]] - mydtu <- list(call_DTU(annot= sim$annot, slo= sim$slo, name_A= "ONE", name_B= "TWO", rboot=FALSE, qboot=FALSE, verbose = FALSE), - call_DTU(annot= sim$annot, boot_data_A = data_A, boot_data_B = data_B, rboot=FALSE, qboot=FALSE, verbose = FALSE), + mydtu <- list(call_DTU(annot= sim$annot, boot_data_A = data_A, boot_data_B = data_B, rboot=FALSE, qboot=FALSE, verbose = FALSE), call_DTU(annot= sim$annot, count_data_A = counts_A, count_data_B = counts_B, rboot=FALSE, qboot=FALSE, verbose = FALSE)) - expect_equal(mydtu[[1]][[2]], mydtu[[2]][[2]]) - expect_equal(mydtu[[1]][[2]][, seq(1,7), with=FALSE], mydtu[[3]][[2]][, seq(1,7), with=FALSE]) - expect_equal(mydtu[[1]][[3]], mydtu[[2]][[3]]) - expect_equal(mydtu[[1]][[3]][, seq(1,4), with=FALSE], mydtu[[3]][[3]][, seq(1,4), with=FALSE]) + expect_equal(mydtu[[1]][[2]][, seq(1,7), with=FALSE], mydtu[[2]][[2]][, seq(1,7), with=FALSE]) + expect_equal(mydtu[[1]][[3]][, seq(1,4), with=FALSE], mydtu[[2]][[3]][, seq(1,4), with=FALSE]) }) diff --git a/tests/testthat/test_2_reports.R b/tests/testthat/test_2_reports.R index fa677ae..b390ed8 100644 --- a/tests/testthat/test_2_reports.R +++ b/tests/testthat/test_2_reports.R @@ -4,8 +4,8 @@ context("DTU Reports") #============================================================================== test_that("The summaries work", { - sim <- sim_sleuth_data(cnames=c("ONE","TWO")) - mydtu <- call_DTU(annot= sim$annot, slo= sim$slo, name_A= "ONE", name_B= "TWO", qboot=FALSE, rboot=FALSE, verbose = FALSE) + sim <- sim_boot_data(cnames=c("ONE","TWO")) + mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qboot=FALSE, rboot=FALSE, verbose = FALSE) expect_silent(ids <- get_dtu_ids(mydtu)) expect_type(ids, "list") @@ -39,7 +39,7 @@ test_that("The summaries work", { expect_equal(tally[[2]], as.vector(sapply(ids, length))) expect_false(any(is.na(tally))) - mydtu <- call_DTU(annot= sim$annot, slo= sim$slo, name_A= "ONE", name_B= "TWO", qboot=FALSE, rboot=FALSE, verbose = FALSE, dprop_thresh=0.1) + mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qboot=FALSE, rboot=FALSE, verbose = FALSE, dprop_thresh=0.1) expect_silent(tally <- dtu_summary(mydtu)) expect_silent(ids <- get_dtu_ids(mydtu)) expect_equal(tally[[2]], c(2, 1, 7, 2, 1, 7, 2, 1, 7, 5, 5, 11)) @@ -60,8 +60,8 @@ test_that("The summaries work", { #============================================================================== test_that("The isoform switching summaries work", { - sim <- sim_sleuth_data(cnames=c("ONE","TWO")) - mydtu <- call_DTU(annot= sim$annot, slo= sim$slo, name_A= "ONE", name_B= "TWO", qboot=FALSE, rboot=FALSE, verbose = FALSE, dprop_thresh=0.1) + sim <- sim_boot_data(cnames=c("ONE","TWO")) + mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qboot=FALSE, rboot=FALSE, verbose = FALSE, dprop_thresh=0.1) expect_silent(ids <- get_switch_ids(mydtu)) expect_type(ids, "list") @@ -86,8 +86,8 @@ test_that("The isoform switching summaries work", { #============================================================================== test_that("The plurality summaries work", { - sim <- sim_sleuth_data(cnames=c("ONE","TWO")) - mydtu <- call_DTU(annot= sim$annot, slo= sim$slo, name_A= "ONE", name_B= "TWO", qboot=FALSE, rboot=FALSE, verbose = FALSE, dprop_thresh=0.1) + sim <- sim_boot_data(cnames=c("ONE","TWO")) + mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qboot=FALSE, rboot=FALSE, verbose = FALSE, dprop_thresh=0.1) expect_silent(ids <- get_plurality_ids(mydtu)) expect_type(ids, "list") @@ -104,7 +104,7 @@ test_that("The plurality summaries work", { expect_equal(tally[[2]], c(1, 1)) expect_equal(tally[[2]], as.vector(sapply(ids, length))) - mydtu <- call_DTU(annot= sim$annot, slo= sim$slo, name_A= "ONE", name_B= "TWO", qboot=FALSE, rboot=FALSE, verbose = FALSE, dprop_thresh=0.3) + mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qboot=FALSE, rboot=FALSE, verbose = FALSE, dprop_thresh=0.3) expect_silent(ids <- get_plurality_ids(mydtu)) expect_silent(tally <- dtu_plurality_summary(mydtu)) expect_named(ids, c("1")) @@ -115,8 +115,8 @@ test_that("The plurality summaries work", { #============================================================================== test_that("The gene plotting commands work", { - sim <- sim_sleuth_data(cnames=c("ONE","TWO")) - mydtu <- call_DTU(annot= sim$annot, slo= sim$slo, name_A= "ONE", name_B= "TWO", qbootnum=2, verbose = FALSE) + sim <- sim_boot_data(cnames=c("ONE","TWO")) + mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qbootnum=2, verbose = FALSE) expect_silent(plot_gene(dtuo=mydtu, pid="MIX6")) expect_silent(plot_gene(dtuo=mydtu, pid="MIX6", style="bycondition")) @@ -154,8 +154,8 @@ test_that("The gene plotting commands work", { #============================================================================== test_that("The overview plotting commands work", { - sim <- sim_sleuth_data(cnames=c("ONE","TWO")) - mydtu <- call_DTU(annot= sim$annot, slo= sim$slo, name_A= "ONE", name_B= "TWO", qbootnum=2, verbose = FALSE) + sim <- sim_boot_data(cnames=c("ONE","TWO")) + mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qbootnum=2, verbose = FALSE) expect_silent(plot_overview(mydtu)) expect_silent(plot_overview(dtuo=mydtu, type="tvolcano")) diff --git a/vignettes/input.Rmd b/vignettes/input.Rmd index 7adca52..bc59076 100644 --- a/vignettes/input.Rmd +++ b/vignettes/input.Rmd @@ -1,7 +1,7 @@ --- title: 'RATs: Input and Settings' author: "Kimon Froussios" -date: "31 MAY 2017" +date: "19 SEP 2017" output: html_document: keep_md: yes @@ -39,42 +39,39 @@ library(rats) RATs can work with several input types: -1. A Sleuth object. -2. Salmon/Kallisto quantifications. -3. Bootstrapped abundance estimates. -4. Abundance estimates. +1. [Salmon](https://github.com/COMBINE-lab/salmon) and [Kallisto](http://pachterlab.github.io/kallisto/) quantifications. +2. Bootstrapped abundance estimates in lists of R `data.table`s. +3. Abundance estimates in R `data.table`s. --> [Sleuth](http://pachterlab.github.io/sleuth/) objects contain their input data, so RATs can extract the bootstrapped abundance estimates directly from there, -for your convenience if Sleuth is in your workflow already. +As of `v0.6.0`, input from a `sleuth` object is no longer supported by RATs. This is due to changes in Sleuth `v0.29` that made the +bootstrap data no longer available. Instead, RATs already supports input directly from the Kallisto/Salmon quantifications. --> [Salmon](https://combine-lab.github.io/salmon/) quantifications require the [wasabi](https://github.com/COMBINE-lab/wasabi) converter. -[Kallisto](https://pachterlab.github.io/kallisto/) quantifications are accessed directly. +For option 1, the function `fish4rodents()` will load the data into tables suitable for option 2. Details in the respective +section below. --> Bootstrapped abundance estimates from other sources can be input directly as `list`s of `data.table`s. Two lists are needed, one per condition. -Each datatable should contain the transcript identifiers in the first column, followed by columns containing the estimates from the bootstrap iterations: +For options 2 and 3, the format of the tables is as in the example below. The first column contains the transcript identifiers. +Subsequent columns contain the abundances. ```{r, echo=FALSE} -# Show the first rows of the table corresponding to one sample, from simulated data. +# Show the first rows of the table corresponding to one sample, from emulated data. head(sim_boot_data()[[2]][[1]]) ``` --> Abundance estimates, without bootstrapping information, can be input simply as two `data.table`s, one per condition. The first column should -contain the transcript identifiers, followed by columns listing the abundance per sample. The format of each table is identical to the one shown above. - +* In the case of option 3, each column represents a sample. Therefore, each condition is represented by a single table containing +the relevant samples. +* In the case of option 2, each column represents a bootstrap iteration and each table represents a sample. Therefore, each +condition is represented by a list of tables. ### Read counts, TPMs, etc -RATs will happily crunch any type of numeric value, but the type of abundances you provide will have an impact on the DTU outcome. - -Read counts (such as the `est_counts`) work well with RATs and achieve decent False Discovery Rate. However, they are influenced by -the length of the transcripts. If the isoforms of a gene differ considerably in length, a well-expressed long isoform can drown a valid -change in the relative abundances of shorter isoforms. +RATs will happily crunch any type of numeric value, but the type of abundances you provide will have an impact on the DTU outcome. The statistical test +employed (G test) is meant to work with actual counts, not arbitrarily scaled values like TPMs (transcripts per million reads) or other per-million-reads representations. +On the other hand, read counts work well with RATs and achieve a good False Discovery Rate, but they are influenced by +the length of the transcripts and a well-expressed long isoform can drown out a valid change in the relative abundances of short isoforms. -TPM abundances account for transcript length and are a truer representation of relative isoform abundance. However, they are scaled -arbitrarily to 1 million transcripts, which is an unrealistically small sample size and this causes loss of statistical power. Therefore, -RATs provides the option to scale abundance values such as TPMs, by a constant factor. Use of this option should not be done -light-heartedly, as it will artificially inflate or deflate the significances obtained. Ideally, you should infer an appropriate -scaling factor based on your sample preparation details (for example, based on RNA concentration). +To get the best of both worlds, we recommend obtaining TPM abundances, so as to account for transcript lengths, and then scaling these values to your actual library size +to cancel out the per-million part and regain realistic counts. RATs provides functionality to enable you to do this, the first of which is the `scaleto` parameter in +`fish4rodents()`, shown above. ## Annotation @@ -87,8 +84,8 @@ By default the column labels are `target_id` for the transcript IDs and `parent_ head(sim_count_data()[[1]]) ``` -A function is provided to create this table, given a GTF file. -(**Note:** GFF3 is **not** supported, as the specification is too relaxed.) +Such a table can be compiled with a variety of methods and from a variety of resources. RATs provides functionality +to create this table from a GTF file. (**Note:** GFF3 is not supported for this) ```{r, eval=FALSE} # Extract transcript ID to gene ID index from a GTF annotation. @@ -101,31 +98,30 @@ myannot <- annot2ids("my_annotation_file.gtf") # Calling DTU -To bypass the complexity of running third-party tools in this tutorial, we will instead use **emulated data**. RATs comes -with data emulators, intended to be used for testing the code. However, they are also convenient to use for showcasing -how RATs works. If you happen to have real data available, you can use that instead if you wish. +To bypass the complexity of involving third-party tools in this tutorial, we will instead use **emulated data**. +RATs comes with data emulators, intended to be used for testing the code. However, they are also convenient to +use for showcasing how RATs works. -By default, RATs reports on its progress and produces a summary report. -The progress messages and summary can be suppressed by adding the `verbose = FALSE` parameter to the call. -To prevent cluttering this tutorial with verbose output, we will use this option in all the examples. +By default, RATs reports on its progress and produces a summary report. These can be suppressed using the +`verbose = FALSE` parameter to the call. To prevent cluttering this tutorial with verbose output, we will use this +option in all the examples. -If you leave `verbose = TRUE` when trying out the examples below using the emulated data, you will get **warnings** +If you choose to allow verbose output when trying out the examples below using the emulated data, you will get some **warnings** about the number of bootstraps. The warning is triggered because the emulated dataset used in the examples immitates only the structure of -real data, not the actual volume of it, and as such contains too few bootstrap iterations. +real data, not the actual volume of it, and as such it contains unrealistically few bootstrap iterations. Simply ignore these warnings +for these examples. -### ...from abundance estimates, without bootstraps +### DTU from abundance estimates, without bootstraps -This is the simplest usage case, provided only for completeness. We recommend using bootstrapped data whenever possible, -for reasons that will be discussed in the relevant section below. +This is the simplest usage case, and your likely input type if you use quantifications methods other than Kallisto and Salmon. First, let's emulate some data to work with: ```{r, eval=FALSE} # Simulate some data. simdat <- sim_count_data() - -# For convenience let's assign the contents of the list to separate variables. +# For convenience let's assign the contents of the list to separate variables mycond_A <- simdat[[2]] # Simulated abundances for one condition. mycond_B <- simdat[[3]] # Simulated abundances for other condition. myannot <- simdat[[1]] # Transcript and gene IDs for the above data. @@ -135,10 +131,13 @@ Now we can call DTU: ```{r, eval=FALSE} # Find DTU between the simulated datasets. -mydtu <- call_DTU(annot= myannot, count_data_A= mycond_A, count_data_B= mycond_B, - verbose= FALSE, - name_A= "healthy", name_B= "patients", varname= "My phenotype", - description="Comparison of two simulated counts datasets for the tutorial. Simulated using built-in functionality of RATs.") +mydtu <- call_DTU(annot= myannot, count_data_A= mycond_A, + count_data_B= mycond_B, verbose= FALSE, + name_A= "healthy", name_B= "patients", + varname= "My phenotype", + description="Comparison of two simulated counts datasets + for the tutorial. Simulated using built-in functionality + of RATs.") ``` #### Mandatory arguments: @@ -146,14 +145,18 @@ mydtu <- call_DTU(annot= myannot, count_data_A= mycond_A, count_data_B= mycond_B 1. `annot` - An annotation data frame, as described in the *Input formats* section. 2. `count_data_A` and `count_data_B` - are each a `data.table` of transcript abundances as described in the Input section. -#### Optional arguments: +#### Optional arguments (will be recorded in the output object): * `name_A`, `name_B` - A name for each conditon. (Default `NA`) * `varname` - The name of the variable/condition. (Default `NA`) -* `description` - Free-text description. You can note experiment details, annotation source, anything you might find useful later. (Default `NA`) +* `description` - Free-text description. (Default `NA`) + +#### Advanced options + +Refer to the respective sections later in this vignette. -### ...from bootstrapped abundance estimates +### DTU from bootstrapped abundance estimates First, let's emulate some data, as we did before. @@ -161,20 +164,23 @@ First, let's emulate some data, as we did before. # Simulate some data. (Notice it is a different function than before.) simdat <- sim_boot_data() -# For convenience let's assign the contents of the list to separate variables. -mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition. -mycond_B <- simdat[[3]] # Simulated bootstrapped data for other condition. -myannot <- simdat[[1]] # Transcript and gene IDs for the above data. +# For convenience let's assign the contents of the list to separate variables +mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition. +mycond_B <- simdat[[3]] # Simulated bootstrapped data for other condition. +myannot <- simdat[[1]] # Transcript and gene IDs for the above data. ``` Now we can call DTU: ```{r, eval=FALSE} -# Find DTU between conditions "controls" and "patients" in the simulated data. -mydtu <- call_DTU(annot= myannot, boot_data_A= mycond_A, boot_data_B= mycond_B, - verbose= FALSE, - name_A= "wildtype", name_B= "some mutant", varname = "My phenotype", - description="Comparison of two simulated datasets of bootstrapped counts for the tutorial. Simulated using built-in functionality of RATs.") +# Find DTU between conditions. +mydtu <- call_DTU(annot= myannot, boot_data_A= mycond_A, + boot_data_B= mycond_B, verbose= FALSE, + name_A= "wildtype", name_B= "some mutant", + varname = "My phenotype", description="Comparison of + two simulated datasets of bootstrapped counts for the + tutorial. Simulated using built-in functionality + of RATs.") ``` #### Mandatory arguments: @@ -182,143 +188,62 @@ mydtu <- call_DTU(annot= myannot, boot_data_A= mycond_A, boot_data_B= mycond_B, 1. `annot` - An annotation data frame, as described in the *Input formats* section. 2. `boot_data_A` and `boot_data_B` - are each a list of `data.table` objects, as described in the Input section. -#### Optional arguments: +#### Optional arguments (will be recorded in the output object): * `name_A`, `name_B` - A name for each conditon. (Default `NA`) * `varname` - The name of the variable/condition. (Default `NA`) -* `description` - Free-text description. You can note experiment details, annotation source, anything you might find useful later. (Default `NA`) - - -### ...with a sleuth object - -First, let's emulate a Sleuth object, using the bundled tools. The real Sleuth objects are very large and very complex nested lists. -The emulated one contains only the minimum essential elements relevant to calling DTU with RATs. - -```{r} -# Simulate some data. -simdat <- sim_sleuth_data(cnames = c("controls", "patients")) -# controls and patients are arbitrary names to use as conditions. - -# For convenience let's assign the contents of the list to separate variables. -myslo <- simdat$slo # Simulated minimal sleuth object. -myannot <- simdat$annot # Transcript and gene Identifiers for the above data. -``` - -Now call DTU. - -```{r, eval=FALSE} -# Find DTU between conditions "controls" and "patients" in the simulated data. -mydtu <- call_DTU(annot= myannot, slo= myslo, name_A= "controls", name_B= "patients", - varname= "condition", verbose= FALSE, - description="Comparison of two conditions using a simulated sleuth object for the purposes of the tutorial. Simulated using built-in functionality of RATs.") -``` - -#### Mandatory arguents: - -1. `annot` - an annotation data frame, as described in the *Input formats* section. -2. `slo` - a Sleuth object. -3. `name_A` and `name_B` - the two groups of samples to compare. These are restricted to values available in `myslo$sample_to_covariates`. The values must both exist in the column specified by `varname`. -4. `varname` - the variable for which samples are compared. must be an existing column header in `myslo$sample_to_covariates`. - -#### Optional arguments: - -* `description` - Free-text description. You can note experiment details, annotation source, anything you might find useful later. (Default `NA`) - -Please note that, unlike the other two usage cases, `name_A`, `name_B` and `varname` are **not optional**, as they specify how data is extracted from the Sleuth object. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +* `description` - Free-text description. (Default `NA`) - +#### Advanced options - - +Refer to the respective sections later in this vignette. - - - - - - - - - -### ...with Salmon/Kallisto output +### DTU with Salmon/Kallisto output RATs offers a method to import bootstrapped abundances directly from [Salmon](https://combine-lab.github.io/salmon/) output (requires [wasabi](https://github.com/COMBINE-lab/wasabi)) or [Kallisto](https://pachterlab.github.io/kallisto/) -output. Abundances are corrected for transcript length and scaled to 1M transcripts (TPM). +output. The raw abundances are normalised to TPM (default). ```{r, eval=FALSE} # 1. Collect your outputs into vectors. The end of each path should be a # directory with a unique name/identifier for one sample. -samples_A <- c("your/path/SAMPLE1", "your/path/SAMPLE4","your/path/SAMPLE5", ...) -samples_B <- c("your/path/SAMPLE2", "your/path/SAMPLE3","your/path/SAMPLE7", ...) -# OR if your outputs are all in the same path, this syntax might be more convenient: -samples_A <- file.path("your/path/", c("SAMPLE1", "SAMPLE4","SAMPLE5", ...)) -samples_B <- file.path("your/path/", c("SAMPLE2", "SAMPLE3","SAMPLE7", ...)) +samples_A <- c("your/path/SAMPLE1", "your/path/SAMPLE4","your/path/SAMPLE5") +samples_B <- c("your/path/SAMPLE2", "your/path/SAMPLE3","your/path/SAMPLE7", + "your/path/SAMPLE10") # 2. Convert, import, and extract. -# The annotation is needed to enforce consistent transcripts order throughout RATs. -mydata <- fish4rodents(A_paths= samples_A, B_paths= samples_B, annot= myannot) - -# 3. Run RATs with the generic bootstrapped format: -mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A, - boot_data_B= mydata$boot_data_B, verbose= FALSE, - name_A= "controls", name_B= "patients", - varname = "condition", description="Comparison of two sets of bootstrapped counts imported from the quantification output.") +mydata <- fish4rodents(A_paths= samples_A, B_paths= samples_B, + annot= myannot, scaleto=100000000) ``` +The return value of `fish4rodents` is a list with two items: `$boot_data_A` and `$boot_data_B`. These two items are +formatted for RATs' generic bootstrapped input. + #### Mandatory arguments (`fish4rodents`): 1. `A_paths` and `B_paths` - Two vectors containing the paths to the quantification output directories, one vector for each condition. The last segments of each path should be a directory with a unique identifying name for a single sample. 2. `annot` - The annotation table that you will use for calling DTU (and that was used for the quantification). +This is needed to enforce consistent order of the transcripts in the tables, enabling efficient processing. -The return value of `fish4rodents` is a list with two items: `$boot_data_A` and `$boot_data_B`. These two items are -formatted for RATs' generic bootstrapped input. +#### Optional arguments + +* `scaleto` allows you to control the normalisation factor. (Default 1000000 gives TPM values). + +And finally run DTU in the way already shown: + +```{r, eval=FALSE} +# 3. Run RATs with the bootstrapped table data format: +mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A, + boot_data_B= mydata$boot_data_B, verbose= FALSE) +``` *** -# Additional Parameters and Settings +# Advanced Parameters and Settings In summary, `rats` works as follows: @@ -332,15 +257,16 @@ The following three main thresholds are used in RATs: ```{r, eval=FALSE} # Calling DTU with custom thresholds. -mydtu <- call_DTU(annot= myannot, slo= myslo, name_A= "controls", name_B= "patients", +mydtu <- call_DTU(annot= myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, p_thresh= 0.01, abund_thresh= 10, dprop_thres = 0.25) ``` 1. `p_thresh` - Statistical significance level. P-values below this will be considered significant. Lower threshold values are stricter. (Default 0.05) -2. `abund_thresh` - Noise threshold. Transcripts with abundance below that value in both conditions are ignored. Higher threshold values are stricter. (Default 5, assumes use of estimated counts similar to those reported by [Salmon](https://github.com/COMBINE-lab/salmon)) +2. `abund_thresh` - Noise threshold. Transcripts with abundance below that value in both conditions are ignored. Higher threshold values are stricter. (Default 5, assumes abundances scaled to library size) 3. `dprop_thresh` - Effect size threshold. Transcripts whose proportion changes between conditions by less than the threshold are considered non-DTU, regardless of their statistical significance. Higher threshold values are stricter. (Default 0.20) -The default values for these thresholds have been chosen such that they achieve a *median* FPR <5% for a high quality dataset from *Arabidopsis thaliana*, even with only 3 replicates per condition. Your mileage may vary and you should give some consideration to selecting appropriate values. +The default values for these thresholds have been chosen such that they achieve a *median* FDR <5% for a high quality dataset from *Arabidopsis thaliana*, even with only 3 replicates per condition. Your mileage may vary and you should give some consideration to selecting appropriate values. Depending on the settings, *additional thresholds* are available and will be discussed in their respective sections below. @@ -348,8 +274,8 @@ Depending on the settings, *additional thresholds* are available and will be dis RATs offers two types of bootstrapping: -1. Bootstrapping of `p-value` and `Dprop` against the technical fluctuations of the quantifications. This requires bootstrapped quantifications as input. -2. Bootstrapping of `p-value` and `Dprop` against the abundance fluctuations among samples. +1. Bootstrapping of significance and effect size against the technical fluctuations of the quantifications. This requires bootstrapped quantifications as input. +2. Bootstrapping of significance and effect size against the abundance fluctuations among samples. Enabling these two procedures assesses the robustness of the DTU calls. Calls that are easily overturned depending on which subset of the data is used may be spurious but may also be valid. Deeper sequencing and/or higher replication may resolve these ambiguities. @@ -357,16 +283,18 @@ may be spurious but may also be valid. Deeper sequencing and/or higher replicati In both cases, what is measured is the fraction of iterations in which the significance and effect size meet their respective thresholds. These fractions can be used as indicators of confidence in the calls, especially when a large number of iterations is performed. -If bootstrapping is switched off, the respective output fields will not be included in the output. +If bootstrapping is switched off, the respective fields will not be included in the output. ### Quantification bootstraping -Three parameters control bootstrapping of DTU calls on the abundance estimates: +Three parameters control bootstrapping of DTU calls against the quantification uncertainty: 1. `qboot` - Whether to bootstrap the quantifications or not. (Default TRUE) -2. `qbootnum` - How many bootstrap iterations to do. Preferably at least 100. If 0, RATs will try to infer a value from the data. (Default 0) -3. `qrep_thresh` - Reproducibility threshold. What fraction of the iterations has to agree on a result to consider it confident. Higher threshold values are stricter. (Default 0.95) +2. `qbootnum` - How many bootstrap iterations to do. Preferably at least 100. (Default 0) If 0, +RATs will try to infer a value from the data (currently equal to the number of bootstraps in the data). +3. `qrep_thresh` - Reproducibility threshold. What fraction of the iterations has to agree on a result to consider it confident. (Default 0.95) +If you want to calculate the reproducibility but don't want it to weigh in on the DTU classifications, set the threshold to 0. In this process, one quantification iteration will be randomly selected from each sample and DTU will be called on it. This will be repeated `qbootnum` times. Because the number of replicates remains the same, the statistical power is not compromised. Therefore, the reproducibility will be **used as a criterion** in calling DTU, along with @@ -381,12 +309,14 @@ Warnings will be generated if `qbootnum` is too low or too high, but in most cas ```{r, eval=FALSE} # Bootstrap (default). Do 100 iterations. -mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", - name_B = "patients", qboot = TRUE, qbootnum = 100, qrep_thresh= 0.95) +mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + qboot = TRUE, qbootnum = 100, qrep_thresh= 0.95) # Skip bootstraps. -mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", - name_B = "patients", qboot = FALSE) +mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + qboot = FALSE) ``` @@ -395,61 +325,66 @@ mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", Three parameters control bootstrapping of DTU calls agaisnt the samples: 1. `rboot` - Whether to bootstrap the replicates or not. (Default TRUE) -2. `rrep_thresh` - Reproducibility threshold. What fraction of the iterations has to agree on a result to consider it confident. Higher threshold values are stricter. (Default 0.85) -3. `rrep_as_crit` - Whether the replicate-wise reproducibility should be used as a criterion in calling DTU. (Default TRUE) +2. `rrep_thresh` - Reproducibility threshold. What fraction of the iterations has to agree on a result to consider it confident. (Default 0.85) +If you want to calculate the reproducibility but don't want it to weigh in on the DTU classifications, set the threshold to 0. Unlike bootstrapping the quantifications, bootstrapping the replicates is currently **not stochastic**. RATs will do ALL the 1 vs 1 combinations of samples, one from each condition. This behaviour may be changed if in the future there is demand for high replication that reaches the capacity limits of R matrices. - To maintain comparable statistical significance to the actual experimental design, because only one replicate is used each time, the abundances are scaled up by the number of replicates in the condition. This is equivalent to the idealised scenario of all the replicates being extremely consistent and similar to the chosen one. ```{r, eval=FALSE} # Bootstrap (default). -# Note that for few replicates, the reproducibility values are highly discrete: -# The number of iterations for 3 samples per condition is 3*3=9. -# So the minimum possible error rate is 1/9=0.111. -# The corresponding threshold is 1-0.111=0.888. -mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", - name_B = "patients", rboot = TRUE, qrep_thresh= 0.85) +mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + rboot = TRUE, qrep_thresh= 0.85) # Skip bootstraps. -mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", - name_B = "patients", rboot = FALSE) +mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + rboot = FALSE) ``` +**Note** that for few replicates, the reproducibility values are highly discrete: +The number of iterations for 3 samples per condition is `3 * 3 = 9`. +So the minimum possible error rate is `1 / 9 = 0.111111...`. +The corresponding threshold is `1-0.1111111... = 0.8888888...`, hence the default threshold of 0.85. + ## Multi-threading -RATs completion time depends on the number of expressed annotated transcripts. Single-threaded, RATs can take up to a few minutes per iteration for large annotations. +RATs completion time depends on the number of annotated and expressed transcripts. Single-threaded, RATs can take up to a few minutes per iteration for large annotations. Fortunately, the task is highly parallelisable: ```{r eval=FALSE} # Using 8 threads/cores for parallel computing. -mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", - name_B = "patients", threads = 8) +mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + threads = 8) ``` 1. `threads` - The number of threads to use. (Default 1) -Due to core R implementation limitations, multi-threading works only in POSIX-compliant systems. In other systems, the option will be ignored and R will run serially. -Most Linux distros, Unix versions, and recent OSX versions are supposedly compatible, and there are environments that can be installed on Windows that might circumvent the issue. +Due to core R implementation limitations, the type of multi-threading used in RATs works only in POSIX-compliant systems. +Refer to the `parallel` package for details. ## Test selection -RATs runs both gene-level calls and transcript-level calls. They are complementary and we recommend using them -together, but the option to skip either is provided for special use cases. The fields of the skipped test will be filled with `NA`. +RATs runs both gene-level DTU calls and transcript-level DTU calls. They are independent from one another and we consider them complementary and recommend using them together, but the option to skip either is provided for special use cases. +The output fields of the skipped test will be filled with `NA`. ```{r, eval=FALSE} # Transcripts only. -mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", - name_B = "patients", testmode="transc") +mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + testmode="transc") # Genes only. -mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", - name_B = "patients", testmode="genes") +mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + testmode="genes") ``` -1. `testmode` - Which test(s) to run {"transc", "genes", "both""}. (Default "both") +1. `testmode` - Which test(s) to run {"transc", "genes", "both"}. (Default "both") ## Correction for multiple testing @@ -459,8 +394,9 @@ options is listed in R's `p.adjust.methods`. ```{r, eval=FALSE} # Bonferroni correction. -mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", - name_B = "patients", correction = "bonferroni") +mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + correction = "bonferroni") ``` 1. `correction` - Type of multiple testing correction. (Default "BH") @@ -470,47 +406,6 @@ mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", RATs needs to pull information from different fields of the data and annotation and it does so based on the names of the columns. You can override the default names of these fields. - -```{r} -# Lets emulate some input with custom field names. -sim <- sim_sleuth_data(varname = "mouse", cnames = c("Splinter", "Mickey"), - COUNTS_COL = "the-counts", TARGET_COL = "transcript", - PARENT_COL = "gene", BS_TARGET_COL = "trscr") -myslo <- sim$slo -myannot <- sim$annot - -# Sleuth covariates table. -print( sim$slo$sample_to_covariates ) -# Sleuth bootstrapped quantifications. -print( head(sim$slo$kal[[1]]$bootstrap[[1]]) ) -# Annotation. -print( head(sim$annot) ) -``` - - -### Sleuth field names - -Although we expect Sleuth field names to be constant with the exception of covariate names, RATs allows overriding the -expected names. One reason to do this would be to switch between using `tpm` and `est_count` abundances. - -```{r, eval=FALSE} -# Call DTU on data with custom field names. -mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "Splinter", name_B = "Mickey", - varname="mouse", COUNTS_COL="the-counts", BS_TARGET_COL="trscr", - verbose = FALSE) - -#! In our example data here, the annotation fields have been modified as well, -#! so this command will NOT run as it is shown. You will also need the parameters -#! shown in teh section below. -``` - -1. `varname` - The field name in `myslo$sample_to_covariates` where the desired condition names are listed. (Default "condition") -2. `COUNTS_COL` - The name of the field holding the abundances in the bootstrap tables of Sleuth. (Default "tpm") -3. `BS_TARGET_COL` - The name of the field holding the transcript identifiers in the bootstrap tables of Sleuth. (Default "target_id") - -The `COUNTS_COL` and `BS_TARGET_COL` parameters are also available for `denest_sleuth_boots()`. - - ### Annotation field names Although it is easy to rename the columns of a table to comply with the expected names, this may sometimes be undesireable, so RATs @@ -518,13 +413,9 @@ allows you to change the expected names instead. ```{r, eval=FALSE} # Call DTU using annotation with custom field names. -mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "Splinter", name_B = "Mickey", - varname="mouse", TARGET_COL="transcript", PARENT_COL="gene", - verbose = FALSE) - -#! In our example data here, the Sleuth fields have been modified as well, -#! so this command will NOT run as it is shown. You will also need the parameters -#! shown in the section above. +mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + TARGET_COL="transcript", PARENT_COL="gene") ``` 1. `TARGET_COL` - The name of the field holding the transcript identifiers in the annotation data frame. (Default "target_id") @@ -536,29 +427,29 @@ The `TARGET_COL` and `PARENT_COL` parameters are also available for `denest_sleu ## Abundance scaling As mentioned previously, various normalised abundance units (like TPM) are scaled to an arbitrary and usually smaller than actual -sample size. This artificially deflates the significances calculated by RATs and reduces the statistical power. If you can infer -a more likely number of transcripts in your sequenced samples (based on your sample preparation details), you can specify this -scaling factor to "correct" the abundance values. +sample size. This artificially deflates the significances calculated by RATs and reduces the statistical power. The `scaling` parameter allows you to scale normalised abundances up to the actual library sizes. ```{r eval=FALSE} -# If your sample preps contain ~70 million transcripts and you are using TPM abundances -mydtu <- call_DTU(annot = myannot, slo = myslo, name_A = "controls", - name_B = "patients", COUNTS_COL = "tpm", scaling = 70) +# If your sample preps contain ~70 million transcripts +# and you are using TPM abundances, the scaling factor +# is 10M / 1M = 70. +mydtu <- call_DTU(annot = myannot, + boot_data_A= mycond_A, boot_data_B= mycond_B, + scaling = 70) ``` - *** # Annotation discrepancies -Different annotation versions often preserve transcript IDs, despite altering the details of the transcript models, -such as UTRs. It is important to use the same annotation throughout the workflow, especially for the quantifications. -Otherwise, the data is not comparable. +Different annotation versions often preserve transcript IDs, despite altering the details of the transcript model. +It is important to use the same annotation throughout the workflow, otherwise the abundances will not be comparable +in a meaningful way. -All internal operations and the output of RATs will be based on the annotation provided: +All internal operations and the output of RATs are based on the annotation provided: * Any transcripts present in the data but missing from the annotation will be ignored completely and will not show up in the output, as there is no reliable way to match them to gene IDs. diff --git a/vignettes/intro.Rmd b/vignettes/intro.Rmd deleted file mode 100644 index 8a5ac5a..0000000 --- a/vignettes/intro.Rmd +++ /dev/null @@ -1,187 +0,0 @@ ---- -title: 'RATs: Quick Start' -author: "Kimon Froussios" -date: "15 MAY 2017" -output: - html_document: - keep_md: yes - theme: readable - toc: yes - toc_float: yes - pdf_document: - toc: yes -vignette: > - \usepackage[utf8]{inputenc} - %\VignetteIndexEntry{RATs 1: Introduction & Quick Start} - %\VignetteEngine{knitr::rmarkdown} ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - - -*** - -# Introduction - -The `rats` R package aims to identify genes that show a shift in the relative abundance of their transcript isoforms -between two conditions (*Differential Transcript Usage* -- **DTU**). This is supplementary -to identifying *Differential Transcript Expression* (**DTE**), which simply measures whether transcripts change in -abundance, but does not compare them to their sibling isoforms within each gene. Situations that show DTU are a -subset of those that show DTE. The figure below shows the relationship between DTU and DTE, as well as between them and -*differential gene expression* (**DGE**): - -![DGE vs. DTE vs. DTU](./figs/dge-dte-dtu.jpg) - - -**RATs is workflow-agnostic**. Quantification quality details are left to the quantification tools; RATs uses only the -transcript abundances. This makes it *suitable for use with alignment-free quantification tools* like [Kallisto](http://pachterlab.github.io/kallisto/) -or [Salmon](https://github.com/COMBINE-lab/salmon). It is also compatible with DTE output from [Sleuth](http://pachterlab.github.io/sleuth). - -Additionally, RATs is able to take advantage of the bootstrapped quantifications provided by the alignment-free tools. These bootstrapped -data are used by RATs to assess how much the technical variability of the heuristic quantifications affects differential transcript usage. -This provides a measure of confidence in the DTU calls. - - -*** - - -# Install RATs - -## Install dependencies first - -* Packages needed for computation (mandatory) - Available on CRAN: - -```{r, eval= FALSE} -install.packages(c("data.table", "matrixStats"), dependencies=TRUE) -``` - -* Package needed only for plotting (recommended) - Available on CRAN: - -```{r, eval= FALSE} -install.packages("ggplot2", dependencies=TRUE) -``` - -* Packages needed only for importing directly from Salmon/Kallisto output (optional) - Available through Bioconductor: - -```{r, eval= FALSE} -# Devtools (available on both CRAN and Bioconductor), needed by wasabi apparently? -install.packages("devtools", dependencies=TRUE) - -source("http://bioconductor.org/biocLite.R") -# Wasabi converter from Salmon/Sailfish to Kallisto. -biocLite("COMBINE-lab/wasabi") -#Kallisto parser -biocLite("rhdf5") -``` - -* Package needed only for interactive visualisation features (optional) - Available on CRAN: -```{r, eval= FALSE} -install.packages("shiny", dependencies=TRUE) -``` - - -## Install the rats package - -#### from Github - -RATs is available as R source package from the project's [releases section](https://github.com/bartongroup/Rats/releases) on Github. -Download the latest release and then install it using: - -```{r, eval= FALSE} -install.packages("", repos = NULL, type="source") -``` - -Or install directly with the help of the [devtools](https://www.rstudio.com/products/rpackages/devtools/) package: - -```{r, eval= FALSE} -devtools::install_github("bartongroup/rats", ref="master") -``` - -Release snapshots are labelled with even-ending version numbers (x.x.x0/2/4/6/8). - -#### through Bioconductor - -Not available yet. - -#### developmental version - -The most current developmental version of `rats` can be installed directly from the working copy on Github, using `devtools`: - -```{r, eval= FALSE} -devtools::install_github("bartongroup/rats", ref="development") -``` - -As this version represents active development, it can change at any moment. Features may be added, removed or changed without notice, and it -may even temporarily not work at all. **Do not rely on this version for reproducible analysis of important data!** -A main reason to use the developmental version would be to test the resolution of bugs, or to give us feedback on the changes as they happen. -Developmental versions are labelled with odd-ending version numbers (x.x.x1/3/5/7/9). - - -*** - - -# Use RATs - -RATs uses, as input, tables of transcript abundances with or without quantification bootstraps. For convenience, RATs can also -extract the transcript abundances from a Sleuth object. The latter option is shown below. For more details and settings, consult -the *Input & Settings* vignette. - -```{r, eval= FALSE} -# 1. Load into R session. -library(rats) - -# 2. Specify transcript grouping: -my_identifiers_table <- annot2ids("my_annotation.gtf") - -# 3a. Call DTU on a sleuth object, using default settings: -mydtu <- call_DTU(annot= my_identifiers_table, slo= my_sleuth_object, - name_A= "My_condition", name_B= "My_other_condition") -# 3b. Call DTU on generic bootstrapped abundance estimates: -mydtu <- call_DTU(annot= my_identifiers_table, boot_data_A= my_list_data_tables_A, - boot_data_B= my_list_data_tables_A) -# 3c. Call DTU on generic abundance estimates: -mydtu <- call_DTU(annot= my_identifiers_table, count_data_A= my_data_table_A, - count_data_B= my_data_table_B, qboot= FALSE) - -# 4. Plot significance VS effect size: -plot_overview(mydtu) - -# 5a. Get all gene and transcript identifiers per category -# (significant DTU, no DTU, Not Applicable): -myids <- get_dtu_ids(mydtu) - -# 5b. Get all gene and transcript identifiers implicated in isoform switching: -myids <- get_switch_ids(mydtu) - -# 6. Plot isoform changes for a given gene. -plot_gene(mydtu, "my_awesome_gene_ID") -``` - - - -### Defaults - -RATs provides default values for all its settings, however optimal threshold values will depend on the quantity, -quality and biology of your data, so some trial and error may be required on your part. The current default values -were optimised to control False Discovery Rate at the 5% level on *Arabidopsis thaliana* data. -The available thresholds and settings are discussed in the *Input & Settings* vignette. - - -*** - - -# Contact information - -The `rats` R package was developed within [The Barton Group](http://www.compbio.dundee.ac.uk) at [The University of Dundee](http://www.dundee.ac.uk) -by Dr. Kimon Froussios, Dr. Kira Mourão and Dr. Nick Schurch. - -To **report problems** or **ask for assistance**, please raise a new issue [on the project's support forum](https://github.com/bartongroup/Rats/issues). -Providing a *reproducible working example* that demonstrates your issue is strongly encouraged to help us understand the problem. Also, be sure -to **read the vignette(s)**, and browse/search the support forum before posting a new issue, in case your question is already answered there. - -Enjoy! - -![](./figs/rats_logo.png) - diff --git a/vignettes/output.Rmd b/vignettes/output.Rmd index 32399d6..23e58cf 100644 --- a/vignettes/output.Rmd +++ b/vignettes/output.Rmd @@ -1,7 +1,7 @@ --- title: 'RATs: Raw Output' author: "Kimon Froussios" -date: "01 JUN 2017" +date: "19 SEP 2017" output: html_document: keep_md: yes @@ -22,11 +22,7 @@ knitr::opts_chunk$set(echo = TRUE) *** -```{r, include=FALSE} -library(rats) -``` - -```{r, eval=FALSE} +```{r} library(rats) ``` @@ -35,20 +31,18 @@ Set up an example. ```{r} # Simulate some data. -simdat <- sim_sleuth_data(cnames = c("controls", "patients")) -# For convenience let's assign the contents of the list to separate variables. -myslo <- simdat$slo -myannot <- simdat$annot +simdat <- sim_boot_data() +# For convenience let's assign the contents of the list to separate variables +mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition. +mycond_B <- simdat[[3]] # Simulated bootstrapped data for other condition. +myannot <- simdat[[1]] # Transcript and gene IDs for the above data. # Call DTU -mydtu <- call_DTU(annot= myannot, slo= myslo, name_A= "controls", - name_B= "patients", varname= "condition", verbose= FALSE, - dprop_thresh=0.1, qboot=TRUE, rboot=TRUE, - description="Comparison of two conditions using a simulated sleuth object for the purposes of the tutorial. Simulated using built-in functionality of RATs.") +mydtu <- call_DTU(annot= myannot, verbose= FALSE, + boot_data_A= mycond_A, boot_data_B= mycond_B, + dprop_thresh=0.1, qboot=TRUE, rboot=FALSE) ``` -We lowered `dprop_thresh` and bypassed the bootstrapping to increase the number of DTU positive -classifications, to better illustrate the summary fucntions below for the given artificial data. *** @@ -56,8 +50,9 @@ classifications, to better illustrate the summary fucntions below for the given # Quick results For your convenience, RATs provides a few functions to give you a quick summary of your results. -However, we do recommend you become familiar with the actual results structure and content, so that you -can judge the quality of the DTU calls and trace the reasons behind the classification of each gene or transcript. +However, we do recommend you become familiar with the actual results structure and content, as +this will allow you to better understand the reasons behind your results and enable you to ask +more complex/detailed questions than those covered by the functions provided. ## Summary of DTU @@ -65,19 +60,17 @@ The `dtu_summary()` function lists the total number of genes and transcripts for * DTU: There is significant change in isoform ratios. * non-DTU: No significant change. -* NA: Not applicable. Genes/transcripts with abundance below the noise threshold, or where the gene has only one transcript. +* NA: Not applicable. Genes/transcripts with abundance below the noise threshold, or where the gene has only one known transcript. ```{r} # A really simple tally of the outcome. print( dtu_summary(mydtu) ) ``` -RATs uses *two* separate methods of identifying DTU: The gene-level test works on the isoforms of each gene as a set, -whereas the transcript-level test works on each isoform individually -and then aggregates the isoform results to form a gene-level result. Therefore, as of v0.4.2, `dtu_summury()` +RATs tests for DTU at both the gene level and the transcript level. Therefore, as of v0.4.2, `dtu_summury()` will display the DTU genes according to either method as well as the intersection of the two. -The `get_dtu_ids()` function lists the actual identifiers per category, instead of the numbers in each category. +The `get_dtu_ids()` function lists the coresponding identifiers per category. As of `v0.4.2` it uses the same category names as `dtu_summury()` for consistency and clarity. ```{r} @@ -105,7 +98,14 @@ print( ids ) ``` Again, three versions of the results are shown, covering the respective DTU methods in RATs. -`MIX6` is demonstrating a switch in both primary and tertiary isoforms. + +The gene "MIX6" is showing both primary and tertiary isoform switching. Looking at the abundances (more on this in the plots vignette), +indeed isoforms c1 and c2 swap ranks, as do isoforms c3 and c4: + +```{r} +plot_gene(mydtu, "MIX6") +``` + ## Summary of DTU plurality @@ -120,7 +120,10 @@ ids <- get_plurality_ids(mydtu) print( ids ) ``` -The categories are named by the number of isoforms affected. There is one gene (`MIX6`) that has `3` isoforms affected. +These give you the number and IDs of genes in which 2, 3, etc isoforms show DTU. + +For example, one gene ("MIX6") is showing significant change in 3 of its isoforms. + *** @@ -143,28 +146,27 @@ print( names(mydtu) ) print( names(mydtu$Parameters) ) ``` -1. `description` - (str) Free-text description of the run. It is useful to record data sources, annotation source and version, experimental parameters... -2. `time` - (str) Date and time for the run. This does not represent the exact time the run started. -3. `rats_version` - (str) The version of RATs. -4. `R_version` - (str) The version of R (including OS architecture). -5. `var_name` - (str) The value passed to the `varname` parameter. -6. `cond_A` & `cond_B` - (str) The values passed to the `name_A` and `name_B` parameters. -7. `data_type` - (str) The format of the input data. -8. `num_replic_A` & `num_replic_B` - (int) The number of samples in each condition. -9. `num_genes` - (int) The number of genes in the provided annotation. -10. `num_transc` - (int) The number of transcripts in the provided annotation. -11. `tests` - (str) The value passed to the `testmode` parameter. -12. `p_thresh` - (num) The value passed to the `p_thresh` parameter. -13. `abund_thresh` - (num) The value passed to the `abund_thresh` parameter. -14. `dprop_thresh` - (num) The value passed to the `dprop_thresh` parameter. -15. `abund_sclaing` - (num) The value passed to the `scaling` parameter. -16. `quant_reprod_thresh` - (num) The value passed to the `qrep_thresh` parameter. -17. `quant_boot` - (bool) The value passed to the `qboots` parameter. -18. `quant_bootnum` - (int) The value passed to the `qbootnum` parameter. -19. `rep_reprod_thresh` - (num) The value passed to the `rrep_thresh` parameter. -20. `rep_boots` - (bool) The value passed to the `rboot` parameter. -21. `rep_bootnum` - (int) The number of replicate bootstrapping iterations (usually M*N, where M and N are the number of samples in the two conditions). -22. `rep_reprod_as_crit` - (bool) The value passed to the `rrep_as_crit` parameter. +* `description` - (str) Free-text description of the run. It is useful to record data sources, annotation source and version, experimental parameters... +* `time` - (str) Date and time for the run. This does not represent the exact time the run started. +* `rats_version` - (str) The version of RATs. +* `R_version` - (str) The version of R (including OS architecture). +* `var_name` - (str) The value passed to the `varname` parameter. +* `cond_A` & `cond_B` - (str) The values passed to the `name_A` and `name_B` parameters. +* `data_type` - (str) The format of the input data. +* `num_replic_A` & `num_replic_B` - (int) The number of samples in each condition. +* `num_genes` - (int) The number of genes in the provided annotation. +* `num_transc` - (int) The number of transcripts in the provided annotation. +* `tests` - (str) The value passed to the `testmode` parameter. +* `p_thresh` - (num) The value passed to the `p_thresh` parameter. +* `abund_thresh` - (num) The value passed to the `abund_thresh` parameter. +* `dprop_thresh` - (num) The value passed to the `dprop_thresh` parameter. +* `abund_sclaing` - (num) The value passed to the `scaling` parameter. +* `quant_reprod_thresh` - (num) The value passed to the `qrep_thresh` parameter. +* `quant_boot` - (bool) The value passed to the `qboots` parameter. +* `quant_bootnum` - (int) The value passed to the `qbootnum` parameter. +* `rep_reprod_thresh` - (num) The value passed to the `rrep_thresh` parameter. +* `rep_boots` - (bool) The value passed to the `rboot` parameter. +* `rep_bootnum` - (int) The number of replicate bootstrapping iterations (usually M*N, where M and N are the number of samples in the two conditions). **Note:** If bootstraps are disabled, the bootstrap-related fields may have `NA` values despite any values that were passed to their respective parameters. @@ -185,33 +187,33 @@ print( names(mydtu$Genes) ) The first few columns show the result of each decision step. The remaining columns list the values based on which the decisions were made. Pseudo-code formulas are shown to help understand how the different fields interact when making decisions. -1. `parent_id` - (str) Gene identifier. -2. `elig` - (bool) Eligible for testing. Whether the gene met the pre-filtering criteria. `= (elig_transc >= 2)`. -3. `sig` - (bool) Statistically significant. `= (pvalAB_corr < Parameters$p_thresh) & (pvalBA_corr < Parameters$p_thresh)`. -4. `elig_fx` - (bool) Eligible effect size. Whether at least one of the isoforms meets the effect size criterion. `= any(Transcript[parent_id, elig_fx])`. -5. `quant_reprod` - (bool) Quantification reproducible. +* `parent_id` - (str) Gene identifier. +* `elig` - (bool) Eligible for testing. Whether the gene met the pre-filtering criteria. `= (elig_transc >= 2)`. +* `sig` - (bool) Statistically significant. `= (pvalAB_corr < Parameters$p_thresh) & (pvalBA_corr < Parameters$p_thresh)`. +* `elig_fx` - (bool) Eligible effect size. Whether at least one of the isoforms meets the effect size criterion. `= any(Transcript[parent_id, elig_fx])`. +* `quant_reprod` - (bool) Quantification reproducible. For positive DTU, `= (quant_dtu_freq >= Parameters$quant_reprod_thresh)`, for non-DTU `= (quant_dtu_freq <= 1 - Parameters$quant_reprod_thresh)`. -6. `rep_reprod` - (bool) Replication reproducible. +* `rep_reprod` - (bool) Replication reproducible. For positive DTU, `= (rep_dtu_freq >= Parameters$rep_reprod_thresh)`, for non-DTU `= (rep_dtu_freq <= 1 - Parameters$rep_reprod_thresh)`. -7. `DTU` - (bool) Differential Transcript Usage. `= (sig & elig_fx & quant_reprod)`, or if `rrep_as_crit==TRUE` then `= (sig & elig_fx & quant_reprod & rep_reprod)`. -8. `transc_DTU` - (bool) Aggregated from `Transcripts$DTU`: At least one isoform was individually reported as DTU. `= any(Transcript[parent_id, DTU])`. -9. `known_transc` - (int) Number of known trascripts for this gene according to the given annotation. -10. `detect_trancs` - (int) Number of detected (expressed) transcripts in the given dataset. -11. `elig_transc` - (int) Number of eligible transcripts, aggregated from `Transcripts$elig`. -12. `pval` - (num) G test of independence p-value for the isoform ratios. -13. `pval_corr` - (num) Multiple testing corrected p-value. `pval_corr= p.adjust(pval, Parameters$correction)`. -14. `quant_p_mean` - (num) Mean p-value across quantification bootstraps. -15. `quant_p_stdev` - (num) Standard deviation of p-values across quantification bootstraps. -16. `quant_p_min` - (num) Minimum observed p-value across quantification bootstraps. -17. `quant_p_max` - (num) Maximum observed p-value across quantification bootstraps. -18. `quant_na_freq` - (num) Fraction of quantification iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). -19. `quant_dtu_freq` - (num) Fraction of replicate iterations that support a positive `DTU` classification. -20. `rep_p_mean` - (num) Mean p-value across replication bootstraps. -21. `rep_p_stdev` - (num) Standard deviation of p-values across replication bootstraps. -22. `rep_p_min` - (num) Minimum observed p-value across replication bootstraps. -23. `rep_p_max` - (num) Maximum observed p-value across replication bootstraps. -24. `rep_na_freq` - (num) Fraction of replication iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). -25. `rep_dtu_freq` - (num) Fraction of replication iterations that support a positive `DTU` classification. +* `DTU` - (bool) Differential Transcript Usage. `= (sig & elig_fx & quant_reprod)`, or if `rrep_as_crit==TRUE` then `= (sig & elig_fx & quant_reprod & rep_reprod)`. +* `transc_DTU` - (bool) Aggregated from `Transcripts$DTU`: At least one isoform was individually reported as DTU. `= any(Transcript[parent_id, DTU])`. +* `known_transc` - (int) Number of known trascripts for this gene according to the given annotation. +* `detect_trancs` - (int) Number of detected (expressed) transcripts in the given dataset. +* `elig_transc` - (int) Number of eligible transcripts, aggregated from `Transcripts$elig`. +* `pval` - (num) G test of independence p-value for the isoform ratios. +* `pval_corr` - (num) Multiple testing corrected p-value. `pval_corr= p.adjust(pval, Parameters$correction)`. +* `quant_p_mean` - (num) Mean p-value across quantification bootstraps. +* `quant_p_stdev` - (num) Standard deviation of p-values across quantification bootstraps. +* `quant_p_min` - (num) Minimum observed p-value across quantification bootstraps. +* `quant_p_max` - (num) Maximum observed p-value across quantification bootstraps. +* `quant_na_freq` - (num) Fraction of quantification iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). +* `quant_dtu_freq` - (num) Fraction of replicate iterations that support a positive `DTU` classification. +* `rep_p_mean` - (num) Mean p-value across replication bootstraps. +* `rep_p_stdev` - (num) Standard deviation of p-values across replication bootstraps. +* `rep_p_min` - (num) Minimum observed p-value across replication bootstraps. +* `rep_p_max` - (num) Maximum observed p-value across replication bootstraps. +* `rep_na_freq` - (num) Fraction of replication iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). +* `rep_dtu_freq` - (num) Fraction of replication iterations that support a positive `DTU` classification. **Note:** The fields reporting on the bootstraps will not be shown when bootstrapping is disabled. @@ -228,39 +230,39 @@ The maximum likelihood-based G-test of independence is used to compare each give print( names(mydtu$Transcripts) ) ``` -1. `target_id` - (str) Transcript identifier. -2. `parent_id` - (str) Gene identifier. -3. `elig_xp` - (bool) Eligible expression level. `= (meanA >= Parameters$abund_thresh | meanB >= Parameters$abund_thresh)`. -4. `elig` - (bool) Eligible for testing (meets the noise threshold and at least one other isoform is expressed). `= (elig_xp & totalA != 0 & totalB != 0 & (sumA != totalA | sumB != totalB))`. -5. `sig` - (bool) Statistically significant. `= (pval_corr < Parameters$p_thresh)`. -6. `elig_fx` - (bool) Eligible effect size. Proxy for biological significance. `= (Dprop > Parameters$dprop_thresh)`. +* `target_id` - (str) Transcript identifier. +* `parent_id` - (str) Gene identifier. +* `elig_xp` - (bool) Eligible expression level. `= (meanA >= Parameters$abund_thresh | meanB >= Parameters$abund_thresh)`. +* `elig` - (bool) Eligible for testing (meets the noise threshold and at least one other isoform is expressed). `= (elig_xp & totalA != 0 & totalB != 0 & (sumA != totalA | sumB != totalB))`. +* `sig` - (bool) Statistically significant. `= (pval_corr < Parameters$p_thresh)`. +* `elig_fx` - (bool) Eligible effect size. Proxy for biological significance. `= (Dprop > Parameters$dprop_thresh)`. 7. `quant_reprod` - (bool) Quantification reproducible. For positive DTU, `= (quant_dtu_freq >= Parameters$quant_reprod_thresh)`, for non-DTU `= (quant_dtu_freq <= 1 - Parameters$quant_reprod_thresh)`. -8. `rep_reprod` - (bool) Replication reproducible. +* `rep_reprod` - (bool) Replication reproducible. For positive DTU, `= (rep_dtu_freq >= Parameters$rep_reprod_thresh)`, for non-DTU `= (rep_dtu_freq <= 1 - Parameters$rep_reprod_thresh)`. -9. `DTU` - (bool) The Transcript is Differentially Used. `= (sig & elig_fx & quant_reprod)`, or if `rrep_as_crit==TRUE` then `= (sig & elig_fx & quant_reprod & rep_reprod)`. -10. `gene_DTU` - (bool) Expanded from `Genes$DTU`. Indicates that the gene as a whole shows significant change in isoform ratios. `= (Genes[parent_id, DTU])`. -11. `meanA` and `meanB` - (num) The mean abundance across replicates, for each condition. -12. `stdevA` and `stdevB` - (num) The standard deviation of the abundance across the replicates. -13. `sumA` and `sumB` - (num) The sum of the abundances across replicates. This is the value used for the tests, so that replication level informs the significance. -14. `log2FC` - (num) lgo2 of fold-change of transcript abundance: `= (sumB / sumA)`. sumA and sumB are NOT normalised for sequencing depth, so when log2FC is plotted the distribution may not be centred on 0. If it is not centred on 0, do NOT use this log2FC for differential transcript expression analysis! -15. `totalA` and `totalB` - (num) The total abundance for the gene: `totalA= sum(transcripts[parent_id, sumA])`. -16. `propA` and `propB` - (num) The proportion of the gene expression owed to this transcript. `propA= sumA / totalA`. -17. `Dprop` - (num) The difference in the proportion of the transcript between the two conditions (effect size). `= (probB - propA)`. -18. `pval` - (num) The proportion equality test P-value for the transcript. -19. `pval_corr` - (num) Multiple testing corrected p-value. `= p.adjust(pval, Parameters$correction)`. -20. `quant_p_mean` - (num) Mean p-value across quantification bootstraps. -21. `quant_p_stdev` - (num) Standard deviation of p-values across quantification bootstraps. -22. `quant_p_min` - (num) Minimum observed p-value across quantification bootstraps. -23. `quant_p_max` - (num) Maximum observed p-value across quantification bootstraps. -24. `quant_na_freq` - (num) Fraction of quantification iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). -25. `quant_dtu_freq` - (num) Fraction of quantification iterations that support a positive `DTU` classification. -26. `rep_p_mean` - (num) Mean p-value across replication bootstraps. -27. `rep_p_stdev` - (num) Standard deviation of p-values across replication bootstraps. -28. `rep_p_min` - (num) Minimum observed p-value across replication bootstraps. -29. `rep_p_max` - (num) Maximum observed p-value across replication bootstraps. -30. `rep_na_freq` - (num) Fraction of replication iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). -31. `rep_dtu_freq` - (num) Fraction of replication iterations that support a positive `DTU` classification. +* `DTU` - (bool) The Transcript is Differentially Used. `= (sig & elig_fx & quant_reprod)`, or if `rrep_as_crit==TRUE` then `= (sig & elig_fx & quant_reprod & rep_reprod)`. +*. `gene_DTU` - (bool) Expanded from `Genes$DTU`. Indicates that the gene as a whole shows significant change in isoform ratios. `= (Genes[parent_id, DTU])`. +* `meanA` and `meanB` - (num) The mean abundance across replicates, for each condition. +* `stdevA` and `stdevB` - (num) The standard deviation of the abundance across the replicates. +* `sumA` and `sumB` - (num) The sum of the abundances across replicates. This is the value used for the tests, so that replication level informs the significance. +* `log2FC` - (num) lgo2 of fold-change of transcript abundance: `= (sumB / sumA)`. sumA and sumB are NOT normalised for sequencing depth, so when log2FC is plotted the distribution may not be centred on 0. If it is not centred on 0, do NOT use this log2FC for differential transcript expression analysis! +* `totalA` and `totalB` - (num) The total abundance for the gene: `totalA= sum(transcripts[parent_id, sumA])`. +* `propA` and `propB` - (num) The proportion of the gene expression owed to this transcript. `propA= sumA / totalA`. +* `Dprop` - (num) The difference in the proportion of the transcript between the two conditions (effect size). `= (probB - propA)`. +* `pval` - (num) The proportion equality test P-value for the transcript. +* `pval_corr` - (num) Multiple testing corrected p-value. `= p.adjust(pval, Parameters$correction)`. +* `quant_p_mean` - (num) Mean p-value across quantification bootstraps. +* `quant_p_stdev` - (num) Standard deviation of p-values across quantification bootstraps. +* `quant_p_min` - (num) Minimum observed p-value across quantification bootstraps. +* `quant_p_max` - (num) Maximum observed p-value across quantification bootstraps. +* `quant_na_freq` - (num) Fraction of quantification iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). +* `quant_dtu_freq` - (num) Fraction of quantification iterations that support a positive `DTU` classification. +* `rep_p_mean` - (num) Mean p-value across replication bootstraps. +* `rep_p_stdev` - (num) Standard deviation of p-values across replication bootstraps. +* `rep_p_min` - (num) Minimum observed p-value across replication bootstraps. +* `rep_p_max` - (num) Maximum observed p-value across replication bootstraps. +* `rep_na_freq` - (num) Fraction of replication iterations in which `DTU` could not be determined (not meeting pre-filtering criteria). +* `rep_dtu_freq` - (num) Fraction of replication iterations that support a positive `DTU` classification. **Note:** The fields reporting on the bootstraps will not be shown when bootstrapping is disabled. @@ -271,9 +273,8 @@ For positive DTU, `= (rep_dtu_freq >= Parameters$rep_reprod_thresh)`, for non-DT so if bootstrapped data was used, these values are the means across iterations. If plain abundances were provided as input, then `Abundances` essentially contains the input data. These abundances are included in the output because they are required for some of RATs' plotting options. - ```{r} -# Elements of ReplicateData +# Elements of Abundances. print( names(mydtu$Abundances) ) ``` diff --git a/vignettes/plots.Rmd b/vignettes/plots.Rmd index 8b8a83d..aa29347 100644 --- a/vignettes/plots.Rmd +++ b/vignettes/plots.Rmd @@ -1,7 +1,7 @@ --- title: 'RATs: Plots' author: "Kimon Froussios" -date: "01 JUN 2017" +date: "19 SEP 2017" output: html_document: keep_md: yes @@ -35,15 +35,16 @@ Set up an example. ```{r} # Simulate some data. -simdat <- sim_sleuth_data(cnames = c("controls", "patients")) -# For convenience let's assign the contents of the list to separate variables. -myslo <- simdat$slo -myannot <- simdat$annot +simdat <- sim_boot_data() +# For convenience let's assign the contents of the list to separate variables +mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition. +mycond_B <- simdat[[3]] # Simulated bootstrapped data for other condition. +myannot <- simdat[[1]] # Transcript and gene IDs for the above data. # Call DTU -mydtu <- call_DTU(annot= myannot, slo= myslo, name_A= "controls", name_B= "patients", - varname= "condition", verbose= FALSE, - description="Comparison of two conditions using a simulated sleuth object for the purposes of the tutorial. Simulated using built-in functionality of RATs.") +mydtu <- call_DTU(annot= myannot, verbose= FALSE, + boot_data_A= mycond_A, boot_data_B= mycond_B, + dprop_thresh=0.1, qboot=TRUE, rboot=FALSE) ``` @@ -78,19 +79,19 @@ plot_gene(mydtu, "MIX6", style="bycondition") The top two facets show the absolute abundances (counts) of the isoforms as supplied in the input, the bottom two show the correspondng relative abundances (proportions). These are split by condition: The left two facets refer to one condition, the right two ones to the other condition. The boxplots describe the abundance measurements of each isoform across replicates. +The actual measurements are overlayed as points and connected by lines so as to group the set of isoform abundances measured in each replicate. +This highlights the level of consistency between the replicates. The points and lines are placed slightly off-centre to prevent replicates +masking one another when the measurements are very similar. -The actual measurements are overlayed as points and connected by lines so as to group the set of isoform abundances measured in each replicate. This highlights the level of -consistency between the replicates. The points and lines are placed slightly off-centre to prevent replicates masking one another when -the measurements are very similar. The colours of the replicates are recycled between the two conditions, to reduce the required palette and -boost contrast. So in the example, there are 4 samples: controls-1, controls-2, patients-1 and patients-2. +The colours of the replicates are recycled between the two conditions, to reducethe required palette and boost contrast. +So in the example, there are 4 samples: controls-1, controls-2, patients-1 and patients-2. In the past, the fill colour encoded the DTU outcome, but to keep things more consistent with the other styles the DTU is now encoded as point shape while the fill colour echoes the condition. In this example, isoforms `.c1` and `.c2` change significantly, isoforms `.c3`, `.c4` and `.nc` do not change significantly -and isoform `.d` could not be tested. It is not always possible to see the fill colour, depending on the spead of the measurements, so -refering back to the RATs output tables may be necessary. +and isoform `.d` could not be tested. -This structure is great for seeing changes in the isoform distribution profile of a gene. A more minimalist version of the plot, without the boxpltos, +This structure is great for seeing changes in the isoform distribution profile of a gene. A more minimalist version of this, without the boxpltos, can also be obtained: ```{r, eval=FALSE} @@ -98,7 +99,7 @@ can also be obtained: plot_gene(mydtu, "MIX6", style="lines") ``` -When there are many lines or many isoforms, it may be preferable to group abundances by isoform, making individual comparisons easier: +When there are many replicates or many isoforms, it may be preferable to group abundances by isoform, making individual comparisons easier: ```{r} # Grouping by isoform: @@ -143,7 +144,7 @@ This is what these look like on a larger dataset: The gene version of the effect size distribution is often bi-modal with the positive peak being larger than the negative, because when a gene has isoforms with the same absolute effect size (for example genes with two isoforms) -the positive value is selected. +the positive value is deterministically selected. Although fold-changes of transcript expression are not tied to DTU, if your input counts are suitably normalised between conditions, you can plot the traditional FC volcano and the relationship between FC and proportion change. @@ -162,7 +163,7 @@ plot_overview(mydtu, type="fcVSdprop") ## Interactive plots -If you prefer picking points from a plot rather than sorting through tables, the volcano plot is also available through +If you prefer picking points from a plot rather than sorting through tables, the gene-level volcano plot is also available through an interactive `shiny` app, that pulls up the relevant `Gene` and `Transcript` results and the isoform abundance plot for any volcano point you select. @@ -210,30 +211,19 @@ plot_gene(mydtu, "MIX6", style="bycondition", fillby="condition", replcolvec=c("green", "darkgreen")) ``` -1. `isofcolvec` - Colour vector for isoform highlighting. Used to build a colorRampPalette. -2. `dtucolvec` - Colour vector for DTU highlighting. -3. `condcolvec` - Colour vector for condition highlighting. -4. `replcolvec` - Colour vector for replicate highlighting. Used to build a colorRampPalette. -5. `nonecol` - Colour to use when no colour coding is wanted. +* `isofcolvec` - Colour vector for isoform highlighting. Used to build a colorRampPalette. +* `dtucolvec` - Colour vector for DTU highlighting. +* `condcolvec` - Colour vector for condition highlighting. +* `replcolvec` - Colour vector for replicate highlighting. Used to build a colorRampPalette. +* `nonecol` - Colour to use when no colour coding is wanted. ## General customisations for all plots -You can save any of the gene and overview plots as a `ggplot2` object and use [ggplot2](http://ggplot2.org) manipulations on it, such as changing the axis scales. -Other `ggplot2` customisations include the axis tick marks, axis values, labels, titles, colours... Consult the [ggplot2](http://ggplot2.org) +You can save any of the gene and overview plots as a `ggplot2` object and use ggplot2 manipulations on it, such as +the axis tick marks, axis values, labels, titles, colours... Consult the [ggplot2](http://ggplot2.org) documentation for more help on these. - - - - - - - - - - - ***