diff --git a/DESCRIPTION b/DESCRIPTION index 6b8130e..779f6d0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: rats -Version: 0.6.2 -Date: 2018-02-16 +Version: 0.6.3 +Date: 2018-03-13 Title: Relative Abundance of Transcripts Encoding: UTF-8 Authors: c(person("Kimon Froussios", role=c("aut"), email="k.froussios@dundee.ac.uk"), diff --git a/NAMESPACE b/NAMESPACE index f87589a..a460217 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,7 +20,6 @@ export(plot_overview) export(plot_shiny_volcano) export(sim_boot_data) export(sim_count_data) -export(sim_sleuth_data) export(tidy_annot) import(data.table) import(ggplot2) diff --git a/R/data_simulators.R b/R/data_simulators.R index 798ea73..7e7334c 100644 --- a/R/data_simulators.R +++ b/R/data_simulators.R @@ -1,115 +1,10 @@ -#============================================================================== -#' Generate an artificial minimal sleuth-like structure, for code-testing or examples. -#' -#' The default values here should match the default values expected by calculate_DTU(). -#' -#' @param varname Name for the variables by which to compare. -#' @param COUNTS_COL Name for the bootdtraps column containing counts. -#' @param TARGET_COL Name for annotation column containing transcript identification. -#' @param PARENT_COL Name for annotation column containing respective gene identification. -#' @param BS_TARGET_COL Name for bootstraps column containing transcript identification. -#' @param cnames A vector of (two) name values for the comparison variable. -#' @param errannot_inconsistent Logical. Introduces an inconsistency in the transcript IDs, for testing of sanity checks. (FALSE) -#' @param cv_dt Logical. Whether covariates table should be a data.table (FALSE). -#' @return A list with \code{slo} a minimal sleuth-like object, \code{annot} a corresponding annotation data.frame, -#' \code{isx} a vector of trancripts common between generated annotation and bootstraps (useful for code testing). -#' -#' The simulated data will have non-uniform number of bootstraps per sample and non-uniform order of -#' transcript id's among samples and bootstraps. These conditions are unlikely to occur in real data, -#' but this allows testing that the code can handle it. -#' -#' @import data.table -#' -#' @export -sim_sleuth_data <- function(varname="condition", COUNTS_COL="est_counts", TARGET_COL="target_id", - PARENT_COL="parent_id", BS_TARGET_COL="target_id", cnames=c("A","B"), - errannot_inconsistent=FALSE, cv_dt=FALSE) -{ - # !!! Some of the tests of the package are tightly connected to the specifics of the object returned by this function. - # !!! Entry additions might be tolerated. Changes or removals of entries or structure will certainly cause failures. - - tx <- data.frame(target_id= c("NIB.1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.1", "1B1C.2", "CC_a", "CC_b", "1NN", "2NN", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "LC1", "LC2", "ALLA1", "ALLB1", "ALLB2"), - parent_id= c("NIB", "1A1N", "1D1C", "1D1C", "1B1C", "1B1C", "CC", "CC", "NN", "NN", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "LC", "LC", "ALLA", "ALLB", "ALLB"), - stringsAsFactors=FALSE) - names(tx) <- c(TARGET_COL, PARENT_COL) - - sl <- list() - sl[["sample_to_covariates"]] <- NaN - if (cv_dt) { - sl[["sample_to_covariates"]] <- data.table("foo"=c(cnames[1], cnames[2], cnames[1], cnames[2]), - "bar"=c("ba", "ba", "bb", "bb")) - } else { - sl[["sample_to_covariates"]] <- data.frame("foo"=c(cnames[1], cnames[2], cnames[1], cnames[2]), - "bar"=c("ba", "ba", "bb", "bb")) - } - names(sl[["sample_to_covariates"]]) <- c(varname, "batch") - - sl[["kal"]] <- list() - sl$kal[[1]] <- list() - sl$kal[[1]]["bootstrap"] <- list() - sl$kal[[1]]$bootstrap[[1]] <- data.frame("target"= c("LC1", "NIA1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(3, 333, 666, 10, 20, 0, 76, 52, 20, 50, 103, 321, 0, 100, 90, 0, 10, 30, 4, 50, 0, 0), - stringsAsFactors=FALSE) - sl$kal[[1]]$bootstrap[[2]] <- data.frame("target"= c("LC1", "NIA1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(2, 310, 680, 11, 21, 0, 80, 55, 22, 52, 165, 320, 0, 130, 80, 0, 11, 29, 5, 40, 0, 0), - stringsAsFactors=FALSE) - sl$kal[[1]]$bootstrap[[3]] <- data.frame("target"= c("LC1", "NIA1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(0, 340, 610, 7, 18, 0, 72, 50, 21, 49, 150, 325, 0, 120, 70, 0, 9, 28, 4, 60, 0, 0), - stringsAsFactors=FALSE) - - sl$kal[[2]] <- list() - sl$kal[[2]]["bootstrap"] <- list() - sl$kal[[2]]$bootstrap[[1]] <- data.frame("target"= c("NIA1", "LC1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(333, 6, 666, 12, 23, 0, 25, 150, 15, 80, 325, 105, 40, 0, 200, 0, 15, 45, 12, 0, 80, 200), - stringsAsFactors=FALSE) - sl$kal[[2]]$bootstrap[[2]] <- data.frame("target"= c("NIA1", "LC1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(323, 1, 606, 15, 22, 0, 23, 190, 20, 90, 270, 115, 30, 0, 150, 0, 20, 60, 15, 0, 120, 250), - stringsAsFactors=FALSE) - - sl$kal[[3]] <- list() - sl$kal[[3]]$bootstrap[[1]] <- data.frame("target"= c("NIA1", "1A1N-2", "LC1", "NIA2", "1D1C:one", "1D1C:two", "1B1C.2", "MIX6.c1", "1A1N-1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.d", "MIX6.nc", "CC_a", "CC_b", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(333, 20, 3, 666, 0, 76, 52, 100, 10, 360, 0, 100, 0, 180, 25, 60, 7, 27, 13, 35, 0, 0), - stringsAsFactors=FALSE) - sl$kal[[3]]$bootstrap[[2]] <- data.frame("target"= c("MIX6.c4", "NIA1", "LC1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1B1C.2", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.nc", "1D1C:two", "MIX6.d", "CC_b", "CC_a", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(80, 330, 4, 560, 11, 21, 0, 55, 90, 380, 0, 240, 80, 0, 55, 23, 10, 31, 2, 55, 0, 0), - stringsAsFactors=FALSE) - - sl$kal[[4]] <- list() - sl$kal[[4]]["bootstrap"] <- list() - sl$kal[[4]]$bootstrap[[1]] <- data.frame("target"= c("NIA2", "1A1N-1", "NIA1", "1A1N-2", "1D1C:one", "1D1C:two", "LC1", "1B1C.2", "MIX6.c2", "MIX6.c3", "MIX6.c1", "MIX6.c4", "MIX6.nc", "MIX6.d", "CC_b", "CC_a", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(666, 12, 333, 23, 0, 25, 0, 150, 155, 40, 300, 0, 33, 0, 93, 22, 22, 61, 11, 0, 110, 210), - stringsAsFactors=FALSE) - sl$kal[[4]]$bootstrap[[2]] <- data.frame("target"= c("NIA1", "MIX6.d", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "LC1", "1D1C:two", "MIX6.c1", "1B1C.2", "MIX6.c3", "MIX6.c2", "MIX6.c4", "MIX6.nc", "CC_a", "CC_b", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(323, 0, 656, 15, 22, 0, 2, 23, 280, 160, 35, 120, 0, 95, 18, 119, 19, 58, 7, 0, 150, 220), - stringsAsFactors=FALSE) - sl$kal[[4]]$bootstrap[[3]] <- data.frame("target"= c("NIA1", "NIA2", "1A1N-1", "1A1N-2", "MIX6.nc", "1B1C.2", "LC1", "MIX6.c1", "MIX6.c2", "MIX6.c3", "1D1C:one", "1D1C:two", "MIX6.c4", "MIX6.d", "CC_a", "CC_b", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(343, 676, 13, 21, 55, 145, 3, 270, 133, 30, 0, 20, 0, 0, 23, 80, 17, 50, 14, 0, 130, 200), - stringsAsFactors=FALSE) - - names(sl$kal[[1]]$bootstrap[[1]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[1]]$bootstrap[[2]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[1]]$bootstrap[[3]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[2]]$bootstrap[[1]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[2]]$bootstrap[[2]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[3]]$bootstrap[[1]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[3]]$bootstrap[[2]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[4]]$bootstrap[[1]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[4]]$bootstrap[[2]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[4]]$bootstrap[[3]]) <- c(BS_TARGET_COL, COUNTS_COL) - - if (errannot_inconsistent) { - sl$kal[[3]]$bootstrap[[1]][10, BS_TARGET_COL] <- "Unexpected-name" - } - - return(list("annot" = tx, "slo" = sl, "isx" = intersect(tx[[TARGET_COL]], sl$kal[[1]]$bootstrap[[1]][[BS_TARGET_COL]]) )) -} - #============================================================================== #' Generate an artificial dataset of bootstrapped abundance estimates, for code-testing or examples. #' #' @param TARGET_COL Name for annotation column containing transcript identification. #' @param PARENT_COL Name for the bootdtraps column containing counts. #' @param errannot_inconsistent Logical. Introduces an inconsistency in the transcript IDs, for testing of sanity checks. (FALSE) +#' @param clean Logical. Remove the intentional inconsistency of IDs between annotation and abundances. #' @return A list with 3 elements. First a data.frame with the corresponding annotation. #' Then, 2 lists of data.tables. One list per condition. Each data table represents a sample and contains the estimates from the bootstrap iterations. #' @@ -117,38 +12,63 @@ sim_sleuth_data <- function(varname="condition", COUNTS_COL="est_counts", TARGET #' #' @export #' -sim_boot_data <- function(errannot_inconsistent=FALSE, PARENT_COL="parent_id", TARGET_COL="target_id") { - tx <- data.frame(target_id= c("NIB.1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.1", "1B1C.2", "CC_a", "CC_b", "1NN", "2NN", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "LC1", "LC2", "ALLA1", "ALLB1", "ALLB2"), - parent_id= c("NIB", "1A1N", "1D1C", "1D1C", "1B1C", "1B1C", "CC", "CC", "NN", "NN", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "LC", "LC", "ALLA", "ALLB", "ALLB"), +sim_boot_data <- function(errannot_inconsistent=FALSE, PARENT_COL="parent_id", TARGET_COL="target_id", clean=FALSE) { + tx <- data.frame(target_id= c("1A1B.a", "1A1B.b", "NIB.1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.1", "1B1C.2", "CC_a", "CC_b", "1NN", "2NN", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "LC1", "LC2", "ALLA1", "ALLB1", "ALLB2"), + parent_id= c("1A1B", "1A1B", "NIB", "1A1N", "1D1C", "1D1C", "1B1C", "1B1C", "CC", "CC", "NN", "NN", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "LC", "LC", "ALLA", "ALLB", "ALLB"), stringsAsFactors=FALSE) names(tx) <- c(TARGET_COL, PARENT_COL) a <- list() - a[[1]] <- data.table("target"= c("LC1", "NIA1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "V1"= c( 3, 333, 666, 10, 20, 0, 76, 52, 20, 50, 103, 321, 0, 100, 90, 0, 10, 30, 4, 50, 0, 0), - "V2"= c( 2, 310, 680, 11, 21, 0, 80, 55, 22, 52, 165, 320, 0, 130, 80, 0, 11, 29, 5, 40, 0, 0), - "V3"= c( 0, 340, 610, 7, 18, 0, 72, 50, 21, 49, 150, 325, 0, 120, 70, 0, 9, 28, 4, 60, 0, 0), - stringsAsFactors=FALSE) - - a[[2]] <- data.table("target"= c("NIA1", "1A1N-2", "LC1", "NIA2", "1D1C:one", "1D1C:two", "1B1C.2", "MIX6.c1", "1A1N-1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.d", "MIX6.nc", "CC_a", "CC_b", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "V1"= c( 333, 20, 3, 666, 0, 76, 52, 100, 10, 360, 0, 100, 0, 180, 25, 60, 7, 27, 13, 35, 0, 0), - "V2"= c( 330, 11, 4, 560, 0, 80, 55, 90, 11, 380, 0, 80, 0, 240, 23, 55, 10, 31, 2, 55, 0, 0), - stringsAsFactors=FALSE) - b <- list() - b[[1]] <- data.table("target"= c("NIA1", "LC1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "V1"= c( 333, 6, 666, 12, 23, 0, 25, 150, 15, 80, 325, 105, 40, 0, 200, 0, 15, 45, 12, 0, 80, 200), - "V2"= c( 323, 1, 606, 15, 22, 0, 23, 190, 20, 90, 270, 115, 30, 0, 150, 0, 20, 60, 15, 0, 120, 250), - stringsAsFactors=FALSE) - b[[2]] <- data.table("target"= c("NIA2", "1A1N-1", "NIA1", "1A1N-2", "1D1C:one", "1D1C:two", "LC1", "1B1C.2", "MIX6.c2", "MIX6.c3", "MIX6.c1", "MIX6.c4", "MIX6.nc", "MIX6.d", "CC_b", "CC_a", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "V1"= c( 666, 12, 333, 23, 0, 25, 0, 150, 155, 40, 300, 0, 33, 0, 93, 22, 22, 61, 11, 0, 110, 210), - "V2"= c( 656, 15, 323, 22, 0, 23, 2, 160, 120, 35, 280, 0, 95, 0, 119, 18, 19, 58, 7, 0, 150, 220), - "V3"= c( 676, 13, 343, 21, 0, 20, 3, 145, 133, 30, 270, 0, 55, 0, 80, 23, 17, 50, 14, 0, 130, 200), - stringsAsFactors=FALSE) + if (clean) { + a[[1]] <- data.table("target"= c("1A1B.a", "1A1B.b", "LC1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2", "1B1C.1", "NIB.1"), + "V1"= c( 69, 0, 3, 20, 0, 76, 52, 20, 50, 103, 321, 0, 100, 90, 0, 10, 30, 4, 50, 0, 0, 0, 0), + "V2"= c( 96, 0, 2, 21, 0, 80, 55, 22, 52, 165, 320, 0, 130, 80, 0, 11, 29, 5, 40, 0, 0, 0, 0), + "V3"= c( 88, 0, 0, 18, 0, 72, 50, 21, 49, 150, 325, 0, 120, 70, 0, 9, 28, 4, 60, 0, 0, 0, 0), + stringsAsFactors=FALSE) + + a[[2]] <- data.table("target"= c("1A1B.a", "1A1B.b", "1A1N-2", "LC1", "1D1C:one", "1D1C:two", "1B1C.2", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.d", "MIX6.nc", "CC_a", "CC_b", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2", "1B1C.1", "NIB.1"), + "V1"= c( 121, 0, 20, 3, 0, 76, 52, 100, 360, 0, 100, 0, 180, 25, 60, 7, 27, 13, 35, 0, 0, 0, 0), + "V2"= c( 144, 0, 11, 4, 0, 80, 55, 90, 380, 0, 80, 0, 240, 23, 55, 10, 31, 2, 55, 0, 0, 0, 0), + stringsAsFactors=FALSE) + + b[[1]] <- data.table("target"= c("1A1B.a", "1A1B.b", "LC1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2", "1B1C.1", "NIB.1"), + "V1"= c( 0, 91, 6, 23, 0, 25, 150, 15, 80, 325, 105, 40, 0, 200, 0, 15, 45, 12, 0, 80, 200, 0, 0), + "V2"= c( 0, 100, 1, 22, 0, 23, 190, 20, 90, 270, 115, 30, 0, 150, 0, 20, 60, 15, 0, 120, 250, 0, 0), + stringsAsFactors=FALSE) + + b[[2]] <- data.table("target"= c("1A1N-2", "1D1C:one", "1D1C:two", "LC1", "1B1C.2", "MIX6.c2", "MIX6.c3", "MIX6.c1", "MIX6.c4", "MIX6.nc", "MIX6.d", "CC_b", "CC_a", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2", "1A1B.b", "1A1B.a", "1B1C.1", "NIB.1"), + "V1"= c( 23, 0, 25, 0, 150, 155, 40, 300, 0, 33, 0, 93, 22, 22, 61, 11, 0, 110, 210, 76, 0, 0, 0), + "V2"= c( 22, 0, 23, 2, 160, 120, 35, 280, 0, 95, 0, 119, 18, 19, 58, 7, 0, 150, 220, 69, 0, 0, 0), + "V3"= c( 21, 0, 20, 3, 145, 133, 30, 270, 0, 55, 0, 80, 23, 17, 50, 14, 0, 130, 200, 36, 0, 0, 0), + stringsAsFactors=FALSE) + } else { + a[[1]] <- data.table("target"= c("1A1B.a", "1A1B.b", "LC1", "NIA1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), + "V1"= c( 69, 0, 3, 333, 666, 10, 20, 0, 76, 52, 20, 50, 103, 321, 0, 100, 90, 0, 10, 30, 4, 50, 0, 0), + "V2"= c( 96, 0, 2, 310, 680, 11, 21, 0, 80, 55, 22, 52, 165, 320, 0, 130, 80, 0, 11, 29, 5, 40, 0, 0), + "V3"= c( 88, 0, 0, 340, 610, 7, 18, 0, 72, 50, 21, 49, 150, 325, 0, 120, 70, 0, 9, 28, 4, 60, 0, 0), + stringsAsFactors=FALSE) + + a[[2]] <- data.table("target"= c("NIA1", "1A1B.a", "1A1B.b", "1A1N-2", "LC1", "NIA2", "1D1C:one", "1D1C:two", "1B1C.2", "MIX6.c1", "1A1N-1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.d", "MIX6.nc", "CC_a", "CC_b", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), + "V1"= c( 333, 121, 0, 20, 3, 666, 0, 76, 52, 100, 10, 360, 0, 100, 0, 180, 25, 60, 7, 27, 13, 35, 0, 0), + "V2"= c( 330, 144, 0, 11, 4, 560, 0, 80, 55, 90, 11, 380, 0, 80, 0, 240, 23, 55, 10, 31, 2, 55, 0, 0), + stringsAsFactors=FALSE) + + b[[1]] <- data.table("target"= c("1A1B.a", "1A1B.b", "NIA1", "LC1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), + "V1"= c( 0, 91, 333, 6, 666, 12, 23, 0, 25, 150, 15, 80, 325, 105, 40, 0, 200, 0, 15, 45, 12, 0, 80, 200), + "V2"= c( 0, 100, 323, 1, 606, 15, 22, 0, 23, 190, 20, 90, 270, 115, 30, 0, 150, 0, 20, 60, 15, 0, 120, 250), + stringsAsFactors=FALSE) + + b[[2]] <- data.table("target"= c("NIA2", "1A1N-1", "NIA1", "1A1N-2", "1D1C:one", "1D1C:two", "LC1", "1B1C.2", "MIX6.c2", "MIX6.c3", "MIX6.c1", "MIX6.c4", "MIX6.nc", "MIX6.d", "CC_b", "CC_a", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2", "1A1B.b", "1A1B.a"), + "V1"= c( 666, 12, 333, 23, 0, 25, 0, 150, 155, 40, 300, 0, 33, 0, 93, 22, 22, 61, 11, 0, 110, 210, 76, 0), + "V2"= c( 656, 15, 323, 22, 0, 23, 2, 160, 120, 35, 280, 0, 95, 0, 119, 18, 19, 58, 7, 0, 150, 220, 69, 0), + "V3"= c( 676, 13, 343, 21, 0, 20, 3, 145, 133, 30, 270, 0, 55, 0, 80, 23, 17, 50, 14, 0, 130, 200, 36, 0), + stringsAsFactors=FALSE) + } if (errannot_inconsistent) - a[[2]][14, 1] <- "Unexpected-name" + b[[1]][14, 1] <- "Unexpected-name" return(list('annot'= tx, 'boots_A'= a, 'boots_B'= b)) } @@ -159,134 +79,14 @@ sim_boot_data <- function(errannot_inconsistent=FALSE, PARENT_COL="parent_id", T #' @param PARENT_COL Name for the bootdtraps column containing counts. #' @param TARGET_COL Name for annotation column containing transcript identification. #' @param errannot_inconsistent Logical. Introduces an inconsistency in the transcript IDs, for testing of sanity checks. (FALSE) +#' @param clean Logical. Remove the intentional inconsistency of IDs between annotation and abundances. #' @return A list with 3 elements. First, a data.frame with the corresponding annotation table. #' Then, 2 data.tables, one per condition. Each data table contains the abundance estimates for all the samples for the respective condition. #' #' @export #' -sim_count_data <- function(errannot_inconsistent=FALSE, PARENT_COL="parent_id", TARGET_COL="target_id") { - sim <- sim_boot_data(errannot_inconsistent=errannot_inconsistent, PARENT_COL=PARENT_COL, TARGET_COL=TARGET_COL) +sim_count_data <- function(errannot_inconsistent=FALSE, PARENT_COL="parent_id", TARGET_COL="target_id", clean=FALSE) { + sim <- sim_boot_data(errannot_inconsistent=errannot_inconsistent, PARENT_COL=PARENT_COL, TARGET_COL=TARGET_COL, clean=clean) return(list('annot'=sim$annot, 'counts_A'= sim$boots_A[[1]], 'counts_B'= sim$boots_B[[1]])) } - - -#=============================================================================== -#' Generate artificial read-count data for simulated 2-transcript genes, -#' systematically covering a broad range of proportions and magnitudes. -#' -#' For each level of transcript proportions in condition A, all proportion levels are -#' generated for condition B as possible new states. Therefore the number of transcripts -#' generated is 2 * N^2 * M, where 2 is the number of gene isoforms per paramater combination, -#' N is the number of proportion levels and M the number of magnitude levels. -#' -#' @param proportions Range and step of proportions for 1st transcript. Sibling transcript complemented from these. -#' @param magnitudes vector of read-count sizes to use. -#' @return list containing: \code{data} a minimal sleuth-like object. -#' \code{anno} a dataframe matching \code{target_id} to \code{parent_id}. -#' \code{sim} a dataframe with the simulation parameters per transcript. -#' -# @export -countrange_sim <- function(proportions= seq(0, 1, 0.01), - magnitudes= c(1,5,10,30,100,500,1000)) { - # Combine proportions and magnitudes. - # Start by grid-ing the values for easier visualization and naming, instead of computing the outer products directly. - grd <- expand.grid(c("t1", "t2"), proportions, proportions, magnitudes) - names(grd) <- c("sibl", "propA", "propB", "mag") - # Name the "transcripts". - opts <- options() - options(scipen=0, digits=3) - grd["parent_id"] <- paste(grd$mag, "_a", grd$propA,"_b", grd$propB, sep="") - grd["target_id"] <- paste(grd$parent_id, grd$sibl, sep="_") - options(scipen=opts$scipen, digits=opts$digits) - # Complementary proportions - grd[grd$sibl == "t2", c("propA", "propB")] <- c(1-grd$propA[grd$sibl == "t2"], 1-grd$propB[grd$sibl == "t2"]) - - # Make a sleuth-like object. - sl <- list() - sl["kal"] <- list() - sl[["sample_to_covariates"]] <- data.frame("sample"=c("A-1", "B-1"), "condition"=c("A", "B")) - # condition A - sl$kal[[1]] <- list() - sl$kal[[1]]["bootstrap"] <- list() - sl$kal[[1]]$bootstrap[[1]] <- data.frame("target_id"=grd$target_id, "est_counts"=grd$propA * grd$mag) - # condition B - sl$kal[[2]] <- list() - sl$kal[[2]]["bootstrap"] <- list() - sl$kal[[2]]$bootstrap[[1]] <- data.frame("target_id"=grd$target_id, "est_counts"=grd$propB * grd$mag) - - # Create annotation: - anno <- data.frame("target_id"=grd$target_id, "parent_id"=grd$parent_id) - - # Store simulation for ease. - results <- list("data"=sl, "anno"=anno, "sim"=grd) - - return(results) -} - - -#=============================================================================== -#' Combine simulation details with DTU calls, in preparation for plotting. -#' -#' @param sim An object generated with countrange_sim(), containing the simulated data details with which the \code{dtu} object was calculated. -#' @param dtuo The object with the DTU results corresponding to the \code{sim} object's data. -#' @return dataframe -#' -#' @import data.table -# @export -combine_sim_dtu <- function(sim, dtuo) { - with(dtuo, { - results <- data.table(Genes[, list(parent_id, pval_AB, pval_BA, dtu_AB, dtu_BA, dtu, pval_prop_min, dtu_prop)]) - setkey(results, parent_id) - tmp <- data.table(sim$sim[sim$sim["sibl"]=="t2", c("parent_id", "propA", "mag", "propB")]) - setkey(tmp, parent_id) - results <- merge(results, tmp, by="parent_id") - return(results) - }) -} - - -#=============================================================================== -#' Plot relationship of parameters. -#' -#' @param data the output from combine_sim_dtu() -#' @param type (str) "AvBvM", "B/AvM", "AvBvMprop", "B/AvMprop", "B-AvM" -#' -#' @import ggplot2 -#' @import data.table -# @export -plot_sim <- function(data, type = "AvBvM") { - with(data, { - if(type =="AvBvM"){ - return( - ggplot(data[order(dtu, propA, propB), ], aes(x=propA, y=propB, color=dtu)) + - labs(x="Prop t1 in A", y = "Prop t1 in B") + - scale_color_manual(values=c("lightblue","red")) + - geom_point() + - facet_grid(. ~ mag) - ) - } else if(type == "B/AvM") { - return( - ggplot(data[order(dtu, mag), ], aes(x=propB/propA, y=mag, color=dtu)) + - labs(x="Prop t1 in B / Prop t1 in A", y = "Gene magnitude") + - scale_color_manual(values=c("lightblue","red")) + - geom_point() - ) - } else if(type =="AvBvMprop"){ - return( - ggplot(data[order(dtu_prop, propA, propB), ], aes(x=propA, y=propB, color=dtu_prop)) + - labs(x="Prop t1 in A", y = "Prop t1 in B") + - scale_color_manual(values=c("lightblue","red")) + - geom_point() + - facet_grid(. ~ mag) - ) - } else if(type == "B/AvMprop") { - return( - ggplot(data[order(dtu_prop, mag), ], aes(x=propB/propA, y=mag, color=dtu_prop)) + - labs(x="Prop t1 in B / Prop t1 in A", y = "Gene magnitude") + - scale_color_manual(values=c("lightblue","red")) + - geom_point() - ) - } - }) -} diff --git a/R/func.R b/R/func.R index 3ee9031..830fe67 100644 --- a/R/func.R +++ b/R/func.R @@ -21,6 +21,7 @@ #' @param threads Number of threads. #' @param seed Seed for random engine. #' @param scaling Abundance scaling factor. +#' @param reckless whether to ignore detected annotation discrepancies #' #' @return List: \itemize{ #' \item{"error"}{logical} @@ -36,7 +37,7 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A, TARGET_COL, PARENT_COL, correction, testmode, scaling, threads, seed, p_thresh, abund_thresh, dprop_thresh, - qboot, qbootnum, qrep_thresh, rboot, rrep_thresh) { + qboot, qbootnum, qrep_thresh, rboot, rrep_thresh, reckless) { warnmsg <- list() # Input format. @@ -48,14 +49,16 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A, return(list("error"=TRUE, "message"="You must specify (the same type of) data for both conditions!")) # Annotation. + if (!is.logical(reckless) || is.na(reckless)) + return(list("error"=TRUE, "message"="Invalid value to reckless!")) if (!is.data.frame(annot)) return(list("error"=TRUE, "message"="The provided annot is not a data.frame!")) if (any(!c(TARGET_COL, PARENT_COL) %in% names(annot))) return(list("error"=TRUE, "message"="The specified target and/or parent IDs field names do not exist in annot!")) - if (length(annot$target_id) != length(unique(annot$target_id))) + if (length(annot[[TARGET_COL]]) != length(unique(annot[[TARGET_COL]]))) return(list("error"=TRUE, "message"="Some transcript identifiers are not unique!")) - - # Parameters. + + # Simple parameters. if (!correction %in% p.adjust.methods) return(list("error"=TRUE, "message"="Invalid p-value correction method name. Refer to stats::p.adjust.methods .")) if (!testmode %in% c("genes", "transc", "both")) @@ -65,7 +68,7 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A, if ((!is.numeric(dprop_thresh)) || dprop_thresh < 0 || dprop_thresh > 1) return(list("error"=TRUE, "message"="Invalid proportion difference threshold! Must be between 0 and 1.")) if ((!is.numeric(abund_thresh)) || abund_thresh < 0) - return(list("error"=TRUE, "message"="Invalid abundance threshold! Must be between 0 and 1.")) + return(list("error"=TRUE, "message"="Invalid abundance threshold! Must be a count >= 0.")) if ((!is.numeric(qbootnum)) || qbootnum < 0) return(list("error"=TRUE, "message"="Invalid number of bootstraps! Must be a positive integer number.")) if (!is.logical(qboot)) @@ -80,6 +83,8 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A, return(list("error"=TRUE, "message"="Invalid number of threads! Must be positive integer.")) if (threads > parallel::detectCores(logical= TRUE)) return(list("error"=TRUE, "message"="Number of threads exceeds system's reported capacity.")) + + # Scaling nsmpl <- NULL if (!is.null(count_data_A)){ nsmpl <- dim(count_data_A)[2] + dim(count_data_B)[2] @@ -103,8 +108,20 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A, if(is.null(boot_data_A)) { if (!is.data.table(count_data_A) | !is.data.table(count_data_B)) return(list("error"=TRUE, "message"="The counts data are not data.tables!")) - if (!all( count_data_A[[1]][order(count_data_A[[1]])] == count_data_B[[1]][order(count_data_B[[1]])] )) - return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs between conditions!")) + if ( !all(count_data_A[[1]] %in% annot[[TARGET_COL]]) && !all(annot[[TARGET_COL]] %in% count_data_A[[1]]) ) { + if(reckless){ + warnmsg["annotation-mismatch"] <- "The transcript IDs in the quantifications and the annotation do not match completely! Did you use different annotations? Reckless mode enabled, continuing at your own risk..." + } else { + return(list("error"=TRUE, "message"="The transcript IDs in the quantifications and the annotation do not match completely! Did you use different annotations?")) + } + } + if (!all( count_data_A[[1]][order(count_data_A[[1]])] == count_data_B[[1]][order(count_data_B[[1]])] )) { + if (reckless) { + warnmsg["countIDs"] <- "Transcript IDs do not match completely between conditions! Did you use different annotations? Reckless mode enabled, continuing at your own risk..." + } else { + return(list("error"=TRUE, "message"="Transcript IDs do not match completely between conditions! Did you use different annotations?")) + } + } } else { warnmsg["cnts&boots"] <- "Received multiple input formats! Only the bootstrapped data will be used." } @@ -119,17 +136,33 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A, } maxmatrix <- 2^31 - 1 if (qboot) { - # Consistency, if (!is.null(boot_data_A)) { - # Direct data. + # Compared to annotation. + if ( !all(boot_data_A[[1]][[1]] %in% annot[[TARGET_COL]]) && !all(annot[[TARGET_COL]] %in% boot_data_A[[1]][[1]]) ) { + if(reckless){ + warnmsg["annotation-mismatch"] <- "The transcript IDs in the quantifications and the annotation do not match completely! Did you use different annotations? Reckless mode enabled, continuing at your own risk..." + } else { + return(list("error"=TRUE, "message"="The transcript IDs in the quantifications and the annotation do not match completely! Did you use different annotations?")) + } + } + # Among samples tx <- boot_data_A[[1]][[1]][order(boot_data_A[[1]][[1]])] for (k in 2:length(boot_data_A)){ - if (!all( tx == boot_data_A[[k]][[1]][order(boot_data_A[[k]][[1]])] )) - return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs across samples!")) + if (!all( tx == boot_data_A[[k]][[1]][order(boot_data_A[[k]][[1]])] )) { + if (reckless) { + warnmsg[paste0("bootIDs-A", k)] <- paste0("Inconsistent set of transcript IDs across samples (A", k, ")! Did you use different annotations? Reckless mode enabled, continuing at your own risk...") + } else { + return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs across samples! Did you use different annotations?")) + } + } } for (k in 1:length(boot_data_B)){ if (!all( tx == boot_data_B[[k]][[1]][order(boot_data_B[[k]][[1]])] )) - return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs across samples!")) + if (reckless) { + warnmsg[paste0("bootIDs-B", k)] <- paste0("Inconsistent set of transcript IDs across samples (B", k, ")! Did you use different annotations? Reckless mode enabled, continuing at your own risk...") + } else { + return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs across samples! Did you use different annotations?")) + } } # Number of iterations. @@ -156,7 +189,7 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A, if (rboot & any( length(samples_by_condition[[1]]) * length(samples_by_condition[[2]]) > maxmatrix/dim(annot)[1], length(boot_data_A) * length(boot_data_B) > maxmatrix/dim(annot)[1] ) ) warnmsg["toomanyreplicates"] <- "The number of replicates is too high. Exhaustive 1vs1 would exceed maximum capacity of an R matrix." - + return(list("error"=FALSE, "message"="All good!", "maxboots"=minboots, "warn"=(length(warnmsg) > 0), "warnings"= warnmsg)) } @@ -205,7 +238,7 @@ alloc_out <- function(annot, full, n=1){ "abund_scaling"=numeric(length=n), "quant_boot"=NA,"quant_reprod_thresh"=NA_real_, "quant_bootnum"=NA_integer_, "rep_boot"=NA, "rep_reprod_thresh"=NA_real_, "rep_bootnum"=NA_integer_, - "seed"=NA_integer_) + "seed"=NA_integer_, "reckless"=NA) Genes <- data.table("parent_id"=as.vector(unique(annot$parent_id)), "elig"=NA, "sig"=NA, "elig_fx"=NA, "quant_reprod"=NA, "rep_reprod"=NA, "DTU"=NA, "transc_DTU"=NA, "known_transc"=NA_integer_, "detect_transc"=NA_integer_, "elig_transc"=NA_integer_, "maxDprop"=NA_real_, @@ -256,7 +289,7 @@ alloc_out <- function(annot, full, n=1){ #' @param test_transc Whether to do transcript-level test. #' @param test_genes Whether to do gene-level test. #' @param full Either "full" (for complete output structure) or "short" (for bootstrapping). -#' @param count_thresh Minimum number of counts per replicate. +#' @param count_thresh Minimum average count across replicates. #' @param p_thresh The p-value threshold. #' @param dprop_thresh Minimum difference in proportions. #' @param correction Multiple testing correction type. @@ -303,8 +336,11 @@ calculate_DTU <- function(counts_A, counts_B, tx_filter, test_transc, test_genes ctA <- count_thresh * resobj$Parameters[["num_replic_A"]] # Adjust count threshold for number of replicates. ctB <- count_thresh * resobj$Parameters[["num_replic_B"]] Transcripts[, elig_xp := (sumA >= ctA | sumB >= ctB)] - Transcripts[, elig := (elig_xp & totalA != 0 & totalB != 0 & (sumA != totalA | sumB != totalB))] # If the entire gene is shut off in one condition, changes in proportion cannot be defined. - # If sum and total are equal in both conditions the gene has only one expressed isoform, or one isoform altogether. + Transcripts[, elig := (elig_xp & # isoform expressed above the noise threshold in EITHER condition + totalA >= ctA & totalB >= ctB & # at least one eligibly expressed isoform exists in EACH condition (prevent gene-on/off from creating wild proportions from low counts) + totalA != 0 & totalB != 0 & # gene expressed in BOTH conditions (failsafe, in case user sets noise threshold to 0) + (propA != 1 | propB != 1) )] # not the only isoform expressed. + # If sum and total are equal in both conditions, it has no detected siblings and thus cannot change in proportion. Genes[, elig_transc := Transcripts[, as.integer(sum(elig, na.rm=TRUE)), by=parent_id][, V1] ] Genes[, elig := elig_transc >= 2] @@ -494,3 +530,4 @@ group_samples <- function(covariates) { } +#EOF diff --git a/R/rats.R b/R/rats.R index 3e5d2bc..bce8e9e 100644 --- a/R/rats.R +++ b/R/rats.R @@ -19,8 +19,8 @@ #' @param name_B The name for the other condition. (Default "Condition-B") #' @param varname The name of the covariate to which the two conditions belong. (Default \code{"condition"}). #' @param p_thresh The p-value threshold. (Default 0.05) -#' @param abund_thresh Noise threshold. Minimum mean abundance for transcripts to be eligible for testing. (Default 5) #' @param dprop_thresh Effect size threshold. Minimum change in proportion of a transcript for it to be considered meaningful. (Default 0.20) +#' @param abund_thresh Noise threshold. Minimum mean (across replicates) abundance for transcripts (and genes) to be eligible for testing. (Default 5) #' @param correction The p-value correction to apply, as defined in \code{\link[stats]{p.adjust.methods}}. (Default \code{"BH"}) #' @param scaling A scaling factor or vector of scaling factors, to be applied to the abundances *prior* to any thresholding and testing. Useful for scaling TPMs (transcripts per 1 million reads) to the actual library sizes of the samples. If a vector is supplied, the order should correspond to the samples in group A followed by the samples in group B. WARNING: Improper use of the scaling factor will artificially inflate/deflate the significances obtained. #' @param testmode One of \itemize{\item{"genes"}, \item{"transc"}, \item{"both" (default)}}. @@ -33,7 +33,8 @@ #' @param verbose Display progress updates and warnings. (Default \code{TRUE}) #' @param threads Number of threads to use. (Default 1) Multi-threading will be ignored on non-POSIX systems. #' @param seed A numeric integer used to initialise the random number engine. Use this only if reproducible bootstrap selections are required. (Default NA) -#' @param dbg Debugging mode. Interrupt execution at the specified flag-point. Used to speed up code-tests by avoiding irrelevant downstream processing. (Default 0: do not interrupt) +#' @param reckless RATs normally aborts if any inconsistency is detected among the transcript IDs found in the annotation and the quantifications. Enabling reckless mode will downgrade this error to a warning and allow RATs to continue the run. Not recommended unless you know why the inconsistency exists and how it will affect the results. (Default FALSE) +#' @param dbg Debugging mode only. Interrupt execution at the specified flag-point. Used to speed up code-tests by avoiding irrelevant downstream processing. (Default 0: do not interrupt) #' @return List of mixed types. Contains a list of runtime settings, a table of gene-level results, a table of transcript-level results, and a list of two tables with the transcript abundaces. #' #' @import utils @@ -46,7 +47,7 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i name_A= "Condition-A", name_B= "Condition-B", varname= "condition", p_thresh= 0.05, abund_thresh= 5, dprop_thresh= 0.2, correction= "BH", scaling= 1, testmode= "both", qboot= TRUE, qbootnum= 0L, qrep_thresh= 0.95, rboot=TRUE, rrep_thresh= 0.85, - description= NA_character_, verbose= TRUE, threads= 1L, seed=NA_integer_, dbg= "0") + description= NA_character_, verbose= TRUE, threads= 1L, seed=NA_integer_, reckless=FALSE, dbg= "0") { #---------- PREP @@ -60,7 +61,7 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i TARGET_COL, PARENT_COL, correction, testmode, scaling, threads, seed, p_thresh, abund_thresh, dprop_thresh, - qboot, qbootnum, qrep_thresh, rboot, rrep_thresh) + qboot, qbootnum, qrep_thresh, rboot, rrep_thresh, reckless) if (paramcheck$error) stop(paramcheck$message) if (verbose) @@ -243,6 +244,7 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i resobj$Parameters[["quant_reprod_thresh"]] <- qrep_thresh resobj$Parameters[["quant_bootnum"]] <- qbootnum resobj$Parameters[["rep_reprod_thresh"]] <- rrep_thresh + resobj$Parameters[["reckless"]] <- reckless if (dbg == "info") return(resobj) @@ -469,6 +471,19 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i Transcripts[is.na(rep_reprod), rep_reprod := c(FALSE)] Transcripts[is.na(DTU), DTU := c(FALSE)] Transcripts[is.na(gene_DTU), gene_DTU := c(FALSE)] + # Eradicate NAs from count columns, as they mess up plotting. These would be caused by inconsistencies between annotation and quantifications, and would only exist in "reckless" mode. + for (i in 1:Parameters$num_replic_A){ + idx <- is.na(Abundances[[1]][[paste0('V',i)]]) + Abundances[[1]][idx, paste0('V',i) := 0] + } + for (i in 1:Parameters$num_replic_B){ + idx <- is.na(Abundances[[2]][[paste0('V',i)]]) + Abundances[[2]][idx, paste0('V',i) := 0] + } + Transcripts[is.na(meanA), meanA := c(0)] + Transcripts[is.na(meanB), meanB := c(0)] + Transcripts[is.na(propA), propA := c(0)] + Transcripts[is.na(propB), propB := c(0)] # Drop the bootstrap columns, if unused. if (!qboot || !test_transc) @@ -495,4 +510,4 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i return(resobj) } - +#EOF diff --git a/inst/doc/input.R b/inst/doc/input.R index 1bd5e38..8296540 100644 --- a/inst/doc/input.R +++ b/inst/doc/input.R @@ -21,7 +21,7 @@ head(sim_count_data()[[1]]) ## ---- eval=FALSE--------------------------------------------------------- # # Simulate some data. -# simdat <- sim_count_data() +# simdat <- sim_count_data(clean=TRUE) # # For convenience let's assign the contents of the list to separate variables # mycond_A <- simdat[[2]] # Simulated abundances for one condition. # mycond_B <- simdat[[3]] # Simulated abundances for other condition. @@ -39,7 +39,7 @@ head(sim_count_data()[[1]]) ## ------------------------------------------------------------------------ # Simulate some data. (Notice it is a different function than before.) -simdat <- sim_boot_data() +simdat <- sim_boot_data(clean=TRUE) # For convenience let's assign the contents of the list to separate variables mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition. @@ -83,7 +83,7 @@ myannot <- simdat[[1]] # Transcript and gene IDs for the above data. # # Calling DTU with custom thresholds. # mydtu <- call_DTU(annot= myannot, # boot_data_A= mycond_A, boot_data_B= mycond_B, -# p_thresh= 0.01, abund_thresh= 10, dprop_thres = 0.25) +# p_thresh= 0.01, dprop_thres = 0.15, abund_thresh= 10) ## ---- eval=FALSE--------------------------------------------------------- # # Bootstrap (default). Do 100 iterations. @@ -169,3 +169,8 @@ myannot <- simdat[[1]] # Transcript and gene IDs for the above data. # boot_data_B= mydata$boot_data_B, # scaling=c(25, 26.7, 23, 50.0, 45, 48.46, 52.36)) +## ---- eval=FALSE--------------------------------------------------------- +# mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A, +# boot_data_B= mydata$boot_data_B, +# reckless=TRUE, verbose=TRUE) + diff --git a/inst/doc/input.Rmd b/inst/doc/input.Rmd index 3a383a5..be59177 100644 --- a/inst/doc/input.Rmd +++ b/inst/doc/input.Rmd @@ -1,7 +1,7 @@ --- title: 'RATs: Input and Settings' author: "Kimon Froussios" -date: "19 SEP 2017" +date: "08 MAR 2018" output: html_document: keep_md: yes @@ -103,8 +103,8 @@ RATs comes with data emulators, intended to be used for testing the code. Howeve use for showcasing how RATs works. By default, RATs reports on its progress and produces a summary report. These can be suppressed using the -`verbose = FALSE` parameter to the call. To prevent cluttering this tutorial with verbose output, we will use this -option in all the examples. +`verbose = FALSE` parameter. This also suppresses some warnings generated by RATs when checking the sanity of its input, +so we do not recommend it for general use. However, to prevent cluttering this tutorial with verbose output, we will use this option in our examples. If you choose to allow verbose output when trying out the examples below using the emulated data, you will get some **warnings** about the number of bootstraps. The warning is triggered because the emulated dataset used in the examples immitates only the structure of @@ -120,7 +120,7 @@ First, let's emulate some data to work with: ```{r, eval=FALSE} # Simulate some data. -simdat <- sim_count_data() +simdat <- sim_count_data(clean=TRUE) # For convenience let's assign the contents of the list to separate variables mycond_A <- simdat[[2]] # Simulated abundances for one condition. mycond_B <- simdat[[3]] # Simulated abundances for other condition. @@ -162,7 +162,7 @@ First, let's emulate some data, as we did before. ```{r} # Simulate some data. (Notice it is a different function than before.) -simdat <- sim_boot_data() +simdat <- sim_boot_data(clean=TRUE) # For convenience let's assign the contents of the list to separate variables mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition. @@ -273,14 +273,16 @@ The following three main thresholds are used in RATs: # Calling DTU with custom thresholds. mydtu <- call_DTU(annot= myannot, boot_data_A= mycond_A, boot_data_B= mycond_B, - p_thresh= 0.01, abund_thresh= 10, dprop_thres = 0.25) + p_thresh= 0.01, dprop_thres = 0.15, abund_thresh= 10) ``` -1. `p_thresh` - Statistical significance level. P-values below this will be considered significant. Lower threshold values are stricter. (Default 0.05) -2. `abund_thresh` - Noise threshold. Transcripts with abundance below that value in both conditions are ignored. Higher threshold values are stricter. (Default 5, assumes abundances scaled to library size) -3. `dprop_thresh` - Effect size threshold. Transcripts whose proportion changes between conditions by less than the threshold are considered non-DTU, regardless of their statistical significance. Higher threshold values are stricter. (Default 0.20) +1. `p_thresh` - Statistical significance level. P-values below this will be considered significant. Lower threshold values are stricter, and help reduce low-count low-confidence calls. (Default 0.05, very permissive) +2. `dprop_thresh` - Effect size threshold. Transcripts whose proportion changes between conditions by less than this threshold are considered uninteresting, regardless of their statistical significance. (Default 0.20, quite strict) +3. `abund_thresh` - Noise threshold. Minimum mean (across replicates) abundance for a transcript to be considered expressed. Transcripts with mean abundances below this will be ignored. Mean total gene count must also meet this value in both conditions, a.k.a. at least one expressed isoform must exist in each condition. (Default 5, very permissive) + +The default values for these thresholds have been chosen such that they achieve a *median* FDR <5% for a high quality dataset from *Arabidopsis thaliana*, even with only 3 replicates per condition. +Your mileage may vary and you should give some consideration to selecting appropriate values. -The default values for these thresholds have been chosen such that they achieve a *median* FDR <5% for a high quality dataset from *Arabidopsis thaliana*, even with only 3 replicates per condition. Your mileage may vary and you should give some consideration to selecting appropriate values. Depending on the settings, *additional thresholds* are available and will be discussed in their respective sections below. @@ -378,7 +380,7 @@ mydtu <- call_DTU(annot = myannot, 1. `threads` - The number of threads to use. (Default 1) -Due to core R implementation limitations, the type of multi-threading used in RATs works only in POSIX-compliant systems. +Due to core R implementation limitations, the type of multi-threading used in RATs works only in POSIX-compliant systems (Linux, Mac, not Windows). Refer to the `parallel` package for details. @@ -447,14 +449,14 @@ such as those employed by RATs, and reduces the statistical power of the method. To counter this, RATs provides the option to scale abundaces either equally by a single factor (such as average library size among samples) or by a vector of factors (one per sample). The former maintains any pre-existing library-size normalisation among samples. This is necessary for fold-change based methods, but RATs does not require it. Instead, using the respective actual library sizes of the samples allows the -higher-throughput samples to have a bigger influence than the lower-throughput samples. This is particularly relevant if your samples have dissimilar +higher-throughput samples to have a bigger influence than the lower-throughput samples. This is particularly relevant if your samples have very dissimilar library sizes. For flexibility with different types of input, these scaling options can be applied in either of two stages: The data import step by `fish4rodents()`, or the actual testing step by `call_DTU()`. In the example examined previously, `fish4rodents()` was instructed to create TPM abundances, -by normalising to 10000000 reads. Such values are useful with certain other tools that a user may also intend to use. +by normalising to `1000000` reads. Such values are useful with certain other tools that a user may also intend to use. Subsequently, these TPMs were re-scaled to meet the library size of each sample, thus providing RATs with count-like abundance values that retain -the TPM's normalisation by isoform length. However, it is not necessary to scale in two separate steps. +the normalisation by isoform length. However, it is not necessary to scale in two separate steps. Both `fish4rodents()` and `call_DTU()` support scaling by a single value or a vector of values. If you don't need the TPMs, you can scale directly to the desired library size(s), as in the examples below: @@ -495,26 +497,33 @@ mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A, ``` You can mix and match scaling options as per your needs, so take care to ensure that the scaling you apply is appropriate. -It is important to note, that if you simply run both methods with their respective defaults, you'll effectively run RATs +*It is important to note*, that if you simply run both methods with their respective defaults, you'll effectively run RATs on TPM values, which is extremely underpowered and not recommended. Please, provide appropriate scaling factors for your data. -*** - - -# Annotation discrepancies +## Annotation discrepancies Different annotation versions often preserve transcript IDs, despite altering the details of the transcript model. +They also tend to include more or fewer transcripts, affecting the result of quantification. It is important to use the same annotation throughout the workflow, otherwise the abundances will not be comparable in a meaningful way. -All internal operations and the output of RATs are based on the annotation provided: +RATs will abort the run if the set of feature IDs in the provided annotation does not match fully the set of IDs in the quantifications. +If this happens, ensure you are using the exact same annotation throughout your workflow. + +For special use cases, RATs provides the option to ignore the discrepancy and pretend everything is OK. Do this at your own risk. + +```{r, eval=FALSE} +mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A, + boot_data_B= mydata$boot_data_B, + reckless=TRUE, verbose=TRUE) +``` + -* Any transcripts present in the data but missing from the annotation will be ignored completely and -will not show up in the output, as there is no reliable way to match them to gene IDs. -* Any transcript/gene present in the annotation but missing from the data will be included in the output as zero expression. -* If the samples appear to use different annotations from one another, RATs will abort. +In reckless mode, all internal operations and the output of RATs are based on the annotation provided: +* Any transcript IDs present in the data but missing from the annotation will be ignored and will not show up in the output at all, as they cannot be matched to the gene IDs. +* Any transcript ID present in the annotation but missing from the data will be included in the output as zero expression. They can be identified later as `NA` values in `$Transcripts[, .(stdevA, stdevB)]` as these fields are not used downstream by RATs. `NA` values in other numeric fields are explicitly overwritten with `0` to allow downstream interoperability. *** diff --git a/inst/doc/input.html b/inst/doc/input.html index 8078e6c..65d0336 100644 --- a/inst/doc/input.html +++ b/inst/doc/input.html @@ -11,7 +11,7 @@ - +
As of v0.6.0
, input from a sleuth
object is no longer supported by RATs. This is due to changes in Sleuth v0.29
that made the bootstrap data no longer available. Instead, RATs already supports input directly from the Kallisto/Salmon quantifications.
For option 1, the function fish4rodents()
will load the data into tables suitable for option 2. Details in the respective section below.
For options 2 and 3, the format of the tables is as in the example below. The first column contains the transcript identifiers. Subsequent columns contain the abundances.
-## target V1 V2 V3
-## 1: LC1 3 2 0
-## 2: NIA1 333 310 340
-## 3: NIA2 666 680 610
-## 4: 1A1N-1 10 11 7
-## 5: 1A1N-2 20 21 18
-## 6: 1D1C:one 0 0 0
+## target V1 V2 V3
+## 1: 1A1B.a 69 96 88
+## 2: 1A1B.b 0 0 0
+## 3: LC1 3 2 0
+## 4: NIA1 333 310 340
+## 5: NIA2 666 680 610
+## 6: 1A1N-1 10 11 7
Regardless of data format, RATs also needs an annotation data.frame
or data.table
that matches transcript identifiers to gene identifiers. By default the column labels are target_id
for the transcript IDs and parent_id
for the gene IDs. These label values can be overridden (see Additional Settings later in this vignette). The annotation table looks like this:
## target_id parent_id
-## 1 NIB.1 NIB
-## 2 1A1N-2 1A1N
-## 3 1D1C:one 1D1C
-## 4 1D1C:two 1D1C
-## 5 1B1C.1 1B1C
-## 6 1B1C.2 1B1C
+## 1 1A1B.a 1A1B
+## 2 1A1B.b 1A1B
+## 3 NIB.1 NIB
+## 4 1A1N-2 1A1N
+## 5 1D1C:one 1D1C
+## 6 1D1C:two 1D1C
Such a table can be compiled with a variety of methods and from a variety of resources. RATs provides functionality to create this table from a GTF file. (Note: GFF3 is not supported for this)
# Extract transcript ID to gene ID index from a GTF annotation.
myannot <- annot2ids("my_annotation_file.gtf")
@@ -274,14 +276,14 @@ To bypass the complexity of involving third-party tools in this tutorial, we will instead use emulated data. RATs comes with data emulators, intended to be used for testing the code. However, they are also convenient to use for showcasing how RATs works.
-By default, RATs reports on its progress and produces a summary report. These can be suppressed using the verbose = FALSE
parameter to the call. To prevent cluttering this tutorial with verbose output, we will use this option in all the examples.
By default, RATs reports on its progress and produces a summary report. These can be suppressed using the verbose = FALSE
parameter. This also suppresses some warnings generated by RATs when checking the sanity of its input, so we do not recommend it for general use. However, to prevent cluttering this tutorial with verbose output, we will use this option in our examples.
If you choose to allow verbose output when trying out the examples below using the emulated data, you will get some warnings about the number of bootstraps. The warning is triggered because the emulated dataset used in the examples immitates only the structure of real data, not the actual volume of it, and as such it contains unrealistically few bootstrap iterations. Simply ignore these warnings for these examples.
This is the simplest usage case, and your likely input type if you use quantifications methods other than Kallisto and Salmon.
First, let’s emulate some data to work with:
# Simulate some data.
-simdat <- sim_count_data()
+simdat <- sim_count_data(clean=TRUE)
# For convenience let's assign the contents of the list to separate variables
mycond_A <- simdat[[2]] # Simulated abundances for one condition.
mycond_B <- simdat[[3]] # Simulated abundances for other condition.
@@ -319,7 +321,7 @@ Advanced options
DTU from bootstrapped abundance estimates
First, let’s emulate some data, as we did before.
# Simulate some data. (Notice it is a different function than before.)
-simdat <- sim_boot_data()
+simdat <- sim_boot_data(clean=TRUE)
# For convenience let's assign the contents of the list to separate variables
mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition.
@@ -408,13 +410,14 @@ Thresholds
# Calling DTU with custom thresholds.
mydtu <- call_DTU(annot= myannot,
boot_data_A= mycond_A, boot_data_B= mycond_B,
- p_thresh= 0.01, abund_thresh= 10, dprop_thres = 0.25)
+ p_thresh= 0.01, dprop_thres = 0.15, abund_thresh= 10)
-p_thresh
- Statistical significance level. P-values below this will be considered significant. Lower threshold values are stricter. (Default 0.05)
-abund_thresh
- Noise threshold. Transcripts with abundance below that value in both conditions are ignored. Higher threshold values are stricter. (Default 5, assumes abundances scaled to library size)
-dprop_thresh
- Effect size threshold. Transcripts whose proportion changes between conditions by less than the threshold are considered non-DTU, regardless of their statistical significance. Higher threshold values are stricter. (Default 0.20)
+p_thresh
- Statistical significance level. P-values below this will be considered significant. Lower threshold values are stricter, and help reduce low-count low-confidence calls. (Default 0.05, very permissive)
+dprop_thresh
- Effect size threshold. Transcripts whose proportion changes between conditions by less than this threshold are considered uninteresting, regardless of their statistical significance. (Default 0.20, quite strict)
+abund_thresh
- Noise threshold. Minimum mean (across replicates) abundance for a transcript to be considered expressed. Transcripts with mean abundances below this will be ignored. Mean total gene count must also meet this value in both conditions, a.k.a. at least one expressed isoform must exist in each condition. (Default 5, very permissive)
-The default values for these thresholds have been chosen such that they achieve a median FDR <5% for a high quality dataset from Arabidopsis thaliana, even with only 3 replicates per condition. Your mileage may vary and you should give some consideration to selecting appropriate values. Depending on the settings, additional thresholds are available and will be discussed in their respective sections below.
+The default values for these thresholds have been chosen such that they achieve a median FDR <5% for a high quality dataset from Arabidopsis thaliana, even with only 3 replicates per condition. Your mileage may vary and you should give some consideration to selecting appropriate values.
+Depending on the settings, additional thresholds are available and will be discussed in their respective sections below.
threads
- The number of threads to use. (Default 1)Due to core R implementation limitations, the type of multi-threading used in RATs works only in POSIX-compliant systems. Refer to the parallel
package for details.
Due to core R implementation limitations, the type of multi-threading used in RATs works only in POSIX-compliant systems (Linux, Mac, not Windows). Refer to the parallel
package for details.
As mentioned previously, various commonly-used normalised abundance units (like TPM) are scaled to an arbitrary and usually smaller-than-actual sample size (often 1 million reads or fragments). This artificially deflates the significances from count-based tests, such as those employed by RATs, and reduces the statistical power of the method.
-To counter this, RATs provides the option to scale abundaces either equally by a single factor (such as average library size among samples) or by a vector of factors (one per sample). The former maintains any pre-existing library-size normalisation among samples. This is necessary for fold-change based methods, but RATs does not require it. Instead, using the respective actual library sizes of the samples allows the higher-throughput samples to have a bigger influence than the lower-throughput samples. This is particularly relevant if your samples have dissimilar library sizes.
-For flexibility with different types of input, these scaling options can be applied in either of two stages: The data import step by fish4rodents()
, or the actual testing step by call_DTU()
. In the example examined previously, fish4rodents()
was instructed to create TPM abundances, by normalising to 10000000 reads. Such values are useful with certain other tools that a user may also intend to use. Subsequently, these TPMs were re-scaled to meet the library size of each sample, thus providing RATs with count-like abundance values that retain the TPM’s normalisation by isoform length. However, it is not necessary to scale in two separate steps.
To counter this, RATs provides the option to scale abundaces either equally by a single factor (such as average library size among samples) or by a vector of factors (one per sample). The former maintains any pre-existing library-size normalisation among samples. This is necessary for fold-change based methods, but RATs does not require it. Instead, using the respective actual library sizes of the samples allows the higher-throughput samples to have a bigger influence than the lower-throughput samples. This is particularly relevant if your samples have very dissimilar library sizes.
+For flexibility with different types of input, these scaling options can be applied in either of two stages: The data import step by fish4rodents()
, or the actual testing step by call_DTU()
. In the example examined previously, fish4rodents()
was instructed to create TPM abundances, by normalising to 1000000
reads. Such values are useful with certain other tools that a user may also intend to use. Subsequently, these TPMs were re-scaled to meet the library size of each sample, thus providing RATs with count-like abundance values that retain the normalisation by isoform length. However, it is not necessary to scale in two separate steps.
Both fish4rodents()
and call_DTU()
support scaling by a single value or a vector of values. If you don’t need the TPMs, you can scale directly to the desired library size(s), as in the examples below:
# 1:
# Scale directly to library sizes at the import step.
@@ -560,21 +563,24 @@ Abundance scaling
mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A,
boot_data_B= mydata$boot_data_B,
scaling=c(25, 26.7, 23, 50.0, 45, 48.46, 52.36))
-You can mix and match scaling options as per your needs, so take care to ensure that the scaling you apply is appropriate. It is important to note, that if you simply run both methods with their respective defaults, you’ll effectively run RATs on TPM values, which is extremely underpowered and not recommended. Please, provide appropriate scaling factors for your data.
-Different annotation versions often preserve transcript IDs, despite altering the details of the transcript model. It is important to use the same annotation throughout the workflow, otherwise the abundances will not be comparable in a meaningful way.
-All internal operations and the output of RATs are based on the annotation provided:
+You can mix and match scaling options as per your needs, so take care to ensure that the scaling you apply is appropriate. It is important to note, that if you simply run both methods with their respective defaults, you’ll effectively run RATs on TPM values, which is extremely underpowered and not recommended. Please, provide appropriate scaling factors for your data.
+Different annotation versions often preserve transcript IDs, despite altering the details of the transcript model. They also tend to include more or fewer transcripts, affecting the result of quantification. It is important to use the same annotation throughout the workflow, otherwise the abundances will not be comparable in a meaningful way.
+RATs will abort the run if the set of feature IDs in the provided annotation does not match fully the set of IDs in the quantifications. If this happens, ensure you are using the exact same annotation throughout your workflow.
+For special use cases, RATs provides the option to ignore the discrepancy and pretend everything is OK. Do this at your own risk.
+mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A,
+ boot_data_B= mydata$boot_data_B,
+ reckless=TRUE, verbose=TRUE)
+In reckless mode, all internal operations and the output of RATs are based on the annotation provided:
NA
values in $Transcripts[, .(stdevA, stdevB)]
as these fields are not used downstream by RATs. NA
values in other numeric fields are explicitly overwritten with 0
to allow downstream interoperability.The rats
R package was developed within The Barton Group at The University of Dundee by Dr. Kimon Froussios, Dr. Kira Mourão and Dr. Nick Schurch.
library(rats)
Set up an example.
# Simulate some data.
-simdat <- sim_boot_data()
+simdat <- sim_boot_data(clean=TRUE)
# For convenience let's assign the contents of the list to separate variables
mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition.
mycond_B <- simdat[[3]] # Simulated bootstrapped data for other condition.
@@ -251,16 +253,16 @@ Summary of DTU
# A really simple tally of the outcome.
print( dtu_summary(mydtu) )
## category tally
-## 1 DTU genes (gene test) 1
+## 1 DTU genes (gene test) 2
## 2 non-DTU genes (gene test) 2
## 3 ineligible genes (gene test) 7
-## 4 DTU genes (transc. test) 1
+## 4 DTU genes (transc. test) 2
## 5 non-DTU genes (transc. test) 3
## 6 ineligible genes (transc. test) 6
-## 7 DTU genes (both tests) 1
+## 7 DTU genes (both tests) 2
## 8 non-DTU genes (both tests) 2
## 9 ineligible genes (both tests) 7
-## 10 DTU transcripts 3
+## 10 DTU transcripts 5
## 11 non-DTU transcripts 7
## 12 ineligible transcripts 11
RATs tests for DTU at both the gene level and the transcript level. Therefore, as of v0.4.2, dtu_summury()
will display the DTU genes according to either method as well as the intersection of the two.
@@ -269,7 +271,7 @@ Summary of DTU
ids <- get_dtu_ids(mydtu)
print( ids )
## $`DTU genes (gene test)`
-## [1] "MIX6"
+## [1] "1A1B" "MIX6"
##
## $`non-DTU genes (gene test)`
## [1] "CC" "NN"
@@ -278,7 +280,7 @@ Summary of DTU
## [1] "LC" "1A1N" "1B1C" "1D1C" "ALLA" "ALLB" "NIB"
##
## $`DTU genes (transc. test)`
-## [1] "MIX6"
+## [1] "1A1B" "MIX6"
##
## $`non-DTU genes (transc. test)`
## [1] "LC" "CC" "NN"
@@ -287,7 +289,7 @@ Summary of DTU
## [1] "1A1N" "1B1C" "1D1C" "ALLA" "ALLB" "NIB"
##
## $`DTU genes (both tests)`
-## [1] "MIX6"
+## [1] "1A1B" "MIX6"
##
## $`non-DTU genes (both tests)`
## [1] "CC" "NN"
@@ -296,7 +298,7 @@ Summary of DTU
## [1] "LC" "1A1N" "1B1C" "1D1C" "ALLA" "ALLB" "NIB"
##
## $`DTU transcripts`
-## [1] "MIX6.c1" "MIX6.c2" "MIX6.c4"
+## [1] "1A1B.a" "1A1B.b" "MIX6.c1" "MIX6.c2" "MIX6.c4"
##
## $`non-DTU transcripts`
## [1] "LC2" "CC_a" "CC_b" "MIX6.c3" "2NN" "1NN" "MIX6.nc"
@@ -312,29 +314,29 @@ Summary of isoform switching
# A tally of genes switching isoform ranks.
print( dtu_switch_summary(mydtu) )
## category genes
-## 1 Primary switch (gene test) 1
+## 1 Primary switch (gene test) 2
## 2 Non-primary switch (gene test) 1
-## 3 Primary switch (transc. test) 1
+## 3 Primary switch (transc. test) 2
## 4 Non-primary switch (transc. test) 1
-## 5 Primary switch (both tests) 1
+## 5 Primary switch (both tests) 2
## 6 Non-primary switch (both tests) 1
# The gene IDs displaying isoform switching.
ids <- get_switch_ids(mydtu)
print( ids )
## $`Primary switch (gene test)`
-## [1] "MIX6"
+## [1] "1A1B" "MIX6"
##
## $`Non-primary switch (gene test)`
## [1] "MIX6"
##
## $`Primary switch (transc. test)`
-## [1] "MIX6"
+## [1] "1A1B" "MIX6"
##
## $`Non-primary switch (transc. test)`
## [1] "MIX6"
##
## $`Primary switch (both tests)`
-## [1] "MIX6"
+## [1] "1A1B" "MIX6"
##
## $`Non-primary switch (both tests)`
## [1] "MIX6"
@@ -349,11 +351,15 @@ Summary of DTU plurality
# A tally of genes switching isoform ranks.
print( dtu_plurality_summary(mydtu) )
## isof_affected num_of_genes
-## 1 3 1
+## 1 2 1
+## 2 3 1
# The gene IDs displaying isoform switching.
ids <- get_plurality_ids(mydtu)
print( ids )
-## $`3`
+## $`2`
+## [1] "1A1B"
+##
+## $`3`
## [1] "MIX6"
These give you the number and IDs of genes in which 2, 3, etc isoforms show DTU.
For example, one gene (“MIX6”) is showing significant change in 3 of its isoforms.
@@ -378,7 +384,7 @@ Parameters
## [16] "dprop_thresh" "correction" "abund_scaling"
## [19] "quant_boot" "quant_reprod_thresh" "quant_bootnum"
## [22] "rep_boot" "rep_reprod_thresh" "rep_bootnum"
-## [25] "seed"
+## [25] "seed" "reckless"
description
- (str) Free-text description of the run. It is useful to record data sources, annotation source and version, experimental parameters…time
- (str) Date and time for the run. This does not represent the exact time the run started.rep_reprod_thresh
- (num) The value passed to the rrep_thresh
parameter, if rboot==TRUE
..rep_boot
- (bool) The value passed to the rboot
parameter.rep_bootnum
- (int) The number of replicate bootstrapping iterations (currently M*N, where M and N are the numbers of samples in the two conditions), if rboot==TRUE
.seed
- (int) Custom seed for the random generator.seed
- (int) Custom seed for the random generator. NA
if none was specified.reckless
- (bool) The value passed to the reckless
parameter.Note: If bootstraps are disabled, the bootstrap-related fields may have NA
values despite any values that were passed to their respective parameters.
Note: If bootstraps are disabled, the bootstrap-related fields may be NA
, regardless of supplied values, to reflect the fact that they were not used.
# Abundance table for first condition.
print( head(mydtu$Abundances[[1]]) )
-## V1 V2 target_id parent_id
-## 1: 19.66667 15.5 1A1N-2 1A1N
-## 2: NA NA 1B1C.1 1B1C
-## 3: 52.33333 53.5 1B1C.2 1B1C
-## 4: 0.00000 0.0 1D1C:one 1D1C
-## 5: 76.00000 78.0 1D1C:two 1D1C
-## 6: 50.00000 45.0 ALLA1 ALLA
+## V1 V2 target_id parent_id
+## 1: 84.33333 132.5 1A1B.a 1A1B
+## 2: 0.00000 0.0 1A1B.b 1A1B
+## 3: 19.66667 15.5 1A1N-2 1A1N
+## 4: 0.00000 0.0 1B1C.1 1B1C
+## 5: 52.33333 53.5 1B1C.2 1B1C
+## 6: 0.00000 0.0 1D1C:one 1D1C
library(rats)
Set up an example.
# Simulate some data.
-simdat <- sim_boot_data()
+simdat <- sim_boot_data(clean=TRUE)
# For convenience let's assign the contents of the list to separate variables
mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition.
mycond_B <- simdat[[3]] # Simulated bootstrapped data for other condition.
@@ -257,7 +259,7 @@ Isoform abundances for a given gene
When there are many replicates or many isoforms, it may be preferable to group abundances by isoform, making individual comparisons easier:
# Grouping by isoform:
plot_gene(mydtu, "MIX6", style="byisoform")
-
+
Overview plots
@@ -290,7 +292,7 @@ Diagnostic plots
Currently there is only one plot type in this category.
# Matrix of pairwise Pearson's correlations among samples.
plot_diagnostics(mydtu, type='cormat') # Default type.
-
+
What you want to see here, is a nice separation between the samples of different conditions. Anomalies in this plot may indicate batch effects or mislabelled samples.
@@ -314,7 +316,7 @@ Change information layers
The fillby
, colourby
and shapeby
parameters can be respectively used to control which information layers are encoded as fill, line/point colour, and point shape. Possible values are c("isoform", "condition", "DTU", "none", "replicate")
. Be aware that some combinations of plot style and information layers are not possible. If safe to do so these will be silently ignored, otherwise an error message will appear.
# For a less busy look, any of the information layers can be disabled.
plot_gene(mydtu, "MIX6", style="byisoform", colourby="none", shapeby="none")
-
+
The following command will approximate the older version of the gene plot (where DTU determined the fill colour):
plot_gene(mydtu, "MIX6", fillby="DTU", shapeby="none")
diff --git a/man/calculate_DTU.Rd b/man/calculate_DTU.Rd
index dfcf270..d974a32 100644
--- a/man/calculate_DTU.Rd
+++ b/man/calculate_DTU.Rd
@@ -20,7 +20,7 @@ calculate_DTU(counts_A, counts_B, tx_filter, test_transc, test_genes, full,
\item{full}{Either "full" (for complete output structure) or "short" (for bootstrapping).}
-\item{count_thresh}{Minimum number of counts per replicate.}
+\item{count_thresh}{Minimum average count across replicates.}
\item{p_thresh}{The p-value threshold.}
diff --git a/man/call_DTU.Rd b/man/call_DTU.Rd
index 7634778..4df817a 100644
--- a/man/call_DTU.Rd
+++ b/man/call_DTU.Rd
@@ -11,7 +11,7 @@ call_DTU(annot = NULL, TARGET_COL = "target_id", PARENT_COL = "parent_id",
dprop_thresh = 0.2, correction = "BH", scaling = 1, testmode = "both",
qboot = TRUE, qbootnum = 0L, qrep_thresh = 0.95, rboot = TRUE,
rrep_thresh = 0.85, description = NA_character_, verbose = TRUE,
- threads = 1L, seed = NA_integer_, dbg = "0")
+ threads = 1L, seed = NA_integer_, reckless = FALSE, dbg = "0")
}
\arguments{
\item{annot}{A data.table matching transcript identifiers to gene identifiers. Any additional columns are allowed but ignored.}
@@ -36,7 +36,7 @@ call_DTU(annot = NULL, TARGET_COL = "target_id", PARENT_COL = "parent_id",
\item{p_thresh}{The p-value threshold. (Default 0.05)}
-\item{abund_thresh}{Noise threshold. Minimum mean abundance for transcripts to be eligible for testing. (Default 5)}
+\item{abund_thresh}{Noise threshold. Minimum mean (across replicates) abundance for transcripts (and genes) to be eligible for testing. (Default 5)}
\item{dprop_thresh}{Effect size threshold. Minimum change in proportion of a transcript for it to be considered meaningful. (Default 0.20)}
@@ -64,7 +64,9 @@ call_DTU(annot = NULL, TARGET_COL = "target_id", PARENT_COL = "parent_id",
\item{seed}{A numeric integer used to initialise the random number engine. Use this only if reproducible bootstrap selections are required. (Default NA)}
-\item{dbg}{Debugging mode. Interrupt execution at the specified flag-point. Used to speed up code-tests by avoiding irrelevant downstream processing. (Default 0: do not interrupt)}
+\item{reckless}{RATs normally aborts if any inconsistency is detected among the transcript IDs found in the annotation and the quantifications. Enabling reckless mode will downgrade this error to a warning and allow RATs to continue the run. Not recommended unless you know why the inconsistency exists and how it will affect the results. (Default FALSE)}
+
+\item{dbg}{Debugging mode only. Interrupt execution at the specified flag-point. Used to speed up code-tests by avoiding irrelevant downstream processing. (Default 0: do not interrupt)}
}
\value{
List of mixed types. Contains a list of runtime settings, a table of gene-level results, a table of transcript-level results, and a list of two tables with the transcript abundaces.
diff --git a/man/combine_sim_dtu.Rd b/man/combine_sim_dtu.Rd
deleted file mode 100644
index 7ac8e7f..0000000
--- a/man/combine_sim_dtu.Rd
+++ /dev/null
@@ -1,19 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/data_simulators.R
-\name{combine_sim_dtu}
-\alias{combine_sim_dtu}
-\title{Combine simulation details with DTU calls, in preparation for plotting.}
-\usage{
-combine_sim_dtu(sim, dtuo)
-}
-\arguments{
-\item{sim}{An object generated with countrange_sim(), containing the simulated data details with which the \code{dtu} object was calculated.}
-
-\item{dtuo}{The object with the DTU results corresponding to the \code{sim} object's data.}
-}
-\value{
-dataframe
-}
-\description{
-Combine simulation details with DTU calls, in preparation for plotting.
-}
diff --git a/man/countrange_sim.Rd b/man/countrange_sim.Rd
deleted file mode 100644
index 2ad76a3..0000000
--- a/man/countrange_sim.Rd
+++ /dev/null
@@ -1,26 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/data_simulators.R
-\name{countrange_sim}
-\alias{countrange_sim}
-\title{Generate artificial read-count data for simulated 2-transcript genes,
-systematically covering a broad range of proportions and magnitudes.}
-\usage{
-countrange_sim(proportions = seq(0, 1, 0.01), magnitudes = c(1, 5, 10, 30,
- 100, 500, 1000))
-}
-\arguments{
-\item{proportions}{Range and step of proportions for 1st transcript. Sibling transcript complemented from these.}
-
-\item{magnitudes}{vector of read-count sizes to use.}
-}
-\value{
-list containing: \code{data} a minimal sleuth-like object.
- \code{anno} a dataframe matching \code{target_id} to \code{parent_id}.
- \code{sim} a dataframe with the simulation parameters per transcript.
-}
-\description{
-For each level of transcript proportions in condition A, all proportion levels are
-generated for condition B as possible new states. Therefore the number of transcripts
-generated is 2 * N^2 * M, where 2 is the number of gene isoforms per paramater combination,
-N is the number of proportion levels and M the number of magnitude levels.
-}
diff --git a/man/parameters_are_good.Rd b/man/parameters_are_good.Rd
index 7f2d27b..1ec7bc7 100644
--- a/man/parameters_are_good.Rd
+++ b/man/parameters_are_good.Rd
@@ -7,7 +7,7 @@
parameters_are_good(annot, count_data_A, count_data_B, boot_data_A, boot_data_B,
TARGET_COL, PARENT_COL, correction, testmode, scaling, threads, seed,
p_thresh, abund_thresh, dprop_thresh, qboot, qbootnum, qrep_thresh, rboot,
- rrep_thresh)
+ rrep_thresh, reckless)
}
\arguments{
\item{annot}{Annotation dataframe.}
@@ -49,6 +49,8 @@ parameters_are_good(annot, count_data_A, count_data_B, boot_data_A, boot_data_B,
\item{rboot}{Whether to bootstrap against samples.}
\item{rrep_thresh}{Confidence threshold.}
+
+\item{reckless}{whether to ignore detected annotation discrepancies}
}
\value{
List: \itemize{
diff --git a/man/plot_sim.Rd b/man/plot_sim.Rd
deleted file mode 100644
index 7c5c160..0000000
--- a/man/plot_sim.Rd
+++ /dev/null
@@ -1,16 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/data_simulators.R
-\name{plot_sim}
-\alias{plot_sim}
-\title{Plot relationship of parameters.}
-\usage{
-plot_sim(data, type = "AvBvM")
-}
-\arguments{
-\item{data}{the output from combine_sim_dtu()}
-
-\item{type}{(str) "AvBvM", "B/AvM", "AvBvMprop", "B/AvMprop", "B-AvM"}
-}
-\description{
-Plot relationship of parameters.
-}
diff --git a/man/sim_boot_data.Rd b/man/sim_boot_data.Rd
index b78c3fc..c17f343 100644
--- a/man/sim_boot_data.Rd
+++ b/man/sim_boot_data.Rd
@@ -5,7 +5,7 @@
\title{Generate an artificial dataset of bootstrapped abundance estimates, for code-testing or examples.}
\usage{
sim_boot_data(errannot_inconsistent = FALSE, PARENT_COL = "parent_id",
- TARGET_COL = "target_id")
+ TARGET_COL = "target_id", clean = FALSE)
}
\arguments{
\item{errannot_inconsistent}{Logical. Introduces an inconsistency in the transcript IDs, for testing of sanity checks. (FALSE)}
@@ -13,6 +13,8 @@ sim_boot_data(errannot_inconsistent = FALSE, PARENT_COL = "parent_id",
\item{PARENT_COL}{Name for the bootdtraps column containing counts.}
\item{TARGET_COL}{Name for annotation column containing transcript identification.}
+
+\item{clean}{Logical. Remove the intentional inconsistency of IDs between annotation and abundances.}
}
\value{
A list with 3 elements. First a data.frame with the corresponding annotation.
diff --git a/man/sim_count_data.Rd b/man/sim_count_data.Rd
index 094c672..22b7e8f 100644
--- a/man/sim_count_data.Rd
+++ b/man/sim_count_data.Rd
@@ -5,7 +5,7 @@
\title{Generate an artificial dataset of abundance estimates, for code-testing or examples.}
\usage{
sim_count_data(errannot_inconsistent = FALSE, PARENT_COL = "parent_id",
- TARGET_COL = "target_id")
+ TARGET_COL = "target_id", clean = FALSE)
}
\arguments{
\item{errannot_inconsistent}{Logical. Introduces an inconsistency in the transcript IDs, for testing of sanity checks. (FALSE)}
@@ -13,6 +13,8 @@ sim_count_data(errannot_inconsistent = FALSE, PARENT_COL = "parent_id",
\item{PARENT_COL}{Name for the bootdtraps column containing counts.}
\item{TARGET_COL}{Name for annotation column containing transcript identification.}
+
+\item{clean}{Logical. Remove the intentional inconsistency of IDs between annotation and abundances.}
}
\value{
A list with 3 elements. First, a data.frame with the corresponding annotation table.
diff --git a/man/sim_sleuth_data.Rd b/man/sim_sleuth_data.Rd
deleted file mode 100644
index b408b07..0000000
--- a/man/sim_sleuth_data.Rd
+++ /dev/null
@@ -1,39 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/data_simulators.R
-\name{sim_sleuth_data}
-\alias{sim_sleuth_data}
-\title{Generate an artificial minimal sleuth-like structure, for code-testing or examples.}
-\usage{
-sim_sleuth_data(varname = "condition", COUNTS_COL = "est_counts",
- TARGET_COL = "target_id", PARENT_COL = "parent_id",
- BS_TARGET_COL = "target_id", cnames = c("A", "B"),
- errannot_inconsistent = FALSE, cv_dt = FALSE)
-}
-\arguments{
-\item{varname}{Name for the variables by which to compare.}
-
-\item{COUNTS_COL}{Name for the bootdtraps column containing counts.}
-
-\item{TARGET_COL}{Name for annotation column containing transcript identification.}
-
-\item{PARENT_COL}{Name for annotation column containing respective gene identification.}
-
-\item{BS_TARGET_COL}{Name for bootstraps column containing transcript identification.}
-
-\item{cnames}{A vector of (two) name values for the comparison variable.}
-
-\item{errannot_inconsistent}{Logical. Introduces an inconsistency in the transcript IDs, for testing of sanity checks. (FALSE)}
-
-\item{cv_dt}{Logical. Whether covariates table should be a data.table (FALSE).}
-}
-\value{
-A list with \code{slo} a minimal sleuth-like object, \code{annot} a corresponding annotation data.frame,
- \code{isx} a vector of trancripts common between generated annotation and bootstraps (useful for code testing).
-
-The simulated data will have non-uniform number of bootstraps per sample and non-uniform order of
-transcript id's among samples and bootstraps. These conditions are unlikely to occur in real data,
-but this allows testing that the code can handle it.
-}
-\description{
-The default values here should match the default values expected by calculate_DTU().
-}
diff --git a/tests/testthat/test_1_internal-structures.R b/tests/testthat/test_1_internal-structures.R
index da012bc..4d2e067 100644
--- a/tests/testthat/test_1_internal-structures.R
+++ b/tests/testthat/test_1_internal-structures.R
@@ -17,13 +17,13 @@ test_that("The reporting structures are created correctly", {
expect_type(full$Parameters, "list")
expect_true(typeof(full$Parameters) == typeof(short$Parameters))
- expect_length(full$Parameters, 25)
+ expect_length(full$Parameters, 26)
expect_length(short$Parameters, 2)
expect_named(full$Parameters, c("description", "time", "rats_version", "R_version",
"var_name", "cond_A", "cond_B", "data_type", "num_replic_A", "num_replic_B", "num_genes", "num_transc",
"tests", "p_thresh", "abund_thresh", "dprop_thresh", "correction", "abund_scaling",
"quant_boot", "quant_reprod_thresh", "quant_bootnum",
- "rep_boot", "rep_reprod_thresh", "rep_bootnum", "seed"))
+ "rep_boot", "rep_reprod_thresh", "rep_bootnum", "seed", "reckless"))
expect_named(short$Parameters, c("num_replic_A", "num_replic_B"))
expect_true(is.data.frame(full$Genes))
diff --git a/tests/testthat/test_2_data-munging.R b/tests/testthat/test_2_data-munging.R
index f9940d5..f95001a 100644
--- a/tests/testthat/test_2_data-munging.R
+++ b/tests/testthat/test_2_data-munging.R
@@ -2,80 +2,6 @@
#==============================================================================
context("DTU Internal data munging")
-#==============================================================================
-test_that("Samples are grouped correctly", {
- sim <- sim_sleuth_data(cv_dt=FALSE)
- expect_silent(r <- group_samples(sim$slo$sample_to_covariates))
- # number of covariates
- expect_equal(length(r), length(sim$slo$sample_to_covariates))
- # names of covariates
- expect_named(r, names(sim$slo$sample_to_covariates))
- # number of values of each covariate
- expect_equal(length(r[[1]]), length(levels(as.factor(sim$slo$sample_to_covariates[[1]]))))
- expect_equal(length(r[[2]]), length(levels(as.factor(sim$slo$sample_to_covariates[[2]]))))
- # total number of samples
- expect_equal(sum(sapply(r[[1]],length)), length(sim$slo$sample_to_covariates[[1]]))
- expect_equal(sum(sapply(r[[2]],length)), length(sim$slo$sample_to_covariates[[2]]))
-
- # Repeat tests with data.table covariate
- sim <- sim_sleuth_data(cv_dt=TRUE)
- expect_silent(r <- group_samples(sim$slo$sample_to_covariates))
- # number of covariates
- expect_equal(length(r), length(sim$slo$sample_to_covariates))
- # names of covariates
- expect_named(r, names(sim$slo$sample_to_covariates))
- # number of values of each covariate
- expect_equal(length(r[[1]]), length(levels(as.factor(sim$slo$sample_to_covariates[[1]]))))
- expect_equal(length(r[[2]]), length(levels(as.factor(sim$slo$sample_to_covariates[[2]]))))
- # total number of samples
- expect_equal(sum(sapply(r[[1]],length)), length(sim$slo$sample_to_covariates[[1]]))
- expect_equal(sum(sapply(r[[2]],length)), length(sim$slo$sample_to_covariates[[2]]))
-
-})
-
-
-#==============================================================================
-test_that("Bootstrapped counts are extracted correctly", {
- samples <- c(1,3)
- bst <- "id"
- cnt <- "counts"
- sim <- sim_sleuth_data(COUNTS_COL = cnt, BS_TARGET_COL = bst)
- lr <- denest_sleuth_boots(sim$slo, sim$annot, samples, cnt, bst)
-
- for (i in 1:length(lr)) {
- # The transcripts supposed to be there are there.
- expect_true(all(sim$isx %in% as.character(lr[[i]]$target_id)))
- # No NA.
- expect_false(any(is.na(lr[[i]])))
-
- # Number of bootstraps per sample.
- expect_equal(length(lr[[i]]) - 1, length(sim$slo$kal[[samples[i]]]$bootstrap)) # one column in lr[[i]] is IDs.
-
- # All target counts pulled from the correct bootstraps and the correct transcripts.
- fltr2 <- match(sim$isx, lr[[i]][["target_id"]]) # Where in the extracted counts are the expected IDs.
- counts_ok <- sapply(2:(length(lr[[i]])), function(j) {
- fltr1 <- match(sim$isx, sim$slo$kal[[samples[i]]]$bootstrap[[j-1]][[bst]]) # Where in the boot are the expected IDs.
- all(sim$slo$kal[[samples[i]]]$bootstrap[[j-1]][[cnt]][fltr1] == lr[[i]][[j]][fltr2]) # Both vectors' elements are ordered by the same IDs.
- })
- expect_true(all(counts_ok))
-
- # IDs in annotation, but not in bootstraps, should be 0.
- missing_from_boots_ok <- sapply(2:(length(lr[[i]])), function(j) {
- nib <- setdiff(sim$annot$target_id, sim$isx)
- nib <- match(nib, lr[[i]][["target_id"]])
- all(lr[[i]][[j]][nib] == 0)
- })
- expect_true(all(missing_from_boots_ok))
-
- # IDs in bootstrap, but not in annotation, should be absent.
- missing_from_annot_ok <- sapply(1:(length(lr[[i]]) - 1), function(j) {
- nia <- setdiff(sim$slo$kal[[i]]$bootstrap[[j]], sim$isx)
- any(nia %in% lr[[i]][["target_id"]])
- })
- expect_false(any(missing_from_annot_ok))
- }
-})
-
#==============================================================================
test_that("The number of iterations is detected correctly", {
diff --git a/tests/testthat/test_3_input-checks.R b/tests/testthat/test_3_input-checks.R
index 5343bc6..6d434ab 100644
--- a/tests/testthat/test_3_input-checks.R
+++ b/tests/testthat/test_3_input-checks.R
@@ -3,146 +3,202 @@
context("DTU Input checks")
#==============================================================================
-test_that("The input checks work", {
+test_that("No false alarms", {
name_A <- "one"
name_B <- "two"
- wrong_name <- "RUBBISH_COLUMN_NAME"
- # Emulate bootstrap data.
- sim1 <- sim_boot_data()
- sim2 <- sim_boot_data(errannot_inconsistent=FALSE, PARENT_COL="parent", TARGET_COL="target")
- # Emulate non-bootstrap data.
- counts_A <- sim1$boots_A[[1]]
- counts_B <- sim1$boots_B[[1]]
-
- # No false alarms with valid calls.
- expect_silent(call_DTU(annot= sim2$annot, boot_data_A= sim2$boots_A, boot_data_B= sim2$boots_B, name_A = "AAAA", name_B = "BBBB", varname= "waffles", p_thresh= 0.01, abund_thresh= 10,
+ sim <- sim_boot_data(clean=TRUE, errannot_inconsistent=FALSE, TARGET_COL= "target", PARENT_COL= "parent")
+ expect_silent(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A = "AAAA", name_B = "BBBB", varname= "waffles", p_thresh= 0.01, abund_thresh= 10,
rrep_thresh = 0.6, qrep_thresh = 0.8, testmode= "transc", correction= "bonferroni", verbose= FALSE, rboot= FALSE, qboot=TRUE,
qbootnum= 100, TARGET_COL= "target", PARENT_COL= "parent", threads= 2, dbg= "prep", scaling=c(10, 11, 20, 21)))
- expect_silent(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, verbose = FALSE, dbg= "prep", scaling=30))
- expect_silent(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, rboot=FALSE, qboot= FALSE, verbose = FALSE, dbg= "prep"))
-
- # Mixed input formats.
- expect_silent(call_DTU(annot= sim1$annot, name_A= name_A, name_B= name_B, verbose = FALSE,
- boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, count_data_A= counts_A, count_data_B= counts_B, dbg= "prep"))
- expect_warning(call_DTU(annot= sim1$annot, verbose = TRUE, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B,
- count_data_A= counts_A, count_data_B= counts_B, qbootnum= 100, dbg= "prep"),
- "Only the bootstrapped data will be used", fixed= TRUE)
+
+ sim <- sim_boot_data(clean=TRUE, errannot_inconsistent=FALSE)
+ expect_silent(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, dbg= "prep", scaling=30))
+ expect_silent(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, rboot=FALSE, qboot= FALSE, verbose = FALSE, dbg= "prep"))
+ expect_silent(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, dbg= "prep"))
+ expect_silent(call_DTU(annot= sim$annot, count_data_A= sim$boots_A[[1]], count_data_B= sim$boots_B[[1]], verbose = FALSE, dbg= "prep"))
+})
+
+#==============================================================================
+test_that("Feature ID inconsistencies abort or warn", {
+ sim <- sim_boot_data(clean=TRUE, errannot_inconsistent=TRUE)
+
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, dbg= "prep"),
+ "Inconsistent set of transcript IDs across samples", fixed= TRUE)
+ expect_error(call_DTU(annot= sim$annot, count_data_A= sim$boots_A[[1]], count_data_B= sim$boots_B[[1]], verbose = FALSE, dbg= "prep"),
+ "Transcript IDs do not match completely between conditions", fixed= TRUE)
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_B, boot_data_B= sim$boots_A, verbose = FALSE, dbg= "prep"),
+ "The transcript IDs in the quantifications and the annotation do not match", fixed= TRUE)
+ expect_error(call_DTU(annot= sim$annot, count_data_A= sim$boots_B[[1]], count_data_B= sim$boots_A[[1]], verbose = FALSE, dbg= "prep"),
+ "The transcript IDs in the quantifications and the annotation do not match", fixed= TRUE)
+
+ expect_warning(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = TRUE, dbg= "prep", reckless=TRUE),
+ "Inconsistent set of transcript IDs across samples", fixed= TRUE)
+ expect_warning(call_DTU(annot= sim$annot, count_data_A= sim$boots_A[[1]], count_data_B= sim$boots_B[[1]], verbose = TRUE, dbg= "prep", reckless=TRUE),
+ "Transcript IDs do not match completely between conditions", fixed= TRUE)
+ expect_warning(call_DTU(annot= sim$annot, boot_data_A= sim$boots_B, boot_data_B= sim$boots_A, verbose = TRUE, dbg= "prep", reckless=TRUE),
+ "The transcript IDs in the quantifications and the annotation do not match", fixed= TRUE)
+ expect_warning(call_DTU(annot= sim$annot, count_data_A= sim$boots_B[[1]], count_data_B= sim$boots_A[[1]], verbose = TRUE, dbg= "prep", reckless=TRUE),
+ "The transcript IDs in the quantifications and the annotation do not match", fixed= TRUE)
+
+ expect_silent(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, dbg= "prep", reckless=TRUE))
+ expect_silent(call_DTU(annot= sim$annot, count_data_A= sim$boots_A[[1]], count_data_B= sim$boots_B[[1]], verbose = FALSE, dbg= "prep", reckless=TRUE))
+ expect_silent(call_DTU(annot= sim$annot, boot_data_A= sim$boots_B, boot_data_B= sim$boots_A, verbose = FALSE, dbg= "prep", reckless=TRUE))
+ expect_silent(call_DTU(annot= sim$annot, count_data_A= sim$boots_B[[1]], count_data_B= sim$boots_A[[1]], verbose = FALSE, dbg= "prep", reckless=TRUE))
+})
+
+#==============================================================================
+test_that("Annotation format is checked", {
+ sim <- sim_boot_data(clean=TRUE)
# Annotation is not a dataframe.
- expect_error(call_DTU(annot= list("not", "a", "dataframe"), boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, verbose = FALSE, dbg= "prep"), "annot is not a data.frame")
+ expect_error(call_DTU(annot= list("not", "a", "dataframe"), boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, verbose = FALSE, dbg= "prep"),
+ "annot is not a data.frame")
# Annotation field names.
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, TARGET_COL= wrong_name, verbose = FALSE, dbg= "prep"),
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, TARGET_COL= "wrong_name", verbose = FALSE, dbg= "prep"),
"target and/or parent IDs field names do not exist in annot", fixed= TRUE)
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, PARENT_COL= wrong_name, verbose = FALSE, dbg= "prep"),
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, PARENT_COL= "wrong_name", verbose = FALSE, dbg= "prep"),
"target and/or parent IDs field names do not exist in annot", fixed= TRUE)
- # Inconsistent annotation.
- sim3 <- sim_boot_data(errannot_inconsistent= TRUE)
- expect_error(call_DTU(annot= sim3$annot, boot_data_A= sim3$boots_A, boot_data_B= sim3$boots_B, name_A= name_A, name_B= name_B, verbose = FALSE, dbg= "prep"),
- "Inconsistent set of transcript IDs", fixed= TRUE)
- # Non unique IDs.
- a <- copy(sim1$annot)
+
+ # Non unique transcript IDs.
+ a <- copy(sim$annot)
a[1, "target_id"] <- a[2, "target_id"]
- expect_error(call_DTU(annot= a, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, verbose = FALSE, dbg= "prep"), "transcript identifiers are not unique")
+ expect_error(call_DTU(annot= a, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, verbose = FALSE, dbg= "prep"),
+ "transcript identifiers are not unique")
+})
+
+#==============================================================================
+test_that("Quantification formats are checked", {
+ sim <- sim_boot_data(clean=TRUE)
+
+ # Redundant quantification formats.
+ expect_silent(call_DTU(annot= sim$annot, verbose = FALSE, dbg= "prep", boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, count_data_A= sim$boots_A[[1]], count_data_B= sim$boots_B[[1]]))
+ expect_warning(call_DTU(annot= sim$annot, verbose = TRUE, dbg= "prep", boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, count_data_A= sim$boots_A[[1]], count_data_B= sim$boots_B[[1]]),
+ "Only the bootstrapped data will be used", fixed= TRUE)
# Boot data is not a list of datatables.
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= c("not", "a", "list"), boot_data_B= sim1$boots_B, verbose = FALSE, dbg= "prep"), "bootstrap data are not lists")
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= c("not", "a", "list"), verbose = FALSE, dbg= "prep"), "bootstrap data are not lists")
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= list( list("not"), list("a"), list("table")), boot_data_B= sim1$boots_B, verbose = FALSE, dbg= "prep"), "bootstrap data are not lists")
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= list( list("not"), list("a"), list("table")), verbose = FALSE, dbg= "prep"), "bootstrap data are not lists")
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= list(data.frame(a="not", b="a", c="table")), boot_data_B= sim1$boots_B, verbose = FALSE, dbg= "prep"), "not lists of data.tables")
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= list(data.frame(a="not", b="a", c="table")), verbose = FALSE, dbg= "prep"), "not lists of data.tables")
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= c("not", "a", "list"), boot_data_B= sim$boots_B, verbose = FALSE, dbg= "prep"), "bootstrap data are not lists")
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= c("not", "a", "list"), verbose = FALSE, dbg= "prep"), "bootstrap data are not lists")
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= list( list("not"), list("a"), list("table")), boot_data_B= sim$boots_B, verbose = FALSE, dbg= "prep"), "bootstrap data are not lists")
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= list( list("not"), list("a"), list("table")), verbose = FALSE, dbg= "prep"), "bootstrap data are not lists")
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= list(data.frame(a="not", b="a", c="table")), boot_data_B= sim$boots_B, verbose = FALSE, dbg= "prep"), "not lists of data.tables")
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= list(data.frame(a="not", b="a", c="table")), verbose = FALSE, dbg= "prep"), "not lists of data.tables")
# Counts data is not datatables.
- expect_error(call_DTU(annot= sim1$annot, count_data_A= c("not", "a", "list"), count_data_B= counts_B, verbose = FALSE, dbg= "prep"), "counts data are not data.tables")
- expect_error(call_DTU(annot= sim1$annot, count_data_A= counts_A, count_data_B= c("not", "a", "list"), verbose = FALSE, dbg= "prep"), "counts data are not data.tables")
- expect_error(call_DTU(annot= sim1$annot, count_data_A= data.frame(a="not", b="a", c="table"), count_data_B= counts_B, verbose = FALSE, dbg= "prep"), "counts data are not data.tables")
- expect_error(call_DTU(annot= sim1$annot, count_data_A= counts_A, count_data_B= data.frame(a="not", b="a", c="table"), verbose = FALSE, dbg= "prep"), "counts data are not data.tables")
+ expect_error(call_DTU(annot= sim$annot, count_data_A= c("not", "a", "list"), count_data_B= sim$boots_B[[1]], verbose = FALSE, dbg= "prep"), "counts data are not data.tables")
+ expect_error(call_DTU(annot= sim$annot, count_data_A= sim$boots_A[[1]], count_data_B= c("not", "a", "list"), verbose = FALSE, dbg= "prep"), "counts data are not data.tables")
+ expect_error(call_DTU(annot= sim$annot, count_data_A= data.frame(a="not", b="a", c="table"), count_data_B= sim$boots_B[[1]], verbose = FALSE, dbg= "prep"), "counts data are not data.tables")
+ expect_error(call_DTU(annot= sim$annot, count_data_A= sim$boots_A[[1]], count_data_B= data.frame(a="not", b="a", c="table"), verbose = FALSE, dbg= "prep"), "counts data are not data.tables")
+})
+
+#==============================================================================
+test_that("Bootstrap parameters are checked", {
+ sim <- sim_boot_data(clean=TRUE)
# Number of bootstraps.
- expect_warning(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, verbose = TRUE, dbg= "prep"),
+ expect_warning(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, verbose = TRUE, dbg= "prep"),
"few bootstrap iterations", fixed= TRUE)
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, qbootnum = -5, verbose = FALSE, dbg= "prep"),
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, qbootnum = -5, verbose = FALSE, dbg= "prep"),
"Invalid number of bootstraps", fixed= TRUE)
- expect_warning(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, qbootnum = 5, verbose = TRUE, dbg= "prep"),
+ expect_warning(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, qbootnum = 5, verbose = TRUE, dbg= "prep"),
"qbootnum is low", fixed= TRUE)
- expect_warning(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, qbootnum = 5000, verbose = TRUE, dbg= "prep"),
+ expect_warning(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, qbootnum = 5000, verbose = TRUE, dbg= "prep"),
"number of quantification bootstraps is very high", fixed= TRUE)
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, qbootnum = 5000000000000, verbose = FALSE, dbg= "prep"),
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, qbootnum = 5000000000000, verbose = FALSE, dbg= "prep"),
"number of quantification bootstraps would exceed the maximum capacity", fixed= TRUE)
# Bootstraps without boot data.
- expect_warning(call_DTU(annot= sim1$annot, count_data_A= counts_A, count_data_B= counts_B, qboot=TRUE, qbootnum=2, verbose= TRUE, dbg= "prep"),
+ expect_warning(call_DTU(annot= sim$annot, count_data_A= sim$boots_A[[1]], count_data_B= sim$boots_B[[1]], qboot=TRUE, qbootnum=2, verbose= TRUE, dbg= "prep"),
"qboot is TRUE but no bootstrapped estimates", fixed= TRUE)
+ # Confidence threshold.
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, rrep_thresh = -2, verbose = FALSE, dbg= "prep"),
+ "Invalid reproducibility threshold", fixed= TRUE)
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, rrep_thresh = 2, verbose = FALSE, dbg= "prep"),
+ "Invalid reproducibility threshold", fixed= TRUE)
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, qrep_thresh = -2, verbose = FALSE, dbg= "prep"),
+ "Invalid reproducibility threshold", fixed= TRUE)
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, qrep_thresh = 2, verbose = FALSE, dbg= "prep"),
+ "Invalid reproducibility threshold", fixed= TRUE)
+})
+
+#==============================================================================
+test_that("Testing parameters are checked", {
+ sim <- sim_boot_data(clean=TRUE)
+
# Correction method.
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, correction= wrong_name, verbose= FALSE, dbg= "prep"),
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, correction= "wrong_name", verbose= FALSE, dbg= "prep"),
"Invalid p-value correction method name", fixed= TRUE)
- # Verbose is bool.
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, verbose="yes", dbg= "prep"),
- "not interpretable as logical", fixed= TRUE)
-
# Tests.
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, testmode="GCSE", verbose = FALSE, dbg= "prep"),
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, testmode="GCSE", verbose = FALSE, dbg= "prep"),
"Unrecognized value for testmode", fixed= TRUE)
- expect_silent(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, testmode="genes", verbose = FALSE, rboot=FALSE, qboot = FALSE, dbg= "prep"))
- expect_silent(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, testmode="transc", verbose = FALSE, rboot=FALSE, qboot = FALSE, dbg= "prep"))
+ expect_silent(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, testmode="genes", verbose = FALSE, rboot=FALSE, qboot = FALSE, dbg= "prep"))
+ expect_silent(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, testmode="transc", verbose = FALSE, rboot=FALSE, qboot = FALSE, dbg= "prep"))
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, qboot="GCSE", verbose = FALSE, dbg= "prep"),
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, qboot="GCSE", verbose = FALSE, dbg= "prep"),
"Unrecognized value for qboot", fixed= TRUE)
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, rboot="GCSE", verbose = FALSE, dbg= "prep"),
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, rboot="GCSE", verbose = FALSE, dbg= "prep"),
"Unrecognized value for rboot", fixed= TRUE)
+})
- # Probability threshold.
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, p_thresh = 666, verbose = FALSE, dbg= "prep"),
+#==============================================================================
+test_that("Thresholds are checked", {
+ sim <- sim_boot_data(clean=TRUE)
+
+ # Probability threshold.
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, p_thresh = 666, verbose = FALSE, dbg= "prep"),
"Invalid p-value threshold", fixed= TRUE)
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, p_thresh = -0.05, verbose = FALSE, dbg= "prep"),
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, p_thresh = -0.05, verbose = FALSE, dbg= "prep"),
"Invalid p-value threshold", fixed= TRUE)
# Read counts threshold.
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, abund_thresh = -5, verbose = FALSE, dbg= "prep"),
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, abund_thresh = -5, verbose = FALSE, dbg= "prep"),
"Invalid abundance threshold", fixed= TRUE)
# Proportion change threshold.
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, dprop_thresh = -2, verbose = FALSE, dbg= "prep"),
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, dprop_thresh = -2, verbose = FALSE, dbg= "prep"),
"Invalid proportion difference threshold", fixed= TRUE)
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, dprop_thresh = 2, verbose = FALSE, dbg= "prep"),
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, dprop_thresh = 2, verbose = FALSE, dbg= "prep"),
"Invalid proportion difference threshold", fixed= TRUE)
+})
- # Confidence threshold.
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, rrep_thresh = -2, verbose = FALSE, dbg= "prep"),
- "Invalid reproducibility threshold", fixed= TRUE)
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, rrep_thresh = 2, verbose = FALSE, dbg= "prep"),
- "Invalid reproducibility threshold", fixed= TRUE)
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, qrep_thresh = -2, verbose = FALSE, dbg= "prep"),
- "Invalid reproducibility threshold", fixed= TRUE)
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, qrep_thresh = 2, verbose = FALSE, dbg= "prep"),
- "Invalid reproducibility threshold", fixed= TRUE)
-
- # Number of threads.
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, threads = 0, verbose= FALSE, dbg= "prep"),
- "Invalid number of threads", fixed= TRUE)
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, threads = -4, verbose= FALSE, dbg= "prep"),
- "Invalid number of threads", fixed= TRUE)
- expect_silent(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, threads = parallel::detectCores(logical=FALSE), verbose= FALSE, dbg= "prep"))
- expect_silent(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, threads = parallel::detectCores(logical=TRUE), verbose= FALSE, dbg= "prep"))
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, name_A= name_A, name_B= name_B, threads = parallel::detectCores(logical=TRUE) + 1, verbose= FALSE, dbg= "prep"),
- "threads exceed", fixed= TRUE)
-
+#==============================================================================
+test_that("Scaling parameters are checked", {
+ sim <- sim_boot_data(clean=TRUE)
+
# Scaling of abundances.
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, verbose = FALSE, dbg= "prep", scaling=0),
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, dbg= "prep", scaling=0),
"Invalid scaling factor", fixed= TRUE)
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, verbose = FALSE, dbg= "prep", scaling=-33),
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, dbg= "prep", scaling=-33),
"Invalid scaling factor", fixed= TRUE)
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, verbose = FALSE, dbg= "prep", scaling=c(2,3,4)),
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, dbg= "prep", scaling=c(2,3,4)),
"Invalid scaling factor", fixed= TRUE)
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, verbose = FALSE, dbg= "prep", scaling=c(1,2,3,4,5)),
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, dbg= "prep", scaling=c(1,2,3,4,5)),
"Invalid scaling factor", fixed= TRUE)
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, verbose = FALSE, dbg= "prep", scaling=c(1,0,2,3,4)),
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, dbg= "prep", scaling=c(1,0,2,3,4)),
"Invalid scaling factor", fixed= TRUE)
- expect_error(call_DTU(annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, verbose = FALSE, dbg= "prep", scaling=c(1,2,-3,4,5)),
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, dbg= "prep", scaling=c(1,2,-3,4,5)),
"Invalid scaling factor", fixed= TRUE)
})
+#==============================================================================
+test_that("General behaviour parameters are checked", {
+ sim <- sim_boot_data(clean=TRUE)
+
+ # Number of threads.
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, threads = 0, verbose= FALSE, dbg= "prep"),
+ "Invalid number of threads", fixed= TRUE)
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, threads = -4, verbose= FALSE, dbg= "prep"),
+ "Invalid number of threads", fixed= TRUE)
+ expect_silent(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, threads = parallel::detectCores(logical=FALSE), verbose= FALSE, dbg= "prep"))
+ expect_silent(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, threads = parallel::detectCores(logical=TRUE), verbose= FALSE, dbg= "prep"))
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, threads = parallel::detectCores(logical=TRUE) + 1, verbose= FALSE, dbg= "prep"),
+ "threads exceed", fixed= TRUE)
+
+ # Verbose is bool.
+ expect_error(call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= name_A, name_B= name_B, verbose="yes", dbg= "prep"),
+ "not interpretable as logical", fixed= TRUE)
+})
+#EOF
diff --git a/tests/testthat/test_4_steps.R b/tests/testthat/test_4_steps.R
index c4e18e8..7a012b8 100644
--- a/tests/testthat/test_4_steps.R
+++ b/tests/testthat/test_4_steps.R
@@ -4,8 +4,8 @@ context("RATs main")
#==============================================================================
test_that("The paramater interpretations are correct", {
- sim1 <- sim_boot_data()
- sim2 <- sim_count_data()
+ sim1 <- sim_boot_data(clean=TRUE)
+ sim2 <- sim_count_data(clean=TRUE)
# Bootstraps or counts?
expect_equal(2, call_DTU(dbg='prep', annot= sim1$annot, boot_data_A= sim1$boots_A, boot_data_B= sim1$boots_B, verbose = FALSE)[[1]])
@@ -33,7 +33,7 @@ test_that("The index is received", {
sim <- sim_boot_data()
# tidy_annot() should be tested separately.
expect_equal(tidy_annot(sim$annot),
- call_DTU(dbg='indx', annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE))
+ call_DTU(dbg='indx', annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, reckless=TRUE))
})
@@ -43,9 +43,9 @@ test_that("Scaling is applied", {
# Single factor.
S <- 10
- bdt <- call_DTU(dbg='bootin', annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, scaling=S)
- cdt <- call_DTU(dbg='countin', annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, scaling=S)
- sdt <- call_DTU(dbg='scale', annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, scaling=S)
+ bdt <- call_DTU(dbg='bootin', annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, scaling=S, reckless=TRUE)
+ cdt <- call_DTU(dbg='countin', annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, scaling=S, reckless=TRUE)
+ sdt <- call_DTU(dbg='scale', annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, scaling=S, reckless=TRUE)
expect_equal(S * cdt[[1]], sdt[[1]])
expect_equal(S * cdt[[2]], sdt[[2]])
for (b in 1:length(bdt$bootA)) {
@@ -57,9 +57,9 @@ test_that("Scaling is applied", {
# Vector.
S <- c(10, 20, 30, 40)
- bdt <- call_DTU(dbg='bootin', annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, scaling=S)
- cdt <- call_DTU(dbg='countin', annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, scaling=S)
- sdt <- call_DTU(dbg='scale', annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, scaling=S)
+ bdt <- call_DTU(dbg='bootin', annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, scaling=S, reckless=TRUE)
+ cdt <- call_DTU(dbg='countin', annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, scaling=S, reckless=TRUE)
+ sdt <- call_DTU(dbg='scale', annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, verbose = FALSE, scaling=S, reckless=TRUE)
expect_equal(S[1] * cdt[[1]]$V1, sdt[[1]]$V1)
expect_equal(S[2] * cdt[[1]]$V2, sdt[[1]]$V2)
@@ -78,7 +78,7 @@ test_that("Parameters are recorded", {
description='Test.',name_A='cA', name_B='cB', varname='testing',
p_thresh=0.001, abund_thresh=1, dprop_thresh=0.15, correction='bonferroni',
qboot=TRUE, qbootnum=99, qrep_thresh=0.8, rboot=TRUE, rrep_thresh=0.6,
- testmode="genes", seed=666, verbose = FALSE)$Parameters
+ testmode="genes", seed=666, verbose = FALSE, reckless=TRUE)$Parameters
expect_equal(param$description, 'Test.')
expect_true(!is.na(param$time))
@@ -87,7 +87,7 @@ test_that("Parameters are recorded", {
expect_equal(c(param$var_name, param$cond_A, param$cond_B), c('testing', 'cA', 'cB'))
expect_equal(c(param$data_type, param$tests), c("bootstrapped abundance estimates", 'genes'))
expect_equal(c(param$num_replic_A, param$num_replic_B), c(2, 2))
- expect_equal(c(param$num_genes, param$num_transc), c(10, 21))
+ expect_equal(c(param$num_genes, param$num_transc), c(11, 23))
expect_equal(list(param$p_thresh, param$abund_thresh, param$dprop_thresh, param$correction, param$abund_scaling), list(0.001, 1, 0.15, 'bonferroni', c(1,2,3,4)))
expect_equal(list(param$quant_boot, param$quant_bootnum, param$quant_reprod_thresh), list(TRUE, 99, 0.8))
expect_equal(list(param$rep_boot, param$rep_bootnum, param$rep_reprod_thresh), list(TRUE, NA_integer_, 0.6))
diff --git a/tests/testthat/test_5_output.R b/tests/testthat/test_5_output.R
index c812b5b..bdc7810 100644
--- a/tests/testthat/test_5_output.R
+++ b/tests/testthat/test_5_output.R
@@ -4,7 +4,7 @@ context("DTU Output")
#==============================================================================
test_that("The output structure is correct", {
- sim <- sim_boot_data()
+ sim <- sim_boot_data(clean=TRUE)
mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qbootnum=2, qboot=TRUE, rboot=TRUE, verbose = FALSE)
expect_type(mydtu, "list")
@@ -12,12 +12,12 @@ test_that("The output structure is correct", {
expect_named(mydtu, c("Parameters", "Genes", "Transcripts", "Abundances"))
expect_type(mydtu$Parameters, "list")
- expect_length(mydtu$Parameters, 25)
+ expect_length(mydtu$Parameters, 26)
expect_named(mydtu$Parameters, c("description", "time", "rats_version", "R_version",
"var_name", "cond_A", "cond_B", "data_type", "num_replic_A", "num_replic_B", "num_genes", "num_transc",
"tests", "p_thresh", "abund_thresh", "dprop_thresh", "correction", "abund_scaling",
"quant_boot", "quant_reprod_thresh", "quant_bootnum",
- "rep_boot", "rep_reprod_thresh", "rep_bootnum", "seed"))
+ "rep_boot", "rep_reprod_thresh", "rep_bootnum", "seed", "reckless"))
expect_true(is.data.frame(mydtu$Genes))
expect_equal(dim(mydtu$Genes)[2], 26)
@@ -118,16 +118,16 @@ test_that("The output structure is correct", {
#==============================================================================
test_that("The output content is complete", {
- sim <- sim_sleuth_data(cnames=c("ONE","TWO"))
+ sim <- sim_boot_data()
# Emulate non-sleuth bootstrap data.
- data_A <- denest_sleuth_boots(sim$slo, sim$annot, c(1,3), "est_counts", "target_id")
- data_B <- denest_sleuth_boots(sim$slo, sim$annot, c(2,4), "est_counts", "target_id")
+ data_A <- sim$boots_A
+ data_B <- sim$boots_B
# Emulate non-bootstrap data.
- counts_A <- data_A[[1]]
- counts_B <- data_B[[2]]
+ counts_A <- data_A[[2]]
+ counts_B <- data_B[[1]]
- mydtu <- list(call_DTU(annot= sim$annot, count_data_A = counts_A, count_data_B = counts_B, rboot=FALSE, qboot=FALSE, verbose = FALSE, description="test"),
- call_DTU(annot= sim$annot, boot_data_A = data_A, boot_data_B = data_B, rboot=TRUE, qboot=TRUE, qbootnum=2, verbose = FALSE, description="test")
+ mydtu <- list(call_DTU(annot= sim$annot, count_data_A = counts_A, count_data_B = counts_B, rboot=FALSE, qboot=FALSE, verbose = FALSE, description="test", reckless=TRUE),
+ call_DTU(annot= sim$annot, boot_data_A = data_A, boot_data_B = data_B, rboot=TRUE, qboot=TRUE, qbootnum=2, verbose = FALSE, description="test", reckless=TRUE)
)
for (x in length(mydtu)) {
@@ -150,6 +150,8 @@ test_that("The output content is complete", {
expect_false(is.na(mydtu[[x]]$Parameters$"time"))
expect_false(is.na(mydtu[[x]]$Parameters$"rep_boot"))
expect_false(is.na(mydtu[[x]]$Parameters$"quant_boot"))
+ # expect_false(is.na(mydtu[[x]]$Parameters$"seed")) # Actually it may be NA, if no specific seed is given.
+ expect_false(is.na(mydtu[[x]]$Parameters$"reckless"))
if (x>1) {
expect_false(is.na(mydtu[[x]]$Parameters$"rep_bootnum"))
expect_false(is.na(mydtu[[x]]$Parameters$"quant_bootnum"))
@@ -227,16 +229,15 @@ test_that("The output content is complete", {
#==============================================================================
test_that("The result is consistent across input data formats", {
- sim <- sim_sleuth_data(cnames=c("ONE","TWO"))
- # Emulate non-sleuth bootstrap data.
- data_A <- denest_sleuth_boots(sim$slo, sim$annot, c(1,3), "est_counts", "target_id")
- data_B <- denest_sleuth_boots(sim$slo, sim$annot, c(2,4), "est_counts", "target_id")
+ sim <- sim_boot_data()
+ data_A <- sim$boots_A
+ data_B <- sim$boots_B
# Emulate non-bootstrap data.
- counts_A <- data_A[[1]]
- counts_B <- data_B[[2]]
+ counts_A <- data_A[[2]]
+ counts_B <- data_B[[1]]
- mydtu <- list(call_DTU(annot= sim$annot, boot_data_A = data_A, boot_data_B = data_B, rboot=FALSE, qboot=FALSE, verbose = FALSE),
- call_DTU(annot= sim$annot, count_data_A = counts_A, count_data_B = counts_B, rboot=FALSE, qboot=FALSE, verbose = FALSE))
+ mydtu <- list(call_DTU(annot= sim$annot, boot_data_A = data_A, boot_data_B = data_B, rboot=FALSE, qboot=FALSE, verbose = FALSE, reckless=TRUE),
+ call_DTU(annot= sim$annot, count_data_A = counts_A, count_data_B = counts_B, rboot=FALSE, qboot=FALSE, verbose = FALSE, reckless=TRUE))
expect_equal(mydtu[[1]][[2]][, seq(1,7), with=FALSE], mydtu[[2]][[2]][, seq(1,7), with=FALSE])
expect_equal(mydtu[[1]][[3]][, seq(1,4), with=FALSE], mydtu[[2]][[3]][, seq(1,4), with=FALSE])
@@ -245,12 +246,14 @@ test_that("The result is consistent across input data formats", {
#==============================================================================
test_that("Filters work correctly", {
- # !!! This test is tightly dependent on the data used for the test and the default parameter values, in order to
+ # !!! This test is tightly dependent on the data used for the test and the parameter values, in order to
# !!! ensure correct response to specific scenarios.
sim <- sim_boot_data()
- mydtu <- call_DTU(annot= sim$annot, boot_data_A=sim$boots_A, boot_data_B=sim$boots_B, name_A= "ONE", name_B= "TWO", abund_thresh=10, dprop_thresh=0.1, verbose = FALSE, rboot=FALSE, qboot = FALSE, seed=666)
+ mydtu <- call_DTU(annot= sim$annot, boot_data_A=sim$boots_A, boot_data_B=sim$boots_B, name_A= "ONE", name_B= "TWO", abund_thresh=8, dprop_thresh=0.1, verbose = FALSE, rboot=FALSE, qboot = FALSE, seed=666, reckless=TRUE)
+ expect_equivalent(as.list(mydtu$Genes["1A1B", list(known_transc, detect_transc, elig_transc, elig, elig_fx)]),
+ list(2, 2, 2, TRUE, TRUE))
expect_equivalent(as.list(mydtu$Genes["1A1N", list(known_transc, detect_transc, elig_transc, elig, elig_fx)]),
list(1, 1, 0, FALSE, FALSE))
expect_equivalent(as.list(mydtu$Genes["1B1C", list(known_transc, detect_transc, elig_transc, elig, elig_fx)]),
@@ -273,6 +276,10 @@ test_that("Filters work correctly", {
list(2, 2, 2, TRUE, FALSE))
setkey(mydtu$Transcripts, target_id)
+ expect_equivalent(as.list(mydtu$Transcripts["1A1B.a", list(elig_xp, elig, elig_fx)]),
+ list(TRUE, TRUE, TRUE))
+ expect_equivalent(as.list(mydtu$Transcripts["1A1B.b", list(elig_xp, elig, elig_fx)]),
+ list(TRUE, TRUE, TRUE))
expect_equivalent(as.list(mydtu$Transcripts["1A1N-2", list(elig_xp, elig, elig_fx)]),
list(TRUE, FALSE, FALSE))
expect_equivalent(as.list(mydtu$Transcripts["1B1C.1", list(elig_xp, elig, elig_fx)]),
diff --git a/tests/testthat/test_6_reports.R b/tests/testthat/test_6_reports.R
index a2e6958..0e1fce5 100644
--- a/tests/testthat/test_6_reports.R
+++ b/tests/testthat/test_6_reports.R
@@ -5,7 +5,7 @@ context("DTU Reports")
#==============================================================================
test_that("The summaries work", {
sim <- sim_boot_data()
- mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qboot=FALSE, rboot=FALSE, verbose = FALSE)
+ mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qboot=FALSE, rboot=FALSE, verbose = FALSE, reckless=TRUE)
expect_silent(ids <- get_dtu_ids(mydtu))
expect_type(ids, "list")
@@ -16,16 +16,16 @@ test_that("The summaries work", {
for (v in ids) {
expect_false(any(is.na(v)))
}
- expect_equal(ids[[1]], c("MIX6"))
+ expect_equal(ids[[1]], c("1A1B", "MIX6"))
expect_equal(ids[[2]], c("CC", "NN"))
expect_equal(ids[[3]], c("LC", "1A1N", "1B1C", "1D1C", "ALLA", "ALLB", "NIB"))
- expect_equal(ids[[4]], c("MIX6"))
+ expect_equal(ids[[4]], c("1A1B", "MIX6"))
expect_equal(ids[[5]], c("LC", "CC", "NN"))
expect_equal(ids[[6]], c("1A1N", "1B1C", "1D1C", "ALLA", "ALLB", "NIB"))
- expect_equal(ids[[7]], c("MIX6"))
+ expect_equal(ids[[7]], c("1A1B", "MIX6"))
expect_equal(ids[[8]], c("CC", "NN"))
expect_equal(ids[[9]], c("LC", "1A1N", "1B1C", "1D1C", "ALLA", "ALLB", "NIB"))
- expect_equal(ids[[10]], c("MIX6.c1", "MIX6.c2"))
+ expect_equal(ids[[10]], c("1A1B.a", "1A1B.b", "MIX6.c1", "MIX6.c2"))
expect_equal(ids[[11]], c("MIX6.c4", "LC2", "CC_a", "CC_b", "MIX6.c3", "2NN", "1NN", "MIX6.nc"))
expect_equal(ids[[12]], c("LC1", "1A1N-2", "1B1C.1", "1B1C.2", "1D1C:one", "1D1C:two", "MIX6.d", "ALLA1", "ALLB1", "ALLB2", "NIB.1"))
@@ -36,35 +36,35 @@ test_that("The summaries work", {
"DTU genes (both tests)", "non-DTU genes (both tests)", "ineligible genes (both tests)",
"DTU transcripts", "non-DTU transcripts", "ineligible transcripts"))
expect_equal(tally[[2]], as.vector(sapply(ids, length)))
- expect_equal(tally[[2]], c(1, 2, 7, 1, 3, 6, 1, 2, 7, 2, 8, 11))
+ expect_equal(tally[[2]], c(2, 2, 7, 2, 3, 6, 2, 2, 7, 4, 8, 11))
expect_false(any(is.na(tally)))
# Lower effect size threshold to verify that DTU changes accordingly.
- mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qboot=FALSE, rboot=FALSE, verbose = FALSE, dprop_thresh=0.1)
+ mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qboot=FALSE, rboot=FALSE, verbose = FALSE, dprop_thresh=0.1, reckless=TRUE)
expect_silent(ids <- get_dtu_ids(mydtu))
- expect_equal(ids[[1]], c("MIX6", "CC"))
+ expect_equal(ids[[1]], c("1A1B", "MIX6", "CC"))
expect_equal(ids[[2]], c("NN"))
expect_equal(ids[[3]], c("LC", "1A1N", "1B1C", "1D1C", "ALLA", "ALLB", "NIB"))
- expect_equal(ids[[4]], c("MIX6", "CC"))
+ expect_equal(ids[[4]], c("1A1B", "MIX6", "CC"))
expect_equal(ids[[5]], c("LC", "NN"))
expect_equal(ids[[6]], c("1A1N", "1B1C", "1D1C", "ALLA", "ALLB", "NIB"))
- expect_equal(ids[[7]], c("MIX6", "CC"))
+ expect_equal(ids[[7]], c("1A1B", "MIX6", "CC"))
expect_equal(ids[[8]], c("NN"))
expect_equal(ids[[9]], c("LC", "1A1N", "1B1C", "1D1C", "ALLA", "ALLB", "NIB"))
- expect_equal(ids[[10]], c("MIX6.c1", "MIX6.c2", "MIX6.c4", "CC_a", "CC_b"))
+ expect_equal(ids[[10]], c("1A1B.a", "1A1B.b", "MIX6.c1", "MIX6.c2", "MIX6.c4", "CC_a", "CC_b"))
expect_equal(ids[[11]], c("LC2", "MIX6.c3", "2NN", "1NN", "MIX6.nc"))
expect_equal(ids[[12]], c("LC1", "1A1N-2", "1B1C.1", "1B1C.2", "1D1C:one", "1D1C:two", "MIX6.d", "ALLA1", "ALLB1", "ALLB2", "NIB.1"))
expect_silent(tally <- dtu_summary(mydtu))
expect_equal(tally[[2]], as.vector(sapply(ids, length)))
- expect_equal(tally[[2]], c(2, 1, 7, 2, 2, 6, 2, 1, 7, 5, 5, 11))
+ expect_equal(tally[[2]], c(3, 1, 7, 3, 2, 6, 3, 1, 7, 7, 5, 11))
})
#==============================================================================
test_that("The isoform switching summaries work", {
sim <- sim_boot_data()
- mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qboot=FALSE, rboot=FALSE, verbose = FALSE, dprop_thresh=0.1)
+ mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qboot=FALSE, rboot=FALSE, verbose = FALSE, dprop_thresh=0.1, reckless=TRUE)
expect_silent(ids <- get_switch_ids(mydtu))
expect_type(ids, "list")
@@ -74,7 +74,7 @@ test_that("The isoform switching summaries work", {
for (v in ids) {
expect_false(any(is.na(v)))
}
- expect_equal(as.vector(unlist(ids)), c("MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6"))
+ expect_equal(as.vector(unlist(ids)), c("1A1B", "MIX6", "MIX6", "1A1B", "MIX6", "MIX6", "1A1B", "MIX6", "MIX6"))
expect_silent(tally <- dtu_switch_summary(mydtu))
expect_true(is.data.frame(tally))
@@ -82,7 +82,7 @@ test_that("The isoform switching summaries work", {
"Primary switch (transc. test)", "Non-primary switch (transc. test)",
"Primary switch (both tests)", "Non-primary switch (both tests)"))
expect_false(any(is.na(tally)))
- expect_equal(tally[[2]], c(1, 1, 1, 1, 1, 1))
+ expect_equal(tally[[2]], c(2, 1, 2, 1, 2, 1))
expect_equal(tally[[2]], as.vector(sapply(ids, length)))
})
@@ -90,7 +90,7 @@ test_that("The isoform switching summaries work", {
#==============================================================================
test_that("The plurality summaries work", {
sim <- sim_boot_data()
- mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qboot=FALSE, rboot=FALSE, verbose = FALSE, dprop_thresh=0.1)
+ mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qboot=FALSE, rboot=FALSE, verbose = FALSE, dprop_thresh=0.1, reckless=TRUE)
expect_silent(ids <- get_plurality_ids(mydtu))
expect_type(ids, "list")
@@ -98,28 +98,28 @@ test_that("The plurality summaries work", {
for (v in ids) {
expect_false(any(is.na(v)))
}
- expect_equal(as.vector(unlist(ids)), c("CC", "MIX6"))
+ expect_equal(as.vector(unlist(ids)), c("1A1B", "CC", "MIX6"))
expect_silent(tally <- dtu_plurality_summary(mydtu))
expect_true(is.data.frame(tally))
expect_equal(tally[[1]], c("2", "3"))
expect_false(any(is.na(tally)))
- expect_equal(tally[[2]], c(1, 1))
+ expect_equal(tally[[2]], c(2, 1))
expect_equal(tally[[2]], as.vector(sapply(ids, length)))
- mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qboot=FALSE, rboot=FALSE, verbose = FALSE, dprop_thresh=0.3)
+ mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qboot=FALSE, rboot=FALSE, verbose = FALSE, dprop_thresh=0.3, reckless=TRUE)
expect_silent(ids <- get_plurality_ids(mydtu))
expect_silent(tally <- dtu_plurality_summary(mydtu))
- expect_named(ids, c("1"))
- expect_equal(as.vector(unlist(ids)), c("MIX6"))
- expect_equal(tally[[2]], c(1))
+ expect_named(ids, c("2", "1"))
+ expect_equal(as.vector(unlist(ids)), c("1A1B", "MIX6"))
+ expect_equal(tally[[2]], c(1, 1))
})
#==============================================================================
test_that("The gene plotting commands work", {
sim <- sim_boot_data()
- mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qbootnum=2, verbose = FALSE)
+ mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qbootnum=2, verbose = FALSE, reckless=TRUE)
expect_silent(plot_gene(dtuo=mydtu, pid="MIX6"))
expect_silent(plot_gene(dtuo=mydtu, pid="MIX6", style="bycondition"))
@@ -155,10 +155,28 @@ test_that("The gene plotting commands work", {
expect_silent(plot_gene(dtuo=mydtu, pid="MIX6", style="byisoform", shapeby="none"))
})
+#==============================================================================
+test_that("The all gene scenarios can be plotted", {
+ sim <- sim_boot_data()
+ mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, qbootnum=2, verbose = FALSE, reckless=TRUE)
+
+ expect_silent(plot_gene(dtuo=mydtu, pid="1A1B"))
+ expect_silent(plot_gene(dtuo=mydtu, pid="1A1N")) #
+ expect_silent(plot_gene(dtuo=mydtu, pid="1B1C")) ##
+ expect_silent(plot_gene(dtuo=mydtu, pid="1D1C"))
+ expect_silent(plot_gene(dtuo=mydtu, pid="ALLA"))
+ expect_silent(plot_gene(dtuo=mydtu, pid="ALLB"))
+ expect_silent(plot_gene(dtuo=mydtu, pid="CC"))
+ expect_silent(plot_gene(dtuo=mydtu, pid="LC"))
+ expect_silent(plot_gene(dtuo=mydtu, pid="MIX6"))
+ expect_silent(plot_gene(dtuo=mydtu, pid="NIB")) ##
+ expect_silent(plot_gene(dtuo=mydtu, pid="NN"))
+})
+
#==============================================================================
test_that("The overview plotting commands work", {
sim <- sim_boot_data()
- mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qbootnum=2, verbose = FALSE)
+ mydtu <- call_DTU(annot= sim$annot, boot_data_A= sim$boots_A, boot_data_B= sim$boots_B, name_A= "ONE", name_B= "TWO", qbootnum=2, verbose = FALSE, reckless=TRUE)
expect_silent(plot_overview(mydtu))
expect_silent(plot_overview(dtuo=mydtu, type="tvolcano"))
diff --git a/vignettes/input.Rmd b/vignettes/input.Rmd
index 3a383a5..be59177 100644
--- a/vignettes/input.Rmd
+++ b/vignettes/input.Rmd
@@ -1,7 +1,7 @@
---
title: 'RATs: Input and Settings'
author: "Kimon Froussios"
-date: "19 SEP 2017"
+date: "08 MAR 2018"
output:
html_document:
keep_md: yes
@@ -103,8 +103,8 @@ RATs comes with data emulators, intended to be used for testing the code. Howeve
use for showcasing how RATs works.
By default, RATs reports on its progress and produces a summary report. These can be suppressed using the
-`verbose = FALSE` parameter to the call. To prevent cluttering this tutorial with verbose output, we will use this
-option in all the examples.
+`verbose = FALSE` parameter. This also suppresses some warnings generated by RATs when checking the sanity of its input,
+so we do not recommend it for general use. However, to prevent cluttering this tutorial with verbose output, we will use this option in our examples.
If you choose to allow verbose output when trying out the examples below using the emulated data, you will get some **warnings**
about the number of bootstraps. The warning is triggered because the emulated dataset used in the examples immitates only the structure of
@@ -120,7 +120,7 @@ First, let's emulate some data to work with:
```{r, eval=FALSE}
# Simulate some data.
-simdat <- sim_count_data()
+simdat <- sim_count_data(clean=TRUE)
# For convenience let's assign the contents of the list to separate variables
mycond_A <- simdat[[2]] # Simulated abundances for one condition.
mycond_B <- simdat[[3]] # Simulated abundances for other condition.
@@ -162,7 +162,7 @@ First, let's emulate some data, as we did before.
```{r}
# Simulate some data. (Notice it is a different function than before.)
-simdat <- sim_boot_data()
+simdat <- sim_boot_data(clean=TRUE)
# For convenience let's assign the contents of the list to separate variables
mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition.
@@ -273,14 +273,16 @@ The following three main thresholds are used in RATs:
# Calling DTU with custom thresholds.
mydtu <- call_DTU(annot= myannot,
boot_data_A= mycond_A, boot_data_B= mycond_B,
- p_thresh= 0.01, abund_thresh= 10, dprop_thres = 0.25)
+ p_thresh= 0.01, dprop_thres = 0.15, abund_thresh= 10)
```
-1. `p_thresh` - Statistical significance level. P-values below this will be considered significant. Lower threshold values are stricter. (Default 0.05)
-2. `abund_thresh` - Noise threshold. Transcripts with abundance below that value in both conditions are ignored. Higher threshold values are stricter. (Default 5, assumes abundances scaled to library size)
-3. `dprop_thresh` - Effect size threshold. Transcripts whose proportion changes between conditions by less than the threshold are considered non-DTU, regardless of their statistical significance. Higher threshold values are stricter. (Default 0.20)
+1. `p_thresh` - Statistical significance level. P-values below this will be considered significant. Lower threshold values are stricter, and help reduce low-count low-confidence calls. (Default 0.05, very permissive)
+2. `dprop_thresh` - Effect size threshold. Transcripts whose proportion changes between conditions by less than this threshold are considered uninteresting, regardless of their statistical significance. (Default 0.20, quite strict)
+3. `abund_thresh` - Noise threshold. Minimum mean (across replicates) abundance for a transcript to be considered expressed. Transcripts with mean abundances below this will be ignored. Mean total gene count must also meet this value in both conditions, a.k.a. at least one expressed isoform must exist in each condition. (Default 5, very permissive)
+
+The default values for these thresholds have been chosen such that they achieve a *median* FDR <5% for a high quality dataset from *Arabidopsis thaliana*, even with only 3 replicates per condition.
+Your mileage may vary and you should give some consideration to selecting appropriate values.
-The default values for these thresholds have been chosen such that they achieve a *median* FDR <5% for a high quality dataset from *Arabidopsis thaliana*, even with only 3 replicates per condition. Your mileage may vary and you should give some consideration to selecting appropriate values.
Depending on the settings, *additional thresholds* are available and will be discussed in their respective sections below.
@@ -378,7 +380,7 @@ mydtu <- call_DTU(annot = myannot,
1. `threads` - The number of threads to use. (Default 1)
-Due to core R implementation limitations, the type of multi-threading used in RATs works only in POSIX-compliant systems.
+Due to core R implementation limitations, the type of multi-threading used in RATs works only in POSIX-compliant systems (Linux, Mac, not Windows).
Refer to the `parallel` package for details.
@@ -447,14 +449,14 @@ such as those employed by RATs, and reduces the statistical power of the method.
To counter this, RATs provides the option to scale abundaces either equally by a single factor (such as average library size among samples)
or by a vector of factors (one per sample). The former maintains any pre-existing library-size normalisation among samples. This is necessary
for fold-change based methods, but RATs does not require it. Instead, using the respective actual library sizes of the samples allows the
-higher-throughput samples to have a bigger influence than the lower-throughput samples. This is particularly relevant if your samples have dissimilar
+higher-throughput samples to have a bigger influence than the lower-throughput samples. This is particularly relevant if your samples have very dissimilar
library sizes.
For flexibility with different types of input, these scaling options can be applied in either of two stages: The data import step by `fish4rodents()`,
or the actual testing step by `call_DTU()`. In the example examined previously, `fish4rodents()` was instructed to create TPM abundances,
-by normalising to 10000000 reads. Such values are useful with certain other tools that a user may also intend to use.
+by normalising to `1000000` reads. Such values are useful with certain other tools that a user may also intend to use.
Subsequently, these TPMs were re-scaled to meet the library size of each sample, thus providing RATs with count-like abundance values that retain
-the TPM's normalisation by isoform length. However, it is not necessary to scale in two separate steps.
+the normalisation by isoform length. However, it is not necessary to scale in two separate steps.
Both `fish4rodents()` and `call_DTU()` support scaling by a single value or a vector of values. If you don't need the TPMs, you can scale
directly to the desired library size(s), as in the examples below:
@@ -495,26 +497,33 @@ mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A,
```
You can mix and match scaling options as per your needs, so take care to ensure that the scaling you apply is appropriate.
-It is important to note, that if you simply run both methods with their respective defaults, you'll effectively run RATs
+*It is important to note*, that if you simply run both methods with their respective defaults, you'll effectively run RATs
on TPM values, which is extremely underpowered and not recommended. Please, provide appropriate scaling factors for your data.
-***
-
-
-# Annotation discrepancies
+## Annotation discrepancies
Different annotation versions often preserve transcript IDs, despite altering the details of the transcript model.
+They also tend to include more or fewer transcripts, affecting the result of quantification.
It is important to use the same annotation throughout the workflow, otherwise the abundances will not be comparable
in a meaningful way.
-All internal operations and the output of RATs are based on the annotation provided:
+RATs will abort the run if the set of feature IDs in the provided annotation does not match fully the set of IDs in the quantifications.
+If this happens, ensure you are using the exact same annotation throughout your workflow.
+
+For special use cases, RATs provides the option to ignore the discrepancy and pretend everything is OK. Do this at your own risk.
+
+```{r, eval=FALSE}
+mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A,
+ boot_data_B= mydata$boot_data_B,
+ reckless=TRUE, verbose=TRUE)
+```
+
-* Any transcripts present in the data but missing from the annotation will be ignored completely and
-will not show up in the output, as there is no reliable way to match them to gene IDs.
-* Any transcript/gene present in the annotation but missing from the data will be included in the output as zero expression.
-* If the samples appear to use different annotations from one another, RATs will abort.
+In reckless mode, all internal operations and the output of RATs are based on the annotation provided:
+* Any transcript IDs present in the data but missing from the annotation will be ignored and will not show up in the output at all, as they cannot be matched to the gene IDs.
+* Any transcript ID present in the annotation but missing from the data will be included in the output as zero expression. They can be identified later as `NA` values in `$Transcripts[, .(stdevA, stdevB)]` as these fields are not used downstream by RATs. `NA` values in other numeric fields are explicitly overwritten with `0` to allow downstream interoperability.
***
diff --git a/vignettes/output.Rmd b/vignettes/output.Rmd
index 2d91416..16f6b28 100644
--- a/vignettes/output.Rmd
+++ b/vignettes/output.Rmd
@@ -1,7 +1,7 @@
---
title: 'RATs: Raw Output'
author: "Kimon Froussios"
-date: "05 JAN 2018"
+date: "09 MAR 2018"
output:
html_document:
keep_md: yes
@@ -31,7 +31,7 @@ Set up an example.
```{r}
# Simulate some data.
-simdat <- sim_boot_data()
+simdat <- sim_boot_data(clean=TRUE)
# For convenience let's assign the contents of the list to separate variables
mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition.
mycond_B <- simdat[[3]] # Simulated bootstrapped data for other condition.
@@ -168,9 +168,10 @@ print( names(mydtu$Parameters) )
* `rep_reprod_thresh` - (num) The value passed to the `rrep_thresh` parameter, if `rboot==TRUE`..
* `rep_boot` - (bool) The value passed to the `rboot` parameter.
* `rep_bootnum` - (int) The number of replicate bootstrapping iterations (currently M*N, where M and N are the numbers of samples in the two conditions), if `rboot==TRUE`.
-* `seed` - (int) Custom seed for the random generator.
+* `seed` - (int) Custom seed for the random generator. `NA` if none was specified.
+* `reckless` - (bool) The value passed to the `reckless` parameter.
-**Note:** If bootstraps are disabled, the bootstrap-related fields may have `NA` values despite any values that were passed to their respective parameters.
+**Note:** If bootstraps are disabled, the bootstrap-related fields may be `NA`, regardless of supplied values, to reflect the fact that they were not used.
## Genes
diff --git a/vignettes/plots.Rmd b/vignettes/plots.Rmd
index 88cd16a..5bd2580 100644
--- a/vignettes/plots.Rmd
+++ b/vignettes/plots.Rmd
@@ -35,7 +35,7 @@ Set up an example.
```{r}
# Simulate some data.
-simdat <- sim_boot_data()
+simdat <- sim_boot_data(clean=TRUE)
# For convenience let's assign the contents of the list to separate variables
mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition.
mycond_B <- simdat[[3]] # Simulated bootstrapped data for other condition.