diff --git a/DESCRIPTION b/DESCRIPTION index 6b8130e..779f6d0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: rats -Version: 0.6.2 -Date: 2018-02-16 +Version: 0.6.3 +Date: 2018-03-13 Title: Relative Abundance of Transcripts Encoding: UTF-8 Authors: c(person("Kimon Froussios", role=c("aut"), email="k.froussios@dundee.ac.uk"), diff --git a/NAMESPACE b/NAMESPACE index f87589a..a460217 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,7 +20,6 @@ export(plot_overview) export(plot_shiny_volcano) export(sim_boot_data) export(sim_count_data) -export(sim_sleuth_data) export(tidy_annot) import(data.table) import(ggplot2) diff --git a/R/data_simulators.R b/R/data_simulators.R index 798ea73..7e7334c 100644 --- a/R/data_simulators.R +++ b/R/data_simulators.R @@ -1,115 +1,10 @@ -#============================================================================== -#' Generate an artificial minimal sleuth-like structure, for code-testing or examples. -#' -#' The default values here should match the default values expected by calculate_DTU(). -#' -#' @param varname Name for the variables by which to compare. -#' @param COUNTS_COL Name for the bootdtraps column containing counts. -#' @param TARGET_COL Name for annotation column containing transcript identification. -#' @param PARENT_COL Name for annotation column containing respective gene identification. -#' @param BS_TARGET_COL Name for bootstraps column containing transcript identification. -#' @param cnames A vector of (two) name values for the comparison variable. -#' @param errannot_inconsistent Logical. Introduces an inconsistency in the transcript IDs, for testing of sanity checks. (FALSE) -#' @param cv_dt Logical. Whether covariates table should be a data.table (FALSE). -#' @return A list with \code{slo} a minimal sleuth-like object, \code{annot} a corresponding annotation data.frame, -#' \code{isx} a vector of trancripts common between generated annotation and bootstraps (useful for code testing). -#' -#' The simulated data will have non-uniform number of bootstraps per sample and non-uniform order of -#' transcript id's among samples and bootstraps. These conditions are unlikely to occur in real data, -#' but this allows testing that the code can handle it. -#' -#' @import data.table -#' -#' @export -sim_sleuth_data <- function(varname="condition", COUNTS_COL="est_counts", TARGET_COL="target_id", - PARENT_COL="parent_id", BS_TARGET_COL="target_id", cnames=c("A","B"), - errannot_inconsistent=FALSE, cv_dt=FALSE) -{ - # !!! Some of the tests of the package are tightly connected to the specifics of the object returned by this function. - # !!! Entry additions might be tolerated. Changes or removals of entries or structure will certainly cause failures. - - tx <- data.frame(target_id= c("NIB.1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.1", "1B1C.2", "CC_a", "CC_b", "1NN", "2NN", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "LC1", "LC2", "ALLA1", "ALLB1", "ALLB2"), - parent_id= c("NIB", "1A1N", "1D1C", "1D1C", "1B1C", "1B1C", "CC", "CC", "NN", "NN", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "LC", "LC", "ALLA", "ALLB", "ALLB"), - stringsAsFactors=FALSE) - names(tx) <- c(TARGET_COL, PARENT_COL) - - sl <- list() - sl[["sample_to_covariates"]] <- NaN - if (cv_dt) { - sl[["sample_to_covariates"]] <- data.table("foo"=c(cnames[1], cnames[2], cnames[1], cnames[2]), - "bar"=c("ba", "ba", "bb", "bb")) - } else { - sl[["sample_to_covariates"]] <- data.frame("foo"=c(cnames[1], cnames[2], cnames[1], cnames[2]), - "bar"=c("ba", "ba", "bb", "bb")) - } - names(sl[["sample_to_covariates"]]) <- c(varname, "batch") - - sl[["kal"]] <- list() - sl$kal[[1]] <- list() - sl$kal[[1]]["bootstrap"] <- list() - sl$kal[[1]]$bootstrap[[1]] <- data.frame("target"= c("LC1", "NIA1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(3, 333, 666, 10, 20, 0, 76, 52, 20, 50, 103, 321, 0, 100, 90, 0, 10, 30, 4, 50, 0, 0), - stringsAsFactors=FALSE) - sl$kal[[1]]$bootstrap[[2]] <- data.frame("target"= c("LC1", "NIA1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(2, 310, 680, 11, 21, 0, 80, 55, 22, 52, 165, 320, 0, 130, 80, 0, 11, 29, 5, 40, 0, 0), - stringsAsFactors=FALSE) - sl$kal[[1]]$bootstrap[[3]] <- data.frame("target"= c("LC1", "NIA1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(0, 340, 610, 7, 18, 0, 72, 50, 21, 49, 150, 325, 0, 120, 70, 0, 9, 28, 4, 60, 0, 0), - stringsAsFactors=FALSE) - - sl$kal[[2]] <- list() - sl$kal[[2]]["bootstrap"] <- list() - sl$kal[[2]]$bootstrap[[1]] <- data.frame("target"= c("NIA1", "LC1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(333, 6, 666, 12, 23, 0, 25, 150, 15, 80, 325, 105, 40, 0, 200, 0, 15, 45, 12, 0, 80, 200), - stringsAsFactors=FALSE) - sl$kal[[2]]$bootstrap[[2]] <- data.frame("target"= c("NIA1", "LC1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(323, 1, 606, 15, 22, 0, 23, 190, 20, 90, 270, 115, 30, 0, 150, 0, 20, 60, 15, 0, 120, 250), - stringsAsFactors=FALSE) - - sl$kal[[3]] <- list() - sl$kal[[3]]$bootstrap[[1]] <- data.frame("target"= c("NIA1", "1A1N-2", "LC1", "NIA2", "1D1C:one", "1D1C:two", "1B1C.2", "MIX6.c1", "1A1N-1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.d", "MIX6.nc", "CC_a", "CC_b", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(333, 20, 3, 666, 0, 76, 52, 100, 10, 360, 0, 100, 0, 180, 25, 60, 7, 27, 13, 35, 0, 0), - stringsAsFactors=FALSE) - sl$kal[[3]]$bootstrap[[2]] <- data.frame("target"= c("MIX6.c4", "NIA1", "LC1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1B1C.2", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.nc", "1D1C:two", "MIX6.d", "CC_b", "CC_a", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(80, 330, 4, 560, 11, 21, 0, 55, 90, 380, 0, 240, 80, 0, 55, 23, 10, 31, 2, 55, 0, 0), - stringsAsFactors=FALSE) - - sl$kal[[4]] <- list() - sl$kal[[4]]["bootstrap"] <- list() - sl$kal[[4]]$bootstrap[[1]] <- data.frame("target"= c("NIA2", "1A1N-1", "NIA1", "1A1N-2", "1D1C:one", "1D1C:two", "LC1", "1B1C.2", "MIX6.c2", "MIX6.c3", "MIX6.c1", "MIX6.c4", "MIX6.nc", "MIX6.d", "CC_b", "CC_a", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(666, 12, 333, 23, 0, 25, 0, 150, 155, 40, 300, 0, 33, 0, 93, 22, 22, 61, 11, 0, 110, 210), - stringsAsFactors=FALSE) - sl$kal[[4]]$bootstrap[[2]] <- data.frame("target"= c("NIA1", "MIX6.d", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "LC1", "1D1C:two", "MIX6.c1", "1B1C.2", "MIX6.c3", "MIX6.c2", "MIX6.c4", "MIX6.nc", "CC_a", "CC_b", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(323, 0, 656, 15, 22, 0, 2, 23, 280, 160, 35, 120, 0, 95, 18, 119, 19, 58, 7, 0, 150, 220), - stringsAsFactors=FALSE) - sl$kal[[4]]$bootstrap[[3]] <- data.frame("target"= c("NIA1", "NIA2", "1A1N-1", "1A1N-2", "MIX6.nc", "1B1C.2", "LC1", "MIX6.c1", "MIX6.c2", "MIX6.c3", "1D1C:one", "1D1C:two", "MIX6.c4", "MIX6.d", "CC_a", "CC_b", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "my_counts"= c(343, 676, 13, 21, 55, 145, 3, 270, 133, 30, 0, 20, 0, 0, 23, 80, 17, 50, 14, 0, 130, 200), - stringsAsFactors=FALSE) - - names(sl$kal[[1]]$bootstrap[[1]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[1]]$bootstrap[[2]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[1]]$bootstrap[[3]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[2]]$bootstrap[[1]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[2]]$bootstrap[[2]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[3]]$bootstrap[[1]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[3]]$bootstrap[[2]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[4]]$bootstrap[[1]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[4]]$bootstrap[[2]]) <- c(BS_TARGET_COL, COUNTS_COL) - names(sl$kal[[4]]$bootstrap[[3]]) <- c(BS_TARGET_COL, COUNTS_COL) - - if (errannot_inconsistent) { - sl$kal[[3]]$bootstrap[[1]][10, BS_TARGET_COL] <- "Unexpected-name" - } - - return(list("annot" = tx, "slo" = sl, "isx" = intersect(tx[[TARGET_COL]], sl$kal[[1]]$bootstrap[[1]][[BS_TARGET_COL]]) )) -} - #============================================================================== #' Generate an artificial dataset of bootstrapped abundance estimates, for code-testing or examples. #' #' @param TARGET_COL Name for annotation column containing transcript identification. #' @param PARENT_COL Name for the bootdtraps column containing counts. #' @param errannot_inconsistent Logical. Introduces an inconsistency in the transcript IDs, for testing of sanity checks. (FALSE) +#' @param clean Logical. Remove the intentional inconsistency of IDs between annotation and abundances. #' @return A list with 3 elements. First a data.frame with the corresponding annotation. #' Then, 2 lists of data.tables. One list per condition. Each data table represents a sample and contains the estimates from the bootstrap iterations. #' @@ -117,38 +12,63 @@ sim_sleuth_data <- function(varname="condition", COUNTS_COL="est_counts", TARGET #' #' @export #' -sim_boot_data <- function(errannot_inconsistent=FALSE, PARENT_COL="parent_id", TARGET_COL="target_id") { - tx <- data.frame(target_id= c("NIB.1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.1", "1B1C.2", "CC_a", "CC_b", "1NN", "2NN", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "LC1", "LC2", "ALLA1", "ALLB1", "ALLB2"), - parent_id= c("NIB", "1A1N", "1D1C", "1D1C", "1B1C", "1B1C", "CC", "CC", "NN", "NN", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "LC", "LC", "ALLA", "ALLB", "ALLB"), +sim_boot_data <- function(errannot_inconsistent=FALSE, PARENT_COL="parent_id", TARGET_COL="target_id", clean=FALSE) { + tx <- data.frame(target_id= c("1A1B.a", "1A1B.b", "NIB.1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.1", "1B1C.2", "CC_a", "CC_b", "1NN", "2NN", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "LC1", "LC2", "ALLA1", "ALLB1", "ALLB2"), + parent_id= c("1A1B", "1A1B", "NIB", "1A1N", "1D1C", "1D1C", "1B1C", "1B1C", "CC", "CC", "NN", "NN", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "MIX6", "LC", "LC", "ALLA", "ALLB", "ALLB"), stringsAsFactors=FALSE) names(tx) <- c(TARGET_COL, PARENT_COL) a <- list() - a[[1]] <- data.table("target"= c("LC1", "NIA1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "V1"= c( 3, 333, 666, 10, 20, 0, 76, 52, 20, 50, 103, 321, 0, 100, 90, 0, 10, 30, 4, 50, 0, 0), - "V2"= c( 2, 310, 680, 11, 21, 0, 80, 55, 22, 52, 165, 320, 0, 130, 80, 0, 11, 29, 5, 40, 0, 0), - "V3"= c( 0, 340, 610, 7, 18, 0, 72, 50, 21, 49, 150, 325, 0, 120, 70, 0, 9, 28, 4, 60, 0, 0), - stringsAsFactors=FALSE) - - a[[2]] <- data.table("target"= c("NIA1", "1A1N-2", "LC1", "NIA2", "1D1C:one", "1D1C:two", "1B1C.2", "MIX6.c1", "1A1N-1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.d", "MIX6.nc", "CC_a", "CC_b", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "V1"= c( 333, 20, 3, 666, 0, 76, 52, 100, 10, 360, 0, 100, 0, 180, 25, 60, 7, 27, 13, 35, 0, 0), - "V2"= c( 330, 11, 4, 560, 0, 80, 55, 90, 11, 380, 0, 80, 0, 240, 23, 55, 10, 31, 2, 55, 0, 0), - stringsAsFactors=FALSE) - b <- list() - b[[1]] <- data.table("target"= c("NIA1", "LC1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "V1"= c( 333, 6, 666, 12, 23, 0, 25, 150, 15, 80, 325, 105, 40, 0, 200, 0, 15, 45, 12, 0, 80, 200), - "V2"= c( 323, 1, 606, 15, 22, 0, 23, 190, 20, 90, 270, 115, 30, 0, 150, 0, 20, 60, 15, 0, 120, 250), - stringsAsFactors=FALSE) - b[[2]] <- data.table("target"= c("NIA2", "1A1N-1", "NIA1", "1A1N-2", "1D1C:one", "1D1C:two", "LC1", "1B1C.2", "MIX6.c2", "MIX6.c3", "MIX6.c1", "MIX6.c4", "MIX6.nc", "MIX6.d", "CC_b", "CC_a", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), - "V1"= c( 666, 12, 333, 23, 0, 25, 0, 150, 155, 40, 300, 0, 33, 0, 93, 22, 22, 61, 11, 0, 110, 210), - "V2"= c( 656, 15, 323, 22, 0, 23, 2, 160, 120, 35, 280, 0, 95, 0, 119, 18, 19, 58, 7, 0, 150, 220), - "V3"= c( 676, 13, 343, 21, 0, 20, 3, 145, 133, 30, 270, 0, 55, 0, 80, 23, 17, 50, 14, 0, 130, 200), - stringsAsFactors=FALSE) + if (clean) { + a[[1]] <- data.table("target"= c("1A1B.a", "1A1B.b", "LC1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2", "1B1C.1", "NIB.1"), + "V1"= c( 69, 0, 3, 20, 0, 76, 52, 20, 50, 103, 321, 0, 100, 90, 0, 10, 30, 4, 50, 0, 0, 0, 0), + "V2"= c( 96, 0, 2, 21, 0, 80, 55, 22, 52, 165, 320, 0, 130, 80, 0, 11, 29, 5, 40, 0, 0, 0, 0), + "V3"= c( 88, 0, 0, 18, 0, 72, 50, 21, 49, 150, 325, 0, 120, 70, 0, 9, 28, 4, 60, 0, 0, 0, 0), + stringsAsFactors=FALSE) + + a[[2]] <- data.table("target"= c("1A1B.a", "1A1B.b", "1A1N-2", "LC1", "1D1C:one", "1D1C:two", "1B1C.2", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.d", "MIX6.nc", "CC_a", "CC_b", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2", "1B1C.1", "NIB.1"), + "V1"= c( 121, 0, 20, 3, 0, 76, 52, 100, 360, 0, 100, 0, 180, 25, 60, 7, 27, 13, 35, 0, 0, 0, 0), + "V2"= c( 144, 0, 11, 4, 0, 80, 55, 90, 380, 0, 80, 0, 240, 23, 55, 10, 31, 2, 55, 0, 0, 0, 0), + stringsAsFactors=FALSE) + + b[[1]] <- data.table("target"= c("1A1B.a", "1A1B.b", "LC1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2", "1B1C.1", "NIB.1"), + "V1"= c( 0, 91, 6, 23, 0, 25, 150, 15, 80, 325, 105, 40, 0, 200, 0, 15, 45, 12, 0, 80, 200, 0, 0), + "V2"= c( 0, 100, 1, 22, 0, 23, 190, 20, 90, 270, 115, 30, 0, 150, 0, 20, 60, 15, 0, 120, 250, 0, 0), + stringsAsFactors=FALSE) + + b[[2]] <- data.table("target"= c("1A1N-2", "1D1C:one", "1D1C:two", "LC1", "1B1C.2", "MIX6.c2", "MIX6.c3", "MIX6.c1", "MIX6.c4", "MIX6.nc", "MIX6.d", "CC_b", "CC_a", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2", "1A1B.b", "1A1B.a", "1B1C.1", "NIB.1"), + "V1"= c( 23, 0, 25, 0, 150, 155, 40, 300, 0, 33, 0, 93, 22, 22, 61, 11, 0, 110, 210, 76, 0, 0, 0), + "V2"= c( 22, 0, 23, 2, 160, 120, 35, 280, 0, 95, 0, 119, 18, 19, 58, 7, 0, 150, 220, 69, 0, 0, 0), + "V3"= c( 21, 0, 20, 3, 145, 133, 30, 270, 0, 55, 0, 80, 23, 17, 50, 14, 0, 130, 200, 36, 0, 0, 0), + stringsAsFactors=FALSE) + } else { + a[[1]] <- data.table("target"= c("1A1B.a", "1A1B.b", "LC1", "NIA1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), + "V1"= c( 69, 0, 3, 333, 666, 10, 20, 0, 76, 52, 20, 50, 103, 321, 0, 100, 90, 0, 10, 30, 4, 50, 0, 0), + "V2"= c( 96, 0, 2, 310, 680, 11, 21, 0, 80, 55, 22, 52, 165, 320, 0, 130, 80, 0, 11, 29, 5, 40, 0, 0), + "V3"= c( 88, 0, 0, 340, 610, 7, 18, 0, 72, 50, 21, 49, 150, 325, 0, 120, 70, 0, 9, 28, 4, 60, 0, 0), + stringsAsFactors=FALSE) + + a[[2]] <- data.table("target"= c("NIA1", "1A1B.a", "1A1B.b", "1A1N-2", "LC1", "NIA2", "1D1C:one", "1D1C:two", "1B1C.2", "MIX6.c1", "1A1N-1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.d", "MIX6.nc", "CC_a", "CC_b", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), + "V1"= c( 333, 121, 0, 20, 3, 666, 0, 76, 52, 100, 10, 360, 0, 100, 0, 180, 25, 60, 7, 27, 13, 35, 0, 0), + "V2"= c( 330, 144, 0, 11, 4, 560, 0, 80, 55, 90, 11, 380, 0, 80, 0, 240, 23, 55, 10, 31, 2, 55, 0, 0), + stringsAsFactors=FALSE) + + b[[1]] <- data.table("target"= c("1A1B.a", "1A1B.b", "NIA1", "LC1", "NIA2", "1A1N-1", "1A1N-2", "1D1C:one", "1D1C:two", "1B1C.2", "CC_a", "CC_b", "MIX6.c1", "MIX6.c2", "MIX6.c3", "MIX6.c4", "MIX6.nc", "MIX6.d", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2"), + "V1"= c( 0, 91, 333, 6, 666, 12, 23, 0, 25, 150, 15, 80, 325, 105, 40, 0, 200, 0, 15, 45, 12, 0, 80, 200), + "V2"= c( 0, 100, 323, 1, 606, 15, 22, 0, 23, 190, 20, 90, 270, 115, 30, 0, 150, 0, 20, 60, 15, 0, 120, 250), + stringsAsFactors=FALSE) + + b[[2]] <- data.table("target"= c("NIA2", "1A1N-1", "NIA1", "1A1N-2", "1D1C:one", "1D1C:two", "LC1", "1B1C.2", "MIX6.c2", "MIX6.c3", "MIX6.c1", "MIX6.c4", "MIX6.nc", "MIX6.d", "CC_b", "CC_a", "1NN", "2NN", "LC2", "ALLA1", "ALLB1", "ALLB2", "1A1B.b", "1A1B.a"), + "V1"= c( 666, 12, 333, 23, 0, 25, 0, 150, 155, 40, 300, 0, 33, 0, 93, 22, 22, 61, 11, 0, 110, 210, 76, 0), + "V2"= c( 656, 15, 323, 22, 0, 23, 2, 160, 120, 35, 280, 0, 95, 0, 119, 18, 19, 58, 7, 0, 150, 220, 69, 0), + "V3"= c( 676, 13, 343, 21, 0, 20, 3, 145, 133, 30, 270, 0, 55, 0, 80, 23, 17, 50, 14, 0, 130, 200, 36, 0), + stringsAsFactors=FALSE) + } if (errannot_inconsistent) - a[[2]][14, 1] <- "Unexpected-name" + b[[1]][14, 1] <- "Unexpected-name" return(list('annot'= tx, 'boots_A'= a, 'boots_B'= b)) } @@ -159,134 +79,14 @@ sim_boot_data <- function(errannot_inconsistent=FALSE, PARENT_COL="parent_id", T #' @param PARENT_COL Name for the bootdtraps column containing counts. #' @param TARGET_COL Name for annotation column containing transcript identification. #' @param errannot_inconsistent Logical. Introduces an inconsistency in the transcript IDs, for testing of sanity checks. (FALSE) +#' @param clean Logical. Remove the intentional inconsistency of IDs between annotation and abundances. #' @return A list with 3 elements. First, a data.frame with the corresponding annotation table. #' Then, 2 data.tables, one per condition. Each data table contains the abundance estimates for all the samples for the respective condition. #' #' @export #' -sim_count_data <- function(errannot_inconsistent=FALSE, PARENT_COL="parent_id", TARGET_COL="target_id") { - sim <- sim_boot_data(errannot_inconsistent=errannot_inconsistent, PARENT_COL=PARENT_COL, TARGET_COL=TARGET_COL) +sim_count_data <- function(errannot_inconsistent=FALSE, PARENT_COL="parent_id", TARGET_COL="target_id", clean=FALSE) { + sim <- sim_boot_data(errannot_inconsistent=errannot_inconsistent, PARENT_COL=PARENT_COL, TARGET_COL=TARGET_COL, clean=clean) return(list('annot'=sim$annot, 'counts_A'= sim$boots_A[[1]], 'counts_B'= sim$boots_B[[1]])) } - - -#=============================================================================== -#' Generate artificial read-count data for simulated 2-transcript genes, -#' systematically covering a broad range of proportions and magnitudes. -#' -#' For each level of transcript proportions in condition A, all proportion levels are -#' generated for condition B as possible new states. Therefore the number of transcripts -#' generated is 2 * N^2 * M, where 2 is the number of gene isoforms per paramater combination, -#' N is the number of proportion levels and M the number of magnitude levels. -#' -#' @param proportions Range and step of proportions for 1st transcript. Sibling transcript complemented from these. -#' @param magnitudes vector of read-count sizes to use. -#' @return list containing: \code{data} a minimal sleuth-like object. -#' \code{anno} a dataframe matching \code{target_id} to \code{parent_id}. -#' \code{sim} a dataframe with the simulation parameters per transcript. -#' -# @export -countrange_sim <- function(proportions= seq(0, 1, 0.01), - magnitudes= c(1,5,10,30,100,500,1000)) { - # Combine proportions and magnitudes. - # Start by grid-ing the values for easier visualization and naming, instead of computing the outer products directly. - grd <- expand.grid(c("t1", "t2"), proportions, proportions, magnitudes) - names(grd) <- c("sibl", "propA", "propB", "mag") - # Name the "transcripts". - opts <- options() - options(scipen=0, digits=3) - grd["parent_id"] <- paste(grd$mag, "_a", grd$propA,"_b", grd$propB, sep="") - grd["target_id"] <- paste(grd$parent_id, grd$sibl, sep="_") - options(scipen=opts$scipen, digits=opts$digits) - # Complementary proportions - grd[grd$sibl == "t2", c("propA", "propB")] <- c(1-grd$propA[grd$sibl == "t2"], 1-grd$propB[grd$sibl == "t2"]) - - # Make a sleuth-like object. - sl <- list() - sl["kal"] <- list() - sl[["sample_to_covariates"]] <- data.frame("sample"=c("A-1", "B-1"), "condition"=c("A", "B")) - # condition A - sl$kal[[1]] <- list() - sl$kal[[1]]["bootstrap"] <- list() - sl$kal[[1]]$bootstrap[[1]] <- data.frame("target_id"=grd$target_id, "est_counts"=grd$propA * grd$mag) - # condition B - sl$kal[[2]] <- list() - sl$kal[[2]]["bootstrap"] <- list() - sl$kal[[2]]$bootstrap[[1]] <- data.frame("target_id"=grd$target_id, "est_counts"=grd$propB * grd$mag) - - # Create annotation: - anno <- data.frame("target_id"=grd$target_id, "parent_id"=grd$parent_id) - - # Store simulation for ease. - results <- list("data"=sl, "anno"=anno, "sim"=grd) - - return(results) -} - - -#=============================================================================== -#' Combine simulation details with DTU calls, in preparation for plotting. -#' -#' @param sim An object generated with countrange_sim(), containing the simulated data details with which the \code{dtu} object was calculated. -#' @param dtuo The object with the DTU results corresponding to the \code{sim} object's data. -#' @return dataframe -#' -#' @import data.table -# @export -combine_sim_dtu <- function(sim, dtuo) { - with(dtuo, { - results <- data.table(Genes[, list(parent_id, pval_AB, pval_BA, dtu_AB, dtu_BA, dtu, pval_prop_min, dtu_prop)]) - setkey(results, parent_id) - tmp <- data.table(sim$sim[sim$sim["sibl"]=="t2", c("parent_id", "propA", "mag", "propB")]) - setkey(tmp, parent_id) - results <- merge(results, tmp, by="parent_id") - return(results) - }) -} - - -#=============================================================================== -#' Plot relationship of parameters. -#' -#' @param data the output from combine_sim_dtu() -#' @param type (str) "AvBvM", "B/AvM", "AvBvMprop", "B/AvMprop", "B-AvM" -#' -#' @import ggplot2 -#' @import data.table -# @export -plot_sim <- function(data, type = "AvBvM") { - with(data, { - if(type =="AvBvM"){ - return( - ggplot(data[order(dtu, propA, propB), ], aes(x=propA, y=propB, color=dtu)) + - labs(x="Prop t1 in A", y = "Prop t1 in B") + - scale_color_manual(values=c("lightblue","red")) + - geom_point() + - facet_grid(. ~ mag) - ) - } else if(type == "B/AvM") { - return( - ggplot(data[order(dtu, mag), ], aes(x=propB/propA, y=mag, color=dtu)) + - labs(x="Prop t1 in B / Prop t1 in A", y = "Gene magnitude") + - scale_color_manual(values=c("lightblue","red")) + - geom_point() - ) - } else if(type =="AvBvMprop"){ - return( - ggplot(data[order(dtu_prop, propA, propB), ], aes(x=propA, y=propB, color=dtu_prop)) + - labs(x="Prop t1 in A", y = "Prop t1 in B") + - scale_color_manual(values=c("lightblue","red")) + - geom_point() + - facet_grid(. ~ mag) - ) - } else if(type == "B/AvMprop") { - return( - ggplot(data[order(dtu_prop, mag), ], aes(x=propB/propA, y=mag, color=dtu_prop)) + - labs(x="Prop t1 in B / Prop t1 in A", y = "Gene magnitude") + - scale_color_manual(values=c("lightblue","red")) + - geom_point() - ) - } - }) -} diff --git a/R/func.R b/R/func.R index 3ee9031..830fe67 100644 --- a/R/func.R +++ b/R/func.R @@ -21,6 +21,7 @@ #' @param threads Number of threads. #' @param seed Seed for random engine. #' @param scaling Abundance scaling factor. +#' @param reckless whether to ignore detected annotation discrepancies #' #' @return List: \itemize{ #' \item{"error"}{logical} @@ -36,7 +37,7 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A, TARGET_COL, PARENT_COL, correction, testmode, scaling, threads, seed, p_thresh, abund_thresh, dprop_thresh, - qboot, qbootnum, qrep_thresh, rboot, rrep_thresh) { + qboot, qbootnum, qrep_thresh, rboot, rrep_thresh, reckless) { warnmsg <- list() # Input format. @@ -48,14 +49,16 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A, return(list("error"=TRUE, "message"="You must specify (the same type of) data for both conditions!")) # Annotation. + if (!is.logical(reckless) || is.na(reckless)) + return(list("error"=TRUE, "message"="Invalid value to reckless!")) if (!is.data.frame(annot)) return(list("error"=TRUE, "message"="The provided annot is not a data.frame!")) if (any(!c(TARGET_COL, PARENT_COL) %in% names(annot))) return(list("error"=TRUE, "message"="The specified target and/or parent IDs field names do not exist in annot!")) - if (length(annot$target_id) != length(unique(annot$target_id))) + if (length(annot[[TARGET_COL]]) != length(unique(annot[[TARGET_COL]]))) return(list("error"=TRUE, "message"="Some transcript identifiers are not unique!")) - - # Parameters. + + # Simple parameters. if (!correction %in% p.adjust.methods) return(list("error"=TRUE, "message"="Invalid p-value correction method name. Refer to stats::p.adjust.methods .")) if (!testmode %in% c("genes", "transc", "both")) @@ -65,7 +68,7 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A, if ((!is.numeric(dprop_thresh)) || dprop_thresh < 0 || dprop_thresh > 1) return(list("error"=TRUE, "message"="Invalid proportion difference threshold! Must be between 0 and 1.")) if ((!is.numeric(abund_thresh)) || abund_thresh < 0) - return(list("error"=TRUE, "message"="Invalid abundance threshold! Must be between 0 and 1.")) + return(list("error"=TRUE, "message"="Invalid abundance threshold! Must be a count >= 0.")) if ((!is.numeric(qbootnum)) || qbootnum < 0) return(list("error"=TRUE, "message"="Invalid number of bootstraps! Must be a positive integer number.")) if (!is.logical(qboot)) @@ -80,6 +83,8 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A, return(list("error"=TRUE, "message"="Invalid number of threads! Must be positive integer.")) if (threads > parallel::detectCores(logical= TRUE)) return(list("error"=TRUE, "message"="Number of threads exceeds system's reported capacity.")) + + # Scaling nsmpl <- NULL if (!is.null(count_data_A)){ nsmpl <- dim(count_data_A)[2] + dim(count_data_B)[2] @@ -103,8 +108,20 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A, if(is.null(boot_data_A)) { if (!is.data.table(count_data_A) | !is.data.table(count_data_B)) return(list("error"=TRUE, "message"="The counts data are not data.tables!")) - if (!all( count_data_A[[1]][order(count_data_A[[1]])] == count_data_B[[1]][order(count_data_B[[1]])] )) - return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs between conditions!")) + if ( !all(count_data_A[[1]] %in% annot[[TARGET_COL]]) && !all(annot[[TARGET_COL]] %in% count_data_A[[1]]) ) { + if(reckless){ + warnmsg["annotation-mismatch"] <- "The transcript IDs in the quantifications and the annotation do not match completely! Did you use different annotations? Reckless mode enabled, continuing at your own risk..." + } else { + return(list("error"=TRUE, "message"="The transcript IDs in the quantifications and the annotation do not match completely! Did you use different annotations?")) + } + } + if (!all( count_data_A[[1]][order(count_data_A[[1]])] == count_data_B[[1]][order(count_data_B[[1]])] )) { + if (reckless) { + warnmsg["countIDs"] <- "Transcript IDs do not match completely between conditions! Did you use different annotations? Reckless mode enabled, continuing at your own risk..." + } else { + return(list("error"=TRUE, "message"="Transcript IDs do not match completely between conditions! Did you use different annotations?")) + } + } } else { warnmsg["cnts&boots"] <- "Received multiple input formats! Only the bootstrapped data will be used." } @@ -119,17 +136,33 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A, } maxmatrix <- 2^31 - 1 if (qboot) { - # Consistency, if (!is.null(boot_data_A)) { - # Direct data. + # Compared to annotation. + if ( !all(boot_data_A[[1]][[1]] %in% annot[[TARGET_COL]]) && !all(annot[[TARGET_COL]] %in% boot_data_A[[1]][[1]]) ) { + if(reckless){ + warnmsg["annotation-mismatch"] <- "The transcript IDs in the quantifications and the annotation do not match completely! Did you use different annotations? Reckless mode enabled, continuing at your own risk..." + } else { + return(list("error"=TRUE, "message"="The transcript IDs in the quantifications and the annotation do not match completely! Did you use different annotations?")) + } + } + # Among samples tx <- boot_data_A[[1]][[1]][order(boot_data_A[[1]][[1]])] for (k in 2:length(boot_data_A)){ - if (!all( tx == boot_data_A[[k]][[1]][order(boot_data_A[[k]][[1]])] )) - return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs across samples!")) + if (!all( tx == boot_data_A[[k]][[1]][order(boot_data_A[[k]][[1]])] )) { + if (reckless) { + warnmsg[paste0("bootIDs-A", k)] <- paste0("Inconsistent set of transcript IDs across samples (A", k, ")! Did you use different annotations? Reckless mode enabled, continuing at your own risk...") + } else { + return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs across samples! Did you use different annotations?")) + } + } } for (k in 1:length(boot_data_B)){ if (!all( tx == boot_data_B[[k]][[1]][order(boot_data_B[[k]][[1]])] )) - return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs across samples!")) + if (reckless) { + warnmsg[paste0("bootIDs-B", k)] <- paste0("Inconsistent set of transcript IDs across samples (B", k, ")! Did you use different annotations? Reckless mode enabled, continuing at your own risk...") + } else { + return(list("error"=TRUE, "message"="Inconsistent set of transcript IDs across samples! Did you use different annotations?")) + } } # Number of iterations. @@ -156,7 +189,7 @@ parameters_are_good <- function(annot, count_data_A, count_data_B, boot_data_A, if (rboot & any( length(samples_by_condition[[1]]) * length(samples_by_condition[[2]]) > maxmatrix/dim(annot)[1], length(boot_data_A) * length(boot_data_B) > maxmatrix/dim(annot)[1] ) ) warnmsg["toomanyreplicates"] <- "The number of replicates is too high. Exhaustive 1vs1 would exceed maximum capacity of an R matrix." - + return(list("error"=FALSE, "message"="All good!", "maxboots"=minboots, "warn"=(length(warnmsg) > 0), "warnings"= warnmsg)) } @@ -205,7 +238,7 @@ alloc_out <- function(annot, full, n=1){ "abund_scaling"=numeric(length=n), "quant_boot"=NA,"quant_reprod_thresh"=NA_real_, "quant_bootnum"=NA_integer_, "rep_boot"=NA, "rep_reprod_thresh"=NA_real_, "rep_bootnum"=NA_integer_, - "seed"=NA_integer_) + "seed"=NA_integer_, "reckless"=NA) Genes <- data.table("parent_id"=as.vector(unique(annot$parent_id)), "elig"=NA, "sig"=NA, "elig_fx"=NA, "quant_reprod"=NA, "rep_reprod"=NA, "DTU"=NA, "transc_DTU"=NA, "known_transc"=NA_integer_, "detect_transc"=NA_integer_, "elig_transc"=NA_integer_, "maxDprop"=NA_real_, @@ -256,7 +289,7 @@ alloc_out <- function(annot, full, n=1){ #' @param test_transc Whether to do transcript-level test. #' @param test_genes Whether to do gene-level test. #' @param full Either "full" (for complete output structure) or "short" (for bootstrapping). -#' @param count_thresh Minimum number of counts per replicate. +#' @param count_thresh Minimum average count across replicates. #' @param p_thresh The p-value threshold. #' @param dprop_thresh Minimum difference in proportions. #' @param correction Multiple testing correction type. @@ -303,8 +336,11 @@ calculate_DTU <- function(counts_A, counts_B, tx_filter, test_transc, test_genes ctA <- count_thresh * resobj$Parameters[["num_replic_A"]] # Adjust count threshold for number of replicates. ctB <- count_thresh * resobj$Parameters[["num_replic_B"]] Transcripts[, elig_xp := (sumA >= ctA | sumB >= ctB)] - Transcripts[, elig := (elig_xp & totalA != 0 & totalB != 0 & (sumA != totalA | sumB != totalB))] # If the entire gene is shut off in one condition, changes in proportion cannot be defined. - # If sum and total are equal in both conditions the gene has only one expressed isoform, or one isoform altogether. + Transcripts[, elig := (elig_xp & # isoform expressed above the noise threshold in EITHER condition + totalA >= ctA & totalB >= ctB & # at least one eligibly expressed isoform exists in EACH condition (prevent gene-on/off from creating wild proportions from low counts) + totalA != 0 & totalB != 0 & # gene expressed in BOTH conditions (failsafe, in case user sets noise threshold to 0) + (propA != 1 | propB != 1) )] # not the only isoform expressed. + # If sum and total are equal in both conditions, it has no detected siblings and thus cannot change in proportion. Genes[, elig_transc := Transcripts[, as.integer(sum(elig, na.rm=TRUE)), by=parent_id][, V1] ] Genes[, elig := elig_transc >= 2] @@ -494,3 +530,4 @@ group_samples <- function(covariates) { } +#EOF diff --git a/R/rats.R b/R/rats.R index 3e5d2bc..bce8e9e 100644 --- a/R/rats.R +++ b/R/rats.R @@ -19,8 +19,8 @@ #' @param name_B The name for the other condition. (Default "Condition-B") #' @param varname The name of the covariate to which the two conditions belong. (Default \code{"condition"}). #' @param p_thresh The p-value threshold. (Default 0.05) -#' @param abund_thresh Noise threshold. Minimum mean abundance for transcripts to be eligible for testing. (Default 5) #' @param dprop_thresh Effect size threshold. Minimum change in proportion of a transcript for it to be considered meaningful. (Default 0.20) +#' @param abund_thresh Noise threshold. Minimum mean (across replicates) abundance for transcripts (and genes) to be eligible for testing. (Default 5) #' @param correction The p-value correction to apply, as defined in \code{\link[stats]{p.adjust.methods}}. (Default \code{"BH"}) #' @param scaling A scaling factor or vector of scaling factors, to be applied to the abundances *prior* to any thresholding and testing. Useful for scaling TPMs (transcripts per 1 million reads) to the actual library sizes of the samples. If a vector is supplied, the order should correspond to the samples in group A followed by the samples in group B. WARNING: Improper use of the scaling factor will artificially inflate/deflate the significances obtained. #' @param testmode One of \itemize{\item{"genes"}, \item{"transc"}, \item{"both" (default)}}. @@ -33,7 +33,8 @@ #' @param verbose Display progress updates and warnings. (Default \code{TRUE}) #' @param threads Number of threads to use. (Default 1) Multi-threading will be ignored on non-POSIX systems. #' @param seed A numeric integer used to initialise the random number engine. Use this only if reproducible bootstrap selections are required. (Default NA) -#' @param dbg Debugging mode. Interrupt execution at the specified flag-point. Used to speed up code-tests by avoiding irrelevant downstream processing. (Default 0: do not interrupt) +#' @param reckless RATs normally aborts if any inconsistency is detected among the transcript IDs found in the annotation and the quantifications. Enabling reckless mode will downgrade this error to a warning and allow RATs to continue the run. Not recommended unless you know why the inconsistency exists and how it will affect the results. (Default FALSE) +#' @param dbg Debugging mode only. Interrupt execution at the specified flag-point. Used to speed up code-tests by avoiding irrelevant downstream processing. (Default 0: do not interrupt) #' @return List of mixed types. Contains a list of runtime settings, a table of gene-level results, a table of transcript-level results, and a list of two tables with the transcript abundaces. #' #' @import utils @@ -46,7 +47,7 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i name_A= "Condition-A", name_B= "Condition-B", varname= "condition", p_thresh= 0.05, abund_thresh= 5, dprop_thresh= 0.2, correction= "BH", scaling= 1, testmode= "both", qboot= TRUE, qbootnum= 0L, qrep_thresh= 0.95, rboot=TRUE, rrep_thresh= 0.85, - description= NA_character_, verbose= TRUE, threads= 1L, seed=NA_integer_, dbg= "0") + description= NA_character_, verbose= TRUE, threads= 1L, seed=NA_integer_, reckless=FALSE, dbg= "0") { #---------- PREP @@ -60,7 +61,7 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i TARGET_COL, PARENT_COL, correction, testmode, scaling, threads, seed, p_thresh, abund_thresh, dprop_thresh, - qboot, qbootnum, qrep_thresh, rboot, rrep_thresh) + qboot, qbootnum, qrep_thresh, rboot, rrep_thresh, reckless) if (paramcheck$error) stop(paramcheck$message) if (verbose) @@ -243,6 +244,7 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i resobj$Parameters[["quant_reprod_thresh"]] <- qrep_thresh resobj$Parameters[["quant_bootnum"]] <- qbootnum resobj$Parameters[["rep_reprod_thresh"]] <- rrep_thresh + resobj$Parameters[["reckless"]] <- reckless if (dbg == "info") return(resobj) @@ -469,6 +471,19 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i Transcripts[is.na(rep_reprod), rep_reprod := c(FALSE)] Transcripts[is.na(DTU), DTU := c(FALSE)] Transcripts[is.na(gene_DTU), gene_DTU := c(FALSE)] + # Eradicate NAs from count columns, as they mess up plotting. These would be caused by inconsistencies between annotation and quantifications, and would only exist in "reckless" mode. + for (i in 1:Parameters$num_replic_A){ + idx <- is.na(Abundances[[1]][[paste0('V',i)]]) + Abundances[[1]][idx, paste0('V',i) := 0] + } + for (i in 1:Parameters$num_replic_B){ + idx <- is.na(Abundances[[2]][[paste0('V',i)]]) + Abundances[[2]][idx, paste0('V',i) := 0] + } + Transcripts[is.na(meanA), meanA := c(0)] + Transcripts[is.na(meanB), meanB := c(0)] + Transcripts[is.na(propA), propA := c(0)] + Transcripts[is.na(propB), propB := c(0)] # Drop the bootstrap columns, if unused. if (!qboot || !test_transc) @@ -495,4 +510,4 @@ call_DTU <- function(annot= NULL, TARGET_COL= "target_id", PARENT_COL= "parent_i return(resobj) } - +#EOF diff --git a/inst/doc/input.R b/inst/doc/input.R index 1bd5e38..8296540 100644 --- a/inst/doc/input.R +++ b/inst/doc/input.R @@ -21,7 +21,7 @@ head(sim_count_data()[[1]]) ## ---- eval=FALSE--------------------------------------------------------- # # Simulate some data. -# simdat <- sim_count_data() +# simdat <- sim_count_data(clean=TRUE) # # For convenience let's assign the contents of the list to separate variables # mycond_A <- simdat[[2]] # Simulated abundances for one condition. # mycond_B <- simdat[[3]] # Simulated abundances for other condition. @@ -39,7 +39,7 @@ head(sim_count_data()[[1]]) ## ------------------------------------------------------------------------ # Simulate some data. (Notice it is a different function than before.) -simdat <- sim_boot_data() +simdat <- sim_boot_data(clean=TRUE) # For convenience let's assign the contents of the list to separate variables mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition. @@ -83,7 +83,7 @@ myannot <- simdat[[1]] # Transcript and gene IDs for the above data. # # Calling DTU with custom thresholds. # mydtu <- call_DTU(annot= myannot, # boot_data_A= mycond_A, boot_data_B= mycond_B, -# p_thresh= 0.01, abund_thresh= 10, dprop_thres = 0.25) +# p_thresh= 0.01, dprop_thres = 0.15, abund_thresh= 10) ## ---- eval=FALSE--------------------------------------------------------- # # Bootstrap (default). Do 100 iterations. @@ -169,3 +169,8 @@ myannot <- simdat[[1]] # Transcript and gene IDs for the above data. # boot_data_B= mydata$boot_data_B, # scaling=c(25, 26.7, 23, 50.0, 45, 48.46, 52.36)) +## ---- eval=FALSE--------------------------------------------------------- +# mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A, +# boot_data_B= mydata$boot_data_B, +# reckless=TRUE, verbose=TRUE) + diff --git a/inst/doc/input.Rmd b/inst/doc/input.Rmd index 3a383a5..be59177 100644 --- a/inst/doc/input.Rmd +++ b/inst/doc/input.Rmd @@ -1,7 +1,7 @@ --- title: 'RATs: Input and Settings' author: "Kimon Froussios" -date: "19 SEP 2017" +date: "08 MAR 2018" output: html_document: keep_md: yes @@ -103,8 +103,8 @@ RATs comes with data emulators, intended to be used for testing the code. Howeve use for showcasing how RATs works. By default, RATs reports on its progress and produces a summary report. These can be suppressed using the -`verbose = FALSE` parameter to the call. To prevent cluttering this tutorial with verbose output, we will use this -option in all the examples. +`verbose = FALSE` parameter. This also suppresses some warnings generated by RATs when checking the sanity of its input, +so we do not recommend it for general use. However, to prevent cluttering this tutorial with verbose output, we will use this option in our examples. If you choose to allow verbose output when trying out the examples below using the emulated data, you will get some **warnings** about the number of bootstraps. The warning is triggered because the emulated dataset used in the examples immitates only the structure of @@ -120,7 +120,7 @@ First, let's emulate some data to work with: ```{r, eval=FALSE} # Simulate some data. -simdat <- sim_count_data() +simdat <- sim_count_data(clean=TRUE) # For convenience let's assign the contents of the list to separate variables mycond_A <- simdat[[2]] # Simulated abundances for one condition. mycond_B <- simdat[[3]] # Simulated abundances for other condition. @@ -162,7 +162,7 @@ First, let's emulate some data, as we did before. ```{r} # Simulate some data. (Notice it is a different function than before.) -simdat <- sim_boot_data() +simdat <- sim_boot_data(clean=TRUE) # For convenience let's assign the contents of the list to separate variables mycond_A <- simdat[[2]] # Simulated bootstrapped data for one condition. @@ -273,14 +273,16 @@ The following three main thresholds are used in RATs: # Calling DTU with custom thresholds. mydtu <- call_DTU(annot= myannot, boot_data_A= mycond_A, boot_data_B= mycond_B, - p_thresh= 0.01, abund_thresh= 10, dprop_thres = 0.25) + p_thresh= 0.01, dprop_thres = 0.15, abund_thresh= 10) ``` -1. `p_thresh` - Statistical significance level. P-values below this will be considered significant. Lower threshold values are stricter. (Default 0.05) -2. `abund_thresh` - Noise threshold. Transcripts with abundance below that value in both conditions are ignored. Higher threshold values are stricter. (Default 5, assumes abundances scaled to library size) -3. `dprop_thresh` - Effect size threshold. Transcripts whose proportion changes between conditions by less than the threshold are considered non-DTU, regardless of their statistical significance. Higher threshold values are stricter. (Default 0.20) +1. `p_thresh` - Statistical significance level. P-values below this will be considered significant. Lower threshold values are stricter, and help reduce low-count low-confidence calls. (Default 0.05, very permissive) +2. `dprop_thresh` - Effect size threshold. Transcripts whose proportion changes between conditions by less than this threshold are considered uninteresting, regardless of their statistical significance. (Default 0.20, quite strict) +3. `abund_thresh` - Noise threshold. Minimum mean (across replicates) abundance for a transcript to be considered expressed. Transcripts with mean abundances below this will be ignored. Mean total gene count must also meet this value in both conditions, a.k.a. at least one expressed isoform must exist in each condition. (Default 5, very permissive) + +The default values for these thresholds have been chosen such that they achieve a *median* FDR <5% for a high quality dataset from *Arabidopsis thaliana*, even with only 3 replicates per condition. +Your mileage may vary and you should give some consideration to selecting appropriate values. -The default values for these thresholds have been chosen such that they achieve a *median* FDR <5% for a high quality dataset from *Arabidopsis thaliana*, even with only 3 replicates per condition. Your mileage may vary and you should give some consideration to selecting appropriate values. Depending on the settings, *additional thresholds* are available and will be discussed in their respective sections below. @@ -378,7 +380,7 @@ mydtu <- call_DTU(annot = myannot, 1. `threads` - The number of threads to use. (Default 1) -Due to core R implementation limitations, the type of multi-threading used in RATs works only in POSIX-compliant systems. +Due to core R implementation limitations, the type of multi-threading used in RATs works only in POSIX-compliant systems (Linux, Mac, not Windows). Refer to the `parallel` package for details. @@ -447,14 +449,14 @@ such as those employed by RATs, and reduces the statistical power of the method. To counter this, RATs provides the option to scale abundaces either equally by a single factor (such as average library size among samples) or by a vector of factors (one per sample). The former maintains any pre-existing library-size normalisation among samples. This is necessary for fold-change based methods, but RATs does not require it. Instead, using the respective actual library sizes of the samples allows the -higher-throughput samples to have a bigger influence than the lower-throughput samples. This is particularly relevant if your samples have dissimilar +higher-throughput samples to have a bigger influence than the lower-throughput samples. This is particularly relevant if your samples have very dissimilar library sizes. For flexibility with different types of input, these scaling options can be applied in either of two stages: The data import step by `fish4rodents()`, or the actual testing step by `call_DTU()`. In the example examined previously, `fish4rodents()` was instructed to create TPM abundances, -by normalising to 10000000 reads. Such values are useful with certain other tools that a user may also intend to use. +by normalising to `1000000` reads. Such values are useful with certain other tools that a user may also intend to use. Subsequently, these TPMs were re-scaled to meet the library size of each sample, thus providing RATs with count-like abundance values that retain -the TPM's normalisation by isoform length. However, it is not necessary to scale in two separate steps. +the normalisation by isoform length. However, it is not necessary to scale in two separate steps. Both `fish4rodents()` and `call_DTU()` support scaling by a single value or a vector of values. If you don't need the TPMs, you can scale directly to the desired library size(s), as in the examples below: @@ -495,26 +497,33 @@ mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A, ``` You can mix and match scaling options as per your needs, so take care to ensure that the scaling you apply is appropriate. -It is important to note, that if you simply run both methods with their respective defaults, you'll effectively run RATs +*It is important to note*, that if you simply run both methods with their respective defaults, you'll effectively run RATs on TPM values, which is extremely underpowered and not recommended. Please, provide appropriate scaling factors for your data. -*** - - -# Annotation discrepancies +## Annotation discrepancies Different annotation versions often preserve transcript IDs, despite altering the details of the transcript model. +They also tend to include more or fewer transcripts, affecting the result of quantification. It is important to use the same annotation throughout the workflow, otherwise the abundances will not be comparable in a meaningful way. -All internal operations and the output of RATs are based on the annotation provided: +RATs will abort the run if the set of feature IDs in the provided annotation does not match fully the set of IDs in the quantifications. +If this happens, ensure you are using the exact same annotation throughout your workflow. + +For special use cases, RATs provides the option to ignore the discrepancy and pretend everything is OK. Do this at your own risk. + +```{r, eval=FALSE} +mydtu <- call_DTU(annot= myannot, boot_data_A= mydata$boot_data_A, + boot_data_B= mydata$boot_data_B, + reckless=TRUE, verbose=TRUE) +``` + -* Any transcripts present in the data but missing from the annotation will be ignored completely and -will not show up in the output, as there is no reliable way to match them to gene IDs. -* Any transcript/gene present in the annotation but missing from the data will be included in the output as zero expression. -* If the samples appear to use different annotations from one another, RATs will abort. +In reckless mode, all internal operations and the output of RATs are based on the annotation provided: +* Any transcript IDs present in the data but missing from the annotation will be ignored and will not show up in the output at all, as they cannot be matched to the gene IDs. +* Any transcript ID present in the annotation but missing from the data will be included in the output as zero expression. They can be identified later as `NA` values in `$Transcripts[, .(stdevA, stdevB)]` as these fields are not used downstream by RATs. `NA` values in other numeric fields are explicitly overwritten with `0` to allow downstream interoperability. *** diff --git a/inst/doc/input.html b/inst/doc/input.html index 8078e6c..65d0336 100644 --- a/inst/doc/input.html +++ b/inst/doc/input.html @@ -11,7 +11,7 @@ - + RATs: Input and Settings @@ -25,8 +25,8 @@ - - + + @@ -217,7 +219,7 @@

RATs: Input and Settings

Kimon Froussios

-

19 SEP 2017

+

08 MAR 2018

@@ -238,13 +240,13 @@

Data

As of v0.6.0, input from a sleuth object is no longer supported by RATs. This is due to changes in Sleuth v0.29 that made the bootstrap data no longer available. Instead, RATs already supports input directly from the Kallisto/Salmon quantifications.

For option 1, the function fish4rodents() will load the data into tables suitable for option 2. Details in the respective section below.

For options 2 and 3, the format of the tables is as in the example below. The first column contains the transcript identifiers. Subsequent columns contain the abundances.

-
##      target  V1  V2  V3
-## 1:      LC1   3   2   0
-## 2:     NIA1 333 310 340
-## 3:     NIA2 666 680 610
-## 4:   1A1N-1  10  11   7
-## 5:   1A1N-2  20  21  18
-## 6: 1D1C:one   0   0   0
+
##    target  V1  V2  V3
+## 1: 1A1B.a  69  96  88
+## 2: 1A1B.b   0   0   0
+## 3:    LC1   3   2   0
+## 4:   NIA1 333 310 340
+## 5:   NIA2 666 680 610
+## 6: 1A1N-1  10  11   7