update

MelodyJIN-Y · MelodyJIN-Y · commit 7c74743b85d5 · 2022-10-17T21:26:09.000+08:00
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -10,7 +10,7 @@ Authors@R:
            email = "Melody-Jin@outlook.com")
     )
 Depends: R (>= 4.2.0)
-Imports: scuttle, stringr, AnnotationDbi, org.Hs.eg.db, org.Mm.eg.db, stats, grDevices, graphics, speckle, ggplot2, SingleCellExperiment
+Imports: scuttle, stringr, AnnotationDbi, org.Hs.eg.db, org.Mm.eg.db, stats,SingleCellExperiment, speckle
 VignetteBuilder: knitr
 Suggests: knitr, rmarkdown, CellBench, scater, patchwork
 Description: The cellXY package contains functions for predicting sex labels for single cell RNA sequencing data.
diff --git a/NAMESPACE b/NAMESPACE
@@ -3,6 +3,7 @@
 export(classifySex)
 export(findMfDoublet)
 export(preprocess)
+export(preprocessDb)
 importFrom(AnnotationDbi,select)
 importFrom(org.Hs.eg.db,org.Hs.eg.db)
 importFrom(org.Mm.eg.db,org.Mm.eg.db)
diff --git a/R/findMFDoublet.R b/R/findMFDoublet.R
@@ -60,7 +60,7 @@ findMfDoublet<-function(x, genome=NULL, qc = FALSE)
     genome <- match.arg(genome,c("Hs","Mm"))
 
     # pre-process
-    processed.data<-preprocess(x, genome = genome, qc = FALSE)
+    processed.data<-preprocessDb(x, genome = genome, qc = FALSE)
 
     # the processed transposed count matrix
     tcm <-processed.data$tcm.final
diff --git a/R/preprocessDb.R b/R/preprocessDb.R
@@ -0,0 +1,127 @@
+
+#' Pre-processing function for sex classification
+#'
+#' The purpose of this function is to process a single cell counts matrix into
+#' the appropriate format for the \code{classifySex} function.
+#'
+#' This function will filter out cells that are unable to be classified due to
+#' zero counts on *XIST/Xist* and all of the Y chromosome genes. If
+#' \code{qc=TRUE} additional cells are removed as identified by the
+#' \code{perCellQCMetrics} and \code{quickPerCellQC} functions from the
+#' \code{scuttle} package. The resulting counts matrix is then log-normalised
+#' and scaled.
+#'
+#' @param x the counts matrix, rows are genes and columns are cells. Row names
+#' must be gene symbols.
+#' @param genome the genome the data arises from. Current options are
+#' human: genome = "Hs" or mouse: genome = "Mm".
+#' @param qc logical, indicates whether to perform additional quality control
+#' on the cells. qc = TRUE will predict cells that pass quality control only
+#' and the filtered cells will not be classified. qc = FALSE will predict
+#' every cell except the cells with zero counts on *XIST/Xist* and the sum
+#' of the Y genes. Default is TRUE.
+#'
+#' @return outputs a list object with the following components
+#' \item{tcm.final }{A transposed count matrix where rows are cells and columns
+#' are the features used for classification.}
+#' \item{data.df }{The normalised and scaled \code{tcm.final} matrix.}
+#' \item{discarded.cells }{Character vector of cell IDs for the cells that are
+#' discarded when \code{qc=TRUE}.}
+#' \item{zero.cells }{Character vector of cell IDs for the cells that can not
+#' be classified as male/female due to zero counts on *Xist* and all the
+#' Y chromosome genes.}
+#'
+#' @importFrom AnnotationDbi select
+#' @importFrom stringr str_to_title
+#' @importFrom scuttle perCellQCMetrics
+#' @importFrom scuttle quickPerCellQC
+#' @importFrom org.Hs.eg.db org.Hs.eg.db
+#' @importFrom org.Mm.eg.db org.Mm.eg.db
+#' @export preprocessDb
+#'
+
+preprocessDb<- function(x, genome=genome, qc=qc){
+
+  x <- as.matrix(x)
+  row.names(x)<- toupper(row.names(x))
+  # genes located in the X chromosome that have been reported to escape
+  # X-inactivation
+  # http://bioinf.wehi.edu.au/software/GenderGenes/index.html
+  Xgenes<- c("ARHGAP4","STS","ARSD", "ARSL", "AVPR2", "BRS3", "S100G",
+             "CHM", "CLCN4", "DDX3X","EIF1AX","EIF2S3", "GPM6B",
+             "GRPR", "HCFC1", "L1CAM", "MAOA", "MYCLP1", "NAP1L3",
+             "GPR143", "CDK16", "PLXNB3", "PRKX", "RBBP7", "RENBP",
+             "RPS4X", "TRAPPC2", "SH3BGRL", "TBL1X","UBA1", "KDM6A",
+             "XG", "XIST", "ZFX", "PUDP", "PNPLA4", "USP9X", "KDM5C",
+             "SMC1A", "NAA10", "OFD1", "IKBKG", "PIR", "INE2", "INE1",
+             "AP1S2", "GYG2", "MED14", "RAB9A", "ITM2A", "MORF4L2",
+             "CA5B", "SRPX2", "GEMIN8", "CTPS2", "CLTRN", "NLGN4X",
+             "DUSP21", "ALG13","SYAP1", "SYTL4", "FUNDC1", "GAB3",
+             "RIBC1", "FAM9C","CA5BP1")
+
+  # genes belonging to the male-specific region of chromosome Y (unique genes)
+  # http://bioinf.wehi.edu.au/software/GenderGenes/index.html
+  Ygenes<-c("AMELY", "DAZ1", "PRKY", "RBMY1A1", "RBMY1HP", "RPS4Y1", "SRY",
+            "TSPY1", "UTY", "ZFY","KDM5D", "USP9Y", "DDX3Y", "PRY", "XKRY",
+            "BPY2", "VCY", "CDY1", "EIF1AY", "TMSB4Y","CDY2A", "NLGN4Y",
+            "PCDH11Y", "HSFY1", "TGIF2LY", "TBL1Y", "RPS4Y2", "HSFY2",
+            "CDY2B", "TXLNGY","CDY1B", "DAZ3", "DAZ2", "DAZ4")
+
+  # build artificial genes
+  Xgene.set <-Xgenes[Xgenes %in% row.names(x)]
+  Ygene.set <-Ygenes[Ygenes %in% row.names(x)]
+  cm.new<-as.data.frame(matrix(rep(0, 3*ncol(x)), ncol = ncol(x),nrow = 3))
+  row.names(cm.new) <- c("XIST","superX","superY")
+  colnames(cm.new) <- colnames(x)
+  cm.new["XIST", ]<- x["XIST", ]
+  cm.new["superX", ] <-colSums(x[Xgene.set,])
+  cm.new["superY", ] <-colSums(x[Ygene.set,])
+
+  ############################################################################
+  # Pre-processing
+  # perform simple QC
+  # keep a copy of library size
+  discarded.cells <- NA
+  if (qc == TRUE){
+    #data.sce <-SingleCellExperiment(assays = list(counts = x))
+    qcstats <- scuttle::perCellQCMetrics(x,subsets=list(Mito=1:100))
+    qcfilter <- scuttle::quickPerCellQC(qcstats,
+                                        percent_subsets=c("subsets_Mito_percent"))
+    # save the discarded cells
+    discarded.cells <- colnames(x[,qcfilter$discard])
+
+    # cm.new only contains cells that pass the quality control
+    cm.new <-cm.new[,!qcfilter$discard]
+  }
+
+  tcm.final <- t(cm.new)
+  tcm.final <- as.data.frame(tcm.final)
+
+  # Do Not Classify
+  # zero.cells <- NA
+  # dnc <- tcm.final$superY==0 & tcm.final$superX==0
+  #
+  # if(any(dnc)==TRUE){
+  #   zero.cells <- row.names(tcm.final)[dnc]
+  #   message(length(zero.cells), "cell/s are unable to be classified
+  #             due to an abundance of zeroes on X and Y chromosome genes\n")
+  # }
+  # tcm.final <- tcm.final[!dnc, ]
+  #
+  # cm.new <- cm.new[,!dnc]
+
+  cm.lib.size<- colSums(x[,colnames(cm.new)], na.rm=TRUE)
+  med.ls = median(cm.lib.size)
+
+
+  # log-normalisation performed for each cell
+  # scaling performed for each gene
+  normsca.cm <- data.frame(lognormCounts(cm.new, log = TRUE,
+                                         prior.count = 0.5,lib.size=cm.lib.size))
+  data.df <- t(normsca.cm)
+  data.df <- as.data.frame(data.df)
+  data.df$med.ls = cm.lib.size/med.ls
+  tcm.final$med.ls = cm.lib.size/med.ls
+
+  list(tcm.final=tcm.final, data.df=data.df, discarded.cells=discarded.cells)
+}
diff --git a/README.md b/README.md
@@ -7,15 +7,6 @@
 The cellXY package currently contains trained models to classify cells as male or 
 female and to predict whether a cell is a male-female doublet or not. 
 
-The propeller function performs statistical tests for differences in cell
-type composition in single cell data. In order to test for differences in cell
-type proportions between multiple experimental conditions at least one of the 
-groups must have some form of biological replication (i.e. at least two 
-samples). For a two group scenario, the absolute minimum sample size is thus 
-three. Since there are many technical aspects which can affect cell type 
-proportion estimates, having biological replication is essential for a 
-meaningful analysis.
-
 The propeller function takes a SingleCellExperiment or Seurat object as input,
 extracts the relevant cell information, and tests whether the cell type 
 proportions are statistically significantly different between experimental
@@ -53,7 +44,7 @@ devtools::install_github("phipsonlab/cellXY")
 
 ## Sex label prediction example 
 
-This is a basic example which shows you how to obtain a sex label preidction for each cell. 
+This is a basic example which shows you how to obtain a sex label prediction for each cell. 
 
 ``` r
 library(speckle)
@@ -75,11 +66,4 @@ sex <- classifySex(counts, genome="Hs")
 table(sex$prediction)
 boxplot(counts["XIST",]~sex$prediction)
 ```
-Please note that this basic implementation is for when you are only modelling
-group information. When you have additional covariates that you would like to 
-account for, please use the propeller.ttest() and propeller.anova() functions
-directly. Please read the vignette for examples on how to model a continuous 
-variable, account for additional covariates and include a random effect in the 
-analysis. 
-
 
diff --git a/man/preprocessDb.Rd b/man/preprocessDb.Rd

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@ Authors@R:`
`10`	`10`	`email = "[email protected]")`
`11`	`11`	`)`
`12`	`12`	`Depends: R (>= 4.2.0)`
`13`		`-Imports: scuttle, stringr, AnnotationDbi, org.Hs.eg.db, org.Mm.eg.db, stats, grDevices, graphics, speckle, ggplot2, SingleCellExperiment`
	`13`	`+Imports: scuttle, stringr, AnnotationDbi, org.Hs.eg.db, org.Mm.eg.db, stats,SingleCellExperiment, speckle`
`14`	`14`	`VignetteBuilder: knitr`
`15`	`15`	`Suggests: knitr, rmarkdown, CellBench, scater, patchwork`
`16`	`16`	`Description: The cellXY package contains functions for predicting sex labels for single cell RNA sequencing data.`