phipsonlab
diff --git a/‎.DS_Store
0 Bytes b/‎.DS_Store
0 Bytes
diff --git a/‎NAMESPACE
Lines changed: 2 additions & 2 deletions b/‎NAMESPACE
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/classifySex.R
Lines changed: 2 additions & 2 deletions b/‎R/classifySex.R
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/findMFDoublet.R
Lines changed: 94 additions & 0 deletions b/‎R/findMFDoublet.R
Lines changed: 94 additions & 0 deletions
diff --git a/‎R/lognormCounts.R
Lines changed: 32 additions & 0 deletions b/‎R/lognormCounts.R
Lines changed: 32 additions & 0 deletions
diff --git a/‎R/normSca.R
Lines changed: 0 additions & 45 deletions b/‎R/normSca.R
Lines changed: 0 additions & 45 deletions
diff --git a/‎R/preprocess.R
Lines changed: 6 additions & 21 deletions b/‎R/preprocess.R
Lines changed: 6 additions & 21 deletions
diff --git a/‎R/sysdata.rda
-7.72 MB b/‎R/sysdata.rda
-7.72 MB
diff --git a/‎README.md
Lines changed: 21 additions & 0 deletions b/‎README.md
Lines changed: 21 additions & 0 deletions
diff --git a/‎man/findMfDoublet.Rd
Lines changed: 54 additions & 0 deletions b/‎man/findMfDoublet.Rd
Lines changed: 54 additions & 0 deletions
@@ -1,13 +1,13 @@
 # Generated by roxygen2: do not edit by hand
 
 export(classifySex)
-export(normSca)
+export(findMfDoublet)
 export(preprocess)
 importFrom(AnnotationDbi,select)
 importFrom(org.Hs.eg.db,org.Hs.eg.db)
 importFrom(org.Mm.eg.db,org.Mm.eg.db)
 importFrom(scuttle,perCellQCMetrics)
 importFrom(scuttle,quickPerCellQC)
-importFrom(speckle,normCounts)
+importFrom(stats,median)
 importFrom(stats,predict)
 importFrom(stringr,str_to_title)
@@ -94,10 +94,10 @@ classifySex<-function(x, genome=NULL, qc = TRUE)
 
   # load trained models
   if(genome == "Mm"){
-    model <- Mm.model
+    model <- Mm_model
   }
   else{
-    model <- Hs.model
+    model <- Hs_model
   }
 
   preds <- predict(model, newdata = data.df)
 
@@ -0,0 +1,94 @@
+#' Find male-female doublet cells
+#'
+#' This function will identify male-female doublet cells in scRNA-seq data. The
+#' classifier is based on random forest models that have been trained
+#' on mouse and human single cell RNA-seq data.
+#'
+#' @aliases findMfDoublet
+#' @param x counts matrix, rows correspond to genes and columns correspond to
+#' cells. Row names must be gene symbols.
+#' @param genome the genome the data arises from. Current options are
+#' human: genome = "Hs" or mouse: genome = "Mm".
+#' @param qc logical, indicates whether to perform quality control or not.
+#' qc = TRUE will predict cells that pass quality control only and the filtered
+#' cells will not be classified. qc = FALSE will predict every cell except the
+#' cells with zero counts on *XIST/Xist* and the sum of the Y genes.
+#' Default is TRUE.
+#'
+#' @return a dataframe with predicted labels for each cell
+#'
+#' @importFrom stats predict
+#' @export findMfDoublet
+#'
+#' @author Xinyi Jin
+#'
+#' @examples
+#'
+#' library(speckle)
+#' library(SingleCellExperiment)
+#' library(CellBench)
+#' library(org.Hs.eg.db)
+#'
+#' sc_data <- load_sc_data()
+#' sc_10x <- sc_data$sc_10x
+#'
+#' counts <- counts(sc_10x)
+#' ann <- select(org.Hs.eg.db, keys=rownames(sc_10x),
+#'              columns=c("ENSEMBL","SYMBOL"), keytype="ENSEMBL")
+#' m <- match(rownames(counts), ann$ENSEMBL)
+#' rownames(counts) <- ann$SYMBOL[m]
+#'
+#' sex <- findMfDoublet(counts, genome="Hs")
+#'
+#' table(sex$prediction)
+#' boxplot(counts["XIST",]~sex$prediction)
+#'
+findMfDoublet<-function(x, genome=NULL, qc = FALSE)
+#    Classify cells as doublets or single-cell
+#    Xinyi Jin
+#    9 September 2022
+{
+    # Perform some checks on the data
+    if (is.null(x)) stop("Counts matrix missing")
+    x <- as.matrix(x)
+
+    if (is.null(genome)){
+        message("Genome not specified. Human genome used. Options are 'Hs' for
+        human and 'Mm' for mouse. We currently don't support other genomes.")
+    }
+    # Default is Hs
+    genome <- match.arg(genome,c("Hs","Mm"))
+
+    # pre-process
+    processed.data<-preprocess(x, genome = genome, qc = FALSE)
+
+    # the processed transposed count matrix
+    tcm <-processed.data$tcm.final
+
+    # the normalised, transposed count matrix
+    data.df <- processed.data$data.df
+
+    # cells that filtered by QC
+    # discarded.cells <- processed.data$discarded.cells
+
+    # cells with zero count on superX and superY
+    # zero.cells <- processed.data$zero.cells
+
+    # store the final predictions
+    final.pred<-data.frame(prediction=rep("NA", ncol(x)))
+    row.names(final.pred)<- colnames(x)
+
+    # load trained models
+    if(genome == "Mm"){
+        model <- Mm_db_model
+    }
+    else{
+        model <- Hs_db_model
+    }
+
+    preds <- predict(model, newdata = data.df)
+    final.pred[row.names(data.df), "prediction"]<- as.character(preds)
+
+    final.pred
+}
+
@@ -0,0 +1,32 @@
+#' Normalise a counts matrix to the median library size
+#'
+#' @param x a \code{DGEList} object or matrix of counts.
+#' @param log logical, indicates whether the output should be on the log2 scale
+#' or counts scale. Default is FALSE.
+#' @param prior.count The prior count to add if the data is log2 normalised.
+#' Default is a small count of 0.5.
+#' @param lib.size a vector of library sizes to be used during the normalisation
+#' step. Default is NULL and will be computed from the counts matrix.
+#'
+#' @return a matrix of normalised counts
+#' @importFrom stats median
+#' @author Belinda Phipson
+#'
+lognormCounts <- function(x, log = TRUE, prior.count = 0.5, lib.size)
+  # Function to log normalise a counts matrix to median library size
+  # Input is counts matrix
+  # Genes as rows, cells as columns
+  # Belinda Phipson
+  # 26 February 2020
+{
+  x <- as.matrix(x)
+  #lib.size <- colSums(x)
+  M <- median(lib.size)
+  if(log){
+    prior.count.scaled <- lib.size/mean(lib.size)*prior.count
+    lib.size <- lib.size + 2*prior.count.scaled
+    log2(t((t(x)+prior.count.scaled)/lib.size*M))
+  }
+  else t(t(x)/lib.size*M)
+}
+
@@ -1,3 +1,4 @@
+
 #' Pre-processing function for sex classification
 #'
 #' The purpose of this function is to process a single cell counts matrix into
@@ -106,25 +107,6 @@ preprocess<- function(x, genome=genome, qc=qc){
   cm.new["superX", ] <-colSums(x[Xgene.set,])
   cm.new["superY", ] <-colSums(x[Ygene.set,])
 
-  #  if (genome == "Mm"){
-  #    ann <- suppressWarnings(AnnotationDbi::select(org.Mm.eg.db,
-  #                                   keys=str_to_title(row.names(x)),
-  #                                   columns=c("SYMBOL","GENENAME","CHR"),
-  #                                  keytype="SYMBOL"))
-  #  }else{
-  #    ann <- suppressWarnings(AnnotationDbi::select(org.Hs.eg.db,
-  #                                   keys=row.names(x),
-  #                                   columns=c("SYMBOL","GENENAME","CHR"),
-  #                                   keytype="SYMBOL"))
-  #  }
-  #  # create  superY.all
-  #  Ychr.genes<- toupper(unique(ann[which(ann$CHR=="Y"), "SYMBOL"]))
-  #  missing <- setdiff(Ychr.genes, row.names(x))
-  #  if (length(missing) >0){
-  #    Ychr.genes <- Ychr.genes[-match(missing, Ychr.genes)]
-  #  }
-  #  cm.new["superY.all", ] <- colSums(x[Ychr.genes,])
-
   ############################################################################
   # Pre-processing
   # perform simple QC
@@ -148,19 +130,22 @@ preprocess<- function(x, genome=genome, qc=qc){
   #Do Not Classify
   zero.cells <- NA
   dnc <- tcm.final$superY==0 & tcm.final$superX==0
+
   if(any(dnc)==TRUE){
     zero.cells <- row.names(tcm.final)[dnc]
     message(length(zero.cells), "cell/s are unable to be classified
-                due to an abundance of zeroes on X and Y chromosome genes\n")
+              due to an abundance of zeroes on X and Y chromosome genes\n")
   }
   tcm.final <- tcm.final[!dnc, ]
 
   cm.new <- cm.new[,!dnc]
+
   cm.lib.size<- colSums(x[,colnames(cm.new)], na.rm=TRUE)
 
   # log-normalisation performed for each cell
   # scaling performed for each gene
-  normsca.cm <- normSca(cm.new, lib.size=cm.lib.size)
+  normsca.cm <- data.frame(lognormCounts(cm.new, log = TRUE,
+                                         prior.count = 0.5,lib.size=cm.lib.size))
   data.df <- t(normsca.cm)
   data.df <- as.data.frame(data.df)
 
 
@@ -7,6 +7,21 @@
 The cellXY package currently contains trained models to classify cells as male or 
 female and to predict whether a cell is a male-female doublet or not. 
 
+The propeller function performs statistical tests for differences in cell
+type composition in single cell data. In order to test for differences in cell
+type proportions between multiple experimental conditions at least one of the 
+groups must have some form of biological replication (i.e. at least two 
+samples). For a two group scenario, the absolute minimum sample size is thus 
+three. Since there are many technical aspects which can affect cell type 
+proportion estimates, having biological replication is essential for a 
+meaningful analysis.
+
+The propeller function takes a SingleCellExperiment or Seurat object as input,
+extracts the relevant cell information, and tests whether the cell type 
+proportions are statistically significantly different between experimental
+conditions/groups. The user can also explicitly pass the cluster, sample and 
+experimental group information to propeller. P-values and false discovery rates 
+are outputted for each cell type. 
 
 ## Installation
 
@@ -60,5 +75,11 @@ sex <- classifySex(counts, genome="Hs")
 table(sex$prediction)
 boxplot(counts["XIST",]~sex$prediction)
 ```
+Please note that this basic implementation is for when you are only modelling
+group information. When you have additional covariates that you would like to 
+account for, please use the propeller.ttest() and propeller.anova() functions
+directly. Please read the vignette for examples on how to model a continuous 
+variable, account for additional covariates and include a random effect in the 
+analysis.
Original file line number	Diff line number	Diff line change
`@@ -94,10 +94,10 @@ classifySex<-function(x, genome=NULL, qc = TRUE)`
`94`	`94`
`95`	`95`	`# load trained models`
`96`	`96`	`if(genome == "Mm"){`
`97`		`- model <- Mm.model`
	`97`	`+ model <- Mm_model`
`98`	`98`	`}`
`99`	`99`	`else{`
`100`		`- model <- Hs.model`
	`100`	`+ model <- Hs_model`
`101`	`101`	`}`
`102`	`102`
`103`	`103`	`preds <- predict(model, newdata = data.df)`