Skip to content

Commit 8976049

Browse files
committed
add find M/F doublet
1 parent 8547a7c commit 8976049

File tree

13 files changed

+240
-115
lines changed

13 files changed

+240
-115
lines changed

.DS_Store

0 Bytes
Binary file not shown.

NAMESPACE

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
# Generated by roxygen2: do not edit by hand
22

33
export(classifySex)
4-
export(normSca)
4+
export(findMfDoublet)
55
export(preprocess)
66
importFrom(AnnotationDbi,select)
77
importFrom(org.Hs.eg.db,org.Hs.eg.db)
88
importFrom(org.Mm.eg.db,org.Mm.eg.db)
99
importFrom(scuttle,perCellQCMetrics)
1010
importFrom(scuttle,quickPerCellQC)
11-
importFrom(speckle,normCounts)
11+
importFrom(stats,median)
1212
importFrom(stats,predict)
1313
importFrom(stringr,str_to_title)

R/classifySex.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,10 +94,10 @@ classifySex<-function(x, genome=NULL, qc = TRUE)
9494

9595
# load trained models
9696
if(genome == "Mm"){
97-
model <- Mm.model
97+
model <- Mm_model
9898
}
9999
else{
100-
model <- Hs.model
100+
model <- Hs_model
101101
}
102102

103103
preds <- predict(model, newdata = data.df)

R/findMFDoublet.R

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
#' Find male-female doublet cells
2+
#'
3+
#' This function will identify male-female doublet cells in scRNA-seq data. The
4+
#' classifier is based on random forest models that have been trained
5+
#' on mouse and human single cell RNA-seq data.
6+
#'
7+
#' @aliases findMfDoublet
8+
#' @param x counts matrix, rows correspond to genes and columns correspond to
9+
#' cells. Row names must be gene symbols.
10+
#' @param genome the genome the data arises from. Current options are
11+
#' human: genome = "Hs" or mouse: genome = "Mm".
12+
#' @param qc logical, indicates whether to perform quality control or not.
13+
#' qc = TRUE will predict cells that pass quality control only and the filtered
14+
#' cells will not be classified. qc = FALSE will predict every cell except the
15+
#' cells with zero counts on *XIST/Xist* and the sum of the Y genes.
16+
#' Default is TRUE.
17+
#'
18+
#' @return a dataframe with predicted labels for each cell
19+
#'
20+
#' @importFrom stats predict
21+
#' @export findMfDoublet
22+
#'
23+
#' @author Xinyi Jin
24+
#'
25+
#' @examples
26+
#'
27+
#' library(speckle)
28+
#' library(SingleCellExperiment)
29+
#' library(CellBench)
30+
#' library(org.Hs.eg.db)
31+
#'
32+
#' sc_data <- load_sc_data()
33+
#' sc_10x <- sc_data$sc_10x
34+
#'
35+
#' counts <- counts(sc_10x)
36+
#' ann <- select(org.Hs.eg.db, keys=rownames(sc_10x),
37+
#' columns=c("ENSEMBL","SYMBOL"), keytype="ENSEMBL")
38+
#' m <- match(rownames(counts), ann$ENSEMBL)
39+
#' rownames(counts) <- ann$SYMBOL[m]
40+
#'
41+
#' sex <- findMfDoublet(counts, genome="Hs")
42+
#'
43+
#' table(sex$prediction)
44+
#' boxplot(counts["XIST",]~sex$prediction)
45+
#'
46+
findMfDoublet<-function(x, genome=NULL, qc = FALSE)
47+
# Classify cells as doublets or single-cell
48+
# Xinyi Jin
49+
# 9 September 2022
50+
{
51+
# Perform some checks on the data
52+
if (is.null(x)) stop("Counts matrix missing")
53+
x <- as.matrix(x)
54+
55+
if (is.null(genome)){
56+
message("Genome not specified. Human genome used. Options are 'Hs' for
57+
human and 'Mm' for mouse. We currently don't support other genomes.")
58+
}
59+
# Default is Hs
60+
genome <- match.arg(genome,c("Hs","Mm"))
61+
62+
# pre-process
63+
processed.data<-preprocess(x, genome = genome, qc = FALSE)
64+
65+
# the processed transposed count matrix
66+
tcm <-processed.data$tcm.final
67+
68+
# the normalised, transposed count matrix
69+
data.df <- processed.data$data.df
70+
71+
# cells that filtered by QC
72+
# discarded.cells <- processed.data$discarded.cells
73+
74+
# cells with zero count on superX and superY
75+
# zero.cells <- processed.data$zero.cells
76+
77+
# store the final predictions
78+
final.pred<-data.frame(prediction=rep("NA", ncol(x)))
79+
row.names(final.pred)<- colnames(x)
80+
81+
# load trained models
82+
if(genome == "Mm"){
83+
model <- Mm_db_model
84+
}
85+
else{
86+
model <- Hs_db_model
87+
}
88+
89+
preds <- predict(model, newdata = data.df)
90+
final.pred[row.names(data.df), "prediction"]<- as.character(preds)
91+
92+
final.pred
93+
}
94+

R/lognormCounts.R

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#' Normalise a counts matrix to the median library size
2+
#'
3+
#' @param x a \code{DGEList} object or matrix of counts.
4+
#' @param log logical, indicates whether the output should be on the log2 scale
5+
#' or counts scale. Default is FALSE.
6+
#' @param prior.count The prior count to add if the data is log2 normalised.
7+
#' Default is a small count of 0.5.
8+
#' @param lib.size a vector of library sizes to be used during the normalisation
9+
#' step. Default is NULL and will be computed from the counts matrix.
10+
#'
11+
#' @return a matrix of normalised counts
12+
#' @importFrom stats median
13+
#' @author Belinda Phipson
14+
#'
15+
lognormCounts <- function(x, log = TRUE, prior.count = 0.5, lib.size)
16+
# Function to log normalise a counts matrix to median library size
17+
# Input is counts matrix
18+
# Genes as rows, cells as columns
19+
# Belinda Phipson
20+
# 26 February 2020
21+
{
22+
x <- as.matrix(x)
23+
#lib.size <- colSums(x)
24+
M <- median(lib.size)
25+
if(log){
26+
prior.count.scaled <- lib.size/mean(lib.size)*prior.count
27+
lib.size <- lib.size + 2*prior.count.scaled
28+
log2(t((t(x)+prior.count.scaled)/lib.size*M))
29+
}
30+
else t(t(x)/lib.size*M)
31+
}
32+

R/normSca.R

Lines changed: 0 additions & 45 deletions
This file was deleted.

R/preprocess.R

Lines changed: 6 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
12
#' Pre-processing function for sex classification
23
#'
34
#' The purpose of this function is to process a single cell counts matrix into
@@ -106,25 +107,6 @@ preprocess<- function(x, genome=genome, qc=qc){
106107
cm.new["superX", ] <-colSums(x[Xgene.set,])
107108
cm.new["superY", ] <-colSums(x[Ygene.set,])
108109

109-
# if (genome == "Mm"){
110-
# ann <- suppressWarnings(AnnotationDbi::select(org.Mm.eg.db,
111-
# keys=str_to_title(row.names(x)),
112-
# columns=c("SYMBOL","GENENAME","CHR"),
113-
# keytype="SYMBOL"))
114-
# }else{
115-
# ann <- suppressWarnings(AnnotationDbi::select(org.Hs.eg.db,
116-
# keys=row.names(x),
117-
# columns=c("SYMBOL","GENENAME","CHR"),
118-
# keytype="SYMBOL"))
119-
# }
120-
# # create superY.all
121-
# Ychr.genes<- toupper(unique(ann[which(ann$CHR=="Y"), "SYMBOL"]))
122-
# missing <- setdiff(Ychr.genes, row.names(x))
123-
# if (length(missing) >0){
124-
# Ychr.genes <- Ychr.genes[-match(missing, Ychr.genes)]
125-
# }
126-
# cm.new["superY.all", ] <- colSums(x[Ychr.genes,])
127-
128110
############################################################################
129111
# Pre-processing
130112
# perform simple QC
@@ -148,19 +130,22 @@ preprocess<- function(x, genome=genome, qc=qc){
148130
#Do Not Classify
149131
zero.cells <- NA
150132
dnc <- tcm.final$superY==0 & tcm.final$superX==0
133+
151134
if(any(dnc)==TRUE){
152135
zero.cells <- row.names(tcm.final)[dnc]
153136
message(length(zero.cells), "cell/s are unable to be classified
154-
due to an abundance of zeroes on X and Y chromosome genes\n")
137+
due to an abundance of zeroes on X and Y chromosome genes\n")
155138
}
156139
tcm.final <- tcm.final[!dnc, ]
157140

158141
cm.new <- cm.new[,!dnc]
142+
159143
cm.lib.size<- colSums(x[,colnames(cm.new)], na.rm=TRUE)
160144

161145
# log-normalisation performed for each cell
162146
# scaling performed for each gene
163-
normsca.cm <- normSca(cm.new, lib.size=cm.lib.size)
147+
normsca.cm <- data.frame(lognormCounts(cm.new, log = TRUE,
148+
prior.count = 0.5,lib.size=cm.lib.size))
164149
data.df <- t(normsca.cm)
165150
data.df <- as.data.frame(data.df)
166151

R/sysdata.rda

-7.72 MB
Binary file not shown.

README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,21 @@
77
The cellXY package currently contains trained models to classify cells as male or
88
female and to predict whether a cell is a male-female doublet or not.
99

10+
The propeller function performs statistical tests for differences in cell
11+
type composition in single cell data. In order to test for differences in cell
12+
type proportions between multiple experimental conditions at least one of the
13+
groups must have some form of biological replication (i.e. at least two
14+
samples). For a two group scenario, the absolute minimum sample size is thus
15+
three. Since there are many technical aspects which can affect cell type
16+
proportion estimates, having biological replication is essential for a
17+
meaningful analysis.
18+
19+
The propeller function takes a SingleCellExperiment or Seurat object as input,
20+
extracts the relevant cell information, and tests whether the cell type
21+
proportions are statistically significantly different between experimental
22+
conditions/groups. The user can also explicitly pass the cluster, sample and
23+
experimental group information to propeller. P-values and false discovery rates
24+
are outputted for each cell type.
1025

1126
## Installation
1227

@@ -60,5 +75,11 @@ sex <- classifySex(counts, genome="Hs")
6075
table(sex$prediction)
6176
boxplot(counts["XIST",]~sex$prediction)
6277
```
78+
Please note that this basic implementation is for when you are only modelling
79+
group information. When you have additional covariates that you would like to
80+
account for, please use the propeller.ttest() and propeller.anova() functions
81+
directly. Please read the vignette for examples on how to model a continuous
82+
variable, account for additional covariates and include a random effect in the
83+
analysis.
6384

6485

man/findMfDoublet.Rd

Lines changed: 54 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)