Skip to content

Commit 6ebfc8c

Browse files
committed
add another check for count matrix gene names
1 parent e44c3bc commit 6ebfc8c

File tree

4 files changed

+97
-66
lines changed

4 files changed

+97
-66
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,5 +30,5 @@ Suggests:
3030
Description: The cellXY package contains functions for predicting sex labels for single cell RNA sequencing data.
3131
License: GPL-3
3232
biocViews: SingleCell
33-
RoxygenNote: 7.2.2
33+
RoxygenNote: 7.3.2
3434
Config/testthat/edition: 3

R/preprocess.R

Lines changed: 70 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -72,17 +72,22 @@
7272
#'
7373
preprocess<- function(x, genome=genome, qc=qc){
7474

75-
x <- as.matrix(x)
76-
row.names(x)<- toupper(row.names(x))
77-
78-
if (length(unique(colnames(x))) != ncol(x)){
75+
x <- as.matrix(x)
76+
row.names(x)<- toupper(row.names(x))
77+
78+
if (is.null(row.names(x))){
79+
stop("Missing rownames for the input count matrix.
80+
Please use gene symbols as rownames.")
81+
}
82+
83+
if (length(unique(colnames(x))) != ncol(x)){
7984
message("Cell names are missing/duplicated. Cells are renamed to cell1 - cell", ncol(x))
8085
colnames(x) = paste(rep("cell", ncol(x)), seq(1, ncol(x)), sep="")
81-
}
82-
# genes located in the X chromosome that have been reported to escape
83-
# X-inactivation
84-
# http://bioinf.wehi.edu.au/software/GenderGenes/index.html
85-
Xgenes<- c("ARHGAP4","STS","ARSD", "ARSL", "AVPR2", "BRS3", "S100G",
86+
}
87+
# genes located in the X chromosome that have been reported to escape
88+
# X-inactivation
89+
# http://bioinf.wehi.edu.au/software/GenderGenes/index.html
90+
Xgenes<- c("ARHGAP4","STS","ARSD", "ARSL", "AVPR2", "BRS3", "S100G",
8691
"CHM", "CLCN4", "DDX3X","EIF1AX","EIF2S3", "GPM6B",
8792
"GRPR", "HCFC1", "L1CAM", "MAOA", "MYCLP1", "NAP1L3",
8893
"GPR143", "CDK16", "PLXNB3", "PRKX", "RBBP7", "RENBP",
@@ -93,78 +98,79 @@ preprocess<- function(x, genome=genome, qc=qc){
9398
"CA5B", "SRPX2", "GEMIN8", "CTPS2", "CLTRN", "NLGN4X",
9499
"DUSP21", "ALG13","SYAP1", "SYTL4", "FUNDC1", "GAB3",
95100
"RIBC1", "FAM9C","CA5BP1")
96-
97-
# genes belonging to the male-specific region of chromosome Y (unique genes)
98-
# http://bioinf.wehi.edu.au/software/GenderGenes/index.html
99-
Ygenes<-c("AMELY", "DAZ1", "PRKY", "RBMY1A1", "RBMY1HP", "RPS4Y1", "SRY",
101+
102+
# genes belonging to the male-specific region of chromosome Y (unique genes)
103+
# http://bioinf.wehi.edu.au/software/GenderGenes/index.html
104+
Ygenes<-c("AMELY", "DAZ1", "PRKY", "RBMY1A1", "RBMY1HP", "RPS4Y1", "SRY",
100105
"TSPY1", "UTY", "ZFY","KDM5D", "USP9Y", "DDX3Y", "PRY", "XKRY",
101106
"BPY2", "VCY", "CDY1", "EIF1AY", "TMSB4Y","CDY2A", "NLGN4Y",
102107
"PCDH11Y", "HSFY1", "TGIF2LY", "TBL1Y", "RPS4Y2", "HSFY2",
103108
"CDY2B", "TXLNGY","CDY1B", "DAZ3", "DAZ2", "DAZ4")
104-
105-
# build artificial genes
106-
Xgene.set <-Xgenes[Xgenes %in% row.names(x)]
107-
Ygene.set <-Ygenes[Ygenes %in% row.names(x)]
108-
cm.new<-as.data.frame(matrix(rep(0, 3*ncol(x)), ncol = ncol(x),nrow = 3))
109-
row.names(cm.new) <- c("XIST","superX","superY")
110-
colnames(cm.new) <- colnames(x)
111-
112-
if ("XIST" %in% row.names(x)) {
109+
110+
# build artificial genes
111+
Xgene.set <-Xgenes[Xgenes %in% row.names(x)]
112+
Ygene.set <-Ygenes[Ygenes %in% row.names(x)]
113+
cm.new<-as.data.frame(matrix(rep(0, 3*ncol(x)), ncol = ncol(x),nrow = 3))
114+
row.names(cm.new) <- c("XIST","superX","superY")
115+
colnames(cm.new) <- colnames(x)
116+
117+
if ("XIST" %in% row.names(x)) {
113118
cm.new["XIST", ]<- x["XIST", ]
114-
}else{
115-
119+
}else{
120+
116121
cm.new["XIST", ]<- 0
117-
}
118-
119-
if (length(Xgene.set)>0){
122+
}
123+
124+
if (length(Xgene.set)>0){
120125
cm.new["superX", ] <-colSums(x[Xgene.set,,drop = FALSE])
121-
}
122-
if (length(Ygene.set)>0){
126+
}
127+
if (length(Ygene.set)>0){
123128
cm.new["superY", ] <-colSums(x[Ygene.set,,drop = FALSE])
124-
}
125-
126-
127-
############################################################################
128-
# Pre-processing
129-
# perform simple QC
130-
# keep a copy of library size
131-
discarded.cells <- NA
132-
if (qc == TRUE){
129+
}
130+
131+
132+
############################################################################
133+
# Pre-processing
134+
# perform simple QC
135+
# keep a copy of library size
136+
discarded.cells <- NA
137+
if (qc == TRUE){
133138
#data.sce <-SingleCellExperiment(assays = list(counts = x))
134139
qcstats <- scuttle::perCellQCMetrics(x)
135140
qcfilter <- scuttle::perCellQCFilters(qcstats)
136141
# save the discarded cells
137142
discarded.cells <- colnames(x[,qcfilter$discard])
138-
143+
139144
# cm.new only contains cells that pass the quality control
140145
cm.new <-cm.new[,!qcfilter$discard]
141-
}
142-
143-
tcm.final <- t(cm.new)
144-
tcm.final <- as.data.frame(tcm.final)
145-
146-
#Do Not Classify
147-
zero.cells <- NA
148-
dnc <- tcm.final$superY==0 & tcm.final$superX==0
149-
150-
if(any(dnc)==TRUE){
146+
}
147+
148+
tcm.final <- t(cm.new)
149+
tcm.final <- as.data.frame(tcm.final)
150+
151+
#Do Not Classify
152+
zero.cells <- NA
153+
dnc <- tcm.final$superY==0 & tcm.final$superX==0
154+
155+
if(any(dnc)==TRUE){
151156
zero.cells <- row.names(tcm.final)[dnc]
152157
message(length(zero.cells), "cell/s are unable to be classified
153158
due to an abundance of zeroes on X and Y chromosome genes\n")
154-
}
155-
tcm.final <- tcm.final[!dnc, ]
156-
157-
cm.new <- cm.new[,!dnc]
158-
159-
cm.lib.size<- colSums(x[,colnames(cm.new)], na.rm=TRUE)
160-
161-
# log-normalisation performed for each cell
162-
# scaling performed for each gene
163-
normsca.cm <- data.frame(lognormCounts(cm.new, log = TRUE,
159+
}
160+
tcm.final <- tcm.final[!dnc, ]
161+
162+
cm.new <- cm.new[,!dnc]
163+
164+
cm.lib.size<- colSums(x[,colnames(cm.new)], na.rm=TRUE)
165+
166+
# log-normalisation performed for each cell
167+
# scaling performed for each gene
168+
normsca.cm <- data.frame(lognormCounts(cm.new, log = TRUE,
164169
prior.count = 0.5,lib.size=cm.lib.size))
165-
data.df <- t(normsca.cm)
166-
data.df <- as.data.frame(data.df)
167-
row.names(data.df) = row.names(tcm.final)
168-
return(list(tcm.final=tcm.final, data.df=data.df, discarded.cells=discarded.cells,
169-
zero.cells=zero.cells))
170+
data.df <- t(normsca.cm)
171+
data.df <- as.data.frame(data.df)
172+
row.names(data.df) = row.names(tcm.final)
173+
return(list(tcm.final=tcm.final, data.df=data.df,
174+
discarded.cells=discarded.cells,
175+
zero.cells=zero.cells))
170176
}

README.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ library(speckle)
4848
library(SingleCellExperiment)
4949
library(CellBench)
5050
library(org.Hs.eg.db)
51-
51+
library(scRNAseq)
5252
sc_data <- load_sc_data()
5353
sc_10x <- sc_data$sc_10x
5454

@@ -62,5 +62,17 @@ sex <- classifySex(counts, genome="Hs")
6262

6363
table(sex$prediction)
6464
boxplot(counts["XIST",]~sex$prediction)
65+
66+
67+
# Mouse data example
68+
sce <- fetchDataset("zilionis-lung-2019", "2023-12-20", path="mouse")
69+
mouse_cm <- counts(sce)
70+
# make sure the row names are the gene symbols
71+
row.names(mouse_cm) <- row.names(sce)
72+
# make sure the column (cell) names are unique
73+
colnames(mouse_cm) <- paste("cell", 1:ncol(sce), sep="_")
74+
75+
mouse_pred <- classifySex(mouse_cm, genome="Mm")
76+
table(mouse_pred$prediction)
6577
```
6678

tests/testthat/test-classifySex.R

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,16 @@ test_that("cm with counts on superY only", {
4646
row.names(exp_result) = colnames(cm_male)
4747
expect_identical(result_male, exp_result)
4848
})
49+
50+
###############################
51+
# count matrix with high counts on superY and no counts on superX
52+
wrong_cm <- as.data.frame(matrix(10, ncol=100, nrow=length(Ygenes)))
53+
54+
row.names(wrong_cm) <- NULL
55+
# make sure the column (cell) names are unique
56+
colnames(wrong_cm) <- paste("cell", 1:ncol(wrong_cm), sep="_")
57+
58+
test_that("Invalid input", {
59+
expect_error(classifySex(x=wrong_cm, genome = "Mm"))
60+
})
61+

0 commit comments

Comments
 (0)