add another check for count matrix gene names

MelodyJIN-Y · MelodyJIN-Y · commit 6ebfc8c853ce · 2024-11-11T18:19:21.000+11:00
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -30,5 +30,5 @@ Suggests:
 Description: The cellXY package contains functions for predicting sex labels for single cell RNA sequencing data.
 License: GPL-3
 biocViews: SingleCell 
-RoxygenNote: 7.2.2
+RoxygenNote: 7.3.2
 Config/testthat/edition: 3
diff --git a/R/preprocess.R b/R/preprocess.R
@@ -72,17 +72,22 @@
 #'
 preprocess<- function(x, genome=genome, qc=qc){
 
-  x <- as.matrix(x)
-  row.names(x)<- toupper(row.names(x))
-
-  if (length(unique(colnames(x))) != ncol(x)){
+    x <- as.matrix(x)
+    row.names(x)<- toupper(row.names(x))
+    
+    if (is.null(row.names(x))){
+        stop("Missing rownames for the input count matrix. 
+Please use gene symbols as rownames.")
+    }
+    
+    if (length(unique(colnames(x))) != ncol(x)){
     message("Cell names are missing/duplicated. Cells are renamed to cell1 - cell", ncol(x))
     colnames(x) = paste(rep("cell", ncol(x)), seq(1, ncol(x)), sep="")
-  }
-  # genes located in the X chromosome that have been reported to escape
-  # X-inactivation
-  # http://bioinf.wehi.edu.au/software/GenderGenes/index.html
-  Xgenes<- c("ARHGAP4","STS","ARSD", "ARSL", "AVPR2", "BRS3", "S100G",
+    }
+    # genes located in the X chromosome that have been reported to escape
+    # X-inactivation
+    # http://bioinf.wehi.edu.au/software/GenderGenes/index.html
+    Xgenes<- c("ARHGAP4","STS","ARSD", "ARSL", "AVPR2", "BRS3", "S100G",
              "CHM", "CLCN4", "DDX3X","EIF1AX","EIF2S3", "GPM6B",
              "GRPR", "HCFC1", "L1CAM", "MAOA", "MYCLP1", "NAP1L3",
              "GPR143", "CDK16", "PLXNB3", "PRKX", "RBBP7", "RENBP",
@@ -93,78 +98,79 @@ preprocess<- function(x, genome=genome, qc=qc){
              "CA5B", "SRPX2", "GEMIN8", "CTPS2", "CLTRN", "NLGN4X",
              "DUSP21", "ALG13","SYAP1", "SYTL4", "FUNDC1", "GAB3",
              "RIBC1", "FAM9C","CA5BP1")
-
-  # genes belonging to the male-specific region of chromosome Y (unique genes)
-  # http://bioinf.wehi.edu.au/software/GenderGenes/index.html
-  Ygenes<-c("AMELY", "DAZ1", "PRKY", "RBMY1A1", "RBMY1HP", "RPS4Y1", "SRY",
+    
+    # genes belonging to the male-specific region of chromosome Y (unique genes)
+    # http://bioinf.wehi.edu.au/software/GenderGenes/index.html
+    Ygenes<-c("AMELY", "DAZ1", "PRKY", "RBMY1A1", "RBMY1HP", "RPS4Y1", "SRY",
             "TSPY1", "UTY", "ZFY","KDM5D", "USP9Y", "DDX3Y", "PRY", "XKRY",
             "BPY2", "VCY", "CDY1", "EIF1AY", "TMSB4Y","CDY2A", "NLGN4Y",
             "PCDH11Y", "HSFY1", "TGIF2LY", "TBL1Y", "RPS4Y2", "HSFY2",
             "CDY2B", "TXLNGY","CDY1B", "DAZ3", "DAZ2", "DAZ4")
-
-  # build artificial genes
-  Xgene.set <-Xgenes[Xgenes %in% row.names(x)]
-  Ygene.set <-Ygenes[Ygenes %in% row.names(x)]
-  cm.new<-as.data.frame(matrix(rep(0, 3*ncol(x)), ncol = ncol(x),nrow = 3))
-  row.names(cm.new) <- c("XIST","superX","superY")
-  colnames(cm.new) <- colnames(x)
-
-  if ("XIST" %in% row.names(x)) {
+    
+    # build artificial genes
+    Xgene.set <-Xgenes[Xgenes %in% row.names(x)]
+    Ygene.set <-Ygenes[Ygenes %in% row.names(x)]
+    cm.new<-as.data.frame(matrix(rep(0, 3*ncol(x)), ncol = ncol(x),nrow = 3))
+    row.names(cm.new) <- c("XIST","superX","superY")
+    colnames(cm.new) <- colnames(x)
+    
+    if ("XIST" %in% row.names(x)) {
     cm.new["XIST", ]<- x["XIST", ]
-  }else{
-
+    }else{
+    
     cm.new["XIST", ]<- 0
-  }
-
-  if (length(Xgene.set)>0){
+    }
+    
+    if (length(Xgene.set)>0){
     cm.new["superX", ] <-colSums(x[Xgene.set,,drop = FALSE])
-  }
-  if (length(Ygene.set)>0){
+    }
+    if (length(Ygene.set)>0){
     cm.new["superY", ] <-colSums(x[Ygene.set,,drop = FALSE])
-  }
-
-
-  ############################################################################
-  # Pre-processing
-  # perform simple QC
-  # keep a copy of library size
-  discarded.cells <- NA
-  if (qc == TRUE){
+    }
+    
+    
+    ############################################################################
+    # Pre-processing
+    # perform simple QC
+    # keep a copy of library size
+    discarded.cells <- NA
+    if (qc == TRUE){
     #data.sce <-SingleCellExperiment(assays = list(counts = x))
     qcstats <- scuttle::perCellQCMetrics(x)
     qcfilter <- scuttle::perCellQCFilters(qcstats)
     # save the discarded cells
     discarded.cells <- colnames(x[,qcfilter$discard])
-
+    
     # cm.new only contains cells that pass the quality control
     cm.new <-cm.new[,!qcfilter$discard]
-  }
-
-  tcm.final <- t(cm.new)
-  tcm.final <- as.data.frame(tcm.final)
-
-  #Do Not Classify
-  zero.cells <- NA
-  dnc <- tcm.final$superY==0 & tcm.final$superX==0
-
-  if(any(dnc)==TRUE){
+    }
+    
+    tcm.final <- t(cm.new)
+    tcm.final <- as.data.frame(tcm.final)
+    
+    #Do Not Classify
+    zero.cells <- NA
+    dnc <- tcm.final$superY==0 & tcm.final$superX==0
+    
+    if(any(dnc)==TRUE){
     zero.cells <- row.names(tcm.final)[dnc]
     message(length(zero.cells), "cell/s are unable to be classified
               due to an abundance of zeroes on X and Y chromosome genes\n")
-  }
-  tcm.final <- tcm.final[!dnc, ]
-
-  cm.new <- cm.new[,!dnc]
-
-  cm.lib.size<- colSums(x[,colnames(cm.new)], na.rm=TRUE)
-
-  # log-normalisation performed for each cell
-  # scaling performed for each gene
-  normsca.cm <- data.frame(lognormCounts(cm.new, log = TRUE,
+    }
+    tcm.final <- tcm.final[!dnc, ]
+    
+    cm.new <- cm.new[,!dnc]
+    
+    cm.lib.size<- colSums(x[,colnames(cm.new)], na.rm=TRUE)
+    
+    # log-normalisation performed for each cell
+    # scaling performed for each gene
+    normsca.cm <- data.frame(lognormCounts(cm.new, log = TRUE,
                                          prior.count = 0.5,lib.size=cm.lib.size))
-  data.df <- t(normsca.cm)
-  data.df <- as.data.frame(data.df)
-  row.names(data.df) = row.names(tcm.final)
-  return(list(tcm.final=tcm.final, data.df=data.df, discarded.cells=discarded.cells,
-       zero.cells=zero.cells))
+    data.df <- t(normsca.cm)
+    data.df <- as.data.frame(data.df)
+    row.names(data.df) = row.names(tcm.final)
+    return(list(tcm.final=tcm.final, data.df=data.df, 
+                discarded.cells=discarded.cells,
+                zero.cells=zero.cells))
 }
diff --git a/README.md b/README.md
@@ -48,7 +48,7 @@ library(speckle)
 library(SingleCellExperiment)
 library(CellBench)
 library(org.Hs.eg.db)
-
+library(scRNAseq)
 sc_data <- load_sc_data()
 sc_10x <- sc_data$sc_10x
 
@@ -62,5 +62,17 @@ sex <- classifySex(counts, genome="Hs")
 
 table(sex$prediction)
 boxplot(counts["XIST",]~sex$prediction)
+
+
+# Mouse data example 
+sce <- fetchDataset("zilionis-lung-2019", "2023-12-20", path="mouse")
+mouse_cm <- counts(sce)
+# make sure the row names are the gene symbols
+row.names(mouse_cm) <- row.names(sce)
+# make sure the column (cell) names are unique
+colnames(mouse_cm) <- paste("cell", 1:ncol(sce), sep="_")
+
+mouse_pred <- classifySex(mouse_cm, genome="Mm")
+table(mouse_pred$prediction)
 ```
 
diff --git a/tests/testthat/test-classifySex.R b/tests/testthat/test-classifySex.R
@@ -46,3 +46,16 @@ test_that("cm with counts on superY only", {
   row.names(exp_result) = colnames(cm_male)
   expect_identical(result_male, exp_result)
 })
+
+###############################
+# count matrix with high counts on superY and no counts on superX
+wrong_cm <-  as.data.frame(matrix(10, ncol=100, nrow=length(Ygenes)))
+
+row.names(wrong_cm) <- NULL
+# make sure the column (cell) names are unique
+colnames(wrong_cm) <- paste("cell", 1:ncol(wrong_cm), sep="_")
+
+test_that("Invalid input", {
+    expect_error(classifySex(x=wrong_cm, genome = "Mm"))
+})
+