|
| 1 | + |
| 2 | +#' Pre-processing function for sex classification |
| 3 | +#' |
| 4 | +#' The purpose of this function is to process a single cell counts matrix into |
| 5 | +#' the appropriate format for the \code{classifySex} function. |
| 6 | +#' |
| 7 | +#' This function will filter out cells that are unable to be classified due to |
| 8 | +#' zero counts on *XIST/Xist* and all of the Y chromosome genes. If |
| 9 | +#' \code{qc=TRUE} additional cells are removed as identified by the |
| 10 | +#' \code{perCellQCMetrics} and \code{quickPerCellQC} functions from the |
| 11 | +#' \code{scuttle} package. The resulting counts matrix is then log-normalised |
| 12 | +#' and scaled. |
| 13 | +#' |
| 14 | +#' @param x the counts matrix, rows are genes and columns are cells. Row names |
| 15 | +#' must be gene symbols. |
| 16 | +#' @param genome the genome the data arises from. Current options are |
| 17 | +#' human: genome = "Hs" or mouse: genome = "Mm". |
| 18 | +#' @param qc logical, indicates whether to perform additional quality control |
| 19 | +#' on the cells. qc = TRUE will predict cells that pass quality control only |
| 20 | +#' and the filtered cells will not be classified. qc = FALSE will predict |
| 21 | +#' every cell except the cells with zero counts on *XIST/Xist* and the sum |
| 22 | +#' of the Y genes. Default is TRUE. |
| 23 | +#' |
| 24 | +#' @return outputs a list object with the following components |
| 25 | +#' \item{tcm.final }{A transposed count matrix where rows are cells and columns |
| 26 | +#' are the features used for classification.} |
| 27 | +#' \item{data.df }{The normalised and scaled \code{tcm.final} matrix.} |
| 28 | +#' \item{discarded.cells }{Character vector of cell IDs for the cells that are |
| 29 | +#' discarded when \code{qc=TRUE}.} |
| 30 | +#' \item{zero.cells }{Character vector of cell IDs for the cells that can not |
| 31 | +#' be classified as male/female due to zero counts on *Xist* and all the |
| 32 | +#' Y chromosome genes.} |
| 33 | +#' |
| 34 | +#' @importFrom AnnotationDbi select |
| 35 | +#' @importFrom stringr str_to_title |
| 36 | +#' @importFrom scuttle perCellQCMetrics |
| 37 | +#' @importFrom scuttle quickPerCellQC |
| 38 | +#' @importFrom org.Hs.eg.db org.Hs.eg.db |
| 39 | +#' @importFrom org.Mm.eg.db org.Mm.eg.db |
| 40 | +#' @export preprocessDb |
| 41 | +#' |
| 42 | + |
| 43 | +preprocessDb<- function(x, genome=genome, qc=qc){ |
| 44 | + |
| 45 | + x <- as.matrix(x) |
| 46 | + row.names(x)<- toupper(row.names(x)) |
| 47 | + # genes located in the X chromosome that have been reported to escape |
| 48 | + # X-inactivation |
| 49 | + # http://bioinf.wehi.edu.au/software/GenderGenes/index.html |
| 50 | + Xgenes<- c("ARHGAP4","STS","ARSD", "ARSL", "AVPR2", "BRS3", "S100G", |
| 51 | + "CHM", "CLCN4", "DDX3X","EIF1AX","EIF2S3", "GPM6B", |
| 52 | + "GRPR", "HCFC1", "L1CAM", "MAOA", "MYCLP1", "NAP1L3", |
| 53 | + "GPR143", "CDK16", "PLXNB3", "PRKX", "RBBP7", "RENBP", |
| 54 | + "RPS4X", "TRAPPC2", "SH3BGRL", "TBL1X","UBA1", "KDM6A", |
| 55 | + "XG", "XIST", "ZFX", "PUDP", "PNPLA4", "USP9X", "KDM5C", |
| 56 | + "SMC1A", "NAA10", "OFD1", "IKBKG", "PIR", "INE2", "INE1", |
| 57 | + "AP1S2", "GYG2", "MED14", "RAB9A", "ITM2A", "MORF4L2", |
| 58 | + "CA5B", "SRPX2", "GEMIN8", "CTPS2", "CLTRN", "NLGN4X", |
| 59 | + "DUSP21", "ALG13","SYAP1", "SYTL4", "FUNDC1", "GAB3", |
| 60 | + "RIBC1", "FAM9C","CA5BP1") |
| 61 | + |
| 62 | + # genes belonging to the male-specific region of chromosome Y (unique genes) |
| 63 | + # http://bioinf.wehi.edu.au/software/GenderGenes/index.html |
| 64 | + Ygenes<-c("AMELY", "DAZ1", "PRKY", "RBMY1A1", "RBMY1HP", "RPS4Y1", "SRY", |
| 65 | + "TSPY1", "UTY", "ZFY","KDM5D", "USP9Y", "DDX3Y", "PRY", "XKRY", |
| 66 | + "BPY2", "VCY", "CDY1", "EIF1AY", "TMSB4Y","CDY2A", "NLGN4Y", |
| 67 | + "PCDH11Y", "HSFY1", "TGIF2LY", "TBL1Y", "RPS4Y2", "HSFY2", |
| 68 | + "CDY2B", "TXLNGY","CDY1B", "DAZ3", "DAZ2", "DAZ4") |
| 69 | + |
| 70 | + # build artificial genes |
| 71 | + Xgene.set <-Xgenes[Xgenes %in% row.names(x)] |
| 72 | + Ygene.set <-Ygenes[Ygenes %in% row.names(x)] |
| 73 | + cm.new<-as.data.frame(matrix(rep(0, 3*ncol(x)), ncol = ncol(x),nrow = 3)) |
| 74 | + row.names(cm.new) <- c("XIST","superX","superY") |
| 75 | + colnames(cm.new) <- colnames(x) |
| 76 | + cm.new["XIST", ]<- x["XIST", ] |
| 77 | + cm.new["superX", ] <-colSums(x[Xgene.set,]) |
| 78 | + cm.new["superY", ] <-colSums(x[Ygene.set,]) |
| 79 | + |
| 80 | + ############################################################################ |
| 81 | + # Pre-processing |
| 82 | + # perform simple QC |
| 83 | + # keep a copy of library size |
| 84 | + discarded.cells <- NA |
| 85 | + if (qc == TRUE){ |
| 86 | + #data.sce <-SingleCellExperiment(assays = list(counts = x)) |
| 87 | + qcstats <- scuttle::perCellQCMetrics(x,subsets=list(Mito=1:100)) |
| 88 | + qcfilter <- scuttle::quickPerCellQC(qcstats, |
| 89 | + percent_subsets=c("subsets_Mito_percent")) |
| 90 | + # save the discarded cells |
| 91 | + discarded.cells <- colnames(x[,qcfilter$discard]) |
| 92 | + |
| 93 | + # cm.new only contains cells that pass the quality control |
| 94 | + cm.new <-cm.new[,!qcfilter$discard] |
| 95 | + } |
| 96 | + |
| 97 | + tcm.final <- t(cm.new) |
| 98 | + tcm.final <- as.data.frame(tcm.final) |
| 99 | + |
| 100 | + # Do Not Classify |
| 101 | + # zero.cells <- NA |
| 102 | + # dnc <- tcm.final$superY==0 & tcm.final$superX==0 |
| 103 | + # |
| 104 | + # if(any(dnc)==TRUE){ |
| 105 | + # zero.cells <- row.names(tcm.final)[dnc] |
| 106 | + # message(length(zero.cells), "cell/s are unable to be classified |
| 107 | + # due to an abundance of zeroes on X and Y chromosome genes\n") |
| 108 | + # } |
| 109 | + # tcm.final <- tcm.final[!dnc, ] |
| 110 | + # |
| 111 | + # cm.new <- cm.new[,!dnc] |
| 112 | + |
| 113 | + cm.lib.size<- colSums(x[,colnames(cm.new)], na.rm=TRUE) |
| 114 | + med.ls = median(cm.lib.size) |
| 115 | + |
| 116 | + |
| 117 | + # log-normalisation performed for each cell |
| 118 | + # scaling performed for each gene |
| 119 | + normsca.cm <- data.frame(lognormCounts(cm.new, log = TRUE, |
| 120 | + prior.count = 0.5,lib.size=cm.lib.size)) |
| 121 | + data.df <- t(normsca.cm) |
| 122 | + data.df <- as.data.frame(data.df) |
| 123 | + data.df$med.ls = cm.lib.size/med.ls |
| 124 | + tcm.final$med.ls = cm.lib.size/med.ls |
| 125 | + |
| 126 | + list(tcm.final=tcm.final, data.df=data.df, discarded.cells=discarded.cells) |
| 127 | +} |
0 commit comments