72
72
# '
73
73
preprocess <- function (x , genome = genome , qc = qc ){
74
74
75
- x <- as.matrix(x )
76
- row.names(x )<- toupper(row.names(x ))
77
-
78
- if (length(unique(colnames(x ))) != ncol(x )){
75
+ x <- as.matrix(x )
76
+ row.names(x )<- toupper(row.names(x ))
77
+
78
+ if (is.null(row.names(x ))){
79
+ stop(" Missing rownames for the input count matrix.
80
+ Please use gene symbols as rownames." )
81
+ }
82
+
83
+ if (length(unique(colnames(x ))) != ncol(x )){
79
84
message(" Cell names are missing/duplicated. Cells are renamed to cell1 - cell" , ncol(x ))
80
85
colnames(x ) = paste(rep(" cell" , ncol(x )), seq(1 , ncol(x )), sep = " " )
81
- }
82
- # genes located in the X chromosome that have been reported to escape
83
- # X-inactivation
84
- # http://bioinf.wehi.edu.au/software/GenderGenes/index.html
85
- Xgenes <- c(" ARHGAP4" ," STS" ," ARSD" , " ARSL" , " AVPR2" , " BRS3" , " S100G" ,
86
+ }
87
+ # genes located in the X chromosome that have been reported to escape
88
+ # X-inactivation
89
+ # http://bioinf.wehi.edu.au/software/GenderGenes/index.html
90
+ Xgenes <- c(" ARHGAP4" ," STS" ," ARSD" , " ARSL" , " AVPR2" , " BRS3" , " S100G" ,
86
91
" CHM" , " CLCN4" , " DDX3X" ," EIF1AX" ," EIF2S3" , " GPM6B" ,
87
92
" GRPR" , " HCFC1" , " L1CAM" , " MAOA" , " MYCLP1" , " NAP1L3" ,
88
93
" GPR143" , " CDK16" , " PLXNB3" , " PRKX" , " RBBP7" , " RENBP" ,
@@ -93,78 +98,79 @@ preprocess<- function(x, genome=genome, qc=qc){
93
98
" CA5B" , " SRPX2" , " GEMIN8" , " CTPS2" , " CLTRN" , " NLGN4X" ,
94
99
" DUSP21" , " ALG13" ," SYAP1" , " SYTL4" , " FUNDC1" , " GAB3" ,
95
100
" RIBC1" , " FAM9C" ," CA5BP1" )
96
-
97
- # genes belonging to the male-specific region of chromosome Y (unique genes)
98
- # http://bioinf.wehi.edu.au/software/GenderGenes/index.html
99
- Ygenes <- c(" AMELY" , " DAZ1" , " PRKY" , " RBMY1A1" , " RBMY1HP" , " RPS4Y1" , " SRY" ,
101
+
102
+ # genes belonging to the male-specific region of chromosome Y (unique genes)
103
+ # http://bioinf.wehi.edu.au/software/GenderGenes/index.html
104
+ Ygenes <- c(" AMELY" , " DAZ1" , " PRKY" , " RBMY1A1" , " RBMY1HP" , " RPS4Y1" , " SRY" ,
100
105
" TSPY1" , " UTY" , " ZFY" ," KDM5D" , " USP9Y" , " DDX3Y" , " PRY" , " XKRY" ,
101
106
" BPY2" , " VCY" , " CDY1" , " EIF1AY" , " TMSB4Y" ," CDY2A" , " NLGN4Y" ,
102
107
" PCDH11Y" , " HSFY1" , " TGIF2LY" , " TBL1Y" , " RPS4Y2" , " HSFY2" ,
103
108
" CDY2B" , " TXLNGY" ," CDY1B" , " DAZ3" , " DAZ2" , " DAZ4" )
104
-
105
- # build artificial genes
106
- Xgene.set <- Xgenes [Xgenes %in% row.names(x )]
107
- Ygene.set <- Ygenes [Ygenes %in% row.names(x )]
108
- cm.new <- as.data.frame(matrix (rep(0 , 3 * ncol(x )), ncol = ncol(x ),nrow = 3 ))
109
- row.names(cm.new ) <- c(" XIST" ," superX" ," superY" )
110
- colnames(cm.new ) <- colnames(x )
111
-
112
- if (" XIST" %in% row.names(x )) {
109
+
110
+ # build artificial genes
111
+ Xgene.set <- Xgenes [Xgenes %in% row.names(x )]
112
+ Ygene.set <- Ygenes [Ygenes %in% row.names(x )]
113
+ cm.new <- as.data.frame(matrix (rep(0 , 3 * ncol(x )), ncol = ncol(x ),nrow = 3 ))
114
+ row.names(cm.new ) <- c(" XIST" ," superX" ," superY" )
115
+ colnames(cm.new ) <- colnames(x )
116
+
117
+ if (" XIST" %in% row.names(x )) {
113
118
cm.new [" XIST" , ]<- x [" XIST" , ]
114
- }else {
115
-
119
+ }else {
120
+
116
121
cm.new [" XIST" , ]<- 0
117
- }
118
-
119
- if (length(Xgene.set )> 0 ){
122
+ }
123
+
124
+ if (length(Xgene.set )> 0 ){
120
125
cm.new [" superX" , ] <- colSums(x [Xgene.set ,,drop = FALSE ])
121
- }
122
- if (length(Ygene.set )> 0 ){
126
+ }
127
+ if (length(Ygene.set )> 0 ){
123
128
cm.new [" superY" , ] <- colSums(x [Ygene.set ,,drop = FALSE ])
124
- }
125
-
126
-
127
- # ###########################################################################
128
- # Pre-processing
129
- # perform simple QC
130
- # keep a copy of library size
131
- discarded.cells <- NA
132
- if (qc == TRUE ){
129
+ }
130
+
131
+
132
+ # ###########################################################################
133
+ # Pre-processing
134
+ # perform simple QC
135
+ # keep a copy of library size
136
+ discarded.cells <- NA
137
+ if (qc == TRUE ){
133
138
# data.sce <-SingleCellExperiment(assays = list(counts = x))
134
139
qcstats <- scuttle :: perCellQCMetrics(x )
135
140
qcfilter <- scuttle :: perCellQCFilters(qcstats )
136
141
# save the discarded cells
137
142
discarded.cells <- colnames(x [,qcfilter $ discard ])
138
-
143
+
139
144
# cm.new only contains cells that pass the quality control
140
145
cm.new <- cm.new [,! qcfilter $ discard ]
141
- }
142
-
143
- tcm.final <- t(cm.new )
144
- tcm.final <- as.data.frame(tcm.final )
145
-
146
- # Do Not Classify
147
- zero.cells <- NA
148
- dnc <- tcm.final $ superY == 0 & tcm.final $ superX == 0
149
-
150
- if (any(dnc )== TRUE ){
146
+ }
147
+
148
+ tcm.final <- t(cm.new )
149
+ tcm.final <- as.data.frame(tcm.final )
150
+
151
+ # Do Not Classify
152
+ zero.cells <- NA
153
+ dnc <- tcm.final $ superY == 0 & tcm.final $ superX == 0
154
+
155
+ if (any(dnc )== TRUE ){
151
156
zero.cells <- row.names(tcm.final )[dnc ]
152
157
message(length(zero.cells ), " cell/s are unable to be classified
153
158
due to an abundance of zeroes on X and Y chromosome genes\n " )
154
- }
155
- tcm.final <- tcm.final [! dnc , ]
156
-
157
- cm.new <- cm.new [,! dnc ]
158
-
159
- cm.lib.size <- colSums(x [,colnames(cm.new )], na.rm = TRUE )
160
-
161
- # log-normalisation performed for each cell
162
- # scaling performed for each gene
163
- normsca.cm <- data.frame (lognormCounts(cm.new , log = TRUE ,
159
+ }
160
+ tcm.final <- tcm.final [! dnc , ]
161
+
162
+ cm.new <- cm.new [,! dnc ]
163
+
164
+ cm.lib.size <- colSums(x [,colnames(cm.new )], na.rm = TRUE )
165
+
166
+ # log-normalisation performed for each cell
167
+ # scaling performed for each gene
168
+ normsca.cm <- data.frame (lognormCounts(cm.new , log = TRUE ,
164
169
prior.count = 0.5 ,lib.size = cm.lib.size ))
165
- data.df <- t(normsca.cm )
166
- data.df <- as.data.frame(data.df )
167
- row.names(data.df ) = row.names(tcm.final )
168
- return (list (tcm.final = tcm.final , data.df = data.df , discarded.cells = discarded.cells ,
169
- zero.cells = zero.cells ))
170
+ data.df <- t(normsca.cm )
171
+ data.df <- as.data.frame(data.df )
172
+ row.names(data.df ) = row.names(tcm.final )
173
+ return (list (tcm.final = tcm.final , data.df = data.df ,
174
+ discarded.cells = discarded.cells ,
175
+ zero.cells = zero.cells ))
170
176
}
0 commit comments