updating plots and preparing for release of version 4

Edwin · Edwin · commit 6a7b7ea26391 · 2021-07-15T12:31:27.000-07:00
diff --git a/doc.md b/doc.md
@@ -1,12 +1,12 @@
-# Seurat.Preprocessing (v1)
+# Seurat.Preprocessing (v4.0)
 ---
 **Description**: GenePattern module which implements the preprocessing steps for Seurat. You may need to run this module multiple times if you want to change the filtering step.
 
-**Author**: Edwin Juárez
+**Author**: Edwin Juárez and Jonathan Zamora
 
 **Contact**: [Forum Link](https://groups.google.com/forum/?utm_medium=email&utm_source=footer#!forum/genepattern-help)
 
-**Algorithm Version**: Seurat 3.2.0
+**Algorithm Version**: Seurat 4.0.3
 
 ---
 
@@ -22,6 +22,10 @@ The `Seurat.Preprocessing` Module aims to provide a way to integrate the multipl
 
 [Seurat](https://satijalab.org/seurat/)
 
+[Module's GitHub repository](https://github.com/genepattern/Seurat.Preprocessing/tree/v4)
+
+[Module's Docker container](https://hub.docker.com/layers/genepattern/seurat-suite/4.0.3/images/sha256-8d3f5fcae1cf4034cfc9aa87a2e3ea352b89073c948a4fa885670a1eebf16721?context=repo)
+
 ### Technical Notes
 
 
@@ -30,7 +34,7 @@ The `Seurat.Preprocessing` Module aims to provide a way to integrate the multipl
 
 | Name | Description |
 -------|--------------
-| tenx_data_dir         | `.tar.gz` or `.zip` file input that contains the  raw single cell data -- currently only 10x data is supported.|
+| input_rds         | RDS file created by Seurat.QC|
 | column_name            | 	column name of percent mitochondrial genes. Note: not all datasets have this column, those who do often times name it percent.mt].|
 | pattern        | 	what pattern to use to label mitochondrial genes [often times this is annotated as MT-].|
 | file_name      | 	Basename of the file to be saved.|
@@ -48,6 +52,7 @@ The `Seurat.Preprocessing` Module aims to provide a way to integrate the multipl
 |feat_sel_method|Method for feature selection. You should probably not change this unless you really know what you are doing.|
 |num_features|	Number of top features color during feature selection.|
 |num_to_label|Number of top features to label.|
+|numpcs|Number of PCA dimensions to visualize (default=50).|
 |vdl_num_dims|Number of PCA dimensions to visualize.|
 |vdhm_num_dims|	Number of dimensions for the dimensional reduction heatmap.|
 |cells|Number of top cells to plot.|
@@ -81,4 +86,5 @@ The `Seurat.Preprocessing` Module aims to provide a way to integrate the multipl
 
 | Version | Release Date | Description                                 |
 ----------|--------------|---------------------------------------------|
+| 4.0       | 2021-07-15          | Updating to Seurat 4.0.3 |
 | 1       | 2020-11-16          | Initial Release of `Seurat.Preprocessing` |
diff --git a/manifest b/manifest
@@ -1,13 +1,13 @@
-# Seurat.Preprocessing  urn:lsid:genepattern.org:module.analysis:00415:3.2
+# Seurat.Preprocessing  urn:lsid:genepattern.org:module.analysis:00415:4.0
 #Wed Jul 14 04:57:38 UTC 2021
 JVMLevel=
-LSID=urn\:lsid\:genepattern.org\:module.analysis\:00415\:3.2
+LSID=urn\:lsid\:genepattern.org\:module.analysis\:00415\:4.0
 author=Edwin F. Juarez;UCSD - Mesirov Lab
 categories=preprocess & utilities;single-cell
-commandLine=Rscript --no-save --quiet --no-restore <libdir>seurat_preprocess.R --input_rds <input_rds> --min_n_features <min_n_features> --max_n_features <max_n_features> --max_percent_mitochondrial <max_percent_mitochondrial> --norm_method <norm_method> --scale_factor <scale_factor> --feat_sel_method <feat_sel_method> --num_features <num_features> --num_to_label <num_to_label> --vdl_num_dims <vdl_num_dims> --vdhm_num_dims <vdhm_num_dims> --cells <cells> --file_name <file_name> --keep_scale_data <keep_scale_data>
+commandLine=Rscript --no-save --quiet --no-restore <libdir>seurat_preprocess.R --input_rds <input_rds> --min_n_features <min_n_features> --max_n_features <max_n_features> --max_percent_mitochondrial <max_percent_mitochondrial> --norm_method <norm_method> --scale_factor <scale_factor> --feat_sel_method <feat_sel_method> --num_features <num_features> --num_to_label <num_to_label> --vdl_num_dims <vdl_num_dims> --vdhm_num_dims <vdhm_num_dims> --cells <cells> --file_name <file_name> --keep_scale_data <keep_scale_data> --numpcs <numpcs>
 cpuType=any
 description=Seurat preprocessing
-documentationUrl=https\://github.com/genepattern/Seurat.Preprocessing/blob/develop/doc.md
+documentationUrl=https://github.com/genepattern/Seurat.Preprocessing/blob/v4/doc.md
 fileFormat=rds
 job.docker.image=genepattern/seurat-suite\:4.0.3
 language=any
@@ -29,7 +29,7 @@ p10_value=
 p11_MODE=
 p11_TYPE=Integer
 p11_default_value=2
-p11_description=Number of PCA dimensions to visualize.
+p11_description=Number of PCA dimensions to visualize (for elbow plot and heatmaps).
 p11_fileFormat=
 p11_flag=--vdl_num_dims
 p11_name=vdl_num_dims
@@ -78,6 +78,21 @@ p14_prefix=
 p14_prefix_when_specified=
 p14_type=java.lang.String
 p14_value=TRUE\=TRUE;FALSE\=FALSE
+
+p15_MODE=
+p15_TYPE=Integer
+p15_default_value=50
+p15_description=Number of principal components to compute (default=50).
+p15_fileFormat=
+p15_flag=--numpcs
+p15_name=numpcs
+p15_numValues=0..1
+p15_optional=
+p15_prefix=
+p15_prefix_when_specified=
+p15_type=java.lang.Integer
+p15_value=
+
 p1_MODE=IN
 p1_TYPE=FILE
 p1_default_value=
@@ -197,8 +212,8 @@ p9_type=java.lang.Integer
 p9_value=
 privacy=public
 publicationDate=11/20/2020 01\:10 
-quality=development
-src.repo=https\://github.com/genepattern/Seurat.Preprocessing
+quality=production
+src.repo=https://github.com/genepattern/Seurat.Preprocessing/tree/v4
 taskDoc=doc.html
 taskType=preprocess & utilities
 userid=edjuaro
diff --git a/paramgroups.json b/paramgroups.json
@@ -8,12 +8,6 @@
             "file_name"
         ]
     },
-    {
-        "name": "Other",
-        "description": "The rest",
-        "hidden": false,
-        "parameters": []
-    },
     {
         "name": "Filtering",
         "description": "Parameters related to filtering for QC.",
@@ -37,7 +31,8 @@
             "vdl_num_dims",
             "vdhm_num_dims",
             "cells",
-            "keep_scale_data"
+            "keep_scale_data",
+            "numpcs"
         ]
     }
 ]
diff --git a/run_seurat.sh b/run_seurat.sh
@@ -1,5 +1,5 @@
 docker run -v $PWD:/LOCAL -w /LOCAL/Job_1 -t genepattern/seurat-suite:4.0.3 Rscript --no-save --quiet --slave --no-restore  /LOCAL/src/seurat_preprocess.R\
- --input_rds '/LOCAL/data/test_run.rds' \
+ --input_rds '/LOCAL/data/pbmc_preprocessed.rds' \
  --column_name "percent.mt" --pattern 'MT-'\
  --first_feature 'nFeature_RNA' --second_feature 'nCount_RNA' --third_feature 'percent.mt'\
  --min_n_features 2 --max_n_features 6000 --max_percent_mitochondrial 25\
@@ -8,4 +8,4 @@ docker run -v $PWD:/LOCAL -w /LOCAL/Job_1 -t genepattern/seurat-suite:4.0.3 Rscr
  --vdl_num_dims 2\
  --vdhm_num_dims 15 --cells 500\
  --file_name "test_run"\
- --keep_scale_data "TRUE"
+ --keep_scale_data "TRUE" --numpcs 50
diff --git a/src/seurat_preprocess.R b/src/seurat_preprocess.R
@@ -24,7 +24,7 @@ load_rds <- function(input.file){
   if (file.exists(input.file)){
   	pbmc = readRDS(input.file)
   } else {
-  Print('Input file could not be found!')
+  print('Input file could not be found!')
   }
   return(pbmc)
 }
@@ -161,9 +161,9 @@ myscale <- function(pbmc){
 
 ## PCA
 
-mypca <-function(pbmc){
+mypca <-function(pbmc,numpcs){
     feats <- VariableFeatures(object = pbmc, verbose = F)
-    pbmc <-RunPCA(pbmc, features = feats, nfeatures.print=5, verbose = F)
+    pbmc <-RunPCA(pbmc, features = feats, nfeatures.print=5, verbose = F, npcs = numpcs)
 
     return(pbmc)
 }
@@ -185,9 +185,8 @@ vdl <- function(nDims){
 
 
 ## ELBOW PLOT
-
-ebp <- function(){
-    plot(ElbowPlot(pbmc))
+ebp <- function(nDims){
+    plot(ElbowPlot(pbmc,ndims=nDims))
     return(pbmc)
 }
 
@@ -248,10 +247,11 @@ parser <- add_option(parser, c("--num_features"),type='integer',default=2000, he
 parser <- add_option(parser, c("--num_to_label"),type='integer',default=10, help = "Number of top features to label.")
 # ====================================
 # Parameter for Vizualize Dimension Loadings, vdl
+parser <- add_option(parser, c("--numpcs"),type='integer',default=50, help = "Number of PCA dimensions to compute (default=50).")
 parser <- add_option(parser, c("--vdl_num_dims"),type='integer',default=2, help = "Number of PCA dimensions to visualize.")
 # ====================================
 # Parameters for Heat Map, vdhm
-parser <- add_option(parser, c("--vdhm_num_dims"),type='integer',default=15, help = "Number of dimensions for the dimensional reduction heatmap.")
+parser <- add_option(parser, c("--vdhm_num_dims"),type='integer',default=15, help = "Number of dimensions for the dimensional reduction heatmap and elbow plots.")
 parser <- add_option(parser, c("--cells"),type='integer',default=500, help = "Number of top cells to plot.")
 # ====================================
 #parameter for save_it
@@ -266,6 +266,8 @@ print('Parameters used:')
 print(args)
 print('==========================================================')
 
+# Setting up the PDF file for the plots
+pdf(file=paste(args$file_name,'.pdf',sep=''))
 
 ################################################################################
 #Begin Running the functions
@@ -359,7 +361,7 @@ job_list <- append(job_list, "MY PCA")
 input_size_list <- append(input_size_list, object.size(pbmc))
 print(object.size(pbmc), units="auto")
 start <- proc.time()
-pbmc <- mypca(pbmc)
+pbmc <- mypca(pbmc,args$numpcs)
 proc.time() - start
 output_size_list <- append(output_size_list, object.size(pbmc))
 print(object.size(pbmc), units="auto")
@@ -395,7 +397,7 @@ job_list <- append(job_list, "EBP")
 input_size_list <- append(input_size_list, object.size(pbmc))
 print(object.size(pbmc), units="auto")
 start <- proc.time()
-ebp()
+ebp(args$vdhm_num_dims)
 proc.time() - start
 output_size_list <- append(output_size_list, object.size(pbmc))
 print(object.size(pbmc), units="auto")
@@ -412,6 +414,32 @@ proc.time() - start
 output_size_list <- append(output_size_list, object.size(pbmc))
 print(object.size(pbmc), units="auto")
 
+print("******************************************************")
+print("************   Explained variability   ***************")
+print("******************************************************")
+
+print('Computing percent of variance explained for each PC')
+mat <- Seurat::GetAssayData(pbmc, assay = "RNA", slot = "scale.data")
+pca <- pbmc[["pca"]]
+# Get the total variance:
+total_variance <- sum(matrixStats::rowVars(mat))
+eigValues = (pca@stdev)^2  ## EigenValues
+varExplained = 100*eigValues / total_variance #Percent of variance explainde
+print(varExplained)
+
+cat('Creating plots for Variance explained')
+cvar<-cumsum(varExplained)
+
+##default#par(mfrow=c(2,1),mar = c(5.1, 4.1, 4.1, 2.1))
+# par(mfrow=c(2,1),mar = c(5.1, 5, 4.1, 2.1))
+par(mfrow=c(1,1),mar = c(5.1, 4.1, 4.1, 2.1))
+plot(1:length(varExplained),varExplained,main='Percent of the variance explained by each PC',ylab='Percent',xlab='Principal Components')
+plot(1:length(cvar),cvar,main='Cumulative Variance explained',ylab='Percent',xlab='Principal Components')
+par(mfrow=c(1,1))
+# plot(1:length(varExplained),varExplained)
+# plot(1:length(cvar),cvar)
+print('... done')
+
 
 if (args$keep_scale_data == 'FALSE') {
   # Why are we calling Diet Seurat? Read this issue:
@@ -434,6 +462,7 @@ if (args$keep_scale_data == 'FALSE') {
 print("*************************************")
 print("************ SAVE RDS ***************")
 print("*************************************")
+dev.off() # Close the PDF file
 job_list <- append(job_list, "SAVE RDS")
 input_size_list <- append(input_size_list, object.size(pbmc))
 start <- proc.time()

Original file line number	Diff line number	Diff line change
`@@ -8,12 +8,6 @@`
`8`	`8`	`"file_name"`
`9`	`9`	`]`
`10`	`10`	`},`
`11`		`- {`
`12`		`- "name": "Other",`
`13`		`- "description": "The rest",`
`14`		`- "hidden": false,`
`15`		`- "parameters": []`
`16`		`- },`
`17`	`11`	`{`
`18`	`12`	`"name": "Filtering",`
`19`	`13`	`"description": "Parameters related to filtering for QC.",`
`@@ -37,7 +31,8 @@`
`37`	`31`	`"vdl_num_dims",`
`38`	`32`	`"vdhm_num_dims",`
`39`	`33`	`"cells",`
`40`		`- "keep_scale_data"`
	`34`	`+ "keep_scale_data",`
	`35`	`+ "numpcs"`
`41`	`36`	`]`
`42`	`37`	`}`
`43`	`38`	`]`