metavannier · LeaChabot · Jan 3, 2024 · Jan 3, 2024 · Jan 3, 2024 · Jan 3, 2024
diff --git a/.gitignore b/.gitignore
@@ -4,7 +4,6 @@
 02_Container/cellranger.sif
 02_Container/mambaforge:23.1.0-1.sif
 05_Output
-05_Output_june
 .vscode/
 .bash_history/
 .cache/
@@ -18,4 +17,6 @@ test
 S004647_to_S004650_unoptimized_ref/
 S004647_to_S004650/
 __pycache__
-
+.netrc
+*.errs
+*.out
diff --git a/02_Container/DE_analysis.yaml b/02_Container/DE_analysis.yaml
@@ -0,0 +1,11 @@
+channels :
+    - r
+    - bioconda
+    - conda-forge
+    - defaults
+dependencies :
+    - r-ggplot2 =3.3.5
+    - r-data.table =1.14.2
+    - r-seurat =4.0.3
+    - r-dplyr =1.0.9
+    - r-ggrepel
diff --git a/02_Container/knnor.yaml b/02_Container/knnor.yaml
@@ -0,0 +1,12 @@
+channels :
+    - main
+    - defaults
+    - bioconda
+    - conda-forge
+dependencies :
+    - python = 3.9
+    - pandas
+    - numpy
+    - pip
+    - pip :
+        - knnor
diff --git a/02_Container/labelTransfer.yaml b/02_Container/labelTransfer.yaml
@@ -0,0 +1,14 @@
+channels :
+    - r
+    - bioconda
+    - conda-forge
+    - defaults
+dependencies :
+    - r-ggplot2 =3.5.1
+    - r-seurat =5.1.0
+    - r-dplyr =1.1.4
+    - r-sctransform=0.4.1  # Include sctransform
+    - r-devtools  # Required for installing packages from GitHub
+    - bioconductor-glmgampoi
+
+
diff --git a/02_Container/remi.yaml b/02_Container/remi.yaml
@@ -0,0 +1,10 @@
+channels :
+    - r
+    - bioconda
+    - conda-forge
+    - defaults
+dependencies :
+    - r-seurat = 4.0.3
+    - r-dplyr = 1.0.9
+    - r-ggplot2 =3.3.5
+    - r-cowplot = 1.1.1
diff --git a/02_Container/sims.yaml b/02_Container/sims.yaml
@@ -8,4 +8,5 @@ dependencies :
   - pip
   - pip:
     - --upgrade pip setuptools wheel
-    - --use-pep517 git+https://github.com/braingeneers/SIMS.git
+    # Sims version Nov 17, 2023
+    - --use-pep517 git+https://github.com/braingeneers/SIMS.git@c3cc547e9223e979fdc70f9af2ae932b729da88e
diff --git a/03_Script/01_prepare_data.R b/03_Script/01_prepare_data.R
@@ -23,23 +23,41 @@ STEP2 = "02_seurat/"
 source(file.path(dirname(DIRECTORY), "03_Script/00_general_deps.R"))
 
 in_data_dir = file.path( OUTPUTDIR, STEP1)
+in_data_dir_demux = file.path( OUTPUTDIR, STEP2)
 samples <- dir(in_data_dir)
 
-
-
 MIN_CELLS <- as.numeric(MIN_CELLS)
 MIN_FEATURES <- as.numeric(MIN_FEATURES)
+# HTO <- unlist(strsplit(HTO, ","))
+
+options(Seurat.object.assay.version = 'v3')
 
 #........................................
 # Read data
 #........................................
-seurat_data <- Read10X(paste0(in_data_dir,SAMPLE_ID,CELL_RANGER_COUNT_PATH))
-# Need a condition if data are depultiplexed with cell ranger multi
 if( RUN_DEMULTIPLEX == TRUE){
+        seurat_data <- Read10X(paste0(in_data_dir,SAMPLE_ID,CELL_RANGER_COUNT_PATH))
         seurat_obj <- CreateSeuratObject(counts = seurat_data, min.cells = MIN_CELLS, min.features = MIN_FEATURES, project = SAMPLE_ID)
-} else {
-        seurat_obj <- CreateSeuratObject(counts = seurat_data$`Gene Expression`, min.cells = MIN_CELLS, min.features = MIN_FEATURES, project = SAMPLE_ID) 
+} else if (RUN_DEMULTIPLEX == FALSE) {
+        seurat_obj <- readRDS(paste0(in_data_dir_demux,SAMPLE_ID, "/demultiplex_", SAMPLE_ID, ".rds"))     
 }
+
+
+
+# # Need a condition if data are depultiplexed with cell ranger multi
+# if( RUN_DEMULTIPLEX == TRUE){
+#         seurat_obj <- CreateSeuratObject(counts = seurat_data, min.cells = MIN_CELLS, min.features = MIN_FEATURES, project = SAMPLE_ID)
+# } else if (as.logical(MULTIPLEX) == TRUE) {
+#         seurat_obj <- CreateSeuratObject(counts = seurat_data$`Gene Expression`, min.cells = MIN_CELLS, min.features = MIN_FEATURES, project = SAMPLE_ID) 
+#         hto <- seurat_data$`Antibody Capture`
+#         hto <- hto[c(HTO), ]
+#         seurat_obj[["HTO"]] <- CreateAssayObject(counts = hto)
+#         seurat_obj <- subset(x = seurat_obj, subset = nCount_HTO > 0)
+#         seurat_obj <- NormalizeData(seurat_obj, assay = "HTO", normalization.method = "CLR")       
+# } else{
+#         seurat_obj <- CreateSeuratObject(counts = seurat_data$`Gene Expression`, min.cells = MIN_CELLS, min.features = MIN_FEATURES, project = SAMPLE_ID) 
+# }
+
 seurat_obj$SampleID <- SAMPLE_ID
 
 # ..........................................................................................................
@@ -371,18 +389,20 @@ ggplot( seurat_obj[[]][order( seurat_obj[["outlier"]]),], # Plot FALSE first and
   labs( x = "# Genes", y = "# UMIs") +
   theme( legend.position = "none")
 
-# Mitochondrial vs ribosomal distributions
-if(exists( "mito.drop") && exists( "ribo.drop"))
+if(exists( "mito.drop"))
 {
   ggplot( seurat_obj[[]][order( seurat_obj[["outlier"]]),], # Plot FALSE first and TRUE after
-          aes( x = percent.rb, 
+          aes( x = nCount_RNA, 
                y = percent.mt, 
                color = outlier)) + 
     geom_point( size = 0.5) +
-    geom_vline( xintercept = FILTER_PERCENT_RB, linetype = 2, color = "blue", alpha = 0.5) +
+    geom_vline( xintercept = c( FILTER_FEATURE_MIN, FILTER_FEATURE_MAX), 
+              linetype = 2, 
+              color = c( "blue", "red")[!sapply( list( FILTER_FEATURE_MIN, FILTER_FEATURE_MAX), is.null)], 
+              alpha = 0.5) +
     geom_hline( yintercept = FILTER_PERCENT_MT, linetype = 2, color = "red", alpha = 0.5) +
     scale_color_manual( values = c( "TRUE" = "#FF0000AA", "FALSE" = "#44444444")) + 
-    labs( x = "% Ribosomal genes", y = "% Mitochondrial genes") +
+    labs( x = "# UMIs", y = "% Mitochondrial genes") +
     theme( legend.position = "none")
 }
 
@@ -405,15 +425,8 @@ if( QC_EXPLORATION_MODE == FALSE){
 #              sep="\t");
 
 # ..........................................................................................................
-## @knitr normalizeData
+## @knitr SCTransform
 # ..........................................................................................................
 
-# Normalize the count data present in a given assay
-seurat_obj = NormalizeData( object = seurat_obj,
-                       normalization.method = NORM_METHOD,
-                       scale.factor = NORM_SCALE_FACTOR,
-                       verbose = .VERBOSE)
-
-# # Scales and centers features in the dataset
-# seurat_obj = ScaleData( object = seurat_obj,
-#                    verbose = .VERBOSE)
+# # Normalize the count data present in a given assay
+seurat_obj = SCTransform(seurat_obj, vars.to.regress = "percent.mt", verbose = FALSE)
diff --git a/03_Script/02_variables_genes.R b/03_Script/02_variables_genes.R
@@ -35,7 +35,11 @@ suppressMessages( suppressWarnings( LabelPoints( plot = VariableFeaturePlot( seu
 ## @knitr findVariableGenes_summaryTable
 # ..........................................................................................................
 
-# Extract variable genes info as data.frame
+# Extract variable genes info as data.frame    feature_select_method: "vst"
+    # Number of features to select as top variable features; only used when select_method is set to 'dispersion' or 'vst'
+    variable_features: 2000
+    # For table in report
+    variable_features_showtop : 200
 variableAnnotationsDT = head( HVFInfo( object = seurat_obj, assay = "RNA", selection.method = FEATURE_SELECT_METHOD)[VariableFeatures( seurat_obj),], VARIABLE_FEATURES_SHOWTOP);
 variableAnnotationsDT = cbind("Gene" = rownames(variableAnnotationsDT), variableAnnotationsDT);
 

diff --git a/03_Script/03_cell_heterogeneity_PCA.R b/03_Script/03_cell_heterogeneity_PCA.R
@@ -3,122 +3,66 @@
 #  of cells using dimension reduction techniques
 # ########################################################
 
-## @knitr ldr
+# ..........................................................................................................
+## @knitr HTODemux
+# ..........................................................................................................
+
+# if(as.logical(MULTIPLEX) == TRUE){
+#   # Demultiplexing data 
+#   seurat_obj <- HTODemux(seurat_obj, assay = "HTO", positive.quantile = 0.99)
+#   cat("<br><br>Removed cells after filtering Doublet et Negative:", sum(seurat_obj[["hash.ID"]] == "Negative" | seurat_obj[["hash.ID"]] == "Doublet"));
+#   cat("<br><br>Remaining cells after filtering:", sum(seurat_obj$hash.ID %in% HTO));
+#   cat("\n<br>\n");
+#   seurat_obj <- subset(seurat_obj, idents = c("Doublet","Negative"), invert = TRUE) ### Keep only the Singlet cells (Remove Negative and Doublet)
+
+#   seurat_obj <- FindVariableFeatures(seurat_obj, selection.method = "mean.var.plot")
+#   seurat_obj <- ScaleData(seurat_obj, features = VariableFeatures(seurat_obj))
+#   seurat_obj <- RunPCA(seurat_obj)
+#   seurat_obj <- RunUMAP(seurat_obj, dims = 1:25)
+#   print(DimPlot(seurat_obj))
+# }
 
-# Scales and centers features in the dataset
-seurat_obj = ScaleData( object = seurat_obj,
-                   verbose = .VERBOSE)
+# ..........................................................................................................
+## @knitr ldr
+# ..........................................................................................................
 
 # Perform linear dimensional reduction
-seurat_obj <- RunPCA(seurat_obj, features = VariableFeatures(object = seurat_obj))
+seurat_obj <- RunPCA(seurat_obj, assay = "SCT", verbose = FALSE)
 VizDimLoadings(seurat_obj, dims = 1:DIMS, reduction = "pca")
 # DimPlot(seurat_obj, reduction = "pca", group.by = "orig.ident")
 # DimPlot(seurat_obj, reduction = "pca", group.by = "categorie")
 DimHeatmap(seurat_obj, dims = 1:DIMS, cells = 500, balanced = TRUE)
 
+# ..........................................................................................................
 ## @knitr elbowplot
+# ..........................................................................................................
+
 ElbowPlot(seurat_obj, ndims = DIMS)
 
+# https://hbctraining.github.io/scRNA-seq/lessons/elbow_plot_metric.html
+pct <- seurat_obj[["pca"]]@stdev / sum(seurat_obj[["pca"]]@stdev) * 100
+cumu <- cumsum(pct)
+co1 <- which(cumu > 90 & pct < 5)[1]
+co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1
+pcs <- min(co1, co2)
+# Create a dataframe with values
+plot_df <- data.frame(pct = pct, 
+           cumu = cumu, 
+           rank = 1:length(pct))
+
+# Elbow plot to visualize 
+  ggplot(plot_df, aes(cumu, pct, label = rank, color = rank > pcs)) + 
+  geom_text() + 
+  geom_vline(xintercept = 90, color = "grey") + 
+  geom_hline(yintercept = min(pct[pct > 5]), color = "grey") +
+  theme_bw()
+
 # ..........................................................................................................
 ## @knitr heterogeneity_pca
 # ..........................................................................................................
 
 # Compute PCA on selected variable genes
-nbPC = PCA_NPC
-if(PCA_NPC > length(Cells(seurat_obj)))
-{
-  warning( paste0( "Number of cells in object (", length(Cells(seurat_obj)), ") smaller than requested number of PCs (", PCA_NPC,"), setting lower PC number..." ))
-  nbPC = length(Cells(seurat_obj))
-}           
 seurat_obj <- RunPCA( object = seurat_obj,
-                 features = VariableFeatures( seurat_obj),
-                 npcs     = nbPC,
-                 verbose  = .VERBOSE);
-
-# Plot PCA, highlighting seurat clusters for combination of dimensions
-invisible( apply( combn( 1:PCA_PLOTS_NBDIMS, 2), 2, function(dims)
-{
-  print( DimPlot( object = seurat_obj, reduction = "pca", dims = dims, group.by = "orig.ident") +
-           theme( legend.position = "none"));                              
-}));
-
-# ..........................................................................................................
-## @knitr heterogeneity_pca_umisCounts
-# ..........................................................................................................
-
-# Same PCA plots but highlight UMI counts
-invisible( apply( combn( 1:PCA_PLOTS_NBDIMS, 2), 2, function(dims)
-{
-  print( FeaturePlot( seurat_obj, feature = "nCount_RNA", reduction = "pca", dims = dims) +
-           theme( legend.position = "none",
-                  plot.margin = margin(0, 0, 0, 0, "cm"),
-                  plot.title = element_blank()));
-}));
-
-# ..........................................................................................................
-## @knitr heterogeneity_pca_genesCounts
-# ..........................................................................................................
-
-# Same PCA plots but highlight feature counts
-invisible( apply( combn( 1:PCA_PLOTS_NBDIMS, 2), 2, function(dims)
-{
-  print( FeaturePlot( seurat_obj, feature = "nFeature_RNA", reduction = "pca", dims = dims) +
-           theme( legend.position = "none",
-                  plot.margin = margin(0, 0, 0, 0, "cm"),
-                  plot.title = element_blank()));
-}));
-
-# ..........................................................................................................
-## @knitr heterogeneity_pca_correlations
-# ..........................................................................................................
-
-# Isolate the PCA location of cells in first dimensions, together with UMI and genes counts for correlation analysis (makes use of cbind recycling to repeat values for each stacked PC)
-relationToPC = suppressWarnings( cbind( stack( as.data.frame( Embeddings( seurat_obj, reduction = "pca")[ rownames( seurat_obj[[]]), paste0( "PC_", 1:PCA_PLOTS_NBDIMS) ])),
-                                        seurat_obj[[ c( "nCount_RNA", "nFeature_RNA") ]],
-                                        Cluster = Idents( seurat_obj)));
-
-# Plot relationship of UMIs and genes counts with PCs (combine plots using '/' from 'patchwork' lib)
-print( (ggplot( data = relationToPC, aes( x = values, y = nCount_RNA)) +
-          facet_wrap( ~ind) +
-          stat_binhex( bins = 60) +
-          #geom_point( aes(col = Cluster), alpha = 0.5) +
-          geom_smooth( method = 'lm') +
-          stat_cor( method = "spearman") +
-          ylab( "# UMIs") +
-          theme( axis.title.x = element_blank(),
-                 axis.text.x = element_blank(),
-                 axis.ticks.x = element_blank(),
-                 plot.margin = unit( c( 1, 1, -0.5, 0.5), "lines")))
-       /
-         (ggplot( data = relationToPC, aes( x = values, y = nFeature_RNA)) +
-            facet_wrap( ~ind) +
-            stat_binhex( bins = 60) +
-            #geom_point( aes(col = Cluster), alpha = 0.5) +
-            geom_smooth( method = 'lm') +
-            stat_cor( method = "spearman") +
-            xlab( "PC values") +
-            ylab( "# Genes")));
-
-# ..........................................................................................................
-## @knitr heterogeneity_pca_loadings
-# ..........................................................................................................
-
-# Plot PCA loadings
-invisible( apply( combn( 1:PCA_PLOTS_NBDIMS, 2), 2, function(dims)
-{
-  namesPC=paste0( "PC_", dims);
-  # Get the loading values for concerned PCs
-  loadingsMatrix = Loadings( seurat_obj, reduction = "pca")[ , namesPC ];
-  # Sort features by average absolute value and convert as DF with features names as column
-  loadingsMatrix = head( loadingsMatrix[ order( apply( loadingsMatrix, 1, function(x){ mean( abs( x)) }), decreasing = TRUE), ], PCA_PLOTS_NBFEATURES);
-  loadingsDF = data.frame( loadingsMatrix, features = rownames( loadingsMatrix));
-
-  # Define symmetric and consistent axes for group of plots
-  axesLimit = max( abs( loadingsMatrix));
-
-  # Plot arrows and features name
-  print( ggplot( data = loadingsDF, aes_( x = as.name( namesPC[1]), y = as.name( namesPC[2]))) +
-           coord_cartesian( xlim = c( -axesLimit, axesLimit), ylim = c( -axesLimit, axesLimit)) +
-           geom_text_repel( aes( label = features), max.iter = 10000) + #, max.overlaps = Inf
-           geom_segment( x = 0 , y = 0, aes_( xend = as.name(namesPC[1]), yend = as.name(namesPC[2])), col = "#00000044", arrow = arrow( length = unit( 2, "mm"))));
-}));
+                 npcs     = pcs,
+                 verbose  = .VERBOSE,
+                 assay = "SCT");