Merge pull request #351 from clemente-lab/Enhancement-SummaryImprovement

Enhancement summary improvement
clemente-lab · Dec 10, 2021 · b06f488 · b06f488
2 parents 8f9884d + d2af659
commit b06f488
Show file tree

Hide file tree

Showing 31 changed files with 483 additions and 118 deletions.
diff --git a/docs/summary_env.md b/docs/summary_env.md
@@ -8,8 +8,8 @@
 `conda activate jupyter`
 
 # Then install the necessary R packages and version of nbconvert from conda-forge
-# NB: nbconvert was previously kept at version 5.6.1, now kept current
-`conda install nbconvert r-ggplot2 r-ggally r-ggrepel r-rcolorbrewer -c conda-forge`
+# NB: you will regret attempting to use a more recent version than nbconvert 5.6.1
+`conda install nbconvert=5.6.1 r-ggplot2 r-ggally r-ggrepel r-rcolorbrewer -c conda-forge`
 
 # Now use pip to install the PDF template for jupyter and the Pillow python library
 

diff --git a/mmeds/config.py b/mmeds/config.py
@@ -412,12 +412,15 @@
 USER_GUIDE = str(TEST_PATH / 'User_Guide.txt')
 SUBJECT_TEMPLATE = str(TEST_PATH / 'subject_template.tsv')
 SPECIMEN_TEMPLATE = str(TEST_PATH / 'specimen_template.tsv')
+CONFIG_EXAMPLE = str(TEST_PATH / 'config_example.yaml')
+
 
 # Demultiplexing Qiime Defaults
 QIIME_SAMPLE_ID_CATS = ('#SampleID', '#q2:types')
 QIIME_FORWARD_BARCODE_CATS = ('BarcodeSequence', 'categorical')
 QIIME_REVERSE_BARCODE_CATS = ('BarcodeSequenceR', 'categorical')
 FASTQ_FILENAME_TEMPLATE = '{}_S1_L001_R{}_001.fastq.gz'
+
 TEST_FILES = {
     'barcodes': TEST_BARCODES,
     'for_reads': TEST_READS,

diff --git a/mmeds/html/analysis_select_tool.html b/mmeds/html/analysis_select_tool.html
@@ -30,4 +30,12 @@
         <p> <button class="w3-btn w3-blue" type='submit' method='post'>Run</button> </p>
     </form>
     </p>
+
+    <div class="w3-container w3-padding w3-card-4">
+        <div class='w3-container w3-card w3-white'>
+            <h3 class='w3-center'> Config File Examples </h3>
+            <a class="w3-btn w3-padding w3-half w3-hover-blue" href="{download_page}?file_name=Config_example">Qiime2 DADA2 Config</a>
+            <a class="w3-btn w3-padding w3-half w3-hover-blue" href="{download_page}?file_name=Config_default">Default Config</a>
+        </div>
+    </div>
 </div>
diff --git a/mmeds/resources/summary_code.txt b/mmeds/resources/summary_code.txt
@@ -18,13 +18,15 @@ config = load_config(Path('config_file.yaml'), Path('metadata.tsv'), True)
 
 # Load metadata file
 if '{analysis_type}' == 'qiime2':
-    mdf = pd.read_csv('qiime_mapping_file.tsv', skiprows=[1], sep='\t')
+    mdf = pd.read_csv('qiime_mapping_file.tsv', skiprows=[1], sep='\t', dtype={{'#SampleID': 'str'}})
 else:
     mdf = pd.read_csv('qiime_mapping_file.tsv', sep='\t')
 mdf.set_index('#SampleID', inplace=True)
 
 # Load the columns to use for analysis
 metadata_columns = sorted(config['metadata'])
+metadata_discrete = sorted([x for x in config['metadata'] if not config['metadata_continuous'][x]])
+metadata_continuous = sorted([x for x in config['metadata'] if config['metadata_continuous'][x]])
 
 # Stores a list of values shared accross groups but unique within (for graphing)
 max_colors = 0
@@ -94,7 +96,7 @@ colFill <- scale_fill_manual(name = ~GroupID, values = allColors)
 allRGB <- data.frame(apply(data.frame(allColors), 1, col2rgb))
 =====
 page_break<source>
-<div style="page-break-after: always;"></div>
+\pagebreak
 =====
 otu_py<source>
 df = pd.read_csv('otu_table.tsv', skiprows=1, header=0, sep='\t')
@@ -122,6 +124,7 @@ df = df.apply(lambda x: x / x.sum(), axis='index')
 df = df.T.reset_index(level=0).melt('index')
 # Rename the columns
 df.rename({{'index': 'variable', 'variable': 'X.OTU.ID'}}, axis='columns', inplace=True)
+df = df.astype({{'variable': 'str'}})
 # Modify the metadata
 mdf_lite = mdf.reset_index()
 mdf_lite = mdf_lite[['#SampleID', '{group}']].rename({{'#SampleID': 'variable'}}, axis='columns')
@@ -211,7 +214,7 @@ with open('mod_revtex.tplx', 'w') as f:
     for line in new_lines:
         f.write(line)
 =====
-alpha_py_qiime1<source>
+alpha_py_discrete_qiime1<source>
 # Read in the data
 df = pd.read_csv('{file1}', sep='\t')
 
@@ -225,7 +228,7 @@ df.set_index('sequences per sample', inplace=True)
 # Create groupings based on metadata values
 group_means = []
 
-for group_name in metadata_columns:
+for group_name in metadata_discrete:
     grouping = mdf[group_name]
     # Calculate the means accross iterations
     groups = df.groupby('sequences per sample')
@@ -264,7 +267,7 @@ for group_name in metadata_columns:
 # Stack all the different groups into a single dataframe
 df = pd.concat(group_means, axis=0, sort=False)
 =====
-alpha_py_qiime2<source>
+alpha_py_discrete_qiime2<source>
 # Read in the data
 df = pd.read_csv('{file1}', sep=',')
 
@@ -279,7 +282,7 @@ df.variable.replace(replacements, inplace=True)
 # For storing
 group_means = []
 
-for group_name in metadata_columns:
+for group_name in metadata_discrete:
     # Remove the metadata not relevant to this grouping
     groups = df[['sample-id', 'variable', 'value', group_name]]
 
@@ -319,6 +322,60 @@ df.Grouping = df.Grouping.astype(str)
 df.GroupID = df.GroupID.astype(str)
 df.GroupName = df.GroupName.astype(str)
 =====
+alpha_py_continuous<source>
+# Read in the data
+df = pd.read_csv('{file1}', sep=',')
+
+# Reshape the data into (mostly) long format
+headers = list(set(mdf.columns).intersection(set(df.columns)))
+df = df.melt(id_vars=['sample-id'] + headers)
+
+# Remove info on specific iterations to allow for grouping by value
+replacements = {{x:int(x.split('_')[0].split('-')[1]) for x in df.variable.unique()}}
+df.variable.replace(replacements, inplace=True)
+
+# For storing
+group_means = []
+
+for group_name in metadata_continuous:
+    # Remove the metadata not relevant to this grouping
+    groups = df[['sample-id', 'variable', 'value', group_name]]
+
+    # Calculate the means accross iterations
+    agger = {{'value': 'mean', group_name: 'first'}}
+    groups = groups.groupby(['sample-id', 'variable']).agg(agger).reset_index()
+
+    # Add a column to store the errors
+    groups = groups.assign(Error=groups.value)
+
+    # Group by metadata value and calculate the means and error
+    agger = {{'Error': 'sem', 'value': 'mean'}}
+    group = groups.groupby([group_name, 'variable']).agg(agger).reset_index()
+
+    # Assign information for the colors
+    colors = [color_maps[group_name][str(x)] for x in group[group_name]]
+    group = group.assign(GroupID=colors)
+
+    # Assign information for grouping
+    group_names = [group_name for x in group[group_name]]
+    group = group.assign(GroupName=group_names)
+
+    # Rename columns and append to the list of dataframes
+    new_names = {{
+        'variable': 'SamplingDepth',
+        'value': 'AverageValue',
+         group_name: 'Grouping'
+    }}
+    group_means.append(group.rename(index=str, columns=new_names))
+
+# Stack all the different groups into a single dataframe
+df = pd.concat(group_means, axis=0, sort=False)
+df.SamplingDepth = df.SamplingDepth.astype(float)
+df.Error = df.Error.astype(float)
+df.AverageValue = df.AverageValue.astype(float)
+df.GroupID = df.GroupID.astype(str)
+df.GroupName = df.GroupName.astype(str)
+=====
 alpha_r<source>
 %%R -i df
 pd <- position_dodge(width = 50)
@@ -329,7 +386,7 @@ p <- ggplot(data = df, aes(x = {xaxis}, y = AverageValue, color = GroupID)) +
      geom_line(stat='identity', position = pd) +
      facet_wrap(~GroupName) + colFill + colScale +
      labs(title = 'Alpha Diversity',
-          subtitle = 'Grouped by Metadata Catagory') +
+          subtitle = 'Grouped by Discrete Metadata Categories') +
      theme_bw() +
      theme(legend.position = 'none',
            plot.title = element_text(hjust = 0.5),
@@ -338,7 +395,26 @@ p <- ggplot(data = df, aes(x = {xaxis}, y = AverageValue, color = GroupID)) +
 # Save plots
 ggsave('{file1}', height = 6, width = 6)
 =====
-beta_py<source>
+alpha_r_continuous<source>
+%%R -i df
+
+pd <- position_dodge(width = 50)
+df.var <- subset(df, GroupName=="{cat}")
+p <- ggplot(data = df.var, aes(x = {xaxis}, y = AverageValue, color = Grouping,group=Grouping)) +
+      geom_errorbar(aes(ymin=AverageValue-Error, ymax=AverageValue+Error), width=100, position = pd) +
+      geom_point(stat='identity', position = pd, size = 1) +
+      geom_line(stat='identity', position = pd) +
+      labs(title = 'Alpha Diversity',
+           subtitle = 'Grouped by {cat}') +
+      theme_bw() +
+      theme(legend.position = 'bottom',
+            plot.title = element_text(hjust = 0.5),
+            plot.subtitle = element_text(hjust = 0.5)) + scale_color_gradient2(low="#0000FF", high="#FF0000", mid="#FFFFFF", midpoint=mean(df.var$Grouping, na.rm=TRUE), name = '{cat}', space = "Lab", na.value = "#888888", guide = "colorbar", aesthetics = "color")
+
+# Save plots
+ggsave('{file1}', height = 6, width = 6)
+=====
+beta_py_discrete<source>
 import pandas as pd
 with open('{file1}') as f:
     page = f.read()
@@ -363,7 +439,7 @@ df = df.rename(index=str, columns=cols)
 samples = mdf['{group}'][[x for x in df.axes[0]]]
 df = df.assign(GroupID=[color_maps['{group}'][str(x)] for x in samples])
 =====
-beta_r<source>
+beta_r_discrete<source>
 %%R -i df
 
 # Create the plots for the first three PCs
@@ -398,7 +474,7 @@ for(i in 1:p$nrow) {{
                                 p[i, j]$labels$x,
                                 p[i, j]$labels$y)
             png(filename, width = 6, height = 6, unit='in', res=200)
-            sp <- p[i,j] + geom_text_repel() +
+            sp <- p[i,j] + geom_text_repel(max.overlaps = Inf) +
                       theme(legend.position = 'none',
                             plot.title = element_text(hjust = 0.5),
                             plot.subtitle = element_text(hjust = 0.5)) +
@@ -412,8 +488,87 @@ for(i in 1:p$nrow) {{
     }}
 }}
 =====
+beta_py_continuous<source>
+import pandas as pd
+with open('{file1}') as f:
+    page = f.read()
+
+store = {{}}
+# Parse the PCA information file
+for i, line in enumerate(page.split('\n')):
+    parts = line.split('\t')
+    if i == 0:
+        length = int(parts[1])
+    if i > 9 :
+        if line == '':
+            break
+        store[parts[0]] = list(map(float, parts[1:length]))
+
+# Create a dataframe and name the axes
+df = pd.DataFrame.from_dict(store).T
+cols = {{x:'PC{{}}'.format(x + 1) for x in df.columns}}
+df = df.rename(index=str, columns=cols)
+
+# Assign variable to DataFrame
+df = df.assign(variable=mdf['{group}'][[x for x in df.axes[0]]])
+=====
+beta_r_continuous<source>
+%%R -i df
+
+# Create the plots for the first three PCs
+png('{plot}', width = 6, height = 6, unit='in', res=200)
+p <- ggpairs(df[,c(1:3)],
+             legend = 4,
+             upper = list(continuous = "points", combo = "box_no_facet"),
+             lower = list(continuous = "points", combo = "dot_no_facet"),
+             aes(color = df$variable, label = rownames(df))) +
+         theme_bw() +
+         theme(legend.position = 'bottom',
+               plot.title = element_text(hjust = 0.5),
+               plot.subtitle = element_text(hjust = 0.5)) +
+         labs(title = 'PCA plot',
+              subtitle = 'Colored by {cat}') +
+         scale_color_gradient2(low="#0000FF", high="#FF0000", mid="#FFFFFF", midpoint=mean(df$variable, na.rm=TRUE), name = '{cat}', space = "Lab", na.value = "#888888", guide = "colorbar", aesthetics = "color")
+
+print(p)
+out <- dev.off()
+
+# Print the individual PCA plots with labels
+for(i in 1:p$nrow) {{
+    for(j in 1:p$ncol){{
+        # Only print the PCAs not the frequency distributions
+        if (i > 2 && j < 3 || i > 1 && j < 2) {{
+            # Setup and save each individual PCA plot
+            filename <- sprintf('{subplot}',
+                                p[i, j]$labels$x,
+                                p[i, j]$labels$y)
+            png(filename, width = 6, height = 6, unit='in', res=200)
+            sp <- p[i,j] + geom_text_repel(max.overlaps = Inf) +
+                      theme(legend.position = 'bottom',
+                            plot.title = element_text(hjust = 0.5),
+                            plot.subtitle = element_text(hjust = 0.5)) +
+                      labs(title = sprintf('%s vs. %s',
+                                           p[i, j]$labels$x,
+                                           p[i, j]$labels$y),
+                           subtitle = 'Colored by {cat}') +
+                      scale_color_gradient2(low="#0000FF", high="#FF0000", mid="#FFFFFF", midpoint=mean(df$variable, na.rm=TRUE), name = '{cat}', space = "Lab", na.value = "#888888", guide = "colorbar", aesthetics = "color")
+            print(sp)
+            out <- dev.off()
+        }}
+    }}
+}}
+=====
+taxa_description<source>
+Taxonomy plots represent the abundance of different taxa using stacked plots on a per-sample or per-group (averaged) basis. Data is normalized so that abundances per sample or per group add up to 100%. When using group-based taxonomy plots, it should be noted that only average abundances are shown per group and taxa: this can induce visual biases when a small number of samples in a group have significantly higher abundance of a given taxa compared to the rest of samples in the group, and give the (incorrect) impression that the group as a whole has high high abundance of the taxa.
+=====
+alpha_description<source>
+Alpha diversity estimates the amount of microbial diversity present in a sample or group of samples. There are several measures that can be used for alpha diversity, including observed features, Shannon's diversity or Faith's phylogenetic diversity. Because diversity estimates depend on the total number of sequences assigned to each sample, rarefaction curves are constructed to show the relation between alpha diversity (on the vertical axis) and sequencing depth (on the horizontal axis). Curves that gradually plateau as sequencing depth increases suggest that additional sequencing effort would not substantially yield additional results in terms of currently not observed diversity; curves that continue to increase suggest additional sequencing effort might be required to saturate the estimate.
+=====
+beta_description<source>
+Beta diversity estimates how similar or dissimilar samples are based on their microbiome composition. Different to alpha diversity, which is estimated per sample, beta diversity is a distance that is calculated between pairs of samples. Samples that are similar to each other in their microbiome composition will have a low distance between them based on beta diversity, while those that are very different in their composition will have a large distance. Principal Coordinate Analysis (PCoA) is an ordination technique that visually represents the samples based on their beta diversity distances to facilitate the identification of clusters or gradients of samples. By default, the first three principal coordinates are shown in PCoA plots.
+=====
 taxa_caption<source>
-The above plot represents the percentage of each sample belonging to particular taxon summarized at level {level}.
+The above plot represents the percentage of each sample belonging to particular taxon summarized at the {level} level.
 =====
 alpha_caption_qiime1<source>
 Add this

diff --git a/mmeds/server.py b/mmeds/server.py
@@ -487,8 +487,12 @@ def handle_errors_warnings(self, metadata_copy, errors, warnings):
 
         # Append Host Subject IDs to list without duplicates
         sub_count = 0
-        if df is not None and 'HostSubjectId' in df.columns:
-            sub_count = df['HostSubjectId'].nunique()
+        if cp.session['subject_type'] == 'animal':
+            subjectName = 'AnimalSubjectID'
+        else:
+            subjectName = 'HostSubjectId'
+        if df is not None and subjectName in df.columns:
+            sub_count = df[subjectName].nunique()
 
         # If there are errors report them and return the error page
         if errors:
@@ -1094,6 +1098,8 @@ def view_corrections(self):
     @cp.expose
     def analysis_page(self):
         """ Page for running analysis of previous uploads. """
+        cp.session['download_files']['Config_default'] = fig.DEFAULT_CONFIG
+        cp.session['download_files']['Config_example'] = fig.CONFIG_EXAMPLE
         study_html = ''' <tr class="w3-hover-blue">
             <th>
             <a href="{select_specimen_page}?access_code={access_code}"> {study_name} </a>