Skip to content

Commit

Permalink
Merge pull request #351 from clemente-lab/Enhancement-SummaryImprovement
Browse files Browse the repository at this point in the history
Enhancement summary improvement
  • Loading branch information
adamcantor22 authored Dec 10, 2021
2 parents 8f9884d + d2af659 commit b06f488
Show file tree
Hide file tree
Showing 31 changed files with 483 additions and 118 deletions.
4 changes: 2 additions & 2 deletions docs/summary_env.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
`conda activate jupyter`

# Then install the necessary R packages and version of nbconvert from conda-forge
# NB: nbconvert was previously kept at version 5.6.1, now kept current
`conda install nbconvert r-ggplot2 r-ggally r-ggrepel r-rcolorbrewer -c conda-forge`
# NB: you will regret attempting to use a more recent version than nbconvert 5.6.1
`conda install nbconvert=5.6.1 r-ggplot2 r-ggally r-ggrepel r-rcolorbrewer -c conda-forge`

# Now use pip to install the PDF template for jupyter and the Pillow python library

Expand Down
3 changes: 3 additions & 0 deletions mmeds/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,12 +412,15 @@
USER_GUIDE = str(TEST_PATH / 'User_Guide.txt')
SUBJECT_TEMPLATE = str(TEST_PATH / 'subject_template.tsv')
SPECIMEN_TEMPLATE = str(TEST_PATH / 'specimen_template.tsv')
CONFIG_EXAMPLE = str(TEST_PATH / 'config_example.yaml')


# Demultiplexing Qiime Defaults
QIIME_SAMPLE_ID_CATS = ('#SampleID', '#q2:types')
QIIME_FORWARD_BARCODE_CATS = ('BarcodeSequence', 'categorical')
QIIME_REVERSE_BARCODE_CATS = ('BarcodeSequenceR', 'categorical')
FASTQ_FILENAME_TEMPLATE = '{}_S1_L001_R{}_001.fastq.gz'

TEST_FILES = {
'barcodes': TEST_BARCODES,
'for_reads': TEST_READS,
Expand Down
8 changes: 8 additions & 0 deletions mmeds/html/analysis_select_tool.html
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,12 @@
<p> <button class="w3-btn w3-blue" type='submit' method='post'>Run</button> </p>
</form>
</p>

<div class="w3-container w3-padding w3-card-4">
<div class='w3-container w3-card w3-white'>
<h3 class='w3-center'> Config File Examples </h3>
<a class="w3-btn w3-padding w3-half w3-hover-blue" href="{download_page}?file_name=Config_example">Qiime2 DADA2 Config</a>
<a class="w3-btn w3-padding w3-half w3-hover-blue" href="{download_page}?file_name=Config_default">Default Config</a>
</div>
</div>
</div>
177 changes: 166 additions & 11 deletions mmeds/resources/summary_code.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,15 @@ config = load_config(Path('config_file.yaml'), Path('metadata.tsv'), True)

# Load metadata file
if '{analysis_type}' == 'qiime2':
mdf = pd.read_csv('qiime_mapping_file.tsv', skiprows=[1], sep='\t')
mdf = pd.read_csv('qiime_mapping_file.tsv', skiprows=[1], sep='\t', dtype={{'#SampleID': 'str'}})
else:
mdf = pd.read_csv('qiime_mapping_file.tsv', sep='\t')
mdf.set_index('#SampleID', inplace=True)

# Load the columns to use for analysis
metadata_columns = sorted(config['metadata'])
metadata_discrete = sorted([x for x in config['metadata'] if not config['metadata_continuous'][x]])
metadata_continuous = sorted([x for x in config['metadata'] if config['metadata_continuous'][x]])

# Stores a list of values shared accross groups but unique within (for graphing)
max_colors = 0
Expand Down Expand Up @@ -94,7 +96,7 @@ colFill <- scale_fill_manual(name = ~GroupID, values = allColors)
allRGB <- data.frame(apply(data.frame(allColors), 1, col2rgb))
=====
page_break<source>
<div style="page-break-after: always;"></div>
\pagebreak
=====
otu_py<source>
df = pd.read_csv('otu_table.tsv', skiprows=1, header=0, sep='\t')
Expand Down Expand Up @@ -122,6 +124,7 @@ df = df.apply(lambda x: x / x.sum(), axis='index')
df = df.T.reset_index(level=0).melt('index')
# Rename the columns
df.rename({{'index': 'variable', 'variable': 'X.OTU.ID'}}, axis='columns', inplace=True)
df = df.astype({{'variable': 'str'}})
# Modify the metadata
mdf_lite = mdf.reset_index()
mdf_lite = mdf_lite[['#SampleID', '{group}']].rename({{'#SampleID': 'variable'}}, axis='columns')
Expand Down Expand Up @@ -211,7 +214,7 @@ with open('mod_revtex.tplx', 'w') as f:
for line in new_lines:
f.write(line)
=====
alpha_py_qiime1<source>
alpha_py_discrete_qiime1<source>
# Read in the data
df = pd.read_csv('{file1}', sep='\t')

Expand All @@ -225,7 +228,7 @@ df.set_index('sequences per sample', inplace=True)
# Create groupings based on metadata values
group_means = []

for group_name in metadata_columns:
for group_name in metadata_discrete:
grouping = mdf[group_name]
# Calculate the means accross iterations
groups = df.groupby('sequences per sample')
Expand Down Expand Up @@ -264,7 +267,7 @@ for group_name in metadata_columns:
# Stack all the different groups into a single dataframe
df = pd.concat(group_means, axis=0, sort=False)
=====
alpha_py_qiime2<source>
alpha_py_discrete_qiime2<source>
# Read in the data
df = pd.read_csv('{file1}', sep=',')

Expand All @@ -279,7 +282,7 @@ df.variable.replace(replacements, inplace=True)
# For storing
group_means = []

for group_name in metadata_columns:
for group_name in metadata_discrete:
# Remove the metadata not relevant to this grouping
groups = df[['sample-id', 'variable', 'value', group_name]]

Expand Down Expand Up @@ -319,6 +322,60 @@ df.Grouping = df.Grouping.astype(str)
df.GroupID = df.GroupID.astype(str)
df.GroupName = df.GroupName.astype(str)
=====
alpha_py_continuous<source>
# Read in the data
df = pd.read_csv('{file1}', sep=',')

# Reshape the data into (mostly) long format
headers = list(set(mdf.columns).intersection(set(df.columns)))
df = df.melt(id_vars=['sample-id'] + headers)

# Remove info on specific iterations to allow for grouping by value
replacements = {{x:int(x.split('_')[0].split('-')[1]) for x in df.variable.unique()}}
df.variable.replace(replacements, inplace=True)

# For storing
group_means = []

for group_name in metadata_continuous:
# Remove the metadata not relevant to this grouping
groups = df[['sample-id', 'variable', 'value', group_name]]

# Calculate the means accross iterations
agger = {{'value': 'mean', group_name: 'first'}}
groups = groups.groupby(['sample-id', 'variable']).agg(agger).reset_index()

# Add a column to store the errors
groups = groups.assign(Error=groups.value)

# Group by metadata value and calculate the means and error
agger = {{'Error': 'sem', 'value': 'mean'}}
group = groups.groupby([group_name, 'variable']).agg(agger).reset_index()

# Assign information for the colors
colors = [color_maps[group_name][str(x)] for x in group[group_name]]
group = group.assign(GroupID=colors)

# Assign information for grouping
group_names = [group_name for x in group[group_name]]
group = group.assign(GroupName=group_names)

# Rename columns and append to the list of dataframes
new_names = {{
'variable': 'SamplingDepth',
'value': 'AverageValue',
group_name: 'Grouping'
}}
group_means.append(group.rename(index=str, columns=new_names))

# Stack all the different groups into a single dataframe
df = pd.concat(group_means, axis=0, sort=False)
df.SamplingDepth = df.SamplingDepth.astype(float)
df.Error = df.Error.astype(float)
df.AverageValue = df.AverageValue.astype(float)
df.GroupID = df.GroupID.astype(str)
df.GroupName = df.GroupName.astype(str)
=====
alpha_r<source>
%%R -i df
pd <- position_dodge(width = 50)
Expand All @@ -329,7 +386,7 @@ p <- ggplot(data = df, aes(x = {xaxis}, y = AverageValue, color = GroupID)) +
geom_line(stat='identity', position = pd) +
facet_wrap(~GroupName) + colFill + colScale +
labs(title = 'Alpha Diversity',
subtitle = 'Grouped by Metadata Catagory') +
subtitle = 'Grouped by Discrete Metadata Categories') +
theme_bw() +
theme(legend.position = 'none',
plot.title = element_text(hjust = 0.5),
Expand All @@ -338,7 +395,26 @@ p <- ggplot(data = df, aes(x = {xaxis}, y = AverageValue, color = GroupID)) +
# Save plots
ggsave('{file1}', height = 6, width = 6)
=====
beta_py<source>
alpha_r_continuous<source>
%%R -i df

pd <- position_dodge(width = 50)
df.var <- subset(df, GroupName=="{cat}")
p <- ggplot(data = df.var, aes(x = {xaxis}, y = AverageValue, color = Grouping,group=Grouping)) +
geom_errorbar(aes(ymin=AverageValue-Error, ymax=AverageValue+Error), width=100, position = pd) +
geom_point(stat='identity', position = pd, size = 1) +
geom_line(stat='identity', position = pd) +
labs(title = 'Alpha Diversity',
subtitle = 'Grouped by {cat}') +
theme_bw() +
theme(legend.position = 'bottom',
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)) + scale_color_gradient2(low="#0000FF", high="#FF0000", mid="#FFFFFF", midpoint=mean(df.var$Grouping, na.rm=TRUE), name = '{cat}', space = "Lab", na.value = "#888888", guide = "colorbar", aesthetics = "color")

# Save plots
ggsave('{file1}', height = 6, width = 6)
=====
beta_py_discrete<source>
import pandas as pd
with open('{file1}') as f:
page = f.read()
Expand All @@ -363,7 +439,7 @@ df = df.rename(index=str, columns=cols)
samples = mdf['{group}'][[x for x in df.axes[0]]]
df = df.assign(GroupID=[color_maps['{group}'][str(x)] for x in samples])
=====
beta_r<source>
beta_r_discrete<source>
%%R -i df

# Create the plots for the first three PCs
Expand Down Expand Up @@ -398,7 +474,7 @@ for(i in 1:p$nrow) {{
p[i, j]$labels$x,
p[i, j]$labels$y)
png(filename, width = 6, height = 6, unit='in', res=200)
sp <- p[i,j] + geom_text_repel() +
sp <- p[i,j] + geom_text_repel(max.overlaps = Inf) +
theme(legend.position = 'none',
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)) +
Expand All @@ -412,8 +488,87 @@ for(i in 1:p$nrow) {{
}}
}}
=====
beta_py_continuous<source>
import pandas as pd
with open('{file1}') as f:
page = f.read()

store = {{}}
# Parse the PCA information file
for i, line in enumerate(page.split('\n')):
parts = line.split('\t')
if i == 0:
length = int(parts[1])
if i > 9 :
if line == '':
break
store[parts[0]] = list(map(float, parts[1:length]))

# Create a dataframe and name the axes
df = pd.DataFrame.from_dict(store).T
cols = {{x:'PC{{}}'.format(x + 1) for x in df.columns}}
df = df.rename(index=str, columns=cols)

# Assign variable to DataFrame
df = df.assign(variable=mdf['{group}'][[x for x in df.axes[0]]])
=====
beta_r_continuous<source>
%%R -i df

# Create the plots for the first three PCs
png('{plot}', width = 6, height = 6, unit='in', res=200)
p <- ggpairs(df[,c(1:3)],
legend = 4,
upper = list(continuous = "points", combo = "box_no_facet"),
lower = list(continuous = "points", combo = "dot_no_facet"),
aes(color = df$variable, label = rownames(df))) +
theme_bw() +
theme(legend.position = 'bottom',
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)) +
labs(title = 'PCA plot',
subtitle = 'Colored by {cat}') +
scale_color_gradient2(low="#0000FF", high="#FF0000", mid="#FFFFFF", midpoint=mean(df$variable, na.rm=TRUE), name = '{cat}', space = "Lab", na.value = "#888888", guide = "colorbar", aesthetics = "color")

print(p)
out <- dev.off()

# Print the individual PCA plots with labels
for(i in 1:p$nrow) {{
for(j in 1:p$ncol){{
# Only print the PCAs not the frequency distributions
if (i > 2 && j < 3 || i > 1 && j < 2) {{
# Setup and save each individual PCA plot
filename <- sprintf('{subplot}',
p[i, j]$labels$x,
p[i, j]$labels$y)
png(filename, width = 6, height = 6, unit='in', res=200)
sp <- p[i,j] + geom_text_repel(max.overlaps = Inf) +
theme(legend.position = 'bottom',
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)) +
labs(title = sprintf('%s vs. %s',
p[i, j]$labels$x,
p[i, j]$labels$y),
subtitle = 'Colored by {cat}') +
scale_color_gradient2(low="#0000FF", high="#FF0000", mid="#FFFFFF", midpoint=mean(df$variable, na.rm=TRUE), name = '{cat}', space = "Lab", na.value = "#888888", guide = "colorbar", aesthetics = "color")
print(sp)
out <- dev.off()
}}
}}
}}
=====
taxa_description<source>
Taxonomy plots represent the abundance of different taxa using stacked plots on a per-sample or per-group (averaged) basis. Data is normalized so that abundances per sample or per group add up to 100%. When using group-based taxonomy plots, it should be noted that only average abundances are shown per group and taxa: this can induce visual biases when a small number of samples in a group have significantly higher abundance of a given taxa compared to the rest of samples in the group, and give the (incorrect) impression that the group as a whole has high high abundance of the taxa.
=====
alpha_description<source>
Alpha diversity estimates the amount of microbial diversity present in a sample or group of samples. There are several measures that can be used for alpha diversity, including observed features, Shannon's diversity or Faith's phylogenetic diversity. Because diversity estimates depend on the total number of sequences assigned to each sample, rarefaction curves are constructed to show the relation between alpha diversity (on the vertical axis) and sequencing depth (on the horizontal axis). Curves that gradually plateau as sequencing depth increases suggest that additional sequencing effort would not substantially yield additional results in terms of currently not observed diversity; curves that continue to increase suggest additional sequencing effort might be required to saturate the estimate.
=====
beta_description<source>
Beta diversity estimates how similar or dissimilar samples are based on their microbiome composition. Different to alpha diversity, which is estimated per sample, beta diversity is a distance that is calculated between pairs of samples. Samples that are similar to each other in their microbiome composition will have a low distance between them based on beta diversity, while those that are very different in their composition will have a large distance. Principal Coordinate Analysis (PCoA) is an ordination technique that visually represents the samples based on their beta diversity distances to facilitate the identification of clusters or gradients of samples. By default, the first three principal coordinates are shown in PCoA plots.
=====
taxa_caption<source>
The above plot represents the percentage of each sample belonging to particular taxon summarized at level {level}.
The above plot represents the percentage of each sample belonging to particular taxon summarized at the {level} level.
=====
alpha_caption_qiime1<source>
Add this
Expand Down
10 changes: 8 additions & 2 deletions mmeds/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,8 +487,12 @@ def handle_errors_warnings(self, metadata_copy, errors, warnings):

# Append Host Subject IDs to list without duplicates
sub_count = 0
if df is not None and 'HostSubjectId' in df.columns:
sub_count = df['HostSubjectId'].nunique()
if cp.session['subject_type'] == 'animal':
subjectName = 'AnimalSubjectID'
else:
subjectName = 'HostSubjectId'
if df is not None and subjectName in df.columns:
sub_count = df[subjectName].nunique()

# If there are errors report them and return the error page
if errors:
Expand Down Expand Up @@ -1094,6 +1098,8 @@ def view_corrections(self):
@cp.expose
def analysis_page(self):
""" Page for running analysis of previous uploads. """
cp.session['download_files']['Config_default'] = fig.DEFAULT_CONFIG
cp.session['download_files']['Config_example'] = fig.CONFIG_EXAMPLE
study_html = ''' <tr class="w3-hover-blue">
<th>
<a href="{select_specimen_page}?access_code={access_code}"> {study_name} </a>
Expand Down
Loading

0 comments on commit b06f488

Please sign in to comment.