Skip to content

Commit 9d8165a

Browse files
authored
Merge pull request #30 from B-UMMI/add_snps
add number of snps to per reference table data
2 parents faf9a33 + 233ac5f commit 9d8165a

File tree

6 files changed

+84
-11
lines changed

6 files changed

+84
-11
lines changed

main.nf

+2-1
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,8 @@ workflow {
9898
postprocessing_wf.out.ngx_json,
9999
postprocessing_wf.out.misassembly_reference_json,
100100
postprocessing_wf.out.misassembly_plot_json,
101-
assembly_wf.out.all_versions)
101+
assembly_wf.out.all_versions,
102+
postprocessing_wf.out.snp_report_json)
102103
}
103104

104105
workflow.onComplete {

modules/postprocessing/postprocessing.nf

+6-2
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ process SNP_ASSESSMENT {
164164
output:
165165
path('*.tsv'), emit: tsv
166166
path('*_snps.csv'), emit: csv
167+
path('*_snps.json'), emit: json
167168

168169
script:
169170
template "snp_assessment.py"
@@ -176,10 +177,12 @@ process PLOT_SNP_REFERENCE {
176177

177178
input:
178179
path snp_coords_dataframes
180+
path snps_jsons
179181

180182
output:
181183
path('*.html') optional true
182-
path('*.json'), emit: json
184+
path('snps_in_reference.json'), emit: json
185+
path('snps_report_per_ref.json'), emit: reference_snps_json
183186

184187
script:
185188
template "plot_snp.py"
@@ -278,7 +281,7 @@ workflow postprocessing_wf {
278281
PLOT_GAP_BOXPLOT(GAP_ASSESSMENT.out.json | collect)
279282
PLOT_GAP_REFERENCE(GAP_ASSESSMENT.out.csv | collect)
280283
SNP_ASSESSMENT(paf, triple_reference)
281-
PLOT_SNP_REFERENCE(SNP_ASSESSMENT.out.csv | collect)
284+
PLOT_SNP_REFERENCE(SNP_ASSESSMENT.out.csv | collect, SNP_ASSESSMENT.out.json | collect)
282285
MISASSEMBLY(misassembly_paf)
283286
PROCESS_MISASSEMBLY(MISASSEMBLY.out.trace_pkl | collect, MISASSEMBLY.out.contig_length_pkl | collect, MISASSEMBLY.out.misassembly_json | collect, MISASSEMBLY.out.misassembled_reference_json | collect)
284287
PLOT_MISASSEMBLY(MISASSEMBLY.out.csv | collect)
@@ -290,6 +293,7 @@ workflow postprocessing_wf {
290293
phred_json = PROCESS_SHRIMP_PLOT.out.json
291294
gap_reference_json = PLOT_GAP_REFERENCE.out.json
292295
snp_reference_json = PLOT_SNP_REFERENCE.out.json
296+
snp_report_json = PLOT_SNP_REFERENCE.out.reference_snps_json
293297
gap_boxplot_json = PLOT_GAP_BOXPLOT.out.json
294298
misassembly_json = PROCESS_MISASSEMBLY.out.json
295299
misassembly_report_json = PROCESS_MISASSEMBLY.out.report_json

modules/report/report.nf

+3-1
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ process COMPILE_REPORT {
4444
file plot_misassembly_per_ref
4545
file about_md
4646
file containers_config
47+
file snp_report_json
4748

4849
output:
4950
path('pipeline_report*.json')
@@ -84,9 +85,10 @@ workflow report_wf {
8485
misassembly_reference
8586
plot_misassembly
8687
version_file
88+
snp_report_json
8789

8890
main:
8991
COMPILE_VERSIONS(version_file)
90-
COMPILE_REPORT(reads_info, stats_global, pipeline_stats, js, lmas_png, reference, plot_contig_distribution, stats_mapping, plot_completness, plot_lx, plot_phred, plot_gap_reference, plot_snp_reference, plot_gap_boxplot, misassembly_info, misassembly_report, plot_nax, plot_ngx, COMPILE_VERSIONS.out, misassembly_reference, plot_misassembly, IN_MD, containers_config)
92+
COMPILE_REPORT(reads_info, stats_global, pipeline_stats, js, lmas_png, reference, plot_contig_distribution, stats_mapping, plot_completness, plot_lx, plot_phred, plot_gap_reference, plot_snp_reference, plot_gap_boxplot, misassembly_info, misassembly_report, plot_nax, plot_ngx, COMPILE_VERSIONS.out, misassembly_reference, plot_misassembly, IN_MD, containers_config, snp_report_json)
9193

9294
}

templates/compile_reports.py

+27-2
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
ABOUT_MD = "$about_md"
4242
CONTAINERS = "$containers_config"
4343
PLOT_MISASSEMBLY_PER_REFERENCE = "$plot_misassembly_per_ref"
44+
SNP_REPORT_REFERENCE = "$snp_report_json"
4445

4546
logger.debug("Running {} with parameters:".format(
4647
os.path.basename(__file__)))
@@ -69,6 +70,7 @@
6970
logger.debug("ABOUT_MD: {}".format(ABOUT_MD))
7071
logger.debug("CONTAINERS: {}".format(CONTAINERS))
7172
logger.debug("CONTAINERS: {}".format(PLOT_MISASSEMBLY_PER_REFERENCE))
73+
logger.debug("SNP_REPORT_REFERENCE: {}".format(SNP_REPORT_REFERENCE))
7274

7375

7476
html_template = """
@@ -327,7 +329,7 @@ def process_sample_reads(reads_jsons):
327329
def main(main_js, pipeline_stats, assembly_stats_report, contig_size_plots, mapping_stats_report, completness_plot,
328330
lmas_logo, reference_file, lx_json, shrimp_json, gap_reference_json, gap_histogram, plot_misassembly, misassembly_report,
329331
min_contig_size, nax_json, ngx_json, reads_json, snp_reference_json, versions_json, misassembly_per_ref, about_md,
330-
containers_config, plot_misassembly_per_reference_json):
332+
containers_config, plot_misassembly_per_reference_json, snp_per_ref):
331333

332334
metadata = {
333335
"nfMetadata": {
@@ -433,6 +435,29 @@ def main(main_js, pipeline_stats, assembly_stats_report, contig_size_plots, mapp
433435
except KeyError:
434436
item['misassembled_contigs'] = 0
435437
item['misassembly_events'] = 0
438+
439+
# add snp stats
440+
with open(snp_per_ref) as snp_fh:
441+
snp_stats = json.load(snp_fh)
442+
for sample_id in main_data_tables_js.keys():
443+
for reference in main_data_tables_js[sample_id]["ReferenceTables"].keys():
444+
for item_row in main_data_tables_js[sample_id]["ReferenceTables"][reference]:
445+
for item in item_row:
446+
assembler = item['assembler']
447+
try:
448+
references = list(
449+
snp_stats[sample_id][assembler][0].keys())
450+
if reference in references:
451+
index = references.index(reference)
452+
ref_name = references[index]
453+
try:
454+
item['snps'] = snp_stats[sample_id][assembler][0][ref_name]["snps"]
455+
except KeyError:
456+
item['snps'] = 0
457+
else:
458+
item['snps'] = 0
459+
except KeyError:
460+
item['snps'] = 0
436461

437462
for sample_id in main_data_tables_js.keys():
438463
main_data_plots_js[sample_id] = {}
@@ -628,5 +653,5 @@ def main(main_js, pipeline_stats, assembly_stats_report, contig_size_plots, mapp
628653
main(MAIN_JS, PIPELINE_STATS, ASSEMBLY_STATS_REPORT, CONTIG_SIZE_DISTRIBUTION, MAPPING_STATS_REPORT,
629654
COMPLETNESS_JSON, LMAS_LOGO, REFERENCE_FILE, LX_JSON, SHRIMP_JSON, GAP_REFERENCE_JSON, GAP_HISTOGRAM,
630655
MISASSEMBLY_PLOT, MISASSEMBLY_REPORT, MIN_CONTIG_SIZE, NAX_JSON, NGX_JSON, READS_NUMBER, SNP_REFERENCE_JSON,
631-
VERSIONS_JSON, MISASSEMBLY_PER_REF, ABOUT_MD, CONTAINERS, PLOT_MISASSEMBLY_PER_REFERENCE)
656+
VERSIONS_JSON, MISASSEMBLY_PER_REF, ABOUT_MD, CONTAINERS, PLOT_MISASSEMBLY_PER_REFERENCE, SNP_REPORT_REFERENCE)
632657

templates/plot_snp.py

+32-4
Original file line numberDiff line numberDiff line change
@@ -50,12 +50,37 @@
5050

5151
if __file__.endswith(".command.sh"):
5252
DATAFRAME_LIST = '$snp_coords_dataframes'.split()
53+
JSON_LIST = '$snps_jsons'.split()
5354
logger.debug("Running {} with parameters:".format(
5455
os.path.basename(__file__)))
5556
logger.debug("DATAFRAME_LIST: {}".format(DATAFRAME_LIST))
56-
57-
58-
def main(dataframes):
57+
logger.debug("JSON_LIST: {}".format(JSON_LIST))
58+
59+
def snps_per_ref(report_per_reference):
60+
"""
61+
"""
62+
master_report_data_per_reference = {}
63+
for report_reference in report_per_reference:
64+
with open(report_reference) as json_fh:
65+
data_json = json.load(json_fh)
66+
print(data_json)
67+
if data_json["sample"] not in master_report_data_per_reference.keys():
68+
master_report_data_per_reference[data_json["sample"]] = {
69+
data_json["assembler"]: [data_json["reference"]]}
70+
elif data_json["assembler"] not in master_report_data_per_reference[data_json["sample"]].keys():
71+
master_report_data_per_reference[data_json["sample"]][data_json["assembler"]] = [
72+
data_json["reference"]]
73+
else:
74+
master_report_data_per_reference[data_json["sample"]][data_json["assembler"]].append(
75+
data_json["reference"])
76+
77+
logger.debug("Global report data: {}". format(master_report_data_per_reference))
78+
79+
with open("snps_report_per_ref.json", "w") as json_report:
80+
json_report.write(json.dumps(
81+
master_report_data_per_reference, separators=(",", ":")))
82+
83+
def main(dataframes, jsons):
5984

6085
li = []
6186
for filename in dataframes:
@@ -154,7 +179,10 @@ def main(dataframes):
154179

155180
with open('snps_in_reference.json', 'w') as json_report:
156181
json_report.write(json.dumps(report_dict, separators=(",", ":")))
182+
183+
# SNPS STATS PER REF
184+
snps_per_ref(jsons)
157185

158186

159187
if __name__ == '__main__':
160-
main(DATAFRAME_LIST)
188+
main(DATAFRAME_LIST, JSON_LIST)

templates/snp_assessment.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from itertools import groupby
3434
from numpy.lib.function_base import append
3535
import pandas as pd
36+
import json
3637
try:
3738
import utils
3839
except ImportError:
@@ -123,6 +124,9 @@ def main(sample_id, assembler, assembly, mapping, reference):
123124

124125
df = pd.DataFrame(columns=COLUMNS)
125126

127+
reference_report = {"sample": sample_id,
128+
"assembler": assembler, "reference": {}}
129+
126130
# iterator for reference files (sequence length is needed)
127131
references = (x[1] for x in groupby(open(reference, "r"), lambda line: line[0] == ">"))
128132

@@ -139,7 +143,16 @@ def main(sample_id, assembler, assembly, mapping, reference):
139143
#print(coord, substitution)
140144
df = df.append({'Sample': sample_id, 'Assembler': assembler, 'Reference': reference_name,
141145
'Reference Length': len(seq)/3, 'SNP Location': coord, 'Substitution Type': substitution}, ignore_index=True)
142-
146+
147+
if reference_name not in reference_report['reference'].keys():
148+
reference_report['reference'][reference_name] = {"snps": 1}
149+
else:
150+
reference_report['reference'][reference_name]["snps"] += 1
151+
152+
# Write files for report
153+
with open("{}_{}_snps.json".format(sample_id, assembler), "w") as json_report:
154+
json_report.write(json.dumps(reference_report, separators=(",", ":")))
155+
143156
df.to_csv(sample_id + '_' + assembler + '_snps.csv')
144157

145158

0 commit comments

Comments
 (0)