Skip to content

Commit 6825d1d

Browse files
author
Amanda Sullivan
committed
add subtype to the summary csv for flu only
1 parent fd8f1c4 commit 6825d1d

File tree

6 files changed

+176
-1
lines changed

6 files changed

+176
-1
lines changed

CHANGELOG.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,24 @@
33
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
44
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
55

6+
## v1.4.0 -
7+
8+
### Credits
9+
10+
- [Amanda Sullivan](https://github.com/mandysulli)
11+
12+
### Enhancements
13+
14+
### `Added`
15+
16+
- [PR #41](https://github.com/CDCgov/MIRA-NF/pull/41) - Adding in the subtype assigned by IRMA in the the summary csv for the flu modules. Only works for flu. Only adds them to the csv outputs.
17+
18+
### `Fixed`
19+
20+
### `Dependencies`
21+
22+
### `Deprecated`
23+
624
## v1.3.1 - 2024.01.08
725

826
### Credits

bin/extract_subtypes.py

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
#!/usr/bin/env python
2+
3+
import pandas as pd
4+
import numpy as np
5+
from sys import argv, path, exit, executable
6+
from os.path import dirname, basename, isfile
7+
from glob import glob
8+
9+
try:
10+
irma_path, variant_xlsx, summary_csv, outfi = argv[1], argv[2], argv[3], argv[4]
11+
except IndexError:
12+
exit(
13+
f"\n\tUSAGE: python {__file__} <path/to/workdir/> <path/to/irma/results/> <samplesheet> <ont|illumina> <flu|sc2|sc2-spike|rsv> <irma_config_type>\n"
14+
f"\n\t\t*Inside path/to/irma/results should be the individual samples-irma-dir results\n"
15+
f"\n\tYou entered:\n\t{executable} {' '.join(argv)}\n\n"
16+
)
17+
18+
na_subtypes = {
19+
"N1": "A_NA_N1",
20+
"N2": "A_NA_N2",
21+
"Bvic": "B_NA"
22+
}
23+
24+
def irmabam2df(bamFiles):
25+
sample_list = []
26+
files = []
27+
for f in bamFiles:
28+
sample = basename(dirname(dirname(dirname(f))))
29+
sample_list.insert(0,sample)
30+
files.insert(0,f)
31+
df = pd.DataFrame()
32+
df['Sample'] = sample_list
33+
df['Files'] = files
34+
return df
35+
36+
37+
ha_df = pd.DataFrame()
38+
na_df = pd.DataFrame()
39+
40+
ha_files = glob(irma_path + "/*/IRMA/*/*HA*bam")
41+
na_files = glob(irma_path + "/*/IRMA/*/*NA*bam")
42+
43+
ha_df = irmabam2df(ha_files)
44+
na_df = irmabam2df(na_files)
45+
46+
ha_df['HA_subtype'] = np.where(ha_df.Files.str.contains("A_HA_H1.bam"), "H1",
47+
np.where(ha_df.Files.str.contains("A_HA_H3.bam"), "H3",
48+
np.where(ha_df.Files.str.contains("A_HA_H5.bam"), "H5",
49+
np.where(ha_df.Files.str.contains("B_HA.bam"), "BVIC", "Missing"))))
50+
51+
na_df['NA_subtype'] = np.where(na_df.Files.str.contains("A_NA_N1.bam"), "N1",
52+
np.where(na_df.Files.str.contains("A_NA_N2.bam"), "N2",
53+
np.where(na_df.Files.str.contains("B_NA.bam"), "BVIC", "Missing")))
54+
55+
combined = pd.DataFrame()
56+
combined = ha_df.merge(na_df, how="outer", on=["Sample"])
57+
combined["Subtype"] = (
58+
combined[["HA_subtype", "NA_subtype"]]
59+
.fillna("")
60+
.agg("".join, axis=1)
61+
)
62+
combined['Subtype'] = combined['Subtype'].str.replace(r'^BVICBVIC$', 'BVIC', regex=True)
63+
combined['Subtype'] = combined['Subtype'].str.replace(r'^H1$', 'Undetermined', regex=True)
64+
combined['Subtype'] = combined['Subtype'].str.replace(r'^H3$', 'Undetermined', regex=True)
65+
combined['Subtype'] = combined['Subtype'].str.replace(r'^H5$', 'Undetermined', regex=True)
66+
combined['Subtype'] = combined['Subtype'].str.replace(r'^N1$', 'Undetermined', regex=True)
67+
combined['Subtype'] = combined['Subtype'].str.replace(r'^N2$', 'Undetermined', regex=True)
68+
69+
check_b = combined[combined['Subtype'].str.contains('BVIC', na=False)]
70+
71+
if check_b.empty == True:
72+
print ("No B's found")
73+
else:
74+
if "aavars.xlsx" in variant_xlsx:
75+
samples = check_b['Sample']
76+
table = pd.read_excel(variant_xlsx, header=0, engine="openpyxl")
77+
for i in samples:
78+
b_samples= table[table['Sample'].str.contains(i, na=False)]
79+
b_samples_ha = b_samples[b_samples['Protein'].str.contains('HA1', na=False)]
80+
if (b_samples_ha == 'BRISBANE60').any().any():
81+
print("BVIC")
82+
elif (b_samples_ha == 'PHUKET3073').any().any():
83+
combined.loc[combined['Sample'] == i, 'Subtype'] = 'BYAM'
84+
print("BYAM")
85+
else:
86+
print("AA variant table not found!")
87+
88+
if "summary.txt" in summary_csv:
89+
summary_table = pd.read_csv(summary_csv, header=0)
90+
91+
merged_df = summary_table.merge(combined[['Sample', 'Subtype']], on='Sample', how='outer').fillna("Undetermined")
92+
93+
pd.DataFrame.to_csv(merged_df, outfi, sep=",", index=False, header=True)

conf/modules.config

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,4 +223,14 @@ process {
223223
]
224224
}
225225

226+
withName: 'ADDFLUSUBTYPE' {
227+
publishDir = [
228+
[
229+
path: { "${params.outdir}/aggregate_outputs/csv-reports" },
230+
pattern: '*.csv',
231+
mode: params.publish_dir_mode
232+
]
233+
]
234+
}
235+
226236
}

modules/local/addflusubtype.nf

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
process ADDFLUSUBTYPE {
2+
label 'process_single'
3+
4+
container 'cdcgov/mira-nf:python3.10-alpine'
5+
6+
input:
7+
path irma_dir
8+
path run_path
9+
path aavars
10+
path input_summary
11+
12+
output:
13+
path '*.csv', emit: updatedcsv
14+
path 'versions.yml' , emit: versions
15+
16+
when:
17+
task.ext.when == null || task.ext.when
18+
19+
script:
20+
def args = task.ext.args ?: ''
21+
def run_name = run_path.getBaseName()
22+
23+
"""
24+
extract_subtypes.py ${irma_dir} ${aavars} ${input_summary} ${run_name}_summary.csv
25+
26+
cat <<-END_VERSIONS > versions.yml
27+
"${task.process}":
28+
addflusubtype: \$(samtools --version |& sed '1!d ; s/samtools //')
29+
END_VERSIONS
30+
"""
31+
32+
stub:
33+
def args = task.ext.args ?: ''
34+
35+
"""
36+
touch ${prefix}.bam
37+
38+
cat <<-END_VERSIONS > versions.yml
39+
"${task.process}":
40+
addflusubtype: \$(samtools --version |& sed '1!d ; s/samtools //')
41+
END_VERSIONS
42+
"""
43+
}

modules/local/parquetmaker.nf

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@ process PARQUETMAKER {
1212

1313
output:
1414
path('*.{parq,csv}'), emit: summary_parq
15-
path 'versions.yml' , emit: versions
15+
path 'versions.yml', emit: versions
16+
path 'aavars.xlsx', emit: aavars
17+
path 'input_summary.txt', emit: input_summary
1618

1719
when:
1820
task.ext.when == null || task.ext.when
@@ -54,6 +56,9 @@ process PARQUETMAKER {
5456
head -n 65 run_info_setup.txt > run_info.txt
5557
parquet_maker.py -f run_info.txt -o ${run_name}_irma_config -r ${run_name} -i ${instrument}
5658
59+
cp MIRA_${run_name}_aavars.xlsx ./aavars.xlsx
60+
cp ${run_name}_summary.csv ./input_summary.txt
61+
5762
cat <<-END_VERSIONS > versions.yml
5863
"${task.process}":
5964
parquetmaker: \$(python3 --version |& sed '1!d ; s/python3 //')

subworkflows/local/preparereports.nf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ include { CHECKMIRAVERSION } from '../../modules/local/checkmiraversion'
77
include { PREPAREIRMAJSON } from '../../modules/local/prepareirmajson'
88
include { STATICHTML } from '../../modules/local/statichtml'
99
include { PARQUETMAKER } from '../../modules/local/parquetmaker'
10+
include { ADDFLUSUBTYPE } from '../../modules/local/addflusubtype'
1011

1112
workflow PREPAREREPORTS {
1213
take:
@@ -115,6 +116,11 @@ workflow PREPAREREPORTS {
115116
PARQUETMAKER(STATICHTML.out.reports, run_ID_ch, input_ch, instrment_ch, irma_dir_ch)
116117
ch_versions = ch_versions.mix(PARQUETMAKER.out.versions)
117118

119+
if (params.e == 'Flu-Illumina' || params.e == 'Flu-ONT') {
120+
ADDFLUSUBTYPE(irma_dir_ch, run_ID_ch, PARQUETMAKER.out.aavars, PARQUETMAKER.out.input_summary)
121+
ch_versions = ch_versions.mix(ADDFLUSUBTYPE.out.versions)
122+
}
123+
118124
versions_path_ch = ch_versions.distinct().collectFile(name: 'collated_versions.yml')
119125
} else if (params.reformat_tables == false) {
120126
versions_path_ch = ch_versions.distinct().collectFile(name: 'collated_versions.yml')

0 commit comments

Comments
 (0)