add subtype to the summary csv for flu only

Amanda Sullivan · Amanda Sullivan · commit 6825d1d68670 · 2025-01-21T16:45:45.000-05:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,24 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## v1.4.0 -
+
+### Credits
+
+- [Amanda Sullivan](https://github.com/mandysulli)
+
+### Enhancements
+
+### `Added`
+
+- [PR #41](https://github.com/CDCgov/MIRA-NF/pull/41) - Adding in the subtype assigned by IRMA in the the summary csv for the flu modules. Only works for flu. Only adds them to the csv outputs.
+
+### `Fixed`
+
+### `Dependencies`
+
+### `Deprecated`
+
 ## v1.3.1 - 2024.01.08
 
 ### Credits
diff --git a/bin/extract_subtypes.py b/bin/extract_subtypes.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+
+import pandas as pd
+import numpy as np
+from sys import argv, path, exit, executable
+from os.path import dirname, basename, isfile
+from glob import glob
+
+try:
+    irma_path, variant_xlsx, summary_csv, outfi = argv[1], argv[2], argv[3], argv[4]
+except IndexError:
+    exit(
+        f"\n\tUSAGE: python {__file__} <path/to/workdir/> <path/to/irma/results/> <samplesheet> <ont|illumina> <flu|sc2|sc2-spike|rsv> <irma_config_type>\n"
+        f"\n\t\t*Inside path/to/irma/results should be the individual samples-irma-dir results\n"
+        f"\n\tYou entered:\n\t{executable} {' '.join(argv)}\n\n"
+    )
+
+na_subtypes = {
+    "N1": "A_NA_N1",
+    "N2": "A_NA_N2",
+    "Bvic": "B_NA"
+}
+
+def irmabam2df(bamFiles):
+    sample_list = []
+    files = []
+    for f in bamFiles:
+        sample = basename(dirname(dirname(dirname(f))))
+        sample_list.insert(0,sample)
+        files.insert(0,f)
+    df = pd.DataFrame()
+    df['Sample'] = sample_list
+    df['Files'] = files
+    return df
+
+
+ha_df = pd.DataFrame()
+na_df = pd.DataFrame()
+
+ha_files = glob(irma_path + "/*/IRMA/*/*HA*bam")
+na_files = glob(irma_path + "/*/IRMA/*/*NA*bam")
+
+ha_df = irmabam2df(ha_files)
+na_df = irmabam2df(na_files)
+
+ha_df['HA_subtype'] = np.where(ha_df.Files.str.contains("A_HA_H1.bam"), "H1",
+                   np.where(ha_df.Files.str.contains("A_HA_H3.bam"), "H3",
+                   np.where(ha_df.Files.str.contains("A_HA_H5.bam"), "H5", 
+                   np.where(ha_df.Files.str.contains("B_HA.bam"), "BVIC", "Missing"))))
+
+na_df['NA_subtype'] = np.where(na_df.Files.str.contains("A_NA_N1.bam"), "N1",
+                   np.where(na_df.Files.str.contains("A_NA_N2.bam"), "N2",
+                   np.where(na_df.Files.str.contains("B_NA.bam"), "BVIC", "Missing")))
+
+combined = pd.DataFrame()
+combined = ha_df.merge(na_df, how="outer", on=["Sample"])
+combined["Subtype"] = (
+            combined[["HA_subtype", "NA_subtype"]]
+            .fillna("")
+            .agg("".join, axis=1)
+            )
+combined['Subtype'] = combined['Subtype'].str.replace(r'^BVICBVIC$', 'BVIC', regex=True)
+combined['Subtype'] = combined['Subtype'].str.replace(r'^H1$', 'Undetermined', regex=True)
+combined['Subtype'] = combined['Subtype'].str.replace(r'^H3$', 'Undetermined', regex=True)
+combined['Subtype'] = combined['Subtype'].str.replace(r'^H5$', 'Undetermined', regex=True)
+combined['Subtype'] = combined['Subtype'].str.replace(r'^N1$', 'Undetermined', regex=True)
+combined['Subtype'] = combined['Subtype'].str.replace(r'^N2$', 'Undetermined', regex=True)
+
+check_b = combined[combined['Subtype'].str.contains('BVIC', na=False)]
+
+if check_b.empty == True:
+    print ("No B's found")
+else:
+    if "aavars.xlsx" in variant_xlsx:
+        samples = check_b['Sample']
+        table = pd.read_excel(variant_xlsx, header=0, engine="openpyxl")
+        for i in samples:
+            b_samples= table[table['Sample'].str.contains(i, na=False)]
+            b_samples_ha = b_samples[b_samples['Protein'].str.contains('HA1', na=False)]
+            if (b_samples_ha == 'BRISBANE60').any().any():
+                print("BVIC")
+            elif (b_samples_ha == 'PHUKET3073').any().any():
+                combined.loc[combined['Sample'] == i, 'Subtype'] = 'BYAM'
+                print("BYAM")
+    else: 
+        print("AA variant table not found!")
+
+if "summary.txt" in summary_csv:
+    summary_table = pd.read_csv(summary_csv, header=0)
+
+merged_df = summary_table.merge(combined[['Sample', 'Subtype']], on='Sample', how='outer').fillna("Undetermined")
+
+pd.DataFrame.to_csv(merged_df, outfi, sep=",", index=False, header=True)
diff --git a/conf/modules.config b/conf/modules.config
@@ -223,4 +223,14 @@ process {
         ]
     }
 
+        withName: 'ADDFLUSUBTYPE' {
+        publishDir = [
+            [
+                path: { "${params.outdir}/aggregate_outputs/csv-reports" },
+                pattern: '*.csv',
+                mode: params.publish_dir_mode
+            ]
+        ]
+    }
+
 }
diff --git a/modules/local/addflusubtype.nf b/modules/local/addflusubtype.nf
@@ -0,0 +1,43 @@
+process ADDFLUSUBTYPE {
+    label 'process_single'
+
+    container 'cdcgov/mira-nf:python3.10-alpine'
+
+    input:
+    path irma_dir
+    path run_path
+    path aavars
+    path input_summary
+
+    output:
+    path '*.csv', emit: updatedcsv
+    path 'versions.yml'           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def run_name = run_path.getBaseName()
+
+    """
+    extract_subtypes.py ${irma_dir} ${aavars} ${input_summary} ${run_name}_summary.csv
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        addflusubtype: \$(samtools --version |& sed '1!d ; s/samtools //')
+    END_VERSIONS
+    """
+
+    stub:
+    def args = task.ext.args ?: ''
+
+    """
+    touch ${prefix}.bam
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        addflusubtype: \$(samtools --version |& sed '1!d ; s/samtools //')
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/parquetmaker.nf b/modules/local/parquetmaker.nf
@@ -12,7 +12,9 @@ process PARQUETMAKER {
 
     output:
     path('*.{parq,csv}'), emit: summary_parq
-    path 'versions.yml' , emit: versions
+    path 'versions.yml', emit: versions
+    path 'aavars.xlsx', emit: aavars
+    path 'input_summary.txt', emit: input_summary
 
     when:
     task.ext.when == null || task.ext.when
@@ -54,6 +56,9 @@ process PARQUETMAKER {
     head -n 65 run_info_setup.txt > run_info.txt
     parquet_maker.py -f run_info.txt -o ${run_name}_irma_config -r ${run_name} -i ${instrument}
 
+    cp MIRA_${run_name}_aavars.xlsx ./aavars.xlsx
+    cp ${run_name}_summary.csv ./input_summary.txt
+
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
         parquetmaker: \$(python3 --version |& sed '1!d ; s/python3 //')
diff --git a/subworkflows/local/preparereports.nf b/subworkflows/local/preparereports.nf
@@ -7,6 +7,7 @@ include { CHECKMIRAVERSION   } from '../../modules/local/checkmiraversion'
 include { PREPAREIRMAJSON   } from '../../modules/local/prepareirmajson'
 include { STATICHTML        } from '../../modules/local/statichtml'
 include { PARQUETMAKER      } from '../../modules/local/parquetmaker'
+include { ADDFLUSUBTYPE      } from '../../modules/local/addflusubtype'
 
 workflow PREPAREREPORTS {
     take:
@@ -115,6 +116,11 @@ workflow PREPAREREPORTS {
         PARQUETMAKER(STATICHTML.out.reports, run_ID_ch, input_ch, instrment_ch, irma_dir_ch)
         ch_versions = ch_versions.mix(PARQUETMAKER.out.versions)
 
+        if (params.e == 'Flu-Illumina' || params.e == 'Flu-ONT') {
+            ADDFLUSUBTYPE(irma_dir_ch, run_ID_ch, PARQUETMAKER.out.aavars, PARQUETMAKER.out.input_summary)
+            ch_versions = ch_versions.mix(ADDFLUSUBTYPE.out.versions)
+        }
+
         versions_path_ch = ch_versions.distinct().collectFile(name: 'collated_versions.yml')
     } else if (params.reformat_tables == false) {
         versions_path_ch = ch_versions.distinct().collectFile(name: 'collated_versions.yml')

Original file line number	Diff line number	Diff line change
`@@ -223,4 +223,14 @@ process {`
`223`	`223`	`]`
`224`	`224`	`}`
`225`	`225`
	`226`	`+ withName: 'ADDFLUSUBTYPE' {`
	`227`	`+ publishDir = [`
	`228`	`+ [`
	`229`	`+ path: { "${params.outdir}/aggregate_outputs/csv-reports" },`
	`230`	`+ pattern: '*.csv',`
	`231`	`+ mode: params.publish_dir_mode`
	`232`	`+ ]`
	`233`	`+ ]`
	`234`	`+ }`
	`235`	`+`
`226`	`236`	`}`