adding support for age and gender, thanks to Luicer Olubayo

maximilianh · maximilianh · commit bf9ae86cc1d6 · 2021-12-08T16:48:40.000+01:00
diff --git a/README.md b/README.md
@@ -156,9 +156,10 @@ files for NCBI and GISAID into mySub/:
 There are four types of possible field names for the meta data:
 
 1) the minimal ones: "isolate", "date" (alias "collection_date") and "location" (alias "country")
-2) the following GISAID field names: covv_collection_date, covv_virus_name, covv_location, covv_assembly_method, covv_coverage, covv_seq_technology
+2) the following GISAID field names: covv_collection_date, covv_virus_name, covv_location, covv_assembly_method, covv_coverage, covv_seq_technology. Additional fields that are accepted in the input file and converted to GISAID fields are: addHost, gender, age, specimen, status, outbreak, last_vaccinated, treatment, coverage, sampleId, sub_lab_sample_id
 3) NCBI source tags, listed here: https://www.ncbi.nlm.nih.gov/WebSub/html/help/genbank-source-table.html, most importantly isolate, collection_date and country (misnomer, it usually includes region and town)
 4) the NCBI Structured comment field names: "Assembly Name", "Assembly Method", "Genome Coverage", "Sequencing Technology", see: https://www.ncbi.nlm.nih.gov/genbank/structuredcomment
+5) For ENA, *all* fields are written to the Biosample. A few special names from the input file are converted to ENA required fields which are otherwise set to empty or default values: age -> 'host sex', 'status' -> 'host health state', 'subject' -> 'host subject id'. All others are passed through to ENA Biosample.
 
 You can also rename any of your own input fields with other names but similar
 content to the NCBI names using the name mapping table in the configuration
diff --git a/multiSub b/multiSub
@@ -1246,14 +1246,19 @@ def writeGisaid(seqs, meta, seqFn, metaFn):
             location = cf["gisaidLocation"]
         addLocation = ""
         host = "Human"
-        addHost = ""
-        gender = "unknown"
-        age = "unknown"
-        status = "unknown"
-        specimen = ""
-        outbreak = ""
-        last_vaccinated = ""
-        treatment = ""
+
+        addHost = metaDict.get("addHost", "")
+        gender = metaDict.get("gender", "unknown")
+        age = metaDict.get("age", "unknown")
+        specimen = metaDict.get("specimen", "")
+        status = metaDict.get("status", "unknown")
+        outbreak = metaDict.get("outbreak", "")
+        last_vaccinated = metaDict.get("last_vaccinated", "")
+        treatment = metaDict.get("treatment", "")
+        coverage = metaDict.get("coverage", "")
+        sampleId = metaDict.get("sampleId", "")
+        subm_lab_sample_id = metaDict.get("subm_lab_sample_id", "")
+
         if "gisaid_technology" in cf:
             technology = cf["gisaid_technology"]
         else:
@@ -1264,13 +1269,11 @@ def writeGisaid(seqs, meta, seqFn, metaFn):
         else:
             assembly_method = ""
 
-        coverage = ""
+
         orig_lab = cf["gisaid_orig_lab"]
         orig_lab_addr = cf["gisaid_orig_lab_addr"]
-        sampleId = ""
         subm_lab = cf["gisaid_orig_lab"]
         subm_lab_addr = cf["gisaid_orig_lab_addr"]
-        subm_lab_sample_id = ""
 
         auStrs = []
         for au in cf["authors"]:
@@ -1503,12 +1506,19 @@ def writeEnaXml(meta, ofh, prefix):
             u"host common name" : host,
             u"sample capture status" : u"active surveillance in response to outbreak",
             u"host scientific name" : u"Homo sapiens",
-            u"host subject id" : seqId, # not correct, but we have no other identifier
-            u"host health state" : u"diseased",
-            u"host sex" : u"not provided",
+            u"host subject id" : seqMeta.get("subject", seqId), # seqId, not correct, but we have no other identifier
+            u"host health state" : seqMeta.get("status", u"diseased")
+            u"host sex" : seqMeta.get("gender", u"not provided")
             u"isolate" : seqMeta["isolate"],
         }
 
+        skipAttrs = ["gender", "subject", "status", "isolate", "collection_date"]
+
+        # merge all the metaDict attributes into the ENA attributes: we pass everything through here
+        for key, val in seqMeta.items():
+            if key not in skipAttrs:
+                attributes[key] = val
+
         for tag, value in attributes.items():
             ofh.write(u'      <SAMPLE_ATTRIBUTE>')
             ofh.write(u'<TAG>%s</TAG>' % tag)