Skip to content

Commit bf9ae86

Browse files
committed
adding support for age and gender, thanks to Luicer Olubayo
1 parent 36efe07 commit bf9ae86

File tree

2 files changed

+26
-15
lines changed

2 files changed

+26
-15
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,9 +156,10 @@ files for NCBI and GISAID into mySub/:
156156
There are four types of possible field names for the meta data:
157157

158158
1) the minimal ones: "isolate", "date" (alias "collection_date") and "location" (alias "country")
159-
2) the following GISAID field names: covv_collection_date, covv_virus_name, covv_location, covv_assembly_method, covv_coverage, covv_seq_technology
159+
2) the following GISAID field names: covv_collection_date, covv_virus_name, covv_location, covv_assembly_method, covv_coverage, covv_seq_technology. Additional fields that are accepted in the input file and converted to GISAID fields are: addHost, gender, age, specimen, status, outbreak, last_vaccinated, treatment, coverage, sampleId, sub_lab_sample_id
160160
3) NCBI source tags, listed here: https://www.ncbi.nlm.nih.gov/WebSub/html/help/genbank-source-table.html, most importantly isolate, collection_date and country (misnomer, it usually includes region and town)
161161
4) the NCBI Structured comment field names: "Assembly Name", "Assembly Method", "Genome Coverage", "Sequencing Technology", see: https://www.ncbi.nlm.nih.gov/genbank/structuredcomment
162+
5) For ENA, *all* fields are written to the Biosample. A few special names from the input file are converted to ENA required fields which are otherwise set to empty or default values: age -> 'host sex', 'status' -> 'host health state', 'subject' -> 'host subject id'. All others are passed through to ENA Biosample.
162163

163164
You can also rename any of your own input fields with other names but similar
164165
content to the NCBI names using the name mapping table in the configuration

multiSub

Lines changed: 24 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1246,14 +1246,19 @@ def writeGisaid(seqs, meta, seqFn, metaFn):
12461246
location = cf["gisaidLocation"]
12471247
addLocation = ""
12481248
host = "Human"
1249-
addHost = ""
1250-
gender = "unknown"
1251-
age = "unknown"
1252-
status = "unknown"
1253-
specimen = ""
1254-
outbreak = ""
1255-
last_vaccinated = ""
1256-
treatment = ""
1249+
1250+
addHost = metaDict.get("addHost", "")
1251+
gender = metaDict.get("gender", "unknown")
1252+
age = metaDict.get("age", "unknown")
1253+
specimen = metaDict.get("specimen", "")
1254+
status = metaDict.get("status", "unknown")
1255+
outbreak = metaDict.get("outbreak", "")
1256+
last_vaccinated = metaDict.get("last_vaccinated", "")
1257+
treatment = metaDict.get("treatment", "")
1258+
coverage = metaDict.get("coverage", "")
1259+
sampleId = metaDict.get("sampleId", "")
1260+
subm_lab_sample_id = metaDict.get("subm_lab_sample_id", "")
1261+
12571262
if "gisaid_technology" in cf:
12581263
technology = cf["gisaid_technology"]
12591264
else:
@@ -1264,13 +1269,11 @@ def writeGisaid(seqs, meta, seqFn, metaFn):
12641269
else:
12651270
assembly_method = ""
12661271

1267-
coverage = ""
1272+
12681273
orig_lab = cf["gisaid_orig_lab"]
12691274
orig_lab_addr = cf["gisaid_orig_lab_addr"]
1270-
sampleId = ""
12711275
subm_lab = cf["gisaid_orig_lab"]
12721276
subm_lab_addr = cf["gisaid_orig_lab_addr"]
1273-
subm_lab_sample_id = ""
12741277

12751278
auStrs = []
12761279
for au in cf["authors"]:
@@ -1503,12 +1506,19 @@ def writeEnaXml(meta, ofh, prefix):
15031506
u"host common name" : host,
15041507
u"sample capture status" : u"active surveillance in response to outbreak",
15051508
u"host scientific name" : u"Homo sapiens",
1506-
u"host subject id" : seqId, # not correct, but we have no other identifier
1507-
u"host health state" : u"diseased",
1508-
u"host sex" : u"not provided",
1509+
u"host subject id" : seqMeta.get("subject", seqId), # seqId, not correct, but we have no other identifier
1510+
u"host health state" : seqMeta.get("status", u"diseased")
1511+
u"host sex" : seqMeta.get("gender", u"not provided")
15091512
u"isolate" : seqMeta["isolate"],
15101513
}
15111514

1515+
skipAttrs = ["gender", "subject", "status", "isolate", "collection_date"]
1516+
1517+
# merge all the metaDict attributes into the ENA attributes: we pass everything through here
1518+
for key, val in seqMeta.items():
1519+
if key not in skipAttrs:
1520+
attributes[key] = val
1521+
15121522
for tag, value in attributes.items():
15131523
ofh.write(u' <SAMPLE_ATTRIBUTE>')
15141524
ofh.write(u'<TAG>%s</TAG>' % tag)

0 commit comments

Comments
 (0)