Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make metadata homogeneizer work properly #368

Draft
wants to merge 6 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions relecov_tools/conf/laboratory_address.json
Original file line number Diff line number Diff line change
Expand Up @@ -11087,5 +11087,94 @@
"submitting_institution": "Hospital Comarcal de Melilla",
"submitting_institution_address": "Remonta, 2",
"submitting_institution_email": ""
},
"Ministerio Sanidad": {
"collecting_institution_address": "Paseo del Prado, 18",
"collecting_institution_email": "",
"geo_loc_state": "Comunidad de Madrid",
"geo_loc_region": "Madrid",
"geo_loc_city": "Madrid",
"geo_loc_country": "Spain",
"submitting_institution": "Ministerio Sanidad",
"submitting_institution_address": "Paseo del Prado, 18",
"submitting_institution_email": ""
},
"Analiza, S.L. Hospital Moncloa": {
"collecting_institution_address": "Av. de Valladolid, 83",
"collecting_institution_email": "[email protected]",
"geo_loc_state": "Comunidad de Madrid",
"geo_loc_region": "Madrid",
"geo_loc_city": "Madrid",
"geo_loc_country": "Spain",
"submitting_institution": "Analiza, S.L. Hospital Moncloa",
"submitting_institution_address": "Av. de Valladolid, 83",
"submitting_institution_email": "[email protected]"
},
"Atimmunolab": {
"collecting_institution_address": "Calle de la Salud, 14",
"collecting_institution_email": "[email protected]",
"geo_loc_state": "Comunidad Valenciana",
"geo_loc_region": "Valencia",
"geo_loc_city": "Valencia",
"geo_loc_country": "Spain",
"submitting_institution": "Atimmunolab",
"submitting_institution_address": "Calle de la Salud, 14",
"submitting_institution_email": "[email protected]"
},
"Centro Nacional De Microbiologia": {
"collecting_institution_address": "Carretera Majadahonda-Pozuelo, Km 2",
"collecting_institution_email": "[email protected]",
"geo_loc_state": "Comunidad de Madrid",
"geo_loc_region": "Madrid",
"geo_loc_city": "Madrid",
"geo_loc_country": "Spain",
"submitting_institution": "Centro Nacional De Microbiologia",
"submitting_institution_address": "Carretera Majadahonda-Pozuelo, Km 2",
"submitting_institution_email": "[email protected]"
},
"Presidencia Del Gobierno": {
"collecting_institution_address": "Complejo de la Moncloa, Av. Puerta de Hierro, s/n",
"collecting_institution_email": "[email protected]",
"geo_loc_state": "Comunidad de Madrid",
"geo_loc_region": "Madrid",
"geo_loc_city": "Madrid",
"geo_loc_country": "Spain",
"submitting_institution": "Presidencia Del Gobierno",
"submitting_institution_address": "Complejo de la Moncloa, Av. Puerta de Hierro, s/n",
"submitting_institution_email": "[email protected]"
},
"Xerencia De Xestión Integrada De Pontevedra": {
"collecting_institution_address": "Rúa Loureiro Crespo, 2",
"collecting_institution_email": "[email protected]",
"geo_loc_state": "Galicia",
"geo_loc_region": "Pontevedra",
"geo_loc_city": "Pontevedra",
"geo_loc_country": "Spain",
"submitting_institution": "Xerencia De Xestión Integrada De Pontevedra",
"submitting_institution_address": "Rúa Loureiro Crespo, 2",
"submitting_institution_email": "[email protected]"
},
"Consejeria De Sanidad": {
"collecting_institution_address": "Calle de Aduana, 29",
"collecting_institution_email": "[email protected]",
"geo_loc_state": "Comunidad de Madrid",
"geo_loc_region": "Madrid",
"geo_loc_city": "Madrid",
"geo_loc_country": "Spain",
"submitting_institution": "Consejeria De Sanidad",
"submitting_institution_address": "Calle de Aduana, 29",
"submitting_institution_email": "[email protected]"
},
"Instituto De Medicina Legal De Toledo": {
"collecting_institution_address": "Calle de Dinamarca, 1",
"collecting_institution_email": "[email protected]",
"geo_loc_state": "Castilla-La Mancha",
"geo_loc_region": "Toledo",
"geo_loc_city": "Toledo",
"geo_loc_country": "Spain",
"submitting_institution": "Instituto De Medicina Legal De Toledo",
"submitting_institution_address": "Calle de Dinamarca, 1",
"submitting_institution_email": "[email protected]"
}

}
24 changes: 9 additions & 15 deletions relecov_tools/institution_scripts/ISCIII.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ def replace_originating_lab(metadata, f_data, mapped_fields, heading):
except KeyError as e:
log.error("Value %s does not exist ", e)
stderr.print(f"[red] Value {e} does not exist")
sys.exit(1)
return metadata


Expand All @@ -41,7 +40,7 @@ def added_seq_inst_model(metadata, f_data, mapped_fields, heading):
except KeyError as e:
log.error("Value %s does not exist ", e)
stderr.print(f"[red] Value {e} does not exist")
sys.exit(1)
continue
if "nextseq" in run_name:
row[m_idx] = "Illumina NextSeq 500"
elif "next_seq" in run_name:
Expand All @@ -55,7 +54,6 @@ def added_seq_inst_model(metadata, f_data, mapped_fields, heading):
else:
log.error("Value %s is not defined in the mapping ", run_name)
stderr.print(f"[red] Value {run_name} is not defined in the mapping")
sys.exit(1)
return metadata


Expand All @@ -70,20 +68,19 @@ def translate_gender_to_english(metadata, f_data, mapped_fields, heading):
"unknown": "Not Provided",
}
for row in metadata[1:]:
for key, val in mapped_fields.items():
for key, _ in mapped_fields.items():
m_idx = heading.index(key)
if row[m_idx] is None or row[m_idx] == "":
row[m_idx] = "Not Provided"
continue
item = row[m_idx].lower()
item = str(row[m_idx]).lower()
if item in map_dict:
row[m_idx] = map_dict[item]
else:
log.error("The '%s' is not a valid data for translation", row[m_idx])
log.error("The %s is not a valid data for translation", row[m_idx])
stderr.print(
"f[red] The '{row[m_idx]}' is not a valid data for translation"
f"[red] The '{row[m_idx]}' is not a valid data for translation"
)
sys.exit(1)
return metadata


Expand All @@ -93,7 +90,7 @@ def translate_specimen_source(metadata, f_data, mapped_fields, heading):
for key, val in mapped_fields.items():
m_idx = heading.index(key)
if row[m_idx] is None:
row[m_idx] = "not provided"
row[m_idx] = "Not Provided"
elif "ASPIRADO NASOFARÍNGEO" in row[m_idx].upper():
row[m_idx] = "Nasopharynx Aspiration"
elif "ASPIRADO BRONQUIAL" in row[m_idx].upper():
Expand All @@ -103,19 +100,18 @@ def translate_specimen_source(metadata, f_data, mapped_fields, heading):
elif "EXTRACTO" in row[m_idx].upper():
row[m_idx] = "Scraping"
elif "EXUDADO FARÍNGEO" in row[m_idx].upper():
row[m_idx] = "Pharynx Swabbing"
row[m_idx] = "Pharynx Swab"
elif "EXUDADO NASOFARÍNGEO" in row[m_idx].upper():
row[m_idx] = "Nasopharynx Swabbing"
row[m_idx] = "Nasopharynx swab"
elif "EXUDADO OROFARINGEO" in row[m_idx].upper():
row[m_idx] = "Oropharynx Swabbing"
row[m_idx] = "Oropharynx Swab"
elif "PLACENTA" in row[m_idx].upper():
row[m_idx] = "Placenta"
elif "SALIVA" in row[m_idx].upper():
row[m_idx] = "Saliva"
else:
log.error("The field is not correctly written or is not filled")
stderr.print("The field is not correctly written or not filled")
sys.exit(1)
return metadata


Expand Down Expand Up @@ -157,7 +153,6 @@ def translate_purpose_seq_to_english(metadata, f_data, mapped_fields, heading):
stderr.print(
"f[red] The {row[m_idx]} is not a valid data for translation"
)
sys.exit(1)
return metadata


Expand Down Expand Up @@ -195,5 +190,4 @@ def findout_library_layout(metadata, f_data, mapped_fields, heading):
stderr.print(
f"[red] {e} is not defined in function findout_library_layout"
)
sys.exit(1)
return metadata
21 changes: 14 additions & 7 deletions relecov_tools/metadata_homogeneizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
import os
import sys
import logging
import json
import rich.console

import relecov_tools.utils
import pandas as pd
from relecov_tools.config_json import ConfigJson

log = logging.getLogger(__name__)
Expand All @@ -26,7 +28,9 @@ def __init__(self, institution=None, directory=None, output_folder=None):
self.heading = self.config_json.get_topic_data(
"lab_metadata", "metadata_lab_heading"
)

self.metadata_processing = self.config_json.get_topic_data(
"sftp_handle", "metadata_processing"
)
# handle institution
if institution is None:
self.institution = relecov_tools.utils.prompt_selection(
Expand Down Expand Up @@ -164,9 +168,9 @@ def handling_files(self, file_data, data_to_add):
elif f_name.endswith(".csv"):
data = relecov_tools.utils.read_csv_file_return_dict(f_name, ",")
elif f_name.endswith(".xlsx"):
header_flag = self.metadata_processing.get("header_flag")
data = relecov_tools.utils.read_excel_file(
f_name, "Sheet", header_flag, leave_empty=True
excel_sheet = self.metadata_processing.get("excel_sheet")
data, _ = relecov_tools.utils.read_excel_file(
f_name, excel_sheet, "ID CNM", leave_empty=True
)
else:
log.error("Additional file extension %s is not supported ", f_name)
Expand Down Expand Up @@ -200,17 +204,21 @@ def handling_files(self, file_data, data_to_add):
+ str(s_value)
)
continue

# sys.exit(1)
for m_field, f_field in file_data["mapped_fields"].items():
try:
meta_idx = self.heading.index(m_field)
except ValueError as e:
log.error("Field %s does not exist in Metadata ", e)
log.error("Field %s does not exist in Metadata heading, check config", e)
stderr.print(f"[red] Field {e} does not exist")
sys.exit(1)
break
row[meta_idx] = item_data[f_field]


else:
if data == {'ERROR': 'not valid format'}:
raise ValueError(f"Unknown error during processing of {file_data["file_name"]}")
func_name = file_data["function"]
stderr.print("[yellow] Start processing function " + func_name)
exec(
Expand All @@ -224,7 +232,6 @@ def handling_files(self, file_data, data_to_add):
func_name
+ "(data_to_add, data, file_data['mapped_fields'], self.heading)"
)

stderr.print("[green] Succesful processing of additional file ")
return data_to_add

Expand Down
10 changes: 1 addition & 9 deletions relecov_tools/schema/institution_schemas/ISCIII.json
Original file line number Diff line number Diff line change
Expand Up @@ -88,14 +88,6 @@
"mapped_key": "Runid",
"function": "None"
},
"read_length": {
"file_name": "samples_run_services_length.tsv",
"mapped_fields": {
"Read Length" : "read1_cycles"
},
"mapped_key": "Sample ID given for sequencing",
"function": "None"
},
"samples_in_run": {
"file_name": "run_and_num_of_samples.csv",
"mapped_fields": {
Expand All @@ -114,7 +106,7 @@
},
"purpose_of_sequencing": {
"file_name": "",
"mapped_fields" : {"Purpose of Sequencing" : "" },
"mapped_fields" : {"Purpose of sampling" : "" },
"mapped_key" : "",
"function": "translate_purpose_seq_to_english"
},
Expand Down
10 changes: 9 additions & 1 deletion relecov_tools/schema/relecov_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -975,7 +975,15 @@
"Centro Sanitario Cinco Villas",
"Hospital Viamed Montecanal",
"Hospital Universitario De Ceuta",
"Hospital Comarcal"
"Hospital Comarcal",
"Analiza, S.L. Hospital Moncloa",
"Atimmunolab",
"Ministerio Sanidad",
"Centro Nacional De Microbiologia",
"Presidencia Del Gobierno",
"Xerencia De Xestión Integrada De Pontevedra",
"Consejeria De Sanidad",
"Instituto De Medicina Legal De Toledo"
],
"ontology": "GENEPIO:0001153",
"type": "string",
Expand Down
25 changes: 24 additions & 1 deletion relecov_tools/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,29 @@ def read_json_file(j_file):
return data


def write_to_excel_file(data, f_name, sheet_name, post_process=None):
book = openpyxl.Workbook()
sheet = book.active
for row in data:
sheet.append(row)
# adding one column with row number
if "insert_cols" in post_process:
sheet.insert_cols(post_process["insert_cols"])
sheet["A1"] = "CAMPO"
counter = 1
for i in range(len(data) - 1):
idx = "A" + str(counter + 1)
sheet[idx] = counter
counter += 1
# adding 3 empty rows
if "insert_rows" in post_process:
for x in range(post_process["insert_rows"]):
sheet.insert_rows(1)
sheet.title = sheet_name
book.save(f_name)
return


def read_excel_file(f_name, sheet_name, header_flag, leave_empty=True):
"""Read the input excel file and give the information in a list
of dictionaries
Expand All @@ -73,7 +96,7 @@ def read_excel_file(f_name, sheet_name, header_flag, leave_empty=True):
idx + 1 for idx, x in enumerate(ws_metadata_lab.values) if header_flag in x
][0]
except IndexError:
raise KeyError(f"Header flag '{header_flag}' could not be found in {f_name}")
raise IndexError(f"Header flag '{header_flag}' could not be found in {f_name}")
heading = [str(i.value).strip() for i in ws_metadata_lab[heading_row] if i.value]
ws_data = []
for row in islice(ws_metadata_lab.values, heading_row, ws_metadata_lab.max_row):
Expand Down
Loading