Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Quality Control to sample metadata "host_age" column #385

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Code contributions to the release:

- Added a more robust datatype handling in utils.py read_csv_file_return_dict() method [#379](https://github.com/BU-ISCIII/relecov-tools/pull/379)
- Improved relecov template generator and version control [#382](https://github.com/BU-ISCIII/relecov-tools/pull/382)
- Added "host_age" control in validation module [#385](https://github.com/BU-ISCIII/relecov-tools/pull/385)

#### Fixes

Expand Down
34 changes: 34 additions & 0 deletions relecov_tools/assets/pipeline_utils/viralrecon.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,3 +469,37 @@ def quality_control_evaluation(data):
f"Error processing sample {sample.get('sequencing_sample_id', 'unknown')}: {e}"
)
return data


def property_evaluation(data):
"""Validate the 'host_age' field for each sample in the dataset.

Args:
data (list): List of dictionaries representing JSON data.

Returns:
tuple: (valid_data, invalid_data, errors)
"""
host_age_field = "host_age"
valid_data = []
invalid_data = []
errors = {}
for sample in data:
sample_id = sample.get("sequencing_sample_id")
host_age = sample.get(host_age_field)
try:
host_age = float(host_age)
except (TypeError, ValueError):
host_age = None

if host_age is None or not (0 <= host_age <= 120):
error_text = f"Sample {sample_id}: Error in column ({host_age_field}) - Invalid age {host_age}. Must be a number between 0 and 120."
if error_text not in errors:
errors[error_text] = 1
else:
errors[error_text] += 1
invalid_data.append(sample)
else:
valid_data.append(sample)

return valid_data, invalid_data, errors
69 changes: 42 additions & 27 deletions relecov_tools/json_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def get_sample_id_field(self):
return sample_id_field

def validate_instances(self):
"""Validate data instances against a validated json schema"""
"""Validate data instances against a validated JSON schema"""

# create validator
validator = Draft202012Validator(self.json_schema)
Expand All @@ -117,50 +117,65 @@ def validate_instances(self):
invalid_json = []
errors = {}
error_keys = {}

if self.sample_id_field is None:
log_text = f"Logs keys set to None. Reason: {self.SAMPLE_FIELD_ERROR}"
self.logsum.add_warning(sample=self.sample_id_field, entry=log_text)

stderr.print("[blue] Start processing the json file")

valid_data, invalid_data, host_age_errors = (
relecov_tools.assets.pipeline_utils.viralrecon.property_evaluation(
self.json_data
)
)

for error_text, count in host_age_errors.items():
self.logsum.add_error(entry=error_text)
errors[error_text] = errors.get(error_text, 0) + count
error_keys[error_text] = "host_age"

for item_row in self.json_data:
# validate(instance=item_row, schema=json_schema)
sample_id_value = item_row.get(self.sample_id_field)
errors_list = []

for error_text, count in host_age_errors.items():
if str(sample_id_value) in error_text:
errors_list.append(error_text)
self.logsum.add_error(sample=sample_id_value, entry=error_text)

if validator.is_valid(item_row):
validated_json_data.append(item_row)
self.logsum.feed_key(sample=sample_id_value)
if errors_list:
item_row["errors"] = errors_list
invalid_json.append(item_row)
else:
validated_json_data.append(item_row)
self.logsum.feed_key(sample=sample_id_value)
else:
# Count error types
for error in validator.iter_errors(item_row):
if error.validator == "required":
error_field = [
f for f in error.validator_value if f in error.message
][0]
else:
error_field = error.absolute_path[0]
try:
err_field_label = schema_props[error_field]["label"]
except KeyError:
log.error("Could not extract label for %s" % error_field)
err_field_label = error_field
error.message.replace(error_field, err_field_label)
error_field = (
error.absolute_path[0] if error.absolute_path else "Unknown"
)
err_field_label = schema_props.get(error_field, {}).get(
"label", error_field
)
error_text = f"Error in column {err_field_label}: {error.message}"
error_keys[error.message] = error_field
if error.message in errors:
errors[error.message] += 1
else:
errors[error.message] = 1
errors[error.message] = errors.get(error.message, 0) + 1
self.logsum.add_error(sample=sample_id_value, entry=error_text)
# append row with errors
errors_list.append(error_text)

item_row["errors"] = errors_list
invalid_json.append(item_row)

# Summarize errors
stderr.print("[blue] --------------------")
stderr.print("[blue] VALIDATION SUMMARY")
stderr.print("[blue] --------------------")
for error_type in errors.keys():
num_of_errors = str(errors[error_type])
field_with_error = str(error_keys[error_type])
error_text = "{} samples failed validation for {}:\n{}"
error_text = error_text.format(num_of_errors, field_with_error, error_type)

for error_type, count in errors.items():
field_with_error = error_keys.get(error_type, "Unknown Field")
error_text = f"{count} samples failed validation for {field_with_error}:\n{error_type}"
self.logsum.add_warning(entry=error_text)
stderr.print(f"[red]{error_text}")
stderr.print("[red] --------------------")
Expand Down
Loading