Skip to content

Commit

Permalink
Implemented host_age control
Browse files Browse the repository at this point in the history
  • Loading branch information
Aberdur committed Feb 10, 2025
1 parent d07f942 commit b3daea2
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 27 deletions.
36 changes: 36 additions & 0 deletions relecov_tools/assets/pipeline_utils/viralrecon.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,3 +469,39 @@ def quality_control_evaluation(data):
f"Error processing sample {sample.get('sequencing_sample_id', 'unknown')}: {e}"
)
return data


def property_evaluation(data):
"""Validate the 'host_age' field for each sample in the dataset.
Args:
data (list): List of dictionaries representing JSON data.
Returns:
tuple: (valid_data, invalid_data, errors)
"""
host_age_field = "host_age"
valid_data = []
invalid_data = []
errors = {}
for sample in data:
sample_id = sample.get(
"sequencing_sample_id"
)
host_age = sample.get(host_age_field)
try:
host_age = float(host_age)
except (TypeError, ValueError):
host_age = None

if host_age is None or not (0 <= host_age <= 120):
error_text = f"Sample {sample_id}: Error in column ({host_age_field}) - Invalid age {int(host_age)}. Must be a number between 0 and 120."
if error_text not in errors:
errors[error_text] = 1
else:
errors[error_text] += 1
invalid_data.append(sample)
else:
valid_data.append(sample)

return valid_data, invalid_data, errors
69 changes: 42 additions & 27 deletions relecov_tools/json_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def get_sample_id_field(self):
return sample_id_field

def validate_instances(self):
"""Validate data instances against a validated json schema"""
"""Validate data instances against a validated JSON schema"""

# create validator
validator = Draft202012Validator(self.json_schema)
Expand All @@ -117,50 +117,65 @@ def validate_instances(self):
invalid_json = []
errors = {}
error_keys = {}

if self.sample_id_field is None:
log_text = f"Logs keys set to None. Reason: {self.SAMPLE_FIELD_ERROR}"
self.logsum.add_warning(sample=self.sample_id_field, entry=log_text)

stderr.print("[blue] Start processing the json file")

valid_data, invalid_data, host_age_errors = (
relecov_tools.assets.pipeline_utils.viralrecon.property_evaluation(
self.json_data
)
)

for error_text, count in host_age_errors.items():
self.logsum.add_error(entry=error_text)
errors[error_text] = errors.get(error_text, 0) + count
error_keys[error_text] = "host_age"

for item_row in self.json_data:
# validate(instance=item_row, schema=json_schema)
sample_id_value = item_row.get(self.sample_id_field)
errors_list = []

for error_text, count in host_age_errors.items():
if str(sample_id_value) in error_text:
errors_list.append(error_text)
self.logsum.add_error(sample=sample_id_value, entry=error_text)

if validator.is_valid(item_row):
validated_json_data.append(item_row)
self.logsum.feed_key(sample=sample_id_value)
if errors_list:
item_row["errors"] = errors_list
invalid_json.append(item_row)
else:
validated_json_data.append(item_row)
self.logsum.feed_key(sample=sample_id_value)
else:
# Count error types
for error in validator.iter_errors(item_row):
if error.validator == "required":
error_field = [
f for f in error.validator_value if f in error.message
][0]
else:
error_field = error.absolute_path[0]
try:
err_field_label = schema_props[error_field]["label"]
except KeyError:
log.error("Could not extract label for %s" % error_field)
err_field_label = error_field
error.message.replace(error_field, err_field_label)
error_field = (
error.absolute_path[0] if error.absolute_path else "Unknown"
)
err_field_label = schema_props.get(error_field, {}).get(
"label", error_field
)
error_text = f"Error in column {err_field_label}: {error.message}"
error_keys[error.message] = error_field
if error.message in errors:
errors[error.message] += 1
else:
errors[error.message] = 1
errors[error.message] = errors.get(error.message, 0) + 1
self.logsum.add_error(sample=sample_id_value, entry=error_text)
# append row with errors
errors_list.append(error_text)

item_row["errors"] = errors_list
invalid_json.append(item_row)

# Summarize errors
stderr.print("[blue] --------------------")
stderr.print("[blue] VALIDATION SUMMARY")
stderr.print("[blue] --------------------")
for error_type in errors.keys():
num_of_errors = str(errors[error_type])
field_with_error = str(error_keys[error_type])
error_text = "{} samples failed validation for {}:\n{}"
error_text = error_text.format(num_of_errors, field_with_error, error_type)

for error_type, count in errors.items():
field_with_error = error_keys.get(error_type, "Unknown Field")
error_text = f"{count} samples failed validation for {field_with_error}:\n{error_type}"
self.logsum.add_warning(entry=error_text)
stderr.print(f"[red]{error_text}")
stderr.print("[red] --------------------")
Expand Down

0 comments on commit b3daea2

Please sign in to comment.