diff --git a/CHANGELOG.md b/CHANGELOG.md index 506c81a2..609add56 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ Code contributions to the release: - Added a more robust datatype handling in utils.py read_csv_file_return_dict() method [#379](https://github.com/BU-ISCIII/relecov-tools/pull/379) - Improved relecov template generator and version control [#382](https://github.com/BU-ISCIII/relecov-tools/pull/382) +- Added "host_age" control in validation module [#385](https://github.com/BU-ISCIII/relecov-tools/pull/385) #### Fixes diff --git a/relecov_tools/assets/pipeline_utils/viralrecon.py b/relecov_tools/assets/pipeline_utils/viralrecon.py index 0f4ee036..b11a7b27 100644 --- a/relecov_tools/assets/pipeline_utils/viralrecon.py +++ b/relecov_tools/assets/pipeline_utils/viralrecon.py @@ -469,3 +469,37 @@ def quality_control_evaluation(data): f"Error processing sample {sample.get('sequencing_sample_id', 'unknown')}: {e}" ) return data + + +def property_evaluation(data): + """Validate the 'host_age' field for each sample in the dataset. + + Args: + data (list): List of dictionaries representing JSON data. + + Returns: + tuple: (valid_data, invalid_data, errors) + """ + host_age_field = "host_age" + valid_data = [] + invalid_data = [] + errors = {} + for sample in data: + sample_id = sample.get("sequencing_sample_id") + host_age = sample.get(host_age_field) + try: + host_age = float(host_age) + except (TypeError, ValueError): + host_age = None + + if host_age is None or not (0 <= host_age <= 120): + error_text = f"Sample {sample_id}: Error in column ({host_age_field}) - Invalid age {host_age}. Must be a number between 0 and 120." + if error_text not in errors: + errors[error_text] = 1 + else: + errors[error_text] += 1 + invalid_data.append(sample) + else: + valid_data.append(sample) + + return valid_data, invalid_data, errors diff --git a/relecov_tools/json_validation.py b/relecov_tools/json_validation.py index 516377bb..c3bddcb6 100755 --- a/relecov_tools/json_validation.py +++ b/relecov_tools/json_validation.py @@ -107,7 +107,7 @@ def get_sample_id_field(self): return sample_id_field def validate_instances(self): - """Validate data instances against a validated json schema""" + """Validate data instances against a validated JSON schema""" # create validator validator = Draft202012Validator(self.json_schema) @@ -117,50 +117,65 @@ def validate_instances(self): invalid_json = [] errors = {} error_keys = {} + if self.sample_id_field is None: log_text = f"Logs keys set to None. Reason: {self.SAMPLE_FIELD_ERROR}" self.logsum.add_warning(sample=self.sample_id_field, entry=log_text) + stderr.print("[blue] Start processing the json file") + + valid_data, invalid_data, host_age_errors = ( + relecov_tools.assets.pipeline_utils.viralrecon.property_evaluation( + self.json_data + ) + ) + + for error_text, count in host_age_errors.items(): + self.logsum.add_error(entry=error_text) + errors[error_text] = errors.get(error_text, 0) + count + error_keys[error_text] = "host_age" + for item_row in self.json_data: - # validate(instance=item_row, schema=json_schema) sample_id_value = item_row.get(self.sample_id_field) + errors_list = [] + + for error_text, count in host_age_errors.items(): + if str(sample_id_value) in error_text: + errors_list.append(error_text) + self.logsum.add_error(sample=sample_id_value, entry=error_text) + if validator.is_valid(item_row): - validated_json_data.append(item_row) - self.logsum.feed_key(sample=sample_id_value) + if errors_list: + item_row["errors"] = errors_list + invalid_json.append(item_row) + else: + validated_json_data.append(item_row) + self.logsum.feed_key(sample=sample_id_value) else: - # Count error types for error in validator.iter_errors(item_row): - if error.validator == "required": - error_field = [ - f for f in error.validator_value if f in error.message - ][0] - else: - error_field = error.absolute_path[0] - try: - err_field_label = schema_props[error_field]["label"] - except KeyError: - log.error("Could not extract label for %s" % error_field) - err_field_label = error_field - error.message.replace(error_field, err_field_label) + error_field = ( + error.absolute_path[0] if error.absolute_path else "Unknown" + ) + err_field_label = schema_props.get(error_field, {}).get( + "label", error_field + ) error_text = f"Error in column {err_field_label}: {error.message}" error_keys[error.message] = error_field - if error.message in errors: - errors[error.message] += 1 - else: - errors[error.message] = 1 + errors[error.message] = errors.get(error.message, 0) + 1 self.logsum.add_error(sample=sample_id_value, entry=error_text) - # append row with errors + errors_list.append(error_text) + + item_row["errors"] = errors_list invalid_json.append(item_row) # Summarize errors stderr.print("[blue] --------------------") stderr.print("[blue] VALIDATION SUMMARY") stderr.print("[blue] --------------------") - for error_type in errors.keys(): - num_of_errors = str(errors[error_type]) - field_with_error = str(error_keys[error_type]) - error_text = "{} samples failed validation for {}:\n{}" - error_text = error_text.format(num_of_errors, field_with_error, error_type) + + for error_type, count in errors.items(): + field_with_error = error_keys.get(error_type, "Unknown Field") + error_text = f"{count} samples failed validation for {field_with_error}:\n{error_type}" self.logsum.add_warning(entry=error_text) stderr.print(f"[red]{error_text}") stderr.print("[red] --------------------")