added fixed and remaining reviewer suggestions on #258

BU-ISCIII · Apr 15, 2024 · 2de9f15 · 2de9f15
1 parent 1f28585
commit 2de9f15
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 35 deletions.
diff --git a/relecov_tools/assets/pipeline_utils/viralrecon.py b/relecov_tools/assets/pipeline_utils/viralrecon.py
@@ -137,7 +137,6 @@ def convert_to_json(self, samp_dict):
         result_regex = re.search(
             "variants_long_table(?:_\d{8})?\.csv", os.path.basename(self.file_path)
         )
-        stderr.print(result_regex.group(0))
         if result_regex is None:
             stderr.print(
                 "[red]\tWARN: Couldn't find variants long table file. Expected file name is:"

diff --git a/relecov_tools/conf/bioinfo_config.json b/relecov_tools/conf/bioinfo_config.json
@@ -33,7 +33,7 @@
                 "lineage_analysis_software_version": "pangolin_version",
                 "lineage_analysis_scorpio_version": "scorpio_version",
                 "lineage_analysis_constellation_version": "constellation_version",
-                "lineage_analysis_date":"Not Provided [GENEPIO:0001668]"
+                "lineage_analysis_date":"lineage_analysis_date"
             }
         },
         "variants_long_table": {

diff --git a/relecov_tools/read_bioinfo_metadata.py b/relecov_tools/read_bioinfo_metadata.py
@@ -274,7 +274,7 @@ def handling_files(self, file_list):
         """Handles different file formats to extract data regardless of their structure. The goal is to extract the data contained in files specified in ${file_list}, using either 'standard' handlers defined in this class or pipeline-specific file handlers.
         (inspired from ./metadata_homogenizer.py)
 
-        A file handler method must generate a data structure as follow:
+            A file handler method must generate a data structure as follow:
             {
                 'SAMPLE1': {
                     'field1': 'value1'
@@ -288,7 +288,7 @@ def handling_files(self, file_list):
                 },
                 ...
             }
-        Note: ensure that 'field1','field2','field3' corresponds with the values especifies in the 'content' section of each software configuration scope (see: conf/bioinfo_config.json).
+            Note: ensure that 'field1','field2','field3' corresponds with the values especifies in the 'content' section of each software configuration scope (see: conf/bioinfo_config.json).
 
         Args:
             file_list (list): A list of file path/s to be processed.
@@ -366,7 +366,15 @@ def mapping_over_table(self, j_data, map_data, mapping_fields, table_name):
         field_errors = {}
         field_valid = {}
         for row in j_data:
-            sample_name = row["sequencing_sample_id"]
+            # TODO: We should consider an independent module that verifies that sample's name matches this pattern.
+            #       If we add warnings within this module, every time mapping_over_table is invoked it will print redundant warings
+            sample_match = re.match(
+                r"^(.*?)_R1\.fastq\.gz", row["sequence_file_R1_fastq"]
+            )
+            if sample_match:
+                sample_name = sample_match.group(1)
+            else:
+                continue
             if sample_name in map_data.keys():
                 for field, value in mapping_fields.items():
                     try:
@@ -380,22 +388,27 @@ def mapping_over_table(self, j_data, map_data, mapping_fields, table_name):
                 errors.append(sample_name)
                 for field in mapping_fields.keys():
                     row[field] = "Not Provided [GENEPIO:0001668]"
+        # work around when map_data comes from several per-sample tables/files instead of single table
+        if len(table_name) > 2:
+            table_name = os.path.dirname(table_name[0])
+        else:
+            table_name = table_name[0]
+        # Parse missing sample errors
         if errors:
             lenerrs = len(errors)
-            # work around when map_data comes from several per-sample tables/files instead of single table (from list to str)
-            if len(table_name) > 1:
-                table_name = os.path.dirname(table_name[0])
-            else:
-                table_name = table_name[0]
             self.log_report.update_log_report(
                 method_name,
                 "warning",
                 f"{lenerrs} samples missing in '{table_name}': {', '.join(errors)}.",
             )
         else:
             self.log_report.update_log_report(
-                method_name, "valid", "Successfully mapped data."
+                method_name,
+                "valid",
+                f"All samples were successfully found in {table_name}.",
             )
+        # Parse missing fields errors
+        # TODO: this stdout can be improved
         if len(field_errors) > 0:
             self.log_report.update_log_report(
                 method_name,
@@ -406,8 +419,9 @@ def mapping_over_table(self, j_data, map_data, mapping_fields, table_name):
             self.log_report.update_log_report(
                 method_name,
                 "valid",
-                f"Successfully mapped fields in {', '.join(field_valid.keys())} - {table_name}.",
+                f"Successfully mapped fields in {', '.join(field_valid.keys())}.",
             )
+        # Print report
         self.log_report.print_log_report(method_name, ["valid", "warning"])
         return j_data
 
@@ -468,7 +482,13 @@ def get_multiqc_software_versions(self, file_list, j_data):
         # Mapping multiqc sofware versions to j_data
         field_errors = {}
         for row in j_data:
-            sample_name = row["sequencing_sample_id"]
+            sample_match = re.match(
+                r"^(.*?)_R1\.fastq\.gz", row["sequence_file_R1_fastq"]
+            )
+            if sample_match:
+                sample_name = sample_match.group(1)
+            else:
+                continue
             for field, values in (
                 self.software_config["workflow_summary"].get("content").items()
             ):
@@ -488,7 +508,7 @@ def get_multiqc_software_versions(self, file_list, j_data):
             )
         else:
             self.log_report.update_log_report(
-                method_name, "valid", "Successfully mapped data."
+                method_name, "valid", "Successfully field mapped data."
             )
         self.log_report.print_log_report(method_name, ["valid", "warning"])
         return j_data
@@ -533,38 +553,32 @@ def add_bioinfo_files_path(self, files_found_dict, j_data):
             j_data: Updated j_data with file paths mapped for bioinformatic metadata.
         """
         method_name = f"{self.add_bioinfo_files_path.__name__}"
-        sample_error = 0
+        sample_name_error = 0
         for row in j_data:
-            try:
-                sample_name = re.match(
-                    r"^(.*?)_R1\.fastq\.gz", row["sequence_file_R1_fastq"]
-                ).group(1)
-            except AttributeError as e:
-                sample_error += 1
-                self.log_report.update_log_report(
-                    method_name,
-                    "warning",
-                    f" {row['sequence_file_R1_fastq']} doesn't match pattern '*_R1.fastq.gz'. Cannot add file paths (error: {e})",
-                )
+            sample_match = re.match(
+                r"^(.*?)_R1\.fastq\.gz", row["sequence_file_R1_fastq"]
+            )
+            if sample_match:
+                sample_name = sample_match.group(1)
+            else:
                 continue
             for key, value in files_found_dict.items():
                 file_path = "Not Provided [GENEPIO:0001668]"
                 if value:  # Check if value is not empty
-                    if len(value) > 1:
-                        for file in value:
-                            if sample_name in file:
-                                file_path = file
-                                break  # Exit loop if match found
-                    else:
-                        file_path = value[0]
+                    for file in value:
+                        if sample_name in file:
+                            file_path = file
+                            break  # Exit loop if match found
+                else:
+                    file_path = value[0]
                 path_key = f"{self.software_name}_filepath_{key}"
                 row[path_key] = file_path
         self.log_report.print_log_report(method_name, ["warning"])
-        if sample_error == 0:
+        if sample_name_error == 0:
             self.log_report.update_log_report(
                 method_name, "valid", "File paths added successfully."
             )
-            self.log_report.print_log_report(method_name, ["valid"])
+        self.log_report.print_log_report(method_name, ["valid", "warning"])
         return j_data
 
     def collect_info_from_lab_json(self):