From cdfb0a4f88627dafb174fe16019b8079466e04bb Mon Sep 17 00:00:00 2001 From: Parameswaran Selvam Date: Sat, 26 Oct 2024 09:28:07 -0700 Subject: [PATCH] Added exception handling to code quality transform Signed-off-by: Parameswaran Selvam --- .../python/src/code_quality_transform.py | 52 +++++++++++++------ 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/transforms/code/code_quality/python/src/code_quality_transform.py b/transforms/code/code_quality/python/src/code_quality_transform.py index 4defb43fe..18cd02281 100644 --- a/transforms/code/code_quality/python/src/code_quality_transform.py +++ b/transforms/code/code_quality/python/src/code_quality_transform.py @@ -223,25 +223,45 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab contents = table.column(self.code_quality["contents_column_name"]).to_pylist() languages = table.column(self.code_quality["language_column_name"]).to_pylist() + document_id = table.column("document_id").to_pylist() + failed_doc_ids = [] # loop over rows and compute filter stats for i, c in enumerate(contents): # compute lines statistics - stats = calculate_line_stats(c) - line_mean_values.append(stats["line_mean"]) - line_max_values.append(stats["line_max"]) - no_lines_values.append(stats["num_lines"]) - avg_longest_lines_values.append(stats["avg_longest_lines"]) - - alphanum_frac_values.append(calculate_alpha_stats(c)["alphanum_frac"]) - char_token_ratio_values.append(calculate_char_token_ratio(c, self.tokenizer)["char_token_ratio"]) - - is_autogenerated_values.append(is_autogenerated(c)) - is_config_or_test_values.append(is_config_or_test(c)) - has_no_keywords_values.append(has_no_keywords(c, languages[i])) - has_few_assignments_values.append(has_few_assignments(c, languages[i])) - is_xml_values.append(is_xml(c, languages[i])) - is_html_values.append(is_html(c, languages[i])) + try: + stats = calculate_line_stats(c) + line_mean_values.append(stats["line_mean"]) + line_max_values.append(stats["line_max"]) + no_lines_values.append(stats["num_lines"]) + avg_longest_lines_values.append(stats["avg_longest_lines"]) + + alphanum_frac_values.append(calculate_alpha_stats(c)["alphanum_frac"]) + char_token_ratio_values.append(calculate_char_token_ratio(c, self.tokenizer)["char_token_ratio"]) + + is_autogenerated_values.append(is_autogenerated(c)) + is_config_or_test_values.append(is_config_or_test(c)) + has_no_keywords_values.append(has_no_keywords(c, languages[i])) + has_few_assignments_values.append(has_few_assignments(c, languages[i])) + is_xml_values.append(is_xml(c, languages[i])) + is_html_values.append(is_html(c, languages[i])) + except Exception as e: + failed_doc_ids.append(document_id[i]) + line_mean_values.append(0) + line_max_values.append(0) + no_lines_values.append(0) + avg_longest_lines_values.append(0) + + alphanum_frac_values.append(0) + char_token_ratio_values.append(0) + + is_autogenerated_values.append(False) + is_config_or_test_values.append(False) + has_no_keywords_values.append(False) + has_few_assignments_values.append(True) + is_xml_values.append(False) + is_html_values.append(False) + table = TransformUtils.add_column(table=table, name="line_mean", content=line_mean_values) table = TransformUtils.add_column(table=table, name="line_max", content=line_max_values) @@ -256,6 +276,8 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab table = TransformUtils.add_column(table=table, name="is_xml", content=is_xml_values) table = TransformUtils.add_column(table=table, name="is_html", content=is_html_values) + if len(failed_doc_ids > 0): + print(f"Failed docs: {failed_doc_ids} in {file_name}") return [table], {}