From 3f61c516692b340e34480dae9532b7fa523ef380 Mon Sep 17 00:00:00 2001 From: Romain Primet Date: Fri, 18 Aug 2023 17:21:02 +0200 Subject: [PATCH] Improve diffing (#58) * remove all double spaces * escape markdown special characters from raw legifrance text when diffing --- src/catleg/find_changes.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/catleg/find_changes.py b/src/catleg/find_changes.py index 35b30f6..3fc9d30 100644 --- a/src/catleg/find_changes.py +++ b/src/catleg/find_changes.py @@ -39,7 +39,7 @@ async def find_changes(f: TextIO, *, file_path: Path | None = None): diff, retcode = wdiff( _reformat(article.text), - _reformat(ref_article.text_and_nota()), + _reformat(_escape_ref_text(ref_article.text_and_nota())), return_exit_code=True, line_offset=article.start_line, ) @@ -62,4 +62,11 @@ def _reformat(paragraph: str): Catala has a 80-char line limit, so law texts will often be manually reformatted. We attempt to remove extra line breaks before comparison. """ - return paragraph.replace("\n", " ").strip().replace(" ", " ") + paragraph = paragraph.replace("\n", " ") + while " " in paragraph: + paragraph = paragraph.replace(" ", " ") + return paragraph.strip() + + +def _escape_ref_text(paragraph: str): + return paragraph.replace("[", r"\[").replace("]", r"\]").replace("*", r"\*")