From b5721388c14f9a5c8238d735cfb3c7a47ac9bd2b Mon Sep 17 00:00:00 2001 From: Chen Wu Date: Thu, 17 Oct 2024 15:26:39 +0800 Subject: [PATCH] chore: Added fullwidth characters check in cn docs --- docs/check_doc_chars.py | 87 +++++++++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 24 deletions(-) diff --git a/docs/check_doc_chars.py b/docs/check_doc_chars.py index 127525da0..7d34e4c63 100755 --- a/docs/check_doc_chars.py +++ b/docs/check_doc_chars.py @@ -16,24 +16,25 @@ def ESP_LOGI(x): def ESP_LOGE(x): print('\033[31m{}\033[0m'.format(x)) -# allowed characters, include some chinese characters, symbol, and punctuation +# allowed characters, include some chinese characters, symbol, and punctuation in en document at_allowed_chars_list = ['中文', '®', '℃', '…', '✅', '❌', '√', '×', '├', '└', '│', '–', '—'] -at_not_allowed_chars_list = re.compile(b'[^\x0a\x0d\x20-\x7e]') at_file_white_list = ['index_of_abbreviations.rst'] +at_not_allowed_chars_list = re.compile(b'[^\x0a\x0d\x20-\x7e]') -def at_get_file_list(doc_path, subdir_file_list): - if os.path.isdir(doc_path): - file_list = os.listdir(doc_path) - else: - subdir_file_list.append(doc_path) - return subdir_file_list - for file in file_list: - cur_path = os.path.join(doc_path, file) - if os.path.isdir(cur_path): - at_get_file_list(cur_path, subdir_file_list) - else: - subdir_file_list.append(cur_path) - return subdir_file_list +# fullwidth space, letters, digits, punctuations that are not allowed in cn document +at_fullwidth_space = r' ' +at_fullwidth_letters = r'[A-Za-z]' +at_fullwidth_digits = r'[0-9]' +at_fullwidth_punctuations = r'[@#$%^&*-+=〈〉「」『』【】\|'"<.>/]' +at_fullwidth_file_white_list = ['index_of_abbreviations.rst'] + +def at_get_rst_list(path): + rst_files = [] + for root, _, files in os.walk(path): + for file in files: + if file.endswith('.rst'): + rst_files.append(os.path.abspath(os.path.join(root, file))) + return rst_files def at_data_is_allowed_chars(match_info, data): to_check_idx = match_info.span() @@ -57,33 +58,71 @@ def at_data_is_allowed_chars(match_info, data): def at_check_doc_chars_validity(doc_name): with open(doc_name, 'rb') as fp: - for (lineno, data) in enumerate(fp): + for (lineno, data) in enumerate(fp, start=1): match_info = re.search(at_not_allowed_chars_list, data) if match_info: if not at_data_is_allowed_chars(match_info, data): - ESP_LOGE('Error: illegal character detected at {}:{}'.format(doc_name, lineno + 1)) - print('raw data ----> {}\r\n'.format(data)) - print('Allowed chars:') + ESP_LOGE('Error: illegal character detected at {}:{}'.format(doc_name, lineno)) + print(f'Line {lineno}: {data.strip()}') + print('\r\nBut allowed chars:') for x in at_allowed_chars_list: print(x, '---->', x.encode()) return False pass return True -def _main(): +def at_check_en_format(): if len(sys.argv) == 2: - dst_path = os.path.abspath(sys.argv[1]) + dst_path = os.path.abspath(sys.argv[1]) + '/en' else: dst_path = os.path.abspath('.') + '/en' - at_en_doc_file_list = at_get_file_list(dst_path, []) + at_en_doc_file_list = at_get_rst_list(dst_path) for current_file in at_en_doc_file_list: for file_basename in at_file_white_list: if os.path.basename(current_file) == file_basename: continue else: if at_check_doc_chars_validity(current_file) == False: - sys.exit(-1) - ESP_LOGI('Document characters check passed! ({})'.format(dst_path)) + return False + +def at_check_fullwidth_chars(file_path): + pattern = re.compile(f'{at_fullwidth_space}|{at_fullwidth_letters}|{at_fullwidth_digits}|{at_fullwidth_punctuations}') + with open(file_path, 'r', encoding='utf-8') as file: + for line_number, line in enumerate(file, start=1): + matches = pattern.findall(line) + if matches: + ESP_LOGE('Error: illegal fullwidth character detected at {}:{}'.format(file_path, line_number)) + print(f'Line {line_number}: {line.strip()}') + print(f"Full-width characters found: {', '.join(matches)}") + return False + return True + +def at_check_cn_format(): + if len(sys.argv) == 2: + dst_path = os.path.abspath(sys.argv[1]) + '/zh_CN' + else: + dst_path = os.path.abspath('.') + '/zh_CN' + at_cn_doc_file_list = at_get_rst_list(dst_path) + for current_file in at_cn_doc_file_list: + if not at_fullwidth_file_white_list: + for file_basename in at_fullwidth_file_white_list: + if os.path.basename(current_file) == file_basename: + continue + else: + if at_check_fullwidth_chars(current_file) == False: + return False + else: + if at_check_fullwidth_chars(current_file) == False: + return False + +def _main(): + if at_check_en_format() == False: + sys.exit(-1) + + if at_check_cn_format() == False: + sys.exit(-1) + + ESP_LOGI('Document characters check passed!') if __name__ == '__main__': _main()