Skip to content

Commit

Permalink
chore: Added fullwidth characters check in cn docs
Browse files Browse the repository at this point in the history
  • Loading branch information
ustccw committed Oct 17, 2024
1 parent 4a3ce5f commit b572138
Showing 1 changed file with 63 additions and 24 deletions.
87 changes: 63 additions & 24 deletions docs/check_doc_chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,24 +16,25 @@ def ESP_LOGI(x):
def ESP_LOGE(x):
print('\033[31m{}\033[0m'.format(x))

# allowed characters, include some chinese characters, symbol, and punctuation
# allowed characters, include some chinese characters, symbol, and punctuation in en document
at_allowed_chars_list = ['中文', '®', '℃', '…', '✅', '❌', '√', '×', '├', '└', '│', '–', '—']
at_not_allowed_chars_list = re.compile(b'[^\x0a\x0d\x20-\x7e]')
at_file_white_list = ['index_of_abbreviations.rst']
at_not_allowed_chars_list = re.compile(b'[^\x0a\x0d\x20-\x7e]')

def at_get_file_list(doc_path, subdir_file_list):
if os.path.isdir(doc_path):
file_list = os.listdir(doc_path)
else:
subdir_file_list.append(doc_path)
return subdir_file_list
for file in file_list:
cur_path = os.path.join(doc_path, file)
if os.path.isdir(cur_path):
at_get_file_list(cur_path, subdir_file_list)
else:
subdir_file_list.append(cur_path)
return subdir_file_list
# fullwidth space, letters, digits, punctuations that are not allowed in cn document
at_fullwidth_space = r' '
at_fullwidth_letters = r'[A-Za-z]'
at_fullwidth_digits = r'[0-9]'
at_fullwidth_punctuations = r'[@#$%^&*-+=〈〉「」『』【】\|'"<.>/]'
at_fullwidth_file_white_list = ['index_of_abbreviations.rst']

def at_get_rst_list(path):
rst_files = []
for root, _, files in os.walk(path):
for file in files:
if file.endswith('.rst'):
rst_files.append(os.path.abspath(os.path.join(root, file)))
return rst_files

def at_data_is_allowed_chars(match_info, data):
to_check_idx = match_info.span()
Expand All @@ -57,33 +58,71 @@ def at_data_is_allowed_chars(match_info, data):

def at_check_doc_chars_validity(doc_name):
with open(doc_name, 'rb') as fp:
for (lineno, data) in enumerate(fp):
for (lineno, data) in enumerate(fp, start=1):
match_info = re.search(at_not_allowed_chars_list, data)
if match_info:
if not at_data_is_allowed_chars(match_info, data):
ESP_LOGE('Error: illegal character detected at {}:{}'.format(doc_name, lineno + 1))
print('raw data ----> {}\r\n'.format(data))
print('Allowed chars:')
ESP_LOGE('Error: illegal character detected at {}:{}'.format(doc_name, lineno))
print(f'Line {lineno}: {data.strip()}')
print('\r\nBut allowed chars:')
for x in at_allowed_chars_list:
print(x, '---->', x.encode())
return False
pass
return True

def _main():
def at_check_en_format():
if len(sys.argv) == 2:
dst_path = os.path.abspath(sys.argv[1])
dst_path = os.path.abspath(sys.argv[1]) + '/en'
else:
dst_path = os.path.abspath('.') + '/en'
at_en_doc_file_list = at_get_file_list(dst_path, [])
at_en_doc_file_list = at_get_rst_list(dst_path)
for current_file in at_en_doc_file_list:
for file_basename in at_file_white_list:
if os.path.basename(current_file) == file_basename:
continue
else:
if at_check_doc_chars_validity(current_file) == False:
sys.exit(-1)
ESP_LOGI('Document characters check passed! ({})'.format(dst_path))
return False

def at_check_fullwidth_chars(file_path):
pattern = re.compile(f'{at_fullwidth_space}|{at_fullwidth_letters}|{at_fullwidth_digits}|{at_fullwidth_punctuations}')
with open(file_path, 'r', encoding='utf-8') as file:
for line_number, line in enumerate(file, start=1):
matches = pattern.findall(line)
if matches:
ESP_LOGE('Error: illegal fullwidth character detected at {}:{}'.format(file_path, line_number))
print(f'Line {line_number}: {line.strip()}')
print(f"Full-width characters found: {', '.join(matches)}")
return False
return True

def at_check_cn_format():
if len(sys.argv) == 2:
dst_path = os.path.abspath(sys.argv[1]) + '/zh_CN'
else:
dst_path = os.path.abspath('.') + '/zh_CN'
at_cn_doc_file_list = at_get_rst_list(dst_path)
for current_file in at_cn_doc_file_list:
if not at_fullwidth_file_white_list:
for file_basename in at_fullwidth_file_white_list:
if os.path.basename(current_file) == file_basename:
continue
else:
if at_check_fullwidth_chars(current_file) == False:
return False
else:
if at_check_fullwidth_chars(current_file) == False:
return False

def _main():
if at_check_en_format() == False:
sys.exit(-1)

if at_check_cn_format() == False:
sys.exit(-1)

ESP_LOGI('Document characters check passed!')

if __name__ == '__main__':
_main()

0 comments on commit b572138

Please sign in to comment.