Skip to content

Commit d383b8e

Browse files
mworrellCopilot
andauthored
Improve word splitting for string normalization (#113)
* Also handle nbsp as word separators * Improve word splitting for string normalization * Update src/z_string_normalize.erl Co-authored-by: Copilot <[email protected]> --------- Co-authored-by: Copilot <[email protected]>
1 parent 7a8142e commit d383b8e

File tree

2 files changed

+50
-18
lines changed

2 files changed

+50
-18
lines changed

src/z_string_normalize.erl

Lines changed: 46 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,43 @@
1212

1313
-export([ normalize/1 ]).
1414

15+
%% File with all the word mappings in the priv folder.
1516
-define(WORD_MAPPING_FILE, "normalize-words-mapping.csv").
1617

18+
%% Separators in a lowercased string.
19+
-define(is_sep(C),
20+
C < $0
21+
orelse (C > $9 andalso C < $a)
22+
orelse (C > $z andalso C < 127)
23+
orelse C =:= 160 % non breaking space
24+
orelse C =:= 8220 % left double quote
25+
orelse C =:= 8221 % right double quote
26+
orelse C =:= 8216 % left single quote
27+
orelse C =:= 8217 % right single quote
28+
orelse C =:= 8230 % ellipsis
29+
orelse C =:= 8232 % line separator
30+
orelse C =:= 8233 % paragraph separator
31+
orelse C =:= 8212 % mdash
32+
orelse C =:= 8211 % ndash
33+
).
34+
35+
-define(is_map_space(C),
36+
C =:= 65279 % byte order mark
37+
orelse C =:= 8232 % line separator
38+
orelse C =:= 8233 % paragraph separator
39+
orelse C =:= 8203 % zero width space
40+
orelse C =:= 8204 % zero width non-joiner
41+
orelse C =:= 8239 % narrow no-break space
42+
).
43+
44+
-define(is_word_ignore_char(C),
45+
C =:= 8205 % zero width joiner
46+
orelse C =:= 8288 % word joiner
47+
orelse C =:= 173 % soft hyphen
48+
).
49+
50+
51+
1752

1853
%% @doc Transliterate an unicode string to an ascii string with lowercase characters.
1954
%% Tries to transliterate some characters to a..z
@@ -35,26 +70,18 @@ normalize({trans, [{_, First} | _] = Tr}) ->
3570
V = proplists:get_value(en, Tr, First),
3671
normalize(V).
3772

38-
%% Separators in a lowercased string
39-
-define(is_sep(C),
40-
C < $0
41-
orelse (C > $9 andalso C < $a)
42-
orelse (C > $z andalso C < 127)
43-
orelse C =:= 8023 % non breaking zero width space
44-
orelse C =:= 8212 % mdash
45-
orelse C =:= 8211 % ndash
46-
).
47-
4873
%% Normalize specific words using custom mappings from CSV file.
49-
%% This allows language-specific transliterations that differ from
74+
%% This allows language-specific transliterations that differ from
5075
%% standard romanization rules.
5176
normalize_words(B) when is_binary(B) ->
5277
Ws = normalize_words_word(B, <<>>, []),
5378
normalize(erlang:iolist_to_binary(Ws), <<>>).
5479

5580
normalize_words_word(<<>>, W, Acc) ->
5681
lists:reverse([map_word(W)|Acc]);
57-
normalize_words_word(<<C/utf8, T/binary>>, W, Acc) when ?is_sep(C) ->
82+
normalize_words_word(<<C/utf8, T/binary>>, W, Acc) when ?is_word_ignore_char(C) ->
83+
normalize_words_word(T, W, Acc);
84+
normalize_words_word(<<C/utf8, T/binary>>, W, Acc) when ?is_sep(C) orelse ?is_map_space(C) ->
5885
normalize_words_sep(T, <<C/utf8>>, [map_word(W)|Acc]);
5986
normalize_words_word(<<C/utf8, T/binary>>, W, Acc) ->
6087
normalize_words_word(T, <<W/binary, C/utf8>>, Acc);
@@ -64,7 +91,9 @@ normalize_words_word(<<_Byte, T/binary>>, W, Acc) ->
6491

6592
normalize_words_sep(<<>>, W, Acc) ->
6693
lists:reverse([W|Acc]);
67-
normalize_words_sep(<<C/utf8, T/binary>>, W, Acc) when not (?is_sep(C)) ->
94+
normalize_words_sep(<<C/utf8, T/binary>>, W, Acc) when ?is_word_ignore_char(C) ->
95+
normalize_words_sep(T, W, Acc);
96+
normalize_words_sep(<<C/utf8, T/binary>>, W, Acc) when not (?is_sep(C) orelse ?is_map_space(C)) ->
6897
normalize_words_word(T, <<C/utf8>>, [W|Acc]);
6998
normalize_words_sep(<<C/utf8, T/binary>>, W, Acc) ->
7099
normalize_words_sep(T, <<W/binary, C/utf8>>, Acc);
@@ -289,11 +318,11 @@ normalize(<<C/utf8,T/binary>>, Acc) when C >= 32, C =< 126 ->
289318
normalize(<<C, T/binary>>, Acc) when C =:= $\n; C =:= $\t ->
290319
% Keep newlines and tabs
291320
normalize(T, <<Acc/binary, " ">>);
292-
normalize(<<C/utf8,T/binary>>, Acc) when C < 32 ->
293-
% Replace control characters with spaces
321+
normalize(<<C/utf8,T/binary>>, Acc) when ?is_map_space(C) ->
322+
% Replace control or space-like characters with spaces
294323
normalize(T, <<Acc/binary, " ">>);
295-
normalize(<<C/utf8,T/binary>>, Acc) when C =:= 8023 ->
296-
% Zero width space
324+
normalize(<<C/utf8,T/binary>>, Acc) when ?is_word_ignore_char(C) ->
325+
% Zero width space et al
297326
normalize(T, Acc);
298327
normalize(<<C/utf8,T/binary>>, Acc) ->
299328
% Try to remove any accents.

test/z_string_test.erl

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,9 +114,12 @@ truncatechars_test() ->
114114
truncatewords_test() ->
115115
?assertEqual(<<"foo bar x">>, z_string:truncatewords(<<"foo bar bla">>, 2, <<"x">>)).
116116

117-
normalize_map_words_test() ->
117+
normalize_map_word_test() ->
118118
?assertEqual(<<"odesa">>, z_string:normalize(<<"Одесса"/utf8>>)).
119119

120+
normalize_map_words_test() ->
121+
?assertEqual(<<"the city odesa is ukrainian">>, z_string:normalize(<<"the city Одесса is Ukrainian"/utf8>>)).
122+
120123
normalize_test() ->
121124
% binary()
122125
?assertEqual(<<"a"/utf8>>, z_string:normalize(<<"ä"/utf8>>)),

0 commit comments

Comments
 (0)