1212
1313-export ([ normalize /1 ]).
1414
15+ % % File with all the word mappings in the priv folder.
1516-define (WORD_MAPPING_FILE , " normalize-words-mapping.csv" ).
1617
18+ % % Separators in a lowercased string.
19+ -define (is_sep (C ),
20+ C < $0
21+ orelse (C > $9 andalso C < $a )
22+ orelse (C > $z andalso C < 127 )
23+ orelse C =:= 160 % non breaking space
24+ orelse C =:= 8220 % left double quote
25+ orelse C =:= 8221 % right double quote
26+ orelse C =:= 8216 % left single quote
27+ orelse C =:= 8217 % right single quote
28+ orelse C =:= 8230 % ellipsis
29+ orelse C =:= 8232 % line separator
30+ orelse C =:= 8233 % paragraph separator
31+ orelse C =:= 8212 % mdash
32+ orelse C =:= 8211 % ndash
33+ ).
34+
35+ -define (is_map_space (C ),
36+ C =:= 65279 % byte order mark
37+ orelse C =:= 8232 % line separator
38+ orelse C =:= 8233 % paragraph separator
39+ orelse C =:= 8203 % zero width space
40+ orelse C =:= 8204 % zero width non-joiner
41+ orelse C =:= 8239 % narrow no-break space
42+ ).
43+
44+ -define (is_word_ignore_char (C ),
45+ C =:= 8205 % zero width joiner
46+ orelse C =:= 8288 % word joiner
47+ orelse C =:= 173 % soft hyphen
48+ ).
49+
50+
51+
1752
1853% % @doc Transliterate an unicode string to an ascii string with lowercase characters.
1954% % Tries to transliterate some characters to a..z
@@ -35,26 +70,18 @@ normalize({trans, [{_, First} | _] = Tr}) ->
3570 V = proplists :get_value (en , Tr , First ),
3671 normalize (V ).
3772
38- % % Separators in a lowercased string
39- -define (is_sep (C ),
40- C < $0
41- orelse (C > $9 andalso C < $a )
42- orelse (C > $z andalso C < 127 )
43- orelse C =:= 8023 % non breaking zero width space
44- orelse C =:= 8212 % mdash
45- orelse C =:= 8211 % ndash
46- ).
47-
4873% % Normalize specific words using custom mappings from CSV file.
49- % % This allows language-specific transliterations that differ from
74+ % % This allows language-specific transliterations that differ from
5075% % standard romanization rules.
5176normalize_words (B ) when is_binary (B ) ->
5277 Ws = normalize_words_word (B , <<>>, []),
5378 normalize (erlang :iolist_to_binary (Ws ), <<>>).
5479
5580normalize_words_word (<<>>, W , Acc ) ->
5681 lists :reverse ([map_word (W )|Acc ]);
57- normalize_words_word (<<C /utf8 , T /binary >>, W , Acc ) when ? is_sep (C ) ->
82+ normalize_words_word (<<C /utf8 , T /binary >>, W , Acc ) when ? is_word_ignore_char (C ) ->
83+ normalize_words_word (T , W , Acc );
84+ normalize_words_word (<<C /utf8 , T /binary >>, W , Acc ) when ? is_sep (C ) orelse ? is_map_space (C ) ->
5885 normalize_words_sep (T , <<C /utf8 >>, [map_word (W )|Acc ]);
5986normalize_words_word (<<C /utf8 , T /binary >>, W , Acc ) ->
6087 normalize_words_word (T , <<W /binary , C /utf8 >>, Acc );
@@ -64,7 +91,9 @@ normalize_words_word(<<_Byte, T/binary>>, W, Acc) ->
6491
6592normalize_words_sep (<<>>, W , Acc ) ->
6693 lists :reverse ([W |Acc ]);
67- normalize_words_sep (<<C /utf8 , T /binary >>, W , Acc ) when not (? is_sep (C )) ->
94+ normalize_words_sep (<<C /utf8 , T /binary >>, W , Acc ) when ? is_word_ignore_char (C ) ->
95+ normalize_words_sep (T , W , Acc );
96+ normalize_words_sep (<<C /utf8 , T /binary >>, W , Acc ) when not (? is_sep (C ) orelse ? is_map_space (C )) ->
6897 normalize_words_word (T , <<C /utf8 >>, [W |Acc ]);
6998normalize_words_sep (<<C /utf8 , T /binary >>, W , Acc ) ->
7099 normalize_words_sep (T , <<W /binary , C /utf8 >>, Acc );
@@ -289,11 +318,11 @@ normalize(<<C/utf8,T/binary>>, Acc) when C >= 32, C =< 126 ->
289318normalize (<<C , T /binary >>, Acc ) when C =:= $\n ; C =:= $\t ->
290319 % Keep newlines and tabs
291320 normalize (T , <<Acc /binary , " " >>);
292- normalize (<<C /utf8 ,T /binary >>, Acc ) when C < 32 ->
293- % Replace control characters with spaces
321+ normalize (<<C /utf8 ,T /binary >>, Acc ) when ? is_map_space ( C ) ->
322+ % Replace control or space-like characters with spaces
294323 normalize (T , <<Acc /binary , " " >>);
295- normalize (<<C /utf8 ,T /binary >>, Acc ) when C =:= 8023 ->
296- % Zero width space
324+ normalize (<<C /utf8 ,T /binary >>, Acc ) when ? is_word_ignore_char ( C ) ->
325+ % Zero width space et al
297326 normalize (T , Acc );
298327normalize (<<C /utf8 ,T /binary >>, Acc ) ->
299328 % Try to remove any accents.
0 commit comments