Improve word splitting for string normalization (#113)

mworrell · Copilot · web-flow · commit d383b8e0868c · 2025-11-13T15:40:54.000+01:00
* Also handle nbsp as word separators

* Improve word splitting for string normalization

* Update src/z_string_normalize.erl

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;

---------

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/src/z_string_normalize.erl b/src/z_string_normalize.erl
@@ -12,8 +12,43 @@
 
 -export([ normalize/1 ]).
 
+%% File with all the word mappings in the priv folder.
 -define(WORD_MAPPING_FILE, "normalize-words-mapping.csv").
 
+%% Separators in a lowercased string.
+-define(is_sep(C),
+        C < $0
+        orelse (C > $9 andalso C < $a)
+        orelse (C > $z andalso C < 127)
+        orelse C =:= 160    % non breaking space
+        orelse C =:= 8220   % left double quote
+        orelse C =:= 8221   % right double quote
+        orelse C =:= 8216   % left single quote
+        orelse C =:= 8217   % right single quote
+        orelse C =:= 8230   % ellipsis
+        orelse C =:= 8232   % line separator
+        orelse C =:= 8233   % paragraph separator
+        orelse C =:= 8212   % mdash
+        orelse C =:= 8211   % ndash
+    ).
+
+-define(is_map_space(C),
+               C =:= 65279  % byte order mark
+        orelse C =:= 8232   % line separator
+        orelse C =:= 8233   % paragraph separator
+        orelse C =:= 8203   % zero width space
+        orelse C =:= 8204   % zero width non-joiner
+        orelse C =:= 8239   % narrow no-break space
+    ).
+
+-define(is_word_ignore_char(C),
+               C =:= 8205   % zero width joiner
+        orelse C =:= 8288   % word joiner
+        orelse C =:= 173    % soft hyphen
+    ).
+
+
+
 
 %% @doc Transliterate an unicode string to an ascii string with lowercase characters.
 %% Tries to transliterate some characters to a..z
@@ -35,26 +70,18 @@ normalize({trans, [{_, First} | _] = Tr}) ->
     V = proplists:get_value(en, Tr, First),
     normalize(V).
 
-%% Separators in a lowercased string
--define(is_sep(C),
-        C < $0
-        orelse (C > $9 andalso C < $a)
-        orelse (C > $z andalso C < 127)
-        orelse C =:= 8023   % non breaking zero width space
-        orelse C =:= 8212   % mdash
-        orelse C =:= 8211   % ndash
-    ).
-
 %% Normalize specific words using custom mappings from CSV file.
-%% This allows language-specific transliterations that differ from 
+%% This allows language-specific transliterations that differ from
 %% standard romanization rules.
 normalize_words(B) when is_binary(B) ->
     Ws = normalize_words_word(B, <<>>, []),
     normalize(erlang:iolist_to_binary(Ws), <<>>).
 
 normalize_words_word(<<>>, W, Acc) ->
     lists:reverse([map_word(W)|Acc]);
-normalize_words_word(<<C/utf8, T/binary>>, W, Acc) when ?is_sep(C) ->
+normalize_words_word(<<C/utf8, T/binary>>, W, Acc) when ?is_word_ignore_char(C) ->
+    normalize_words_word(T, W, Acc);
+normalize_words_word(<<C/utf8, T/binary>>, W, Acc) when ?is_sep(C) orelse ?is_map_space(C) ->
     normalize_words_sep(T, <<C/utf8>>, [map_word(W)|Acc]);
 normalize_words_word(<<C/utf8, T/binary>>, W, Acc) ->
     normalize_words_word(T, <<W/binary, C/utf8>>, Acc);
@@ -64,7 +91,9 @@ normalize_words_word(<<_Byte, T/binary>>, W, Acc) ->
 
 normalize_words_sep(<<>>, W, Acc) ->
     lists:reverse([W|Acc]);
-normalize_words_sep(<<C/utf8, T/binary>>, W, Acc) when not (?is_sep(C)) ->
+normalize_words_sep(<<C/utf8, T/binary>>, W, Acc) when ?is_word_ignore_char(C) ->
+    normalize_words_sep(T, W, Acc);
+normalize_words_sep(<<C/utf8, T/binary>>, W, Acc) when not (?is_sep(C) orelse ?is_map_space(C)) ->
     normalize_words_word(T, <<C/utf8>>, [W|Acc]);
 normalize_words_sep(<<C/utf8, T/binary>>, W, Acc) ->
     normalize_words_sep(T, <<W/binary, C/utf8>>, Acc);
@@ -289,11 +318,11 @@ normalize(<<C/utf8,T/binary>>, Acc) when C >= 32, C =< 126 ->
 normalize(<<C, T/binary>>, Acc) when C =:= $\n; C =:= $\t ->
     % Keep newlines and tabs
     normalize(T, <<Acc/binary, " ">>);
-normalize(<<C/utf8,T/binary>>, Acc) when C < 32 ->
-    % Replace control characters with spaces
+normalize(<<C/utf8,T/binary>>, Acc) when ?is_map_space(C) ->
+    % Replace control or space-like characters with spaces
     normalize(T, <<Acc/binary, " ">>);
-normalize(<<C/utf8,T/binary>>, Acc) when C =:= 8023 ->
-    % Zero width space
+normalize(<<C/utf8,T/binary>>, Acc) when ?is_word_ignore_char(C) ->
+    % Zero width space et al
     normalize(T, Acc);
 normalize(<<C/utf8,T/binary>>, Acc) ->
     % Try to remove any accents.
diff --git a/test/z_string_test.erl b/test/z_string_test.erl
@@ -114,9 +114,12 @@ truncatechars_test() ->
 truncatewords_test() ->
     ?assertEqual(<<"foo bar x">>, z_string:truncatewords(<<"foo bar bla">>, 2, <<"x">>)).
 
-normalize_map_words_test() ->
+normalize_map_word_test() ->
     ?assertEqual(<<"odesa">>, z_string:normalize(<<"Одесса"/utf8>>)).
 
+normalize_map_words_test() ->
+    ?assertEqual(<<"the city odesa is ukrainian">>, z_string:normalize(<<"the city Одесса is Ukrainian"/utf8>>)).
+
 normalize_test() ->
     % binary()
     ?assertEqual(<<"a"/utf8>>, z_string:normalize(<<"ä"/utf8>>)),