Skip to content

Commit 610c1ae

Browse files
committed
Add string:jaro_similarity/2
Calculate word similarity, can for example be used to provide potential alternatives in error messages.
1 parent fbe53ad commit 610c1ae

File tree

3 files changed

+144
-5
lines changed

3 files changed

+144
-5
lines changed

lib/stdlib/doc/src/string.xml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,30 @@ true</pre>
244244
</desc>
245245
</func>
246246

247+
<func>
248+
<name name="jaro_similarity" arity="2" since="OTP 27.0"/>
249+
<fsummary>Calculate the Jaro similarity of two strings.</fsummary>
250+
<desc>
251+
<p>Returns a float between <c>+0.0</c> and <c>1.0</c> representing the
252+
<url href="https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">
253+
Jaro similarity</url> between the given strings. Strings with many letters
254+
in common relative to their lengths will score closer to <c>1.0</c>.
255+
</p>
256+
<p>The Jaro distance between two strings can be calculated with <c>JaroDistance = 1.0-JaroSimilarity</c>.
257+
</p>
258+
<p><em>Example:</em></p>
259+
<pre>
260+
1> <input>string:jaro_similarity("ditto", "ditto").</input>
261+
1.0
262+
2> <input>string:jaro_similarity("foo", "bar").</input>
263+
+0.0
264+
3> <input>string:jaro_similarity("michelle", "michael").</input>
265+
0.8690476190476191
266+
4> <input>string:jaro_similarity(&lt;&lt;"Édouard"/utf8>>, &lt;&lt;"Claude">>).</input>
267+
0.5317460317460317</pre>
268+
</desc>
269+
</func>
270+
247271
<func>
248272
<name name="length" arity="1" since="OTP 20.0"/>
249273
<fsummary>Calculate length of the string.</fsummary>

lib/stdlib/src/string.erl

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
prefix/2,
5959
split/2,split/3,replace/3,replace/4,
6060
find/2,find/3,
61+
jaro_similarity/2,
6162
next_codepoint/1, next_grapheme/1
6263
]).
6364

@@ -85,7 +86,7 @@
8586
-type grapheme_cluster() :: char() | [char()].
8687
-type direction() :: 'leading' | 'trailing'.
8788

88-
-dialyzer({no_improper_lists, [stack/2, length_b/3]}).
89+
-dialyzer({no_improper_lists, [stack/2, length_b/3, str_to_map/2]}).
8990
%%% BIFs internal (not documented) should not to be used outside of this module
9091
%%% May be removed
9192
-export([list_to_float/1, list_to_integer/1]).
@@ -563,6 +564,52 @@ find(String, SearchPattern, leading) ->
563564
find(String, SearchPattern, trailing) ->
564565
find_r(String, unicode:characters_to_list(SearchPattern), nomatch).
565566

567+
-spec jaro_similarity(String1, String2) -> Similarity when
568+
String1 :: unicode:chardata(),
569+
String2 :: unicode:chardata(),
570+
Similarity :: float(). %% Between +0.0 and 1.0
571+
jaro_similarity(A0, B0) ->
572+
{A, ALen} = str_to_gcl_and_length(A0),
573+
{B, BLen} = str_to_indexmap(B0),
574+
Dist = max(ALen, BLen) div 2,
575+
{AM, BM} = jaro_match(A, B, -Dist, Dist, [], []),
576+
if
577+
ALen =:= 0 andalso BLen =:= 0 ->
578+
1.0;
579+
ALen =:= 0 orelse BLen =:= 0 ->
580+
0.0;
581+
AM =:= [] ->
582+
0.0;
583+
true ->
584+
{M,T} = jaro_calc_mt(AM, BM, 0, 0),
585+
(M/ALen + M/BLen + (M-T/2)/M) / 3
586+
end.
587+
588+
jaro_match([A|As], B0, Min, Max, AM, BM) ->
589+
case jaro_detect(maps:get(A, B0, []), Min, Max) of
590+
false ->
591+
jaro_match(As, B0, Min+1, Max+1, AM, BM);
592+
{J, Remain} ->
593+
B = B0#{A => Remain},
594+
jaro_match(As, B, Min+1, Max+1, [A|AM], add_rsorted({J,A},BM))
595+
end;
596+
jaro_match(_A, _B, _Min, _Max, AM, BM) ->
597+
{AM, BM}.
598+
599+
jaro_detect([Idx|Rest], Min, Max) when Min < Idx, Idx < Max ->
600+
{Idx, Rest};
601+
jaro_detect([Idx|Rest], Min, Max) when Idx < Max ->
602+
jaro_detect(Rest, Min, Max);
603+
jaro_detect(_, _, _) ->
604+
false.
605+
606+
jaro_calc_mt([CharA|AM], [{_, CharA}|BM], M, T) ->
607+
jaro_calc_mt(AM, BM, M+1, T);
608+
jaro_calc_mt([_|AM], [_|BM], M, T) ->
609+
jaro_calc_mt(AM, BM, M+1, T+1);
610+
jaro_calc_mt([], [], M, T) ->
611+
{M, T}.
612+
566613
%% Fetch first grapheme cluster and return rest in tail
567614
-spec next_grapheme(String::unicode:chardata()) ->
568615
maybe_improper_list(grapheme_cluster(),unicode:chardata()) |
@@ -1795,6 +1842,37 @@ bin_search_str_2(Bin0, Start, Cont, First, SearchCPs) ->
17951842
end.
17961843

17971844

1845+
%% Returns GC list and length
1846+
str_to_gcl_and_length(S0) ->
1847+
gcl_and_length(unicode_util:gc(S0), [], 0).
1848+
1849+
gcl_and_length([C|Str], Acc, N) ->
1850+
gcl_and_length(unicode_util:gc(Str), [C|Acc], N+1);
1851+
gcl_and_length([], Acc, N) ->
1852+
{lists:reverse(Acc), N};
1853+
gcl_and_length({error, Err}, _, _) ->
1854+
error({badarg, Err}).
1855+
1856+
%% Returns GC map with index and length
1857+
str_to_indexmap(S) ->
1858+
[M|L] = str_to_map(unicode_util:gc(S), 0),
1859+
{M,L}.
1860+
1861+
str_to_map([], L) -> [#{}|L];
1862+
str_to_map([G | Gs], I) ->
1863+
[M|L] = str_to_map(unicode_util:gc(Gs), I+1),
1864+
[maps:put(G, [I | maps:get(G, M, [])], M)| L];
1865+
str_to_map({error,Error}, _) ->
1866+
error({badarg, Error}).
1867+
1868+
%% Add in decreasing order
1869+
add_rsorted(A, [H|_]=BM) when A > H ->
1870+
[A|BM];
1871+
add_rsorted(A, [H|BM]) ->
1872+
[H|add_rsorted(A,BM)];
1873+
add_rsorted(A, []) ->
1874+
[A].
1875+
17981876
%%---------------------------------------------------------------------------
17991877
%% OLD lists API kept for backwards compability
18001878
%%---------------------------------------------------------------------------

lib/stdlib/test/string_SUITE.erl

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,9 @@
3636
uppercase/1, lowercase/1, titlecase/1, casefold/1,
3737
to_integer/1,to_float/1,
3838
prefix/1, split/1, replace/1, find/1,
39-
lexemes/1, nth_lexeme/1, cd_gc/1, meas/1
39+
lexemes/1, nth_lexeme/1, cd_gc/1,
40+
jaro_similarity/1,
41+
meas/1
4042
]).
4143

4244
-export([len/1,old_equal/1,old_concat/1,chr_rchr/1,str_rstr/1]).
@@ -66,6 +68,7 @@ groups() ->
6668
to_integer, to_float,
6769
uppercase, lowercase, titlecase, casefold,
6870
prefix, find, split, replace, cd_gc,
71+
jaro_similarity,
6972
meas]},
7073
{list_string,
7174
[len, old_equal, old_concat, chr_rchr, str_rstr, span_cspan,
@@ -788,6 +791,36 @@ nth_lexeme(_) ->
788791
?TEST([<<"aae">>,778,"öeeåäö"], [2,"e"], "åäö"),
789792
ok.
790793

794+
jaro_similarity(_Config) ->
795+
?TEST("", [""], 1.0),
796+
?TEST("", [["", <<"">>]], 1.0),
797+
%% From https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance#Jaro_similarity
798+
?TEST("faremviel", ["farmville"], 0.8842592592592592),
799+
?TEST("michelle", ["michael"], 0.8690476190476191),
800+
?TEST("michelle", [<<"michael">>], 0.8690476190476191),
801+
?TEST(<<"Édouard"/utf8>>, ["Claude"], 0.5317460317460317),
802+
803+
804+
?TEST("farmville", ["farmville"], 1.0),
805+
?TEST("farmville", ["zxzxzx"], +0.0),
806+
807+
?TEST("Saturday", ["Sunday"], 0.71944444),
808+
?TEST("Sunday", ["Saturday"], 0.71944444),
809+
810+
%% Short strings (no translations counted)
811+
?TEST("ca", ["abc"], 0.0),
812+
?TEST("ca", ["cb"], ((1/2+1/2+1)/3)),
813+
?TEST("ca", ["cab"], ((2/2+2/3+1)/3)),
814+
?TEST("caa", ["cab"], ((2/3+2/3+1)/3)),
815+
%% With one translation
816+
?TEST("caabx", ["caba"], ((4/5+4/4+((4-2/2)/4))/3)),
817+
818+
InvalidUTF8 = <<192,192>>,
819+
{'EXIT', {badarg, _}} = ?TRY(string:jaro_similarity("foo", InvalidUTF8)),
820+
{'EXIT', {badarg, _}} = ?TRY(string:jaro_similarity("foo", <<$a, InvalidUTF8/binary, $z>>)),
821+
822+
ok.
823+
791824

792825
meas(Config) ->
793826
Parent = self(),
@@ -956,7 +989,7 @@ test_1(Line, Func, Str, Args, Exp) ->
956989
check_types(Line, Func, Args, Res),
957990
case res(Res, Exp) of
958991
true -> ok;
959-
{Res1,Exp1} when is_tuple(Exp1) ->
992+
{Res1,Exp1} when is_tuple(Exp1); is_float(Exp1) ->
960993
io:format("~p~n",[Args]),
961994
io:format("~p:~p: ~ts~w =>~n :~w:~w~n",
962995
[Func,Line, Str,Str,Res1,Exp1]),
@@ -999,6 +1032,8 @@ res({S1,S2}=S, {Exp1,Exp2}=E) -> %% For take
9991032
{true, true} -> true;
10001033
_ -> {S, E}
10011034
end;
1035+
res(Float, Exp) when is_float(Exp) ->
1036+
abs(Float - Exp) < 0.0000001 orelse {Float, Exp};
10021037
res(Int, Exp) ->
10031038
Int == Exp orelse {Int, Exp}.
10041039

@@ -1007,8 +1042,10 @@ check_types(_Line, _Func, _Str, Res)
10071042
when is_integer(Res); is_boolean(Res); Res =:= nomatch ->
10081043
%% length or equal
10091044
ok;
1010-
check_types(Line, Func, [S1,S2], Res)
1011-
when Func =:= concat ->
1045+
check_types(_Line, jaro_similarity, _Str, Res)
1046+
when is_float(Res) ->
1047+
ok;
1048+
check_types(Line, concat = Func, [S1,S2], Res) ->
10121049
case check_types_1(type(S1),type(S2)) of
10131050
ok ->
10141051
case check_types_1(type(S1),type(Res)) of

0 commit comments

Comments
 (0)