[#179] Add documentation to core.model_metrics and refactor a bit

ipums · Dec 13, 2024 · b2cf14c · b2cf14c
1 parent bd934f5
commit b2cf14c
Show file tree

Hide file tree

Showing 2 changed files with 89 additions and 33 deletions.
diff --git a/hlink/linking/core/model_metrics.py b/hlink/linking/core/model_metrics.py
@@ -2,10 +2,32 @@
 # For copyright and licensing information, see the NOTICE and LICENSE files
 # in this project's top-level directory, and also on-line at:
 #   https://github.com/ipums/hlink
+"""
+Metrics for evaluating the performance of a machine learning model. These
+metrics operate on the "confusion matrix", which contains the four counts of
+true positives, true negatives, false positives, and false negatives.
+Throughout this module, we use the abbreviations true_pos, true_neg, false_pos,
+and false_neg for these confusion matrix values.
+
+All of these functions return math.nan in cases where they are not well-defined,
+such as cases with division by zero.
+"""
+
 import math
 
 
 def f_measure(true_pos: int, false_pos: int, false_neg: int) -> float:
+    """
+    Compute the F-measure, which is defined as the harmonic mean of precision
+    and recall:
+
+       2 * precision * recall / (precision + recall)
+
+    Using the definitions of precision and recall, we can write this in terms of
+    the confusion matrix entries as
+
+      2 * true_pos / (2 * true_pos + false_pos + false_neg)
+    """
     denominator = 2 * true_pos + false_pos + false_neg
     if denominator == 0:
         return math.nan
@@ -14,44 +36,56 @@ def f_measure(true_pos: int, false_pos: int, false_neg: int) -> float:
 
 def mcc(true_pos: int, true_neg: int, false_pos: int, false_neg: int) -> float:
     """
-    Given the counts of true positives (true_pos), true negatives (true_neg), false
-    positives (false_pos), and false negatives (false_neg) for a model run, compute the
-    Matthews Correlation Coefficient (MCC).
-    """
-    if (
-        math.sqrt(
-            (true_pos + false_pos)
-            * (true_pos + false_neg)
-            * (true_neg + false_pos)
-            * (true_neg + false_neg)
-        )
-    ) != 0:
-        mcc = ((true_pos * true_neg) - (false_pos * false_neg)) / (
-            math.sqrt(
-                (true_pos + false_pos)
-                * (true_pos + false_neg)
-                * (true_neg + false_pos)
-                * (true_neg + false_neg)
-            )
-        )
-    else:
-        mcc = math.nan
-    return mcc
+    Compute the Matthews Correlation Coefficient (MCC). This can be written as
+    numerator / denominator, where
+
+      numerator = true_pos * true_neg - false_pos * false_neg
+
+    and
+
+      denominator = sqrt(
+        (true_pos + false_pos) *
+        (true_pos + false_neg) *
+        (true_neg + false_pos) *
+        (true_neg + false_neg)
+      )
+    """
+    denominator = math.sqrt(
+        (true_pos + false_pos)
+        * (true_pos + false_neg)
+        * (true_neg + false_pos)
+        * (true_neg + false_neg)
+    )
+    if denominator == 0:
+        return math.nan
+
+    numerator = true_pos * true_neg - false_pos * false_neg
+    return numerator / denominator
 
 
 def precision(true_pos: int, false_pos: int) -> float:
-    if (true_pos + false_pos) == 0:
-        precision = math.nan
-    else:
-        precision = true_pos / (true_pos + false_pos)
+    """
+    Compute the precision, also known as the positive predictive value (PPV).
+    This can be written in terms of the entries of the confusion matrix as
+
+      true_pos / (true_pos + false_pos)
+    """
+    denominator = true_pos + false_pos
+    if denominator == 0:
+        return math.nan
 
-    return precision
+    return true_pos / denominator
 
 
 def recall(true_pos: int, false_neg: int) -> float:
-    if (true_pos + false_neg) == 0:
-        recall = math.nan
-    else:
-        recall = true_pos / (true_pos + false_neg)
+    """
+    Compute the recall, which can be written in terms of the entries of the
+    confusion matrix as
+
+      true_pos / (true_pos + false_neg)
+    """
+    denominator = true_pos + false_neg
+    if denominator == 0:
+        return math.nan
 
-    return recall
+    return true_pos / denominator
diff --git a/hlink/tests/core/model_metrics_test.py b/hlink/tests/core/model_metrics_test.py
@@ -72,6 +72,28 @@ def test_mcc_example() -> None:
     assert abs(mcc_score - 0.8111208) < 0.0001, "expected MCC to be near 0.8111208"
 
 
+@given(
+    true_pos=NonNegativeInt,
+    true_neg=NonNegativeInt,
+    false_pos=NonNegativeInt,
+    false_neg=NonNegativeInt,
+)
+def test_mcc_is_between_negative_1_and_positive_1(
+    true_pos: int, true_neg: int, false_pos: int, false_neg: int
+) -> None:
+    """
+    Under "normal circumstances", where the denominator of the Matthews Correlation
+    Coefficient isn't 0, its range is the interval [-1, 1].
+    """
+    assume(true_pos + false_pos > 0)
+    assume(true_pos + false_neg > 0)
+    assume(true_neg + false_pos > 0)
+    assume(true_neg + false_neg > 0)
+
+    mcc_score = mcc(true_pos, true_neg, false_pos, false_neg)
+    assert -1.0 <= mcc_score <= 1.0
+
+
 @pytest.mark.parametrize(
     "true_pos,true_neg,false_pos,false_neg",
     [(0, 0, 0, 0), (0, 1, 0, 1), (0, 1, 1, 0), (1, 0, 0, 1), (1, 0, 1, 0)],