Skip to content

Commit

Permalink
[#179] Add documentation to core.model_metrics and refactor a bit
Browse files Browse the repository at this point in the history
  • Loading branch information
riley-harper committed Dec 13, 2024
1 parent bd934f5 commit b2cf14c
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 33 deletions.
100 changes: 67 additions & 33 deletions hlink/linking/core/model_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,32 @@
# For copyright and licensing information, see the NOTICE and LICENSE files
# in this project's top-level directory, and also on-line at:
# https://github.com/ipums/hlink
"""
Metrics for evaluating the performance of a machine learning model. These
metrics operate on the "confusion matrix", which contains the four counts of
true positives, true negatives, false positives, and false negatives.
Throughout this module, we use the abbreviations true_pos, true_neg, false_pos,
and false_neg for these confusion matrix values.
All of these functions return math.nan in cases where they are not well-defined,
such as cases with division by zero.
"""

import math


def f_measure(true_pos: int, false_pos: int, false_neg: int) -> float:
"""
Compute the F-measure, which is defined as the harmonic mean of precision
and recall:
2 * precision * recall / (precision + recall)
Using the definitions of precision and recall, we can write this in terms of
the confusion matrix entries as
2 * true_pos / (2 * true_pos + false_pos + false_neg)
"""
denominator = 2 * true_pos + false_pos + false_neg
if denominator == 0:
return math.nan
Expand All @@ -14,44 +36,56 @@ def f_measure(true_pos: int, false_pos: int, false_neg: int) -> float:

def mcc(true_pos: int, true_neg: int, false_pos: int, false_neg: int) -> float:
"""
Given the counts of true positives (true_pos), true negatives (true_neg), false
positives (false_pos), and false negatives (false_neg) for a model run, compute the
Matthews Correlation Coefficient (MCC).
"""
if (
math.sqrt(
(true_pos + false_pos)
* (true_pos + false_neg)
* (true_neg + false_pos)
* (true_neg + false_neg)
)
) != 0:
mcc = ((true_pos * true_neg) - (false_pos * false_neg)) / (
math.sqrt(
(true_pos + false_pos)
* (true_pos + false_neg)
* (true_neg + false_pos)
* (true_neg + false_neg)
)
)
else:
mcc = math.nan
return mcc
Compute the Matthews Correlation Coefficient (MCC). This can be written as
numerator / denominator, where
numerator = true_pos * true_neg - false_pos * false_neg
and
denominator = sqrt(
(true_pos + false_pos) *
(true_pos + false_neg) *
(true_neg + false_pos) *
(true_neg + false_neg)
)
"""
denominator = math.sqrt(
(true_pos + false_pos)
* (true_pos + false_neg)
* (true_neg + false_pos)
* (true_neg + false_neg)
)
if denominator == 0:
return math.nan

numerator = true_pos * true_neg - false_pos * false_neg
return numerator / denominator


def precision(true_pos: int, false_pos: int) -> float:
if (true_pos + false_pos) == 0:
precision = math.nan
else:
precision = true_pos / (true_pos + false_pos)
"""
Compute the precision, also known as the positive predictive value (PPV).
This can be written in terms of the entries of the confusion matrix as
true_pos / (true_pos + false_pos)
"""
denominator = true_pos + false_pos
if denominator == 0:
return math.nan

return precision
return true_pos / denominator


def recall(true_pos: int, false_neg: int) -> float:
if (true_pos + false_neg) == 0:
recall = math.nan
else:
recall = true_pos / (true_pos + false_neg)
"""
Compute the recall, which can be written in terms of the entries of the
confusion matrix as
true_pos / (true_pos + false_neg)
"""
denominator = true_pos + false_neg
if denominator == 0:
return math.nan

return recall
return true_pos / denominator
22 changes: 22 additions & 0 deletions hlink/tests/core/model_metrics_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,28 @@ def test_mcc_example() -> None:
assert abs(mcc_score - 0.8111208) < 0.0001, "expected MCC to be near 0.8111208"


@given(
true_pos=NonNegativeInt,
true_neg=NonNegativeInt,
false_pos=NonNegativeInt,
false_neg=NonNegativeInt,
)
def test_mcc_is_between_negative_1_and_positive_1(
true_pos: int, true_neg: int, false_pos: int, false_neg: int
) -> None:
"""
Under "normal circumstances", where the denominator of the Matthews Correlation
Coefficient isn't 0, its range is the interval [-1, 1].
"""
assume(true_pos + false_pos > 0)
assume(true_pos + false_neg > 0)
assume(true_neg + false_pos > 0)
assume(true_neg + false_neg > 0)

mcc_score = mcc(true_pos, true_neg, false_pos, false_neg)
assert -1.0 <= mcc_score <= 1.0


@pytest.mark.parametrize(
"true_pos,true_neg,false_pos,false_neg",
[(0, 0, 0, 0), (0, 1, 0, 1), (0, 1, 1, 0), (1, 0, 0, 1), (1, 0, 1, 0)],
Expand Down

0 comments on commit b2cf14c

Please sign in to comment.