Merge pull request #147 from ipums/add_tests

Add tests to cover several untested sections of code
ipums · Aug 27, 2024 · f10b822 · f10b822
2 parents 54d4820 + e4c9941
commit f10b822
Show file tree

Hide file tree

Showing 9 changed files with 405 additions and 47 deletions.
diff --git a/docs/_sources/feature_selection_transforms.md.txt b/docs/_sources/feature_selection_transforms.md.txt
@@ -1,16 +1,26 @@
-# Feature Selection transforms
-
-Each header below represents a feature selection transform.  These transforms are used in the context of `feature_selections`.
-
-```
-[[feature_selections]]
-input_column = "clean_birthyr"
-output_column = "replaced_birthyr"
-condition = "case when clean_birthyr is null or clean_birthyr == '' then year - age else clean_birthyr end"
-transform = "sql_condition"
-```
-
-There are some additional attributes available for all transforms: `checkpoint`, `override_column_a`, `override_column_b`, `set_value_column_a`, `set_value_column_b`.
+# Feature Selection Transforms
+
+Each feature selection in the `[[feature_selections]]` list must have a
+`transform` attribute which tells hlink which transform it uses. The available
+feature selection transforms are listed below. The attributes of the feature
+selection often vary with the feature selection transform. However, there are a
+few utility attributes which are available for all transforms:
+
+- `override_column_a` - Type: `string`. Optional. Given the name of a column in
+  dataset A, copy that column into the output column instead of computing the
+  feature selection for dataset A. This does not affect dataset B.
+- `override_column_b` - Type: `string`. Optional. Given the name of a column in
+  dataset B, copy that column into the output column instead of computing the
+  feature selection for dataset B. This does not affect dataset A.
+- `set_value_column_a` - Type: any. Optional. Instead of computing the feature
+  selection for dataset A, use the given value for every row in the output
+  column. This does not affect dataset B.
+- `set_value_column_b` - Type: any. Optional. Instead of computing the feature
+  selection for dataset B, use the given value for every row in the output
+  column. This does not affect dataset A.
+- `checkpoint` - Type: `boolean`. Optional. If set to true, checkpoint the
+  dataset in Spark before computing the feature selection. This can reduce some
+  resource usage for very complex workflows, but should not be necessary.
 
 ## bigrams
 

diff --git a/docs/feature_selection_transforms.html b/docs/feature_selection_transforms.html
@@ -5,7 +5,7 @@
     <meta charset="utf-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
 
-    <title>Feature Selection transforms &#8212; hlink 3.6.1 documentation</title>
+    <title>Feature Selection Transforms &#8212; hlink 3.6.1 documentation</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=d1102ebc" />
     <link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=12dfc556" />
     <script src="_static/documentation_options.js?v=f731707b"></script>
@@ -33,16 +33,29 @@
           <div class="body" role="main">
 
   <section id="feature-selection-transforms">
-<h1>Feature Selection transforms<a class="headerlink" href="#feature-selection-transforms" title="Link to this heading">¶</a></h1>
-<p>Each header below represents a feature selection transform.  These transforms are used in the context of <code class="docutils literal notranslate"><span class="pre">feature_selections</span></code>.</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">[[</span><span class="n">feature_selections</span><span class="p">]]</span>
-<span class="n">input_column</span> <span class="o">=</span> <span class="s2">&quot;clean_birthyr&quot;</span>
-<span class="n">output_column</span> <span class="o">=</span> <span class="s2">&quot;replaced_birthyr&quot;</span>
-<span class="n">condition</span> <span class="o">=</span> <span class="s2">&quot;case when clean_birthyr is null or clean_birthyr == &#39;&#39; then year - age else clean_birthyr end&quot;</span>
-<span class="n">transform</span> <span class="o">=</span> <span class="s2">&quot;sql_condition&quot;</span>
-</pre></div>
-</div>
-<p>There are some additional attributes available for all transforms: <code class="docutils literal notranslate"><span class="pre">checkpoint</span></code>, <code class="docutils literal notranslate"><span class="pre">override_column_a</span></code>, <code class="docutils literal notranslate"><span class="pre">override_column_b</span></code>, <code class="docutils literal notranslate"><span class="pre">set_value_column_a</span></code>, <code class="docutils literal notranslate"><span class="pre">set_value_column_b</span></code>.</p>
+<h1>Feature Selection Transforms<a class="headerlink" href="#feature-selection-transforms" title="Link to this heading">¶</a></h1>
+<p>Each feature selection in the <code class="docutils literal notranslate"><span class="pre">[[feature_selections]]</span></code> list must have a
+<code class="docutils literal notranslate"><span class="pre">transform</span></code> attribute which tells hlink which transform it uses. The available
+feature selection transforms are listed below. The attributes of the feature
+selection often vary with the feature selection transform. However, there are a
+few utility attributes which are available for all transforms:</p>
+<ul class="simple">
+<li><p><code class="docutils literal notranslate"><span class="pre">override_column_a</span></code> - Type: <code class="docutils literal notranslate"><span class="pre">string</span></code>. Optional. Given the name of a column in
+dataset A, copy that column into the output column instead of computing the
+feature selection for dataset A. This does not affect dataset B.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">override_column_b</span></code> - Type: <code class="docutils literal notranslate"><span class="pre">string</span></code>. Optional. Given the name of a column in
+dataset B, copy that column into the output column instead of computing the
+feature selection for dataset B. This does not affect dataset A.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">set_value_column_a</span></code> - Type: any. Optional. Instead of computing the feature
+selection for dataset A, use the given value for every row in the output
+column. This does not affect dataset B.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">set_value_column_b</span></code> - Type: any. Optional. Instead of computing the feature
+selection for dataset B, use the given value for every row in the output
+column. This does not affect dataset A.</p></li>
+<li><p><code class="docutils literal notranslate"><span class="pre">checkpoint</span></code> - Type: <code class="docutils literal notranslate"><span class="pre">boolean</span></code>. Optional. If set to true, checkpoint the
+dataset in Spark before computing the feature selection. This can reduce some
+resource usage for very complex workflows, but should not be necessary.</p></li>
+</ul>
 <section id="bigrams">
 <h2>bigrams<a class="headerlink" href="#bigrams" title="Link to this heading">¶</a></h2>
 <p>Split the given string column into <a class="reference external" href="https://en.wikipedia.org/wiki/Bigram">bigrams</a>.</p>

diff --git a/docs/objects.inv b/docs/objects.inv
diff --git a/docs/searchindex.js b/docs/searchindex.js
diff --git a/hlink/linking/core/transforms.py b/hlink/linking/core/transforms.py
@@ -26,7 +26,7 @@
 )
 from pyspark.sql.types import ArrayType, LongType, StringType
 from pyspark.ml import Pipeline
-from pyspark.sql import DataFrame, SparkSession, Window
+from pyspark.sql import Column, DataFrame, SparkSession, Window
 from pyspark.ml.feature import NGram, RegexTokenizer, CountVectorizer, MinHashLSH
 
 
@@ -402,13 +402,18 @@ def get_transforms(name: str, is_a: bool) -> list[dict[str, Any]]:
 
 
 #  These apply to the column mappings in the current config
-def apply_transform(column_select, transform, is_a):
-    """Given a dataframe select string return a new string having applied the given transform.
-    column_select: A PySpark column type
-    transform: The transform info from the current config
-    is_a: Is running on dataset 'a' or 'b ?
-
-    See the json_schema config file in config_schemas/config.json for definitions on each transform type.
+def apply_transform(
+    column_select: Column, transform: dict[str, Any], is_a: bool
+) -> Column:
+    """Return a new column that is the result of applying the given transform
+    to the given input column (column_select). The is_a parameter controls the
+    behavior of the transforms like "add_to_a" which act differently on
+    datasets A and B.
+
+    Args:
+    column_select: a PySpark Column
+    transform: the transform to apply
+    is_a: whether this is dataset A (True) or B (False)
     """
     transform_type = transform["type"]
     if transform_type == "add_to_a":

diff --git a/hlink/tests/core/comparison_feature_test.py b/hlink/tests/core/comparison_feature_test.py
@@ -2,10 +2,12 @@
 # For copyright and licensing information, see the NOTICE and LICENSE files
 # in this project's top-level directory, and also on-line at:
 #   https://github.com/ipums/hlink
+import pytest
 
 import hlink.linking.core.comparison_feature as comparison_feature_core
 import hlink.linking.core.pipeline as pipeline_core
 from pyspark.ml import Pipeline
+from pyspark.sql import Row
 
 
 def test_rel_jaro_winkler_comparison(spark, conf, datasource_rel_jw_input):
@@ -374,3 +376,81 @@ def test_multi_jaro_winkler_search_column_templating():
     assert "static_column" in sql_expr
     assert "static_column1" not in sql_expr
     assert "static_colum1" not in sql_expr
+
+
+def test_b_minus_a_comparison(spark) -> None:
+    comparison_feature = {
+        "alias": "agediff",
+        "column_name": "age",
+        "comparison_type": "b_minus_a",
+    }
+
+    df_a = spark.createDataFrame([[0, 15], [1, 77]], "id:integer, age:integer")
+    df_b = spark.createDataFrame([[100, 15], [101, 70]], "id:integer, age:integer")
+    df_a.write.saveAsTable("table_a")
+    df_b.write.saveAsTable("table_b")
+
+    sql_expr = comparison_feature_core.generate_comparison_feature(
+        comparison_feature, "id", include_as=True
+    )
+
+    result = (
+        spark.sql(
+            f"SELECT a.id AS id_a, b.id AS id_b, {sql_expr} FROM table_a a CROSS JOIN table_b b"
+        )
+        .sort("id_a", "id_b")
+        .collect()
+    )
+
+    assert result == [
+        Row(id_a=0, id_b=100, agediff=0),
+        Row(id_a=0, id_b=101, agediff=55),
+        Row(id_a=1, id_b=100, agediff=-62),
+        Row(id_a=1, id_b=101, agediff=-7),
+    ]
+
+
+def test_b_minus_a_comparison_with_not_equals(spark) -> None:
+    comparison_feature = {
+        "alias": "agediff",
+        "column_name": "age",
+        "comparison_type": "b_minus_a",
+        "not_equals": 99,
+    }
+    df_a = spark.createDataFrame([[0, 15], [1, 77], [2, 99]], "id:integer, age:integer")
+    df_b = spark.createDataFrame(
+        [[100, 15], [101, 70], [102, 99]], "id:integer, age:integer"
+    )
+
+    df_a.write.saveAsTable("table_a")
+    df_b.write.saveAsTable("table_b")
+
+    sql_expr = comparison_feature_core.generate_comparison_feature(
+        comparison_feature, "id", include_as=True
+    )
+
+    result = (
+        spark.sql(
+            f"SELECT a.id AS id_a, b.id AS id_b, {sql_expr} FROM table_a a CROSS JOIN table_b b"
+        )
+        .sort("id_a", "id_b")
+        .collect()
+    )
+
+    assert result == [
+        Row(id_a=0, id_b=100, agediff=0),
+        Row(id_a=0, id_b=101, agediff=55),
+        Row(id_a=0, id_b=102, agediff=-1),
+        Row(id_a=1, id_b=100, agediff=-62),
+        Row(id_a=1, id_b=101, agediff=-7),
+        Row(id_a=1, id_b=102, agediff=-1),
+        Row(id_a=2, id_b=100, agediff=-1),
+        Row(id_a=2, id_b=101, agediff=-1),
+        Row(id_a=2, id_b=102, agediff=-1),
+    ]
+
+
+def test_generate_comparison_feature_error_on_unknown_comparison_type() -> None:
+    comparison_feature = {"comparison_type": "not_supported"}
+    with pytest.raises(ValueError, match="No comparison type"):
+        comparison_feature_core.generate_comparison_feature(comparison_feature, "id")