Add a test for the substring column mapping transform

This confirms that the transform handles the case where the values list doesn't have length 2 by raising an error. This prompted me to make issue #146, which I think should really simplify this transform.
ipums · Aug 27, 2024 · e4c9941 · e4c9941
1 parent 4158841
commit e4c9941
Showing 1 changed file with 29 additions and 0 deletions.
diff --git a/hlink/tests/core/transforms_test.py b/hlink/tests/core/transforms_test.py
@@ -260,6 +260,35 @@ def test_apply_transform_remove_punctuation(spark: SparkSession, is_a: bool) ->
     ]
 
 
+@pytest.mark.parametrize("values", [[1], [1, 2, 3]])
+@pytest.mark.parametrize("is_a", [True, False])
+def test_apply_transform_substring_error_when_not_exactly_2_values(
+    values: list[int], is_a: bool
+) -> None:
+    """
+    The substring transform takes a list of exactly two values, which are the
+    start position of the substring and its length. If the list has the wrong
+    number of values, then apply_transform() raises an error.
+
+    TODO: It would be simpler to have two separate attributes for the substring
+    start and length, like this:
+
+    {
+        "type": "substring",
+        "start_index": 0,
+        "length": 4,
+    }
+
+    See issue #146. Making these changes would eliminate the need for this
+    test.
+    """
+    input_col = col("input")
+    transform = {"type": "substring", "values": values}
+
+    with pytest.raises(ValueError, match="Length of substr transform should be 2"):
+        apply_transform(input_col, transform, is_a)
+
+
 @pytest.mark.parametrize("is_a", [True, False])
 def test_apply_transform_error_when_unrecognized_transform_type(is_a: bool) -> None:
     column_select = col("test")