1
1
import os
2
2
import pytest
3
3
4
+ from pyspark .sql import SparkSession
5
+
4
6
from hlink .configs .load_config import load_conf_file
5
- from hlink .scripts .lib .conf_validations import analyze_conf
7
+ from hlink .scripts .lib .conf_validations import analyze_conf , check_column_mappings
6
8
from hlink .linking .link_run import LinkRun
7
9
8
10
@@ -25,3 +27,199 @@ def test_invalid_conf(conf_dir_path, spark, conf_name, error_msg):
25
27
26
28
with pytest .raises (ValueError , match = error_msg ):
27
29
analyze_conf (link_run )
30
+
31
+
32
+ def test_check_column_mappings_mappings_missing (spark : SparkSession ) -> None :
33
+ """
34
+ The config must have a column_mappings section.
35
+ """
36
+ config = {}
37
+ df_a = spark .createDataFrame ([[1 ], [2 ], [3 ]], ["a" ])
38
+ df_b = spark .createDataFrame ([[4 ], [5 ], [6 ]], ["b" ])
39
+
40
+ with pytest .raises (
41
+ ValueError , match = r"No \[\[column_mappings\]\] exist in the conf file"
42
+ ):
43
+ check_column_mappings (config , df_a , df_b )
44
+
45
+
46
+ def test_check_column_mappings_no_column_name (spark : SparkSession ) -> None :
47
+ """
48
+ Each column mapping in the config must have a column_name attribute.
49
+ """
50
+ config = {
51
+ "column_mappings" : [{"column_name" : "AGE" , "alias" : "age" }, {"alias" : "height" }]
52
+ }
53
+ df_a = spark .createDataFrame ([[20 ], [40 ], [60 ]], ["AGE" ])
54
+ df_b = spark .createDataFrame ([[70 ], [50 ], [30 ]], ["AGE" ])
55
+
56
+ expected_err = (
57
+ r"The following \[\[column_mappings\]\] has no 'column_name' attribute:"
58
+ )
59
+ with pytest .raises (ValueError , match = expected_err ):
60
+ check_column_mappings (config , df_a , df_b )
61
+
62
+
63
+ def test_check_column_mappings_column_name_not_available_datasource_a (
64
+ spark : SparkSession ,
65
+ ) -> None :
66
+ """
67
+ Column mappings may only use column_names that appear in datasource A or a
68
+ previous column mapping.
69
+ """
70
+ config = {"column_mappings" : [{"column_name" : "HEIGHT" }]}
71
+
72
+ df_a = spark .createDataFrame ([[20 ], [40 ], [60 ]], ["AGE" ])
73
+ df_b = spark .createDataFrame ([[70 , 123 ], [50 , 123 ], [30 , 123 ]], ["AGE" , "HEIGHT" ])
74
+
75
+ expected_err = (
76
+ r"Within a \[\[column_mappings\]\] the column_name 'HEIGHT' "
77
+ r"does not exist in datasource_a and no previous \[\[column_mapping\]\] "
78
+ "alias exists for it"
79
+ )
80
+
81
+ with pytest .raises (ValueError , match = expected_err ):
82
+ check_column_mappings (config , df_a , df_b )
83
+
84
+
85
+ def test_check_column_mappings_set_value_column_a_does_not_need_column (
86
+ spark : SparkSession ,
87
+ ) -> None :
88
+ """
89
+ When set_value_column_a is present for a column mapping, that column does not
90
+ need to be present in datasource A.
91
+ """
92
+ config = {"column_mappings" : [{"column_name" : "HEIGHT" , "set_value_column_a" : 125 }]}
93
+
94
+ df_a = spark .createDataFrame ([[20 ], [40 ], [60 ]], ["AGE" ])
95
+ df_b = spark .createDataFrame ([[70 , 123 ], [50 , 123 ], [30 , 123 ]], ["AGE" , "HEIGHT" ])
96
+
97
+ check_column_mappings (config , df_a , df_b )
98
+
99
+
100
+ def test_check_column_mappings_column_name_not_available_datasource_b (
101
+ spark : SparkSession ,
102
+ ) -> None :
103
+ """
104
+ Column mappings may only use column_names that appear in datasource B or a
105
+ previous column mapping.
106
+ """
107
+ config = {"column_mappings" : [{"column_name" : "HEIGHT" }]}
108
+
109
+ df_a = spark .createDataFrame ([[70 , 123 ], [50 , 123 ], [30 , 123 ]], ["AGE" , "HEIGHT" ])
110
+ df_b = spark .createDataFrame ([[20 ], [40 ], [60 ]], ["AGE" ])
111
+
112
+ expected_err = (
113
+ r"Within a \[\[column_mappings\]\] the column_name 'HEIGHT' "
114
+ r"does not exist in datasource_b and no previous \[\[column_mapping\]\] "
115
+ "alias exists for it"
116
+ )
117
+
118
+ with pytest .raises (ValueError , match = expected_err ):
119
+ check_column_mappings (config , df_a , df_b )
120
+
121
+
122
+ def test_check_column_mappings_set_value_column_b_does_not_need_column (
123
+ spark : SparkSession ,
124
+ ) -> None :
125
+ """
126
+ When set_value_column_b is present for a column mapping, that column does not
127
+ need to be present in datasource B.
128
+ """
129
+ config = {"column_mappings" : [{"column_name" : "HEIGHT" , "set_value_column_b" : 125 }]}
130
+
131
+ df_a = spark .createDataFrame ([[70 , 123 ], [50 , 123 ], [30 , 123 ]], ["AGE" , "HEIGHT" ])
132
+ df_b = spark .createDataFrame ([[20 ], [40 ], [60 ]], ["AGE" ])
133
+
134
+ check_column_mappings (config , df_a , df_b )
135
+
136
+
137
+ def test_check_column_mappings_previous_mappings_are_available (
138
+ spark : SparkSession ,
139
+ ) -> None :
140
+ """
141
+ Columns created in a previous column mapping can be used in other column
142
+ mappings.
143
+ """
144
+ config = {
145
+ "column_mappings" : [
146
+ {"column_name" : "AGE" , "alias" : "AGE_HLINK" },
147
+ {"column_name" : "AGE_HLINK" , "alias" : "AGE_HLINK2" },
148
+ ]
149
+ }
150
+ df_a = spark .createDataFrame ([[70 ], [50 ], [30 ]], ["AGE" ])
151
+ df_b = spark .createDataFrame ([[20 ], [40 ], [60 ]], ["AGE" ])
152
+
153
+ check_column_mappings (config , df_a , df_b )
154
+
155
+
156
+ def test_check_column_mappings_override_column_a (spark : SparkSession ) -> None :
157
+ """
158
+ The override_column_a attribute lets you control which column you read from
159
+ in datasource A.
160
+ """
161
+ config = {
162
+ "column_mappings" : [{"column_name" : "AGE" , "override_column_a" : "ageColumn" }]
163
+ }
164
+ df_a = spark .createDataFrame ([[20 ], [40 ], [60 ]], ["ageColumn" ])
165
+ df_b = spark .createDataFrame ([[70 ], [50 ], [30 ]], ["AGE" ])
166
+
167
+ check_column_mappings (config , df_a , df_b )
168
+
169
+
170
+ def test_check_column_mappings_override_column_b (spark : SparkSession ) -> None :
171
+ """
172
+ The override_column_b attribute lets you control which column you read from
173
+ in datasource B.
174
+ """
175
+ config = {
176
+ "column_mappings" : [{"column_name" : "ageColumn" , "override_column_b" : "AGE" }]
177
+ }
178
+ df_a = spark .createDataFrame ([[20 ], [40 ], [60 ]], ["ageColumn" ])
179
+ df_b = spark .createDataFrame ([[70 ], [50 ], [30 ]], ["AGE" ])
180
+
181
+ check_column_mappings (config , df_a , df_b )
182
+
183
+
184
+ def test_check_column_mappings_override_column_a_not_present (
185
+ spark : SparkSession ,
186
+ ) -> None :
187
+ """
188
+ The override_column_a column must be present in datasource A.
189
+ """
190
+ config = {
191
+ "column_mappings" : [
192
+ {"column_name" : "AGE" , "override_column_a" : "oops_not_there" }
193
+ ]
194
+ }
195
+ df_a = spark .createDataFrame ([[20 ], [40 ], [60 ]], ["ageColumn" ])
196
+ df_b = spark .createDataFrame ([[70 ], [50 ], [30 ]], ["AGE" ])
197
+
198
+ expected_err = (
199
+ r"Within a \[\[column_mappings\]\] the override_column_a column "
200
+ "'oops_not_there' does not exist in datasource_a"
201
+ )
202
+ with pytest .raises (ValueError , match = expected_err ):
203
+ check_column_mappings (config , df_a , df_b )
204
+
205
+
206
+ def test_check_column_mappings_override_column_b_not_present (
207
+ spark : SparkSession ,
208
+ ) -> None :
209
+ """
210
+ The override_column_b column must be present in datasource B.
211
+ """
212
+ config = {
213
+ "column_mappings" : [
214
+ {"column_name" : "AGE" , "override_column_b" : "oops_not_there" }
215
+ ]
216
+ }
217
+ df_a = spark .createDataFrame ([[20 ], [40 ], [60 ]], ["AGE" ])
218
+ df_b = spark .createDataFrame ([[70 ], [50 ], [30 ]], ["AGE" ])
219
+
220
+ expected_err = (
221
+ r"Within a \[\[column_mappings\]\] the override_column_b column "
222
+ "'oops_not_there' does not exist in datasource_b"
223
+ )
224
+ with pytest .raises (ValueError , match = expected_err ):
225
+ check_column_mappings (config , df_a , df_b )
0 commit comments