feat: better numeric field conversion

agrc · Jul 21, 2023 · c4c6cf0 · c4c6cf0
1 parent 785be92
commit c4c6cf0
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 10 deletions.
diff --git a/src/palletjack/__init__.py b/src/palletjack/__init__.py
@@ -2,6 +2,11 @@
 
 .. include:: ../../docs/README.md
 """
+import locale
 
 from . import extract, load, transform, utils
 from .errors import IntFieldAsFloatError, TimezoneAwareDatetimeError
+
+#: If the locale is not set explicitly, set it to the system default for text to number conversions
+if not locale.getlocale(locale.LC_NUMERIC)[0]:
+    locale.setlocale(locale.LC_NUMERIC, locale.getlocale())
diff --git a/src/palletjack/transform.py b/src/palletjack/transform.py
@@ -1,5 +1,6 @@
 """Transform pandas dataframes in preparation for loading to AGOL.
 """
+import locale
 import logging
 import warnings
 from datetime import datetime
@@ -186,10 +187,8 @@ def switch_to_nullable_int(dataframe, fields_that_should_be_ints):
         retyped = dataframe.copy()
         try:
             for field in fields_that_should_be_ints:
-                retyped[field] = retyped[field].astype(str).str.replace(',', '')
-                retyped[field].replace('', None, inplace=True)
-                retyped[field] = retyped[field].astype('Int64')
-        except TypeError as error:
+                retyped[field] = DataCleaning._switch_series_to_numeric_dtype(retyped[field], 'Int64')
+        except ValueError as error:
             raise TypeError(
                 'Cannot convert one or more fields to nullable ints. Check for non-int/non-np.nan values.'
             ) from error
@@ -214,15 +213,33 @@ def switch_to_float(dataframe, fields_that_should_be_floats):
         retyped = dataframe.copy()
         try:
             for field in fields_that_should_be_floats:
-                retyped[field] = retyped[field].astype(str).str.replace(',', '')
-                retyped[field].replace('', None, inplace=True)
-                retyped[field] = retyped[field].astype(float)
-        except TypeError as error:
+                retyped[field] = DataCleaning._switch_series_to_numeric_dtype(retyped[field], 'float')
+        except ValueError as error:
             raise TypeError(
                 'Cannot convert one or more fields to floats. Check for non-float/non-null values.'
             ) from error
         return retyped
 
+    @staticmethod
+    def _switch_series_to_numeric_dtype(series, dtype):
+        """Switch the dtype of a series to the specified dtype
+
+        Series of dtype 'object' (ie, series of strings or mixed strings and numbers) are converted to str so that they
+        can be de-localized to remove comma thousands separators
+
+        Args:
+            series (pd.Series): The series to be converted
+            dtype (str): The dtype to convert to
+
+        Returns:
+            pd.Series: The converted series
+        """
+
+        if series.dtype == 'object':
+            series = series.astype(str).apply(locale.delocalize)
+            series.replace('', None, inplace=True)
+        return series.astype(dtype)
+
     @staticmethod
     def switch_to_datetime(dataframe, date_fields, **to_datetime_kwargs):
         """Convert specified fields to datetime dtypes to ensure proper date formatting for AGOL

diff --git a/tests/test_transform.py b/tests/test_transform.py
@@ -416,7 +416,7 @@ def test_switch_to_nullable_int_comma_thousands_separator(self):
 
         retyped_df = palletjack.transform.DataCleaning.switch_to_nullable_int(df, ['a'])
 
-        test_df = pd.DataFrame([1., 2., 3000.], columns=['a'], dtype='float')
+        test_df = pd.DataFrame([1, 2, 3000], columns=['a'], dtype='Int64')
 
         tm.assert_frame_equal(retyped_df, test_df)
 
@@ -427,7 +427,7 @@ def test_switch_to_nullable_int_comma_thousands_separator_mixed_input_types(self
 
         retyped_df = palletjack.transform.DataCleaning.switch_to_nullable_int(df, ['a'])
 
-        test_df = pd.DataFrame([1., 2., 3000.], columns=['a'], dtype='float')
+        test_df = pd.DataFrame([1, 2, 3000], columns=['a'], dtype='Int64')
 
         tm.assert_frame_equal(retyped_df, test_df)
 
@@ -501,6 +501,39 @@ def test_switch_to_float_comma_thousands_separator_mixed_input_types(self):
         tm.assert_frame_equal(retyped_df, test_df)
 
 
+class TestSwitchSeriesToNumericDtype:
+
+    def test_switch_series_to_numeric_dtype_ints_to_Int64(self):
+        series1 = pd.Series([1, 2, 3, 4, 5])
+        test_series = pd.Series([1, 2, 3, 4, 5], dtype='Int64')
+        assert palletjack.transform.DataCleaning._switch_series_to_numeric_dtype(series1, 'Int64').equals(test_series)
+
+    def test_switch_series_to_numeric_dtype_str_to_float_with_thousands(self):
+        series2 = pd.Series(['1,000', '2,000', '3,000', '4,000', '5,000'])
+        test_series = pd.Series([1000.0, 2000.0, 3000.0, 4000.0, 5000.0])
+        assert palletjack.transform.DataCleaning._switch_series_to_numeric_dtype(series2, 'float').equals(test_series)
+
+    def test_switch_series_to_numeric_dtype_mixed_strings_ints_to_Int64(self):
+        series3 = pd.Series(['1,000', 2, '3,000', 4, '5,000'])
+        test_series = pd.Series([1000, 2, 3000, 4, 5000], dtype='Int64')
+        assert palletjack.transform.DataCleaning._switch_series_to_numeric_dtype(series3, 'Int64').equals(test_series)
+
+    def test_switch_series_to_numeric_dtype_mixed_strings_ints_to_float(self):
+        series3 = pd.Series(['1,000', 2, '3,000', 4, '5,000'])
+        test_series = pd.Series([1000.0, 2.0, 3000.0, 4.0, 5000.0])
+        assert palletjack.transform.DataCleaning._switch_series_to_numeric_dtype(series3, 'float').equals(test_series)
+
+    def test_switch_series_to_numeric_dtype_ints_with_nan_to_Int64(self):
+        series4 = pd.Series([1, 2, np.nan])
+        test_series = pd.Series([1, 2, pd.NA], dtype='Int64')
+        assert palletjack.transform.DataCleaning._switch_series_to_numeric_dtype(series4, 'Int64').equals(test_series)
+
+    def test_switch_series_to_numeric_dtype_raises_on_non_numeric(self):
+        series5 = pd.Series(['a', 'b', 'c', 'd', 'e'])
+        with pytest.raises(ValueError):
+            palletjack.transform.DataCleaning._switch_series_to_numeric_dtype(series5, 'float')
+
+
 class TestDatetimeSwitching:
 
     def test_switch_to_datetime_handles_multiple_fields(self):