Skip to content

Commit

Permalink
feat: better numeric field conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
jacobdadams committed Jul 21, 2023
1 parent 785be92 commit c4c6cf0
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 10 deletions.
5 changes: 5 additions & 0 deletions src/palletjack/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
.. include:: ../../docs/README.md
"""
import locale

from . import extract, load, transform, utils
from .errors import IntFieldAsFloatError, TimezoneAwareDatetimeError

#: If the locale is not set explicitly, set it to the system default for text to number conversions
if not locale.getlocale(locale.LC_NUMERIC)[0]:
locale.setlocale(locale.LC_NUMERIC, locale.getlocale())
33 changes: 25 additions & 8 deletions src/palletjack/transform.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Transform pandas dataframes in preparation for loading to AGOL.
"""
import locale
import logging
import warnings
from datetime import datetime
Expand Down Expand Up @@ -186,10 +187,8 @@ def switch_to_nullable_int(dataframe, fields_that_should_be_ints):
retyped = dataframe.copy()
try:
for field in fields_that_should_be_ints:
retyped[field] = retyped[field].astype(str).str.replace(',', '')
retyped[field].replace('', None, inplace=True)
retyped[field] = retyped[field].astype('Int64')
except TypeError as error:
retyped[field] = DataCleaning._switch_series_to_numeric_dtype(retyped[field], 'Int64')
except ValueError as error:
raise TypeError(
'Cannot convert one or more fields to nullable ints. Check for non-int/non-np.nan values.'
) from error
Expand All @@ -214,15 +213,33 @@ def switch_to_float(dataframe, fields_that_should_be_floats):
retyped = dataframe.copy()
try:
for field in fields_that_should_be_floats:
retyped[field] = retyped[field].astype(str).str.replace(',', '')
retyped[field].replace('', None, inplace=True)
retyped[field] = retyped[field].astype(float)
except TypeError as error:
retyped[field] = DataCleaning._switch_series_to_numeric_dtype(retyped[field], 'float')
except ValueError as error:
raise TypeError(
'Cannot convert one or more fields to floats. Check for non-float/non-null values.'
) from error
return retyped

@staticmethod
def _switch_series_to_numeric_dtype(series, dtype):
"""Switch the dtype of a series to the specified dtype
Series of dtype 'object' (ie, series of strings or mixed strings and numbers) are converted to str so that they
can be de-localized to remove comma thousands separators
Args:
series (pd.Series): The series to be converted
dtype (str): The dtype to convert to
Returns:
pd.Series: The converted series
"""

if series.dtype == 'object':
series = series.astype(str).apply(locale.delocalize)
series.replace('', None, inplace=True)
return series.astype(dtype)

@staticmethod
def switch_to_datetime(dataframe, date_fields, **to_datetime_kwargs):
"""Convert specified fields to datetime dtypes to ensure proper date formatting for AGOL
Expand Down
37 changes: 35 additions & 2 deletions tests/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ def test_switch_to_nullable_int_comma_thousands_separator(self):

retyped_df = palletjack.transform.DataCleaning.switch_to_nullable_int(df, ['a'])

test_df = pd.DataFrame([1., 2., 3000.], columns=['a'], dtype='float')
test_df = pd.DataFrame([1, 2, 3000], columns=['a'], dtype='Int64')

tm.assert_frame_equal(retyped_df, test_df)

Expand All @@ -427,7 +427,7 @@ def test_switch_to_nullable_int_comma_thousands_separator_mixed_input_types(self

retyped_df = palletjack.transform.DataCleaning.switch_to_nullable_int(df, ['a'])

test_df = pd.DataFrame([1., 2., 3000.], columns=['a'], dtype='float')
test_df = pd.DataFrame([1, 2, 3000], columns=['a'], dtype='Int64')

tm.assert_frame_equal(retyped_df, test_df)

Expand Down Expand Up @@ -501,6 +501,39 @@ def test_switch_to_float_comma_thousands_separator_mixed_input_types(self):
tm.assert_frame_equal(retyped_df, test_df)


class TestSwitchSeriesToNumericDtype:

def test_switch_series_to_numeric_dtype_ints_to_Int64(self):
series1 = pd.Series([1, 2, 3, 4, 5])
test_series = pd.Series([1, 2, 3, 4, 5], dtype='Int64')
assert palletjack.transform.DataCleaning._switch_series_to_numeric_dtype(series1, 'Int64').equals(test_series)

def test_switch_series_to_numeric_dtype_str_to_float_with_thousands(self):
series2 = pd.Series(['1,000', '2,000', '3,000', '4,000', '5,000'])
test_series = pd.Series([1000.0, 2000.0, 3000.0, 4000.0, 5000.0])
assert palletjack.transform.DataCleaning._switch_series_to_numeric_dtype(series2, 'float').equals(test_series)

def test_switch_series_to_numeric_dtype_mixed_strings_ints_to_Int64(self):
series3 = pd.Series(['1,000', 2, '3,000', 4, '5,000'])
test_series = pd.Series([1000, 2, 3000, 4, 5000], dtype='Int64')
assert palletjack.transform.DataCleaning._switch_series_to_numeric_dtype(series3, 'Int64').equals(test_series)

def test_switch_series_to_numeric_dtype_mixed_strings_ints_to_float(self):
series3 = pd.Series(['1,000', 2, '3,000', 4, '5,000'])
test_series = pd.Series([1000.0, 2.0, 3000.0, 4.0, 5000.0])
assert palletjack.transform.DataCleaning._switch_series_to_numeric_dtype(series3, 'float').equals(test_series)

def test_switch_series_to_numeric_dtype_ints_with_nan_to_Int64(self):
series4 = pd.Series([1, 2, np.nan])
test_series = pd.Series([1, 2, pd.NA], dtype='Int64')
assert palletjack.transform.DataCleaning._switch_series_to_numeric_dtype(series4, 'Int64').equals(test_series)

def test_switch_series_to_numeric_dtype_raises_on_non_numeric(self):
series5 = pd.Series(['a', 'b', 'c', 'd', 'e'])
with pytest.raises(ValueError):
palletjack.transform.DataCleaning._switch_series_to_numeric_dtype(series5, 'float')


class TestDatetimeSwitching:

def test_switch_to_datetime_handles_multiple_fields(self):
Expand Down

0 comments on commit c4c6cf0

Please sign in to comment.