Skip to content

Commit

Permalink
fix problem with missing values for string columns.
Browse files Browse the repository at this point in the history
  • Loading branch information
wendycwong committed Sep 19, 2023
1 parent ecfc2c5 commit 585f836
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 9 deletions.
5 changes: 3 additions & 2 deletions h2o-py/h2o/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1962,7 +1962,8 @@ def as_data_frame(self, use_pandas=True, header=True, multi_thread=False):
try:
tmpdir = tempfile.mkdtemp()
fileName = os.path.join(tmpdir, "h2oframe2Convert.csv")
h2o.export_file(self, fileName)
#h2o.export_file(self, fileName)
h2o.download_csv(self, fileName)
import datatable as dt
frameTypes = self.types
validFrameTypes = {}
Expand All @@ -1971,7 +1972,7 @@ def as_data_frame(self, use_pandas=True, header=True, multi_thread=False):
validFrameTypes[key] = dt.int64
elif value.startswith("real"):
validFrameTypes[key] = dt.float64
dt_frame = dt.fread(fileName, columns=validFrameTypes)
dt_frame = dt.fread(fileName, na_strings=[""], columns=validFrameTypes)
return dt_frame.to_pandas()
finally:
os.remove(fileName)
Expand Down
24 changes: 17 additions & 7 deletions h2o-py/tests/testdir_misc/pyunit_gh_15729_datatable_2_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from tests import pyunit_utils
from h2o.utils.shared_utils import (can_use_pandas, can_use_datatable)
import time
import numpy as np
import pandas as pd

# if datatable is installed, this test will show that using datatable to convert h2o frame to pandas frame is
# much faster.
Expand All @@ -27,21 +29,29 @@ def test_frame_conversion(dataset, compareTime):
new_types = new_pandas_frame.dtypes
old_types = original_pandas_frame.dtypes
ncol = h2oFrame.ncol
nrow = h2oFrame.nrow

for ind in range(ncol):
for ind in list(range(ncol)):
assert new_types[ind] == old_types[ind], "Expected column types: {0}, actual column types: " \
"{1}".format(old_types[ind], new_types[ind])


if new_types[ind] == "object":
diff = new_pandas_frame.iloc[:, ind] == original_pandas_frame.iloc[:, ind]
if not diff.all(): # difference caused by the presence of NAs
newSeries = pd.Series(new_pandas_frame.iloc[:, ind])
newNA = newSeries.isna()
oldSeries = pd.Series(original_pandas_frame.iloc[:, ind])
oldNA = oldSeries.isna()
assert (newNA==oldNA).all()
else:
diff = (new_pandas_frame.iloc[:, ind] - original_pandas_frame.iloc[:, ind]).abs()
assert diff.max() < 1e-10

def test_polars_pandas():
if not(can_use_pandas()):
pyunit_utils.install("pandas")
import pandas
if not(can_use_datatable()):
pyunit_utils.install("datatable")
import datatable
test_frame_conversion("smalldata/glm_test/multinomial_3Class_10KRow.csv", True)
test_frame_conversion("smalldata/titanic/titanic_expanded.csv", False)
test_frame_conversion("smalldata/glm_test/multinomial_3Class_10KRow.csv", True)
test_frame_conversion("smalldata/timeSeries/CreditCard-ts_train.csv", False)


Expand Down

0 comments on commit 585f836

Please sign in to comment.