From 585f836617dd69699529d67fb1ec6419dae55073 Mon Sep 17 00:00:00 2001 From: wendycwong Date: Tue, 19 Sep 2023 10:45:19 -0700 Subject: [PATCH] fix problem with missing values for string columns. --- h2o-py/h2o/frame.py | 5 ++-- .../pyunit_gh_15729_datatable_2_pandas.py | 24 +++++++++++++------ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/h2o-py/h2o/frame.py b/h2o-py/h2o/frame.py index 6594af69829f..d1826d6ad0bd 100644 --- a/h2o-py/h2o/frame.py +++ b/h2o-py/h2o/frame.py @@ -1962,7 +1962,8 @@ def as_data_frame(self, use_pandas=True, header=True, multi_thread=False): try: tmpdir = tempfile.mkdtemp() fileName = os.path.join(tmpdir, "h2oframe2Convert.csv") - h2o.export_file(self, fileName) + #h2o.export_file(self, fileName) + h2o.download_csv(self, fileName) import datatable as dt frameTypes = self.types validFrameTypes = {} @@ -1971,7 +1972,7 @@ def as_data_frame(self, use_pandas=True, header=True, multi_thread=False): validFrameTypes[key] = dt.int64 elif value.startswith("real"): validFrameTypes[key] = dt.float64 - dt_frame = dt.fread(fileName, columns=validFrameTypes) + dt_frame = dt.fread(fileName, na_strings=[""], columns=validFrameTypes) return dt_frame.to_pandas() finally: os.remove(fileName) diff --git a/h2o-py/tests/testdir_misc/pyunit_gh_15729_datatable_2_pandas.py b/h2o-py/tests/testdir_misc/pyunit_gh_15729_datatable_2_pandas.py index accf54ea63b1..307962a6741f 100644 --- a/h2o-py/tests/testdir_misc/pyunit_gh_15729_datatable_2_pandas.py +++ b/h2o-py/tests/testdir_misc/pyunit_gh_15729_datatable_2_pandas.py @@ -4,6 +4,8 @@ from tests import pyunit_utils from h2o.utils.shared_utils import (can_use_pandas, can_use_datatable) import time +import numpy as np +import pandas as pd # if datatable is installed, this test will show that using datatable to convert h2o frame to pandas frame is # much faster. @@ -27,21 +29,29 @@ def test_frame_conversion(dataset, compareTime): new_types = new_pandas_frame.dtypes old_types = original_pandas_frame.dtypes ncol = h2oFrame.ncol + nrow = h2oFrame.nrow - for ind in range(ncol): + for ind in list(range(ncol)): assert new_types[ind] == old_types[ind], "Expected column types: {0}, actual column types: " \ "{1}".format(old_types[ind], new_types[ind]) - - + if new_types[ind] == "object": + diff = new_pandas_frame.iloc[:, ind] == original_pandas_frame.iloc[:, ind] + if not diff.all(): # difference caused by the presence of NAs + newSeries = pd.Series(new_pandas_frame.iloc[:, ind]) + newNA = newSeries.isna() + oldSeries = pd.Series(original_pandas_frame.iloc[:, ind]) + oldNA = oldSeries.isna() + assert (newNA==oldNA).all() + else: + diff = (new_pandas_frame.iloc[:, ind] - original_pandas_frame.iloc[:, ind]).abs() + assert diff.max() < 1e-10 + def test_polars_pandas(): - if not(can_use_pandas()): - pyunit_utils.install("pandas") - import pandas if not(can_use_datatable()): pyunit_utils.install("datatable") import datatable - test_frame_conversion("smalldata/glm_test/multinomial_3Class_10KRow.csv", True) test_frame_conversion("smalldata/titanic/titanic_expanded.csv", False) + test_frame_conversion("smalldata/glm_test/multinomial_3Class_10KRow.csv", True) test_frame_conversion("smalldata/timeSeries/CreditCard-ts_train.csv", False)