fix problem with missing values for string columns.

h2oai · Sep 19, 2023 · 585f836 · 585f836
1 parent ecfc2c5
commit 585f836
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 9 deletions.
diff --git a/h2o-py/h2o/frame.py b/h2o-py/h2o/frame.py
@@ -1962,7 +1962,8 @@ def as_data_frame(self, use_pandas=True, header=True, multi_thread=False):
                     try:
                         tmpdir = tempfile.mkdtemp()
                         fileName = os.path.join(tmpdir, "h2oframe2Convert.csv")
-                        h2o.export_file(self, fileName)
+                        #h2o.export_file(self, fileName)
+                        h2o.download_csv(self, fileName)
                         import datatable as dt
                         frameTypes = self.types
                         validFrameTypes = {}
@@ -1971,7 +1972,7 @@ def as_data_frame(self, use_pandas=True, header=True, multi_thread=False):
                                 validFrameTypes[key] = dt.int64
                             elif value.startswith("real"):
                                 validFrameTypes[key] = dt.float64
-                        dt_frame = dt.fread(fileName, columns=validFrameTypes)
+                        dt_frame = dt.fread(fileName, na_strings=[""], columns=validFrameTypes)
                         return dt_frame.to_pandas()
                     finally:
                         os.remove(fileName)

diff --git a/h2o-py/tests/testdir_misc/pyunit_gh_15729_datatable_2_pandas.py b/h2o-py/tests/testdir_misc/pyunit_gh_15729_datatable_2_pandas.py
@@ -4,6 +4,8 @@
 from tests import pyunit_utils
 from h2o.utils.shared_utils import (can_use_pandas, can_use_datatable)
 import time
+import numpy as np
+import pandas as pd
 
 # if datatable is installed, this test will show that using datatable to convert h2o frame to pandas frame is
 # much faster.
@@ -27,21 +29,29 @@ def test_frame_conversion(dataset, compareTime):
     new_types = new_pandas_frame.dtypes
     old_types = original_pandas_frame.dtypes
     ncol = h2oFrame.ncol
+    nrow = h2oFrame.nrow
 
-    for ind in range(ncol):
+    for ind in list(range(ncol)):
         assert new_types[ind] == old_types[ind], "Expected column types: {0}, actual column types: " \
                                                  "{1}".format(old_types[ind], new_types[ind])
-
-
+        if new_types[ind] == "object":
+            diff = new_pandas_frame.iloc[:, ind] == original_pandas_frame.iloc[:, ind]
+            if not diff.all(): # difference caused by the presence of NAs
+                newSeries = pd.Series(new_pandas_frame.iloc[:, ind])
+                newNA = newSeries.isna()
+                oldSeries = pd.Series(original_pandas_frame.iloc[:, ind])
+                oldNA = oldSeries.isna()
+                assert (newNA==oldNA).all()       
+        else:
+            diff = (new_pandas_frame.iloc[:, ind] - original_pandas_frame.iloc[:, ind]).abs()
+            assert diff.max() < 1e-10
+
 def test_polars_pandas():
-    if not(can_use_pandas()):
-        pyunit_utils.install("pandas")
-    import pandas
     if not(can_use_datatable()):
         pyunit_utils.install("datatable")
     import datatable
-    test_frame_conversion("smalldata/glm_test/multinomial_3Class_10KRow.csv", True)
     test_frame_conversion("smalldata/titanic/titanic_expanded.csv", False)
+    test_frame_conversion("smalldata/glm_test/multinomial_3Class_10KRow.csv", True)
     test_frame_conversion("smalldata/timeSeries/CreditCard-ts_train.csv", False)