make_batch_reader loses dtype with list-of-strings columns, causing Tensorflow error when lists contain a None value #765

arhan-gunel · 2022-07-25T17:43:44Z

The dtype is lost using make_batch_reader when a column is a list of strings. If a batch contains no None values, then Tensorflow is still able to infer the string type of the array. But if the batch contains any None values, then Tensorflow produces the following error:

InternalError: Unsupported object type NoneType

Note that this similar but different than issue #744.

Example

from pathlib import Path
import numpy as np
import pandas as pd
import petastorm
from petastorm.unischema import UnischemaField
from petastorm import tf_utils
from petastorm.transform import TransformSpec

# Create parquet dataset
data_path = Path('/path/to/data.parquet')
data_pd = pd.DataFrame({'list_of_str': [['A', 'B'], ['C', 'D'], ['E', None], ['G', 'H']]})
data_pd.to_parquet(data_path, row_group_size=2)

noop_transform_spec = TransformSpec(lambda x: x, edit_fields=[UnischemaField('list_of_str', np.str_, (2, ), nullable=True)])

reader = petastorm.make_batch_reader(data_path.as_uri(),
                                     workers_count=1,
                                     shuffle_row_groups=False,
                                     num_epochs=2,
                                     transform_spec=noop_transform_spec)

reader.next() # output: inferred_schema_view(list_of_str=array([['A', 'B'], ['C', 'D']], dtype=object))
reader.next() # output: inferred_schema_view(list_of_str=array([['E', None], ['G', 'H']], dtype=object))

# Read with tensorflow
dataset = tf_utils.make_petastorm_dataset(reader)
dataset_itr = dataset.as_numpy_iterator()

dataset_itr.next() # output: inferred_schema_view(list_of_str=array([[b'A', b'B'], [b'C', b'D']], dtype=object))
dataset_itr.next() # InternalError: Unsupported object type NoneType

Workaround

Modify all TransformSpec funcs to replace list-of-string columns with any missing string values with None strings.

def is_string_list(column_type: pyarrow.DataType) -> bool:
    return isinstance(column_type, pyarrow.ListType) and pyarrow.types.is_string(column_type.value_type)

fields_str_list = [f for f in table.schema.names if is_string_list(table.column(f).type)]

def transform_spec_with_workaround(rows_pd: pd.DataFrame) -> pd.DataFrame:
    ...  # custom transformation

    for f in fields_str_list:
        rows_pd[f] = rows_pd[f].map(lambda a: np.ma.masked_values(a, None).filled('None'))

    return rows_pd

Full Trace

---------------------------------------------------------------------------
InternalError                             Traceback (most recent call last)
<ipython-input-50-55cc7ab782db> in <module>
----> 1 dataset_itr.next()

~/conda/lmigpuv0_17_1/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py in next(self)
   4693 
   4694   def next(self):
-> 4695     return self.__next__()
   4696 
   4697 

~/conda/lmigpuv0_17_1/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py in __next__(self)
   4690       return numpy
   4691 
-> 4692     return nest.map_structure(to_numpy, next(self._iterator))
   4693 
   4694   def next(self):

~/conda/lmigpuv0_17_1/lib/python3.7/site-packages/tensorflow/python/data/ops/iterator_ops.py in __next__(self)
    759   def __next__(self):
    760     try:
--> 761       return self._next_internal()
    762     except errors.OutOfRangeError:
    763       raise StopIteration

~/conda/lmigpuv0_17_1/lib/python3.7/site-packages/tensorflow/python/data/ops/iterator_ops.py in _next_internal(self)
    745           self._iterator_resource,
    746           output_types=self._flat_output_types,
--> 747           output_shapes=self._flat_output_shapes)
    748 
    749       try:

~/conda/lmigpuv0_17_1/lib/python3.7/site-packages/tensorflow/python/ops/gen_dataset_ops.py in iterator_get_next(iterator, output_types, output_shapes, name)
   2726       return _result
   2727     except _core._NotOkStatusException as e:
-> 2728       _ops.raise_from_not_ok_status(e, name)
   2729     except _core._FallbackException:
   2730       pass

~/conda/lmigpuv0_17_1/lib/python3.7/site-packages/tensorflow/python/framework/ops.py in raise_from_not_ok_status(e, name)
   6939   message = e.message + (" name: " + name if name is not None else "")
   6940   # pylint: disable=protected-access
-> 6941   six.raise_from(core._status_to_exception(e.code, message), None)
   6942   # pylint: enable=protected-access
   6943 

~/conda/lmigpuv0_17_1/lib/python3.7/site-packages/six.py in raise_from(value, from_value)

InternalError: Unsupported object type NoneType
	 [[{{node PyFunc}}]] [Op:IteratorGetNext]

The text was updated successfully, but these errors were encountered:

arhan-gunel changed the title ~~make_batch_reader loses dtype with list of strings columns, causing Tensorflow error when lists contain a None value~~ make_batch_reader loses dtype with list-of-strings columns, causing Tensorflow error when lists contain a None value Jul 25, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

make_batch_reader loses dtype with list-of-strings columns, causing Tensorflow error when lists contain a None value #765

make_batch_reader loses dtype with list-of-strings columns, causing Tensorflow error when lists contain a None value #765

arhan-gunel commented Jul 25, 2022 •

edited

make_batch_reader loses dtype with list-of-strings columns, causing Tensorflow error when lists contain a None value #765

make_batch_reader loses dtype with list-of-strings columns, causing Tensorflow error when lists contain a None value #765

Comments

arhan-gunel commented Jul 25, 2022 • edited

Example

Workaround

Full Trace

arhan-gunel commented Jul 25, 2022 •

edited