Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add from_indexed_dicts utility to create event-sets from pre-indexed data (w8) #412

Merged
merged 5 commits into from
Apr 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/public_api_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
"to_tensorflow_dataset",
"from_tensorflow_record",
"to_tensorflow_record",
"from_indexed_dicts",
# DTYPES
"float64",
"float32",
Expand Down
Empty file.
2 changes: 1 addition & 1 deletion temporian/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@

# EventSets
from temporian.implementation.numpy.data.event_set import EventSet, IndexData
from temporian.implementation.numpy.data.io import event_set
from temporian.implementation.numpy.data.io import event_set, from_indexed_dicts

# Serialization
from temporian.core.serialization import save
Expand Down
120 changes: 118 additions & 2 deletions temporian/implementation/numpy/data/io.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, List, Optional, Union, Dict
from typing import Any, List, Optional, Union, Dict, Tuple

import logging
import numpy as np
Expand All @@ -12,7 +12,13 @@
from temporian.implementation.numpy.data.event_set import EventSet, IndexData
from temporian.core.evaluation import run
from temporian.core.operators.add_index import add_index
from temporian.core.data.schema import Schema
from temporian.core.data.schema import Schema, IndexSchema, FeatureSchema
from temporian.core.data.dtype import DType
from temporian.implementation.numpy.data.dtype_normalization import (
numpy_dtype_to_tp_dtype,
normalize_index_item,
normalize_features,
)

# Array of values as feed by the user.
DataArray = Union[List[Any], np.ndarray, "pandas.Series"]
Expand Down Expand Up @@ -199,3 +205,113 @@ def event_set(
evset.node()._sampling = same_sampling_as.node().sampling_node

return evset


@typecheck
def from_indexed_dicts(
data: List[Tuple[Dict[str, Any], Dict[str, DataArray]]],
timestamps: str = "timestamp",
is_unix_timestamp: bool = False,
) -> EventSet:
"""Creates an [`EventSet`][temporian.EventSet] from indexed data.

Unlike `event_set`, `from_indexed_dicts` expects for the data to be already
split by index value. Supported values for timestamps, indexes, and
features as similar to `event_set`.

Usage examples:

```python
>>> evset = tp.from_indexed_dicts(
... [
... (
... {"i1": 1, "i2": "A"},
... {"timestamp": [1, 2], "f1": [10, 11], "f2": ["X", "Y"]},
... ),
... (
... {"i1": 1, "i2": "B"},
... {"timestamp": [3, 4], "f1": [12, 13], "f2": ["X", "X"]},
... ),
... ])

```

Args:
data: Indexed data.
timestamps: Name of the feature to be used as timestamps for the
EventSet.
is_unix_timestamp: Whether the timestamps correspond to unix time. Unix
times are required for calendar operators. If `None` (default),
timestamps are interpreted as unix times if the `timestamps`
argument is an array of date or date-like object.

Returns:
An EventSet.
"""

if not isinstance(data, list):
raise ValueError("data is expected to be a list of two-items tuples")

if len(data) == 0:
raise ValueError("Cannot create eventset without any values")

if not isinstance(data[0], tuple):
raise ValueError("data is expected to be a list of two-items tuples")

first_index_value = data[0][0]
index_schema = []
for k, v in first_index_value.items():
index_schema.append(
IndexSchema(name=k, dtype=DType.from_python_value(v))
)

first_feature_values = data[0][1]

if timestamps not in first_feature_values:
raise ValueError(f"No value with name timestamps={timestamps!r}")

# Build schema
features_schema = []
for k, v in first_feature_values.items():
if k == timestamps:
continue
if isinstance(v, np.ndarray):
tp_dtype = numpy_dtype_to_tp_dtype(k, v.dtype.type)
else:
if not isinstance(v, list):
raise ValueError(
"Feature values are expected to be numpy arrays or lists."
f" Instead feature {k} has type {type(v)}"
)
if len(v) == 0:
raise ValueError("Feature {k} has zero observations.")
tp_dtype = DType.from_python_value(v[0])
features_schema.append(FeatureSchema(name=k, dtype=tp_dtype))

schema = Schema(
features=features_schema,
indexes=index_schema,
is_unix_timestamp=is_unix_timestamp,
)

# Build content
evtset_data = {}
for src_index_value, src_feature_value in data:
dst_timestamps, _ = normalize_timestamps(src_feature_value[timestamps])
dst_index_value = tuple(
normalize_index_item(src_index_value[k.name]) for k in index_schema
)
dst_feature_value = [
normalize_features(src_feature_value[k.name], k)
for k in features_schema
]
evtset_data[dst_index_value] = IndexData(
features=dst_feature_value,
timestamps=dst_timestamps,
schema=schema,
)

return EventSet(
schema=schema,
data=evtset_data,
)
8 changes: 4 additions & 4 deletions temporian/implementation/numpy/data/plotter.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,14 +181,14 @@ def _list_index_values(
"""Lists all the index values to plot."""

flat_indexes = set(normalize_index_key_list(indexes, None))
index_values = []
index_values = set()
for evtset in _unroll_evsets(evsets):
for index_value in evtset.data:
if indexes is None or index_value in flat_indexes:
index_values.append(index_value)
index_values.add(index_value)
if len(index_values) >= max_values:
return index_values
return index_values
return list(index_values)
return list(index_values)


def plot(
Expand Down
35 changes: 34 additions & 1 deletion temporian/implementation/numpy/data/test/io_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from numpy.testing import assert_array_equal
from datetime import datetime

from temporian.implementation.numpy.data.io import event_set
from temporian.implementation.numpy.data.io import event_set, from_indexed_dicts
from temporian.implementation.numpy.data.event_set import IndexData, EventSet
from temporian.core.data.schema import Schema
from temporian.core.data.dtype import DType
Expand Down Expand Up @@ -199,6 +199,39 @@ def test_feature_wrong_type(self):
},
)

def test_from_indexed_dicts(self):
evset = from_indexed_dicts(
[
(
{"i1": 1, "i2": "A"},
{"timestamp": [1, 2], "f1": [10, 11], "f2": ["X", "Y"]},
),
(
{"i1": 1, "i2": "B"},
{"timestamp": [3, 4], "f1": [12, 13], "f2": ["X", "X"]},
),
(
{"i1": 2, "i2": "A"},
{"timestamp": [5, 6], "f1": [14, 15], "f2": ["Y", "Y"]},
),
(
{"i1": 2, "i2": "B"},
{"timestamp": [7, 8], "f1": [16, 17], "f2": ["Y", "Z"]},
),
]
)
expected = event_set(
timestamps=[1, 2, 3, 4, 5, 6, 7, 8],
features={
"f1": [10, 11, 12, 13, 14, 15, 16, 17],
"f2": ["X", "Y", "X", "X", "Y", "Y", "Y", "Z"],
"i1": [1, 1, 1, 1, 2, 2, 2, 2],
"i2": ["A", "A", "B", "B", "A", "A", "B", "B"],
},
indexes=["i1", "i2"],
)
self.assertEqual(evset, expected)


if __name__ == "__main__":
absltest.main()
Loading