getsentry · ayirr7 · Feb 20, 2025 · Feb 20, 2025 · Feb 21, 2025 · Feb 21, 2025
diff --git a/py/sentry_streams/adapters/stream_adapter.py b/py/sentry_streams/adapters/stream_adapter.py
@@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 from typing import Any, Optional, assert_never
 
-from sentry_streams.pipeline import Map, Sink, Source, Step, StepType
+from sentry_streams.pipeline import Map, Reduce, Sink, Source, Step, StepType
 
 
 class StreamAdapter(ABC):
@@ -23,6 +23,10 @@ def sink(self, step: Sink, stream: Any) -> Any:
     def map(self, step: Map, stream: Any) -> Any:
         raise NotImplementedError
 
+    @abstractmethod
+    def reduce(self, step: Reduce, stream: Any) -> Any:
+        raise NotImplementedError
+
 
 class RuntimeTranslator:
     """
@@ -51,5 +55,9 @@ def translate_step(self, step: Step, stream: Optional[Any] = None) -> Any:
             assert isinstance(step, Map)
             return self.adapter.map(step, stream)
 
+        elif step_type is StepType.REDUCE:
+            assert isinstance(step, Reduce)
+            return self.adapter.reduce(step, stream)
+
         else:
             assert_never(step_type)
diff --git a/py/sentry_streams/example_config.py b/py/sentry_streams/example_config.py
@@ -1,4 +1,7 @@
-from sentry_streams.pipeline import KafkaSink, KafkaSource, Map, Pipeline
+from sentry_streams.pipeline import KafkaSink, KafkaSource, Map, Pipeline, Reduce
+from sentry_streams.user_functions.sample_agg import WordCounter
+from sentry_streams.user_functions.sample_group_by import my_group_by
+from sentry_streams.user_functions.sample_map import EventsPipelineMapFunction
 
 # pipeline: special name
 pipeline = Pipeline()
@@ -13,12 +16,20 @@
     name="mymap",
     ctx=pipeline,
     inputs=[source],
-    function="sentry_streams.sample_function.EventsPipelineMapFunction.simple_map",
+    function=EventsPipelineMapFunction.simple_map,
+)
+
+reduce = Reduce(
+    name="myreduce",
+    ctx=pipeline,
+    inputs=[map],
+    group_by_key=my_group_by,
+    aggregate_fn=WordCounter(),
 )
 
 sink = KafkaSink(
     name="kafkasink",
     ctx=pipeline,
-    inputs=[map],
+    inputs=[reduce],
     logical_topic="transformed-events",
 )
diff --git a/py/sentry_streams/flink/flink_adapter.py b/py/sentry_streams/flink/flink_adapter.py
@@ -1,7 +1,8 @@
-from typing import Any, MutableMapping
+from typing import Any, Callable, MutableMapping
 
-from pyflink.common import Types
+from pyflink.common import Time, Types, WatermarkStrategy
 from pyflink.common.serialization import SimpleStringSchema
+from pyflink.common.time import Duration
 from pyflink.datastream import StreamExecutionEnvironment
 from pyflink.datastream.connectors import (  # type: ignore[attr-defined]
     FlinkKafkaConsumer,
@@ -10,9 +11,11 @@
     KafkaRecordSerializationSchema,
     KafkaSink,
 )
+from pyflink.datastream.window import TumblingEventTimeWindows
 from sentry_streams.adapters.stream_adapter import StreamAdapter
-from sentry_streams.modules import get_module
+from sentry_streams.flink.flink_agg_fn import FlinkAggregate
 from sentry_streams.pipeline import Step
+from sentry_streams.user_functions.agg_template import Accumulator
 
 
 class FlinkAdapter(StreamAdapter):
@@ -67,17 +70,38 @@ def sink(self, step: Step, stream: Any) -> Any:
     def map(self, step: Step, stream: Any) -> Any:
 
         assert hasattr(step, "function")
-        fn_path = step.function
-        mod, cls, fn = fn_path.rsplit(".", 2)
+        imported_fn = step.function
 
-        try:
-            module = get_module(mod)
+        # TODO: Ensure output type is configurable like the schema above
+        return stream.map(
+            func=lambda msg: imported_fn(msg),
+            output_type=Types.TUPLE([Types.STRING(), Types.INT()]),
+        )
 
-        except ImportError:
-            raise
+    # receives a DataStream, returns a DataStream
+    # optional: group by, windowing
+    # required: aggregation
+    def reduce(self, step: Step, stream: Any) -> Any:
 
-        imported_cls = getattr(module, cls)
-        imported_fn = getattr(imported_cls, fn)
+        # group by and agg are required
+        # windowing is optional and inserted between those 2
 
-        # TODO: Ensure output type is configurable like the schema above
-        return stream.map(func=lambda msg: imported_fn(msg), output_type=Types.STRING())
+        assert hasattr(step, "group_by_key")
+        key: Callable[[tuple[str, int]], str] = step.group_by_key
+
+        assert hasattr(step, "aggregate_fn")
+        agg: Accumulator = step.aggregate_fn
+
+        watermark_strategy = WatermarkStrategy.for_monotonous_timestamps().with_idleness(
+            Duration.of_seconds(5)
+        )
+        time_stream = stream.assign_timestamps_and_watermarks(watermark_strategy)
+
+        keyed_stream = time_stream.key_by(key)
+        windowed_stream = keyed_stream.window(TumblingEventTimeWindows.of(Time.seconds(1)))
+
+        return windowed_stream.aggregate(
+            FlinkAggregate(agg),
+            accumulator_type=Types.TUPLE([Types.STRING(), Types.INT()]),
+            output_type=Types.STRING(),
+        )
diff --git a/py/sentry_streams/flink/flink_agg_fn.py b/py/sentry_streams/flink/flink_agg_fn.py
@@ -0,0 +1,26 @@
+from typing import Any
+
+from pyflink.datastream.functions import AggregateFunction
+from sentry_streams.user_functions.agg_template import Accumulator
+
+
+class FlinkAggregate(AggregateFunction):
+
+    def __init__(self, acc: Accumulator) -> None:
+        self.acc = acc
+
+    def create_accumulator(self) -> Any:
+        print("CREATED")
+        return self.acc.create()
+
+    def add(self, value: Any, accumulator: Any) -> Any:
+        print("ADDED")
+        return self.acc.add(accumulator, value)
+
+    def get_result(self, accumulator: Any) -> Any:
+        print("RESULT")
+        return self.acc.get_output(accumulator)
+
+    def merge(self, acc_a: Any, acc_b: Any) -> Any:
+        print("MERGE")
+        return self.acc.merge(acc_a, acc_b)
diff --git a/py/sentry_streams/modules.py b/py/sentry_streams/modules.py
diff --git a/py/sentry_streams/pipeline.py b/py/sentry_streams/pipeline.py
@@ -3,13 +3,25 @@
 from collections import defaultdict
 from dataclasses import dataclass
 from enum import Enum
-from typing import MutableMapping
+from typing import Any, Callable, MutableMapping
+
+from sentry_streams.user_functions.agg_template import Accumulator
 
 
 class StepType(Enum):
     SINK = "sink"
     SOURCE = "source"
     MAP = "map"
+    REDUCE = "reduce"
+
+
+class StateBackend(Enum):
+    HASH_MAP = "hash_map"
+
+
+class Window(Enum):
+    SLIDING = "sliding"
+    TUMBLING = "tumbling"
 
 
 class Pipeline:
@@ -114,5 +126,21 @@ class Map(WithInput):
     # instead of a raw string
     # TODO: Allow product to both enable and access
     # configuration (e.g. a DB that is used as part of Map)
-    function: str
+    function: Callable[..., Any]
     step_type: StepType = StepType.MAP
+
+
+@dataclass
+class Reduce(WithInput):
+    # group_by_key: refactor to Callable reference
+    group_by_key: Callable[..., Any]
+    # windowing mechanism, is this going to be mandatory?
+    # windowing: Window
+    # aggregation (use standard accumulator)
+    aggregate_fn: Accumulator
+    step_type: StepType = StepType.REDUCE
+    # storage: a fixed (enum?) set of storage backends we provide
+    # consider making this a class
+    storage: StateBackend = StateBackend.HASH_MAP
+
+    # keyed stream --> windowed stream --> reduce to datastream
diff --git a/py/sentry_streams/runner.py b/py/sentry_streams/runner.py
@@ -50,6 +50,7 @@ def iterate_edges(p_graph: Pipeline, translator: RuntimeTranslator) -> None:
                         next_step: WithInput = cast(WithInput, p_graph.steps[output_step_name])
                         print(f"Apply step: {next_step.name}")
                         next_step_stream = translator.translate_step(next_step, input_stream)
+                        print(f"stream type {type(next_step_stream)}")
                         step_streams[next_step.name] = next_step_stream
 
 

diff --git a/py/sentry_streams/sample_function.py b/py/sentry_streams/sample_function.py
diff --git a/py/sentry_streams/user_functions/agg_template.py b/py/sentry_streams/user_functions/agg_template.py
@@ -0,0 +1,28 @@
+from abc import ABC, abstractmethod
+from typing import Any
+
+
+class Accumulator(ABC):
+
+    @abstractmethod
+    def create(self) -> Any:
+        raise NotImplementedError
+
+    @abstractmethod
+    def add(self, acc: Any, value: Any) -> Any:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_output(self, acc: Any) -> Any:
+        raise NotImplementedError
+
+    @abstractmethod
+    def merge(self, acc1: Any, acc2: Any) -> Any:
+        raise NotImplementedError
+
+
+# class GroupBy(ABC):
+
+#     @abstractmethod
+#     def get_key(payload):
+#         pass
diff --git a/py/sentry_streams/user_functions/sample_agg.py b/py/sentry_streams/user_functions/sample_agg.py
@@ -0,0 +1,16 @@
+from sentry_streams.user_functions.agg_template import Accumulator
+
+
+class WordCounter(Accumulator):
+
+    def create(self) -> tuple[str, int]:
+        return "", 0
+
+    def add(self, acc: tuple[str, int], value: tuple[str, int]) -> tuple[str, int]:
+        return value[0], acc[1] + value[1]
+
+    def get_output(self, acc: tuple[str, int]) -> str:
+        return f"{acc[0]} {acc[1]}"
+
+    def merge(self, acc1: tuple[str, int], acc2: tuple[str, int]) -> tuple[str, int]:
+        return acc1[0], acc1[1] + acc2[1]
diff --git a/py/sentry_streams/user_functions/sample_group_by.py b/py/sentry_streams/user_functions/sample_group_by.py
@@ -0,0 +1,9 @@
+def my_group_by(msg_payload: tuple[str, int]) -> str:
+    return msg_payload[0]
+
+
+def dumb_group_by(msg: str) -> str:
+    return msg
+
+
+# lambda x: x[0] simplest
diff --git a/py/sentry_streams/user_functions/sample_map.py b/py/sentry_streams/user_functions/sample_map.py
@@ -0,0 +1,28 @@
+import json
+
+
+class EventsPipelineMapFunction:
+    """
+    Sample user-defined functions to
+    plug into pipeline
+    """
+
+    @staticmethod
+    def dumb_map(value: str) -> str:
+        d = json.loads(value)
+        word: str = d.get("word", "null_word")
+
+        return "hello." + word
+
+    @staticmethod
+    def simple_map(value: str) -> tuple[str, int]:
+        d = json.loads(value)
+        word: str = d.get("word", "null_word")
+
+        return (word, 1)
+
+    @staticmethod
+    def str_convert(value: tuple[str, int]) -> str:
+        word, count = value
+
+        return f"{word} {count}"