refactor: Apply WHERE clauses to return value of upstream SQLStream.build_query

edgarrmondragon · edgarrmondragon · commit 5b32aa04c8d3 · 2025-05-21T13:51:47.000-06:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,7 +29,7 @@ dependencies = [
     "psycopg2-binary==2.9.10",
     "sqlalchemy==2.0.41",
     "sshtunnel==0.4.0",
-    "singer-sdk[faker] @ git+https://github.com/meltano/sdk.git",
+    "singer-sdk[faker] @ git+https://github.com/meltano/sdk.git@refs/pull/3050/head",
 ]
 
 [project.urls]
@@ -52,7 +52,7 @@ lint = [
 testing = [
     "hypothesis>=6.122.1",
     "pytest>=8",
-    "singer-sdk[testing] @ git+https://github.com/meltano/sdk.git",
+    "singer-sdk[testing] @ git+https://github.com/meltano/sdk.git@refs/pull/3050/head",
 ]
 typing = [
     "mypy>=1.8.0",
diff --git a/tap_postgres/client.py b/tap_postgres/client.py
@@ -203,42 +203,13 @@ def max_record_count(self) -> int | None:
         """Return the maximum number of records to fetch in a single query."""
         return self.config.get("max_record_count")
 
-    def build_query(self, context: Context | None) -> sa.sql.Select:
+    def build_query(self, table: sa.Table) -> sa.sql.Select:
         """Build a SQLAlchemy query for the stream."""
-        selected_column_names = self.get_selected_schema()["properties"].keys()
-        table = self.connector.get_table(
-            full_table_name=self.fully_qualified_name,
-            column_names=selected_column_names,
-        )
-        query = table.select()
-
-        if self.replication_key:
-            replication_key_col = table.columns[self.replication_key]
-            order_by = (
-                sa.nulls_first(replication_key_col.asc())
-                if self.supports_nulls_first
-                else replication_key_col.asc()
-            )
-            query = query.order_by(order_by)
-
-            start_val = self.get_starting_replication_key_value(context)
-            if start_val:
-                query = query.where(replication_key_col >= start_val)
-
+        query = super().build_query(table)
         stream_options = self.config.get("stream_options", {}).get(self.name, {})
         if clauses := stream_options.get("custom_where_clauses"):
             query = query.where(*(sa.text(clause.strip()) for clause in clauses))
 
-        if self.ABORT_AT_RECORD_COUNT is not None:
-            # Limit record count to one greater than the abort threshold. This ensures
-            # `MaxRecordsLimitException` exception is properly raised by caller
-            # `Stream._sync_records()` if more records are available than can be
-            # processed.
-            query = query.limit(self.ABORT_AT_RECORD_COUNT + 1)
-
-        if self.max_record_count():
-            query = query.limit(self.max_record_count())
-
         return query
 
     # Get records from stream
@@ -264,8 +235,18 @@ def get_records(self, context: Context | None) -> t.Iterable[dict[str, t.Any]]:
             msg = f"Stream '{self.name}' does not support partitioning."
             raise NotImplementedError(msg)
 
+        selected_column_names = self.get_selected_schema()["properties"].keys()
+        table = self.connector.get_table(
+            full_table_name=self.fully_qualified_name,
+            column_names=selected_column_names,
+        )
+
+        query = self.build_query(table)
+        query = self.apply_replication_filter(query, table, context=context)
+        query = self.apply_abort_query_limit(query)
+
         with self.connector._connect() as conn:
-            for record in conn.execute(self.build_query(context)).mappings():
+            for record in conn.execute(query).mappings():
                 # TODO: Standardize record mapping type
                 # https://github.com/meltano/sdk/issues/2096
                 transformed_record = self.post_process(dict(record))
diff --git a/tests/test_stream_class.py b/tests/test_stream_class.py
@@ -59,7 +59,8 @@ def test_build_query():
         table="test_table",
     )
     stream = PostgresStream(tap, catalog_entry.to_dict(), connector=DummyConnector())
+    table = sa.Table("test_table", sa.MetaData(), sa.Column("id", sa.Integer))
     assert (
-        str(stream.build_query(None).compile()).replace("\n", "")
+        str(stream.build_query(table).compile()).replace("\n", "")
         == "SELECT test_table.id FROM test_table WHERE id % 2 = 0 AND id % 3 = 0"
     )
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,8 @@ def test_build_query():`
`59`	`59`	`table="test_table",`
`60`	`60`	`)`
`61`	`61`	`stream = PostgresStream(tap, catalog_entry.to_dict(), connector=DummyConnector())`
	`62`	`+ table = sa.Table("test_table", sa.MetaData(), sa.Column("id", sa.Integer))`
`62`	`63`	`assert (`
`63`		`- str(stream.build_query(None).compile()).replace("\n", "")`
	`64`	`+ str(stream.build_query(table).compile()).replace("\n", "")`
`64`	`65`	`== "SELECT test_table.id FROM test_table WHERE id % 2 = 0 AND id % 3 = 0"`
`65`	`66`	`)`