From c6721b8e08c178088eee16001ae9516fb1fdcf16 Mon Sep 17 00:00:00 2001 From: johncmerfeld Date: Thu, 12 Sep 2024 16:57:20 -0500 Subject: [PATCH 01/14] minimal changes required --- CHANGELOG.md | 4 ++++ earthmover/__init__.py | 3 ++- requirements.txt | 6 ++++-- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b34a583..d7dea01e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +### Unreleased changes + +* feature: add support for Python 3.12, with corresponding updates to core dataframe dependencies + ### v0.3.6
Released 2024-08-07 diff --git a/earthmover/__init__.py b/earthmover/__init__.py index 8b137891..af904155 100644 --- a/earthmover/__init__.py +++ b/earthmover/__init__.py @@ -1 +1,2 @@ - +import dask +dask.config.set({'dataframe.query-planning': False}) diff --git a/requirements.txt b/requirements.txt index 343eae91..603ed4c8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,10 @@ wheel aiohttp>=3.8.1 -dask[dataframe]~=2023.5.0 +dask[dataframe]~=2024.8.0; python_version>="3.10" +dask[dataframe]~=2023.5.0; python_version<"3.10" Jinja2>=2.11.3 networkx>=2.6.3 -pandas>=1.3.5,<=2.2.1 +pandas~=2.2.1; python_version>="3.10" +pandas>=1.3.5,<=2.2.1; python_version<"3.10" requests>=2.23.0 setuptools>=44.0.0 From fb77791ea0d80778edf47ce06a654b78e8ca138c Mon Sep 17 00:00:00 2001 From: johncmerfeld Date: Thu, 12 Sep 2024 17:18:58 -0500 Subject: [PATCH 02/14] bump pandas a bit --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 603ed4c8..b7ee2fa7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ dask[dataframe]~=2024.8.0; python_version>="3.10" dask[dataframe]~=2023.5.0; python_version<"3.10" Jinja2>=2.11.3 networkx>=2.6.3 -pandas~=2.2.1; python_version>="3.10" +pandas~=2.2.2; python_version>="3.10" pandas>=1.3.5,<=2.2.1; python_version<"3.10" requests>=2.23.0 setuptools>=44.0.0 From 32725704da9d2dccc6d47cb61c90b6595a71da32 Mon Sep 17 00:00:00 2001 From: johncmerfeld Date: Thu, 12 Sep 2024 17:21:22 -0500 Subject: [PATCH 03/14] add a couple explicit support lines to setup.py --- setup.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/setup.py b/setup.py index 27bd9a7f..098fbb5a 100644 --- a/setup.py +++ b/setup.py @@ -52,6 +52,11 @@ "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Office/Business", "Topic :: Scientific/Engineering", "Topic :: Utilities" From 58e9dc0d16ba9d2356992d281cb6370635e7684b Mon Sep 17 00:00:00 2001 From: johncmerfeld Date: Fri, 13 Sep 2024 09:17:10 -0500 Subject: [PATCH 04/14] add comment explaining whythe optimizer needs to be turned off --- earthmover/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/earthmover/__init__.py b/earthmover/__init__.py index af904155..799f5745 100644 --- a/earthmover/__init__.py +++ b/earthmover/__init__.py @@ -1,2 +1,7 @@ +# September 2024 - for now we need to do this in order to turn off the Dask +# query optimizer - see https://blog.dask.org/2023/08/25/dask-expr-introduction +# For reasons unknown, it doesn't yet work with Earthmover. A future Dask +# version may force us to use the query optimizer, but hopefully by then, +# the bugs that emerge when we use it with Earthmover will have been fixed. import dask dask.config.set({'dataframe.query-planning': False}) From 66d63e8cdfaec01a7a53623e43a6df72c4836173 Mon Sep 17 00:00:00 2001 From: johncmerfeld Date: Tue, 24 Sep 2024 11:24:14 -0500 Subject: [PATCH 05/14] make regex pattern a raw string --- earthmover/operations/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earthmover/operations/groupby.py b/earthmover/operations/groupby.py index 8cb211f1..b559e194 100644 --- a/earthmover/operations/groupby.py +++ b/earthmover/operations/groupby.py @@ -93,7 +93,7 @@ def execute(self, data: 'DataFrame', **kwargs) -> 'DataFrame': for new_col_name, func in self.create_columns_dict.items(): _pieces = re.findall( - "([A-Za-z0-9_]*)\(([A-Za-z0-9_]*)?,?(.*)?\)", + r"([A-Za-z0-9_]*)\(([A-Za-z0-9_]*)?,?(.*)?\)", func )[0] From 9f73a464ecdc567a2a355068fae13ac0b36adc35 Mon Sep 17 00:00:00 2001 From: johncmerfeld Date: Wed, 25 Sep 2024 12:16:19 -0500 Subject: [PATCH 06/14] apply performance-enhancing settings --- earthmover/__init__.py | 6 ++++++ requirements.txt | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/earthmover/__init__.py b/earthmover/__init__.py index 799f5745..cdc1624d 100644 --- a/earthmover/__init__.py +++ b/earthmover/__init__.py @@ -5,3 +5,9 @@ # the bugs that emerge when we use it with Earthmover will have been fixed. import dask dask.config.set({'dataframe.query-planning': False}) + +# performance enhancements +dask.config.set({"dataframe.convert-string": True}) +import pandas as pd +pd.options.mode.copy_on_write = True +pd.options.mode.string_storage = "pyarrow" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index b7ee2fa7..e6e8381d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ dask[dataframe]~=2024.8.0; python_version>="3.10" dask[dataframe]~=2023.5.0; python_version<"3.10" Jinja2>=2.11.3 networkx>=2.6.3 -pandas~=2.2.2; python_version>="3.10" +pandas[performance]~=2.2.2; python_version>="3.10" pandas>=1.3.5,<=2.2.1; python_version<"3.10" requests>=2.23.0 setuptools>=44.0.0 From 067ebd9aea2a9437b3fa578d7412c0797215523c Mon Sep 17 00:00:00 2001 From: gnguyen87 Date: Fri, 4 Oct 2024 11:50:21 -0500 Subject: [PATCH 07/14] add an inLineSource class --- earthmover/nodes/source.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/earthmover/nodes/source.py b/earthmover/nodes/source.py index 8ec3c4a6..5b3075e0 100644 --- a/earthmover/nodes/source.py +++ b/earthmover/nodes/source.py @@ -447,3 +447,16 @@ def _verify_packages(self, connection: str): "connecting to a database requires additional libraries... please install using `pip install earthmover[sql]`" ) raise + +class inLineSource(Source): + mode: str = 'inLineSource' + is_remote: bool = False + allowed_configs: Tuple[str] = ('file', 'orientation') + + def __init(self): + super().__init() + self.data = self.error_handler.assert_get_key(self.config, 'data', dtype=list) + + def read_inLineSource(self): + df = pd.DataFrame(self.data) + return df From 201d211e38d24843707557e9f060fe5dc9addee1 Mon Sep 17 00:00:00 2001 From: gnguyen87 Date: Sat, 5 Oct 2024 20:13:25 -0500 Subject: [PATCH 08/14] modify inLineSource class --- earthmover/nodes/source.py | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/earthmover/nodes/source.py b/earthmover/nodes/source.py index 5b3075e0..64894de6 100644 --- a/earthmover/nodes/source.py +++ b/earthmover/nodes/source.py @@ -5,6 +5,7 @@ import os import pandas as pd import re +import yaml from earthmover.nodes.node import Node from earthmover import util @@ -44,10 +45,13 @@ def __new__(cls, name: str, config: 'YamlMapping', *, earthmover: 'Earthmover'): elif 'file' in config: return object.__new__(FileSource) + + elif 'data' in config: + return object.__new__(inLineSource) else: earthmover.error_handler.throw( - "sources must specify either a `file` and/or `connection` string and `query`" + "sources must specify either a `file` and/or `connection` string and `query` or `data`" ) raise @@ -451,12 +455,31 @@ def _verify_packages(self, connection: str): class inLineSource(Source): mode: str = 'inLineSource' is_remote: bool = False - allowed_configs: Tuple[str] = ('file', 'orientation') + allowed_configs: Tuple[str] = ('debug', 'expect', 'show_progress', 'repartition', 'chunksize', 'optional', 'optional_fields', + 'data', 'orientation') - def __init(self): - super().__init() - self.data = self.error_handler.assert_get_key(self.config, 'data', dtype=list) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.df = self.error_handler.assert_get_key(self.config, 'data') + print(self.df) + self.orientation = self.error_handler.assert_get_key(self.config, 'orientation') + + + def execute(self): + super().execute() + + try: + self.data = self.read_inLineSource() + self.logger.debug(f"source `{self.name}` loaded )" + ) + + except Exception as err: + self.error_handler.throw( + f"source {self.name} error ({err}); check `data`" + ) + raise def read_inLineSource(self): - df = pd.DataFrame(self.data) + df = pd.DataFrame(self.df) + print(df) return df From 573edf95b4d83a9659a3e2df009ca75ad0ed3e3c Mon Sep 17 00:00:00 2001 From: gnguyen87 Date: Sat, 5 Oct 2024 20:13:48 -0500 Subject: [PATCH 09/14] import inLineSource instance to handle file hasing logic --- earthmover/runs_file.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/earthmover/runs_file.py b/earthmover/runs_file.py index 9da67bba..a06f73f8 100644 --- a/earthmover/runs_file.py +++ b/earthmover/runs_file.py @@ -6,6 +6,8 @@ from typing import Dict, List, Optional from typing import TYPE_CHECKING + +from earthmover.nodes.source import inLineSource if TYPE_CHECKING: from earthmover.earthmover import Earthmover from earthmover.nodes.node import Node @@ -157,9 +159,10 @@ def _build_hashes(self) -> Dict[str, str]: if f"$sources.{source.name}" not in node_data.keys(): continue - - if not source.is_remote and source.file and not os.path.isdir(source.file): - sources_hash += self._get_file_hash(source.file) + + if not isinstance(source, inLineSource): + if not source.is_remote and source.file and not os.path.isdir(source.file): + sources_hash += self._get_file_hash(source.file) if sources_hash: sources_hash = self._get_string_hash(sources_hash) From da7f22d25633294ee5746a86b6eb7e8066fb04f3 Mon Sep 17 00:00:00 2001 From: gnguyen87 Date: Wed, 9 Oct 2024 10:26:52 -0500 Subject: [PATCH 10/14] Take care of logic to handle different types of inLineSource Orientation --- earthmover/nodes/source.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/earthmover/nodes/source.py b/earthmover/nodes/source.py index 64894de6..81bff0b3 100644 --- a/earthmover/nodes/source.py +++ b/earthmover/nodes/source.py @@ -461,12 +461,12 @@ class inLineSource(Source): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.df = self.error_handler.assert_get_key(self.config, 'data') - print(self.df) - self.orientation = self.error_handler.assert_get_key(self.config, 'orientation') + self.orientation = self.error_handler.assert_get_key(self.config, 'orientation', dtype= str) def execute(self): super().execute() + print(self.orientation) try: self.data = self.read_inLineSource() @@ -480,6 +480,23 @@ def execute(self): raise def read_inLineSource(self): - df = pd.DataFrame(self.df) - print(df) - return df + try: + if self.orientation == 'rows': + df = pd.DataFrame(self.df.to_dict()) + + elif self.orientation == 'columns': + YamlMappingList = [] + for YamlMappingObject in self.df: + YamlMappingList.append(YamlMappingObject.to_dict()) + df = pd.DataFrame(YamlMappingList) + else: + self.error_handler.throw( + f"Invalid {self.orientation}. Must be `rows` or `columns`" + ) + return df + + except Exception as err: + self.error_handler.throw( + f"source {self.orientation} error ({err}); check `orientation`" + ) + raise From 98d1a66c4ce723eea9cfc4d1f7ac90649ab73a55 Mon Sep 17 00:00:00 2001 From: gnguyen87 Date: Wed, 9 Oct 2024 10:29:51 -0500 Subject: [PATCH 11/14] Cast dict vals to str type to appropriately join row values together --- earthmover/operations/column.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earthmover/operations/column.py b/earthmover/operations/column.py index d943ac06..def7f36d 100644 --- a/earthmover/operations/column.py +++ b/earthmover/operations/column.py @@ -284,7 +284,7 @@ def execute(self, data: 'DataFrame', **kwargs) -> 'DataFrame': raise data[self.new_column] = data.apply( - lambda x: self.separator.join(x[col] for col in self.columns_list), + lambda x: self.separator.join(str(x[col]) for col in self.columns_list), axis=1, meta=pd.Series(dtype='str', name=self.new_column) ) From 719975f510f3eab55658e40c097a0f8c39e01027 Mon Sep 17 00:00:00 2001 From: gnguyen87 Date: Wed, 9 Oct 2024 14:27:17 -0500 Subject: [PATCH 12/14] Rename orientation --- earthmover/nodes/source.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/earthmover/nodes/source.py b/earthmover/nodes/source.py index 81bff0b3..d0799a04 100644 --- a/earthmover/nodes/source.py +++ b/earthmover/nodes/source.py @@ -481,10 +481,10 @@ def execute(self): def read_inLineSource(self): try: - if self.orientation == 'rows': + if self.orientation == 'columns': df = pd.DataFrame(self.df.to_dict()) - elif self.orientation == 'columns': + elif self.orientation == 'rows': YamlMappingList = [] for YamlMappingObject in self.df: YamlMappingList.append(YamlMappingObject.to_dict()) From 120d0ef80f1f5ce7ec1d03a95d1ffdce8dbdc22f Mon Sep 17 00:00:00 2001 From: gnguyen87 Date: Wed, 9 Oct 2024 14:53:21 -0500 Subject: [PATCH 13/14] Remove print statement --- earthmover/nodes/source.py | 1 - 1 file changed, 1 deletion(-) diff --git a/earthmover/nodes/source.py b/earthmover/nodes/source.py index d0799a04..1a799ca2 100644 --- a/earthmover/nodes/source.py +++ b/earthmover/nodes/source.py @@ -466,7 +466,6 @@ def __init__(self, *args, **kwargs): def execute(self): super().execute() - print(self.orientation) try: self.data = self.read_inLineSource() From 785c71e6b03a15a1efcfb0c62a7506f82649aef1 Mon Sep 17 00:00:00 2001 From: gnguyen87 Date: Wed, 9 Oct 2024 15:15:13 -0500 Subject: [PATCH 14/14] Add test cases for inLineSource --- earthmover/tests/earthmover.yaml | 75 ++++++++++++++++++++++++ earthmover/tests/expected/animals.jsonl | 10 ++++ earthmover/tests/expected/families.jsonl | 3 +- 3 files changed, 87 insertions(+), 1 deletion(-) diff --git a/earthmover/tests/earthmover.yaml b/earthmover/tests/earthmover.yaml index 58e3a550..51b38470 100644 --- a/earthmover/tests/earthmover.yaml +++ b/earthmover/tests/earthmover.yaml @@ -17,6 +17,69 @@ config: sources: + invertebrate_species: + orientation: rows + data: + - name: Monarch Butterfly + genus: Danaus + species: plexippus + avg_lifespan_yrs: 0.25 + avg_weight_kg: 0.0005 + id: 1 + - name: Giant African Snail + genus: Achatina + species: fulica + avg_lifespan_yrs: 5 + avg_weight_kg: 0.32 + id: 2 + - name: Common Octopus + genus: Octopus + species: vulgaris + avg_lifespan_yrs: 1.5 + avg_weight_kg: 10 + id: 3 + - name: Atlantic Horseshoe Crab + genus: Limulus + species: polyphemus + avg_lifespan_yrs: 20 + avg_weight_kg: 1.5 + id: 4 + - name: Black Widow Spider + genus: Latrodectus + species: mactans + avg_lifespan_yrs: 3 + avg_weight_kg: 0.001 + id: 5 + - name: American Lobster + genus: Homarus + species: americanus + avg_lifespan_yrs: 50 + avg_weight_kg: 7 + id: 6 + - name: Honeybee + genus: Apis + species: mellifera + avg_lifespan_yrs: 0.2 + avg_weight_kg: 0.0001 + id: 7 + - name: Blue Ringed Octopus + genus: Hapalochlaena + species: maculosa + avg_lifespan_yrs: 0.5 + avg_weight_kg: 0.026 + id: 8 + - name: Japanese Spider Crab + genus: Macrocheira + species: kaempferi + avg_lifespan_yrs: 100 + avg_weight_kg: 19 + id: 9 + - name: European Garden Spider + genus: Araneus + species: diadematus + avg_lifespan_yrs: 1 + avg_weight_kg: 0.0007 + id: 10 mammal_species: file: ${BASE_DIR}/sources/mammals.csv @@ -92,6 +155,17 @@ transformations: - operation: modify_columns columns: id: {%raw%}4_{{value}}{%endraw%} + + invertebrate_species: + source: $sources.invertebrate_species + operations: + - operation: add_columns + columns: + family: invertebrate + - operation: modify_columns + columns: + id: {%raw%}5_{{value}}{%endraw%} + "*": "{%raw%}{{value|trim}}{%endraw%}" animal_species: source: $transformations.mammal_species @@ -101,6 +175,7 @@ transformations: - $transformations.bird_species - $transformations.fish_species - $transformations.reptile_species + - $transformations.invertebrate_species debug: True joined_inventories: diff --git a/earthmover/tests/expected/animals.jsonl b/earthmover/tests/expected/animals.jsonl index ddee7c3f..53d325d0 100644 --- a/earthmover/tests/expected/animals.jsonl +++ b/earthmover/tests/expected/animals.jsonl @@ -48,3 +48,13 @@ { "id": "4_7", "name": "Northern Crested Newt", "family": "reptile", "genus": "Triturus", "species": "cristatus", "avg_lifespan_yrs": 8, "avg_weight_kg": 0.1 } { "id": "4_8", "name": "Timber Rattlesnake", "family": "reptile", "genus": "Crotalus", "species": "horridus", "avg_lifespan_yrs": 15, "avg_weight_kg": 1.1 } { "id": "4_9", "name": "Leatherback Sea Turtle", "family": "reptile", "genus": "Dermochelys", "species": "coriacea", "avg_lifespan_yrs": 45, "avg_weight_kg": 500 } +{ "id": "5_1", "name": "Monarch Butterfly", "family": "invertebrate", "genus": "Danaus", "species": "plexippus", "avg_lifespan_yrs": 0.25, "avg_weight_kg": 0.0005 } +{ "id": "5_2", "name": "Giant African Snail", "family": "invertebrate", "genus": "Achatina", "species": "fulica", "avg_lifespan_yrs": 5.0, "avg_weight_kg": 0.32 } +{ "id": "5_3", "name": "Common Octopus", "family": "invertebrate", "genus": "Octopus", "species": "vulgaris", "avg_lifespan_yrs": 1.5, "avg_weight_kg": 10.0 } +{ "id": "5_4", "name": "Atlantic Horseshoe Crab", "family": "invertebrate", "genus": "Limulus", "species": "polyphemus", "avg_lifespan_yrs": 20.0, "avg_weight_kg": 1.5 } +{ "id": "5_5", "name": "Black Widow Spider", "family": "invertebrate", "genus": "Latrodectus", "species": "mactans", "avg_lifespan_yrs": 3.0, "avg_weight_kg": 0.001 } +{ "id": "5_6", "name": "American Lobster", "family": "invertebrate", "genus": "Homarus", "species": "americanus", "avg_lifespan_yrs": 50.0, "avg_weight_kg": 7.0 } +{ "id": "5_7", "name": "Honeybee", "family": "invertebrate", "genus": "Apis", "species": "mellifera", "avg_lifespan_yrs": 0.2, "avg_weight_kg": 0.0001 } +{ "id": "5_8", "name": "Blue Ringed Octopus", "family": "invertebrate", "genus": "Hapalochlaena", "species": "maculosa", "avg_lifespan_yrs": 0.5, "avg_weight_kg": 0.026 } +{ "id": "5_9", "name": "Japanese Spider Crab", "family": "invertebrate", "genus": "Macrocheira", "species": "kaempferi", "avg_lifespan_yrs": 100.0, "avg_weight_kg": 19.0 } +{ "id": "5_10", "name": "European Garden Spider", "family": "invertebrate", "genus": "Araneus", "species": "diadematus", "avg_lifespan_yrs": 1.0, "avg_weight_kg": 0.0007 } diff --git a/earthmover/tests/expected/families.jsonl b/earthmover/tests/expected/families.jsonl index 0d0959e8..b831bdab 100644 --- a/earthmover/tests/expected/families.jsonl +++ b/earthmover/tests/expected/families.jsonl @@ -1,4 +1,5 @@ +{ "family": "reptile", "all_lifespans": [40,6,55,45,30,18,8,15,45], "all_lifespans_quoted": ["40","6","55","45","30","18","8","15","45"] } +{ "family": "invertebrate", "all_lifespans": [0.25,5.0,1.5,20.0,3.0,50.0,0.2,0.5,100.0,1.0], "all_lifespans_quoted": ["0.25","5.0","1.5","20.0","3.0","50.0","0.2","0.5","100.0","1.0"] } { "family": "fish", "all_lifespans": [15,8,4,7,5], "all_lifespans_quoted": ["15","8","4","7","5"] } { "family": "mammal", "all_lifespans": [14,14,7,40,12,65,4,25,40,45,8,15,12,40,20,22,20,40,18,40,20], "all_lifespans_quoted": ["14","14","7","40","12","65","4","25","40","45","8","15","12","40","20","22","20","40","18","40","20"] } { "family": "bird", "all_lifespans": [50,20,30,25,4,9,35,20,23,12,20,30,8,18,12], "all_lifespans_quoted": ["50","20","30","25","4","9","35","20","23","12","20","30","8","18","12"] } -{ "family": "reptile", "all_lifespans": [40,6,55,45,30,18,8,15,45], "all_lifespans_quoted": ["40","6","55","45","30","18","8","15","45"] } \ No newline at end of file