Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/inLineSource: adding another source type #131

Draft
wants to merge 15 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
### Unreleased changes

* feature: add support for Python 3.12, with corresponding updates to core dataframe dependencies

### v0.3.8
<details>
<summary>Released 2024-09-06</summary>
Expand Down
12 changes: 12 additions & 0 deletions earthmover/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,13 @@
# September 2024 - for now we need to do this in order to turn off the Dask
# query optimizer - see https://blog.dask.org/2023/08/25/dask-expr-introduction
# For reasons unknown, it doesn't yet work with Earthmover. A future Dask
# version may force us to use the query optimizer, but hopefully by then,
# the bugs that emerge when we use it with Earthmover will have been fixed.
import dask
dask.config.set({'dataframe.query-planning': False})

# performance enhancements
dask.config.set({"dataframe.convert-string": True})
import pandas as pd
pd.options.mode.copy_on_write = True
pd.options.mode.string_storage = "pyarrow"
54 changes: 53 additions & 1 deletion earthmover/nodes/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
import pandas as pd
import re
import yaml

from earthmover.nodes.node import Node
from earthmover import util
Expand Down Expand Up @@ -44,10 +45,13 @@ def __new__(cls, name: str, config: 'YamlMapping', *, earthmover: 'Earthmover'):

elif 'file' in config:
return object.__new__(FileSource)

elif 'data' in config:
return object.__new__(inLineSource)

else:
earthmover.error_handler.throw(
"sources must specify either a `file` and/or `connection` string and `query`"
"sources must specify either a `file` and/or `connection` string and `query` or `data`"
)
raise

Expand Down Expand Up @@ -447,3 +451,51 @@ def _verify_packages(self, connection: str):
"connecting to a database requires additional libraries... please install using `pip install earthmover[sql]`"
)
raise

class inLineSource(Source):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider InlineSource to fit convention of other nodes.

mode: str = 'inLineSource'
is_remote: bool = False
allowed_configs: Tuple[str] = ('debug', 'expect', 'show_progress', 'repartition', 'chunksize', 'optional', 'optional_fields',
'data', 'orientation')

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.df = self.error_handler.assert_get_key(self.config, 'data')
self.orientation = self.error_handler.assert_get_key(self.config, 'orientation', dtype= str)


def execute(self):
super().execute()

try:
self.data = self.read_inLineSource()
self.logger.debug(f"source `{self.name}` loaded )"
)

except Exception as err:
self.error_handler.throw(
f"source {self.name} error ({err}); check `data`"
)
raise

def read_inLineSource(self):
try:
if self.orientation == 'columns':
df = pd.DataFrame(self.df.to_dict())

elif self.orientation == 'rows':
YamlMappingList = []
for YamlMappingObject in self.df:
YamlMappingList.append(YamlMappingObject.to_dict())
df = pd.DataFrame(YamlMappingList)
else:
self.error_handler.throw(
f"Invalid {self.orientation}. Must be `rows` or `columns`"
)
return df

except Exception as err:
self.error_handler.throw(
f"source {self.orientation} error ({err}); check `orientation`"
)
raise
2 changes: 1 addition & 1 deletion earthmover/operations/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def execute(self, data: 'DataFrame', **kwargs) -> 'DataFrame':
raise

data[self.new_column] = data.apply(
lambda x: self.separator.join(x[col] for col in self.columns_list),
lambda x: self.separator.join(str(x[col]) for col in self.columns_list),
axis=1,
meta=pd.Series(dtype='str', name=self.new_column)
)
Expand Down
2 changes: 1 addition & 1 deletion earthmover/operations/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def execute(self, data: 'DataFrame', **kwargs) -> 'DataFrame':
for new_col_name, func in self.create_columns_dict.items():

_pieces = re.findall(
"([A-Za-z0-9_]*)\(([A-Za-z0-9_]*)?,?(.*)?\)",
r"([A-Za-z0-9_]*)\(([A-Za-z0-9_]*)?,?(.*)?\)",
func
)[0]

Expand Down
9 changes: 6 additions & 3 deletions earthmover/runs_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

from typing import Dict, List, Optional
from typing import TYPE_CHECKING

from earthmover.nodes.source import inLineSource
if TYPE_CHECKING:
from earthmover.earthmover import Earthmover
from earthmover.nodes.node import Node
Expand Down Expand Up @@ -157,9 +159,10 @@ def _build_hashes(self) -> Dict[str, str]:

if f"$sources.{source.name}" not in node_data.keys():
continue

if not source.is_remote and source.file and not os.path.isdir(source.file):
sources_hash += self._get_file_hash(source.file)

if not isinstance(source, inLineSource):
if not source.is_remote and source.file and not os.path.isdir(source.file):
sources_hash += self._get_file_hash(source.file)

if sources_hash:
sources_hash = self._get_string_hash(sources_hash)
Expand Down
75 changes: 75 additions & 0 deletions earthmover/tests/earthmover.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,69 @@ config:


sources:
invertebrate_species:
orientation: rows
data:
- name: Monarch Butterfly
genus: Danaus
species: plexippus
avg_lifespan_yrs: 0.25
avg_weight_kg: 0.0005
id: 1
- name: Giant African Snail
genus: Achatina
species: fulica
avg_lifespan_yrs: 5
avg_weight_kg: 0.32
id: 2
- name: Common Octopus
genus: Octopus
species: vulgaris
avg_lifespan_yrs: 1.5
avg_weight_kg: 10
id: 3
- name: Atlantic Horseshoe Crab
genus: Limulus
species: polyphemus
avg_lifespan_yrs: 20
avg_weight_kg: 1.5
id: 4
- name: Black Widow Spider
genus: Latrodectus
species: mactans
avg_lifespan_yrs: 3
avg_weight_kg: 0.001
id: 5
- name: American Lobster
genus: Homarus
species: americanus
avg_lifespan_yrs: 50
avg_weight_kg: 7
id: 6
- name: Honeybee
genus: Apis
species: mellifera
avg_lifespan_yrs: 0.2
avg_weight_kg: 0.0001
id: 7
- name: Blue Ringed Octopus
genus: Hapalochlaena
species: maculosa
avg_lifespan_yrs: 0.5
avg_weight_kg: 0.026
id: 8
- name: Japanese Spider Crab
genus: Macrocheira
species: kaempferi
avg_lifespan_yrs: 100
avg_weight_kg: 19
id: 9
- name: European Garden Spider
genus: Araneus
species: diadematus
avg_lifespan_yrs: 1
avg_weight_kg: 0.0007
id: 10

mammal_species:
file: ${BASE_DIR}/sources/mammals.csv
Expand Down Expand Up @@ -92,6 +155,17 @@ transformations:
- operation: modify_columns
columns:
id: {%raw%}4_{{value}}{%endraw%}

invertebrate_species:
source: $sources.invertebrate_species
operations:
- operation: add_columns
columns:
family: invertebrate
- operation: modify_columns
columns:
id: {%raw%}5_{{value}}{%endraw%}
"*": "{%raw%}{{value|trim}}{%endraw%}"

animal_species:
source: $transformations.mammal_species
Expand All @@ -101,6 +175,7 @@ transformations:
- $transformations.bird_species
- $transformations.fish_species
- $transformations.reptile_species
- $transformations.invertebrate_species
debug: True

joined_inventories:
Expand Down
10 changes: 10 additions & 0 deletions earthmover/tests/expected/animals.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,13 @@
{ "id": "4_7", "name": "Northern Crested Newt", "family": "reptile", "genus": "Triturus", "species": "cristatus", "avg_lifespan_yrs": 8, "avg_weight_kg": 0.1 }
{ "id": "4_8", "name": "Timber Rattlesnake", "family": "reptile", "genus": "Crotalus", "species": "horridus", "avg_lifespan_yrs": 15, "avg_weight_kg": 1.1 }
{ "id": "4_9", "name": "Leatherback Sea Turtle", "family": "reptile", "genus": "Dermochelys", "species": "coriacea", "avg_lifespan_yrs": 45, "avg_weight_kg": 500 }
{ "id": "5_1", "name": "Monarch Butterfly", "family": "invertebrate", "genus": "Danaus", "species": "plexippus", "avg_lifespan_yrs": 0.25, "avg_weight_kg": 0.0005 }
{ "id": "5_2", "name": "Giant African Snail", "family": "invertebrate", "genus": "Achatina", "species": "fulica", "avg_lifespan_yrs": 5.0, "avg_weight_kg": 0.32 }
{ "id": "5_3", "name": "Common Octopus", "family": "invertebrate", "genus": "Octopus", "species": "vulgaris", "avg_lifespan_yrs": 1.5, "avg_weight_kg": 10.0 }
{ "id": "5_4", "name": "Atlantic Horseshoe Crab", "family": "invertebrate", "genus": "Limulus", "species": "polyphemus", "avg_lifespan_yrs": 20.0, "avg_weight_kg": 1.5 }
{ "id": "5_5", "name": "Black Widow Spider", "family": "invertebrate", "genus": "Latrodectus", "species": "mactans", "avg_lifespan_yrs": 3.0, "avg_weight_kg": 0.001 }
{ "id": "5_6", "name": "American Lobster", "family": "invertebrate", "genus": "Homarus", "species": "americanus", "avg_lifespan_yrs": 50.0, "avg_weight_kg": 7.0 }
{ "id": "5_7", "name": "Honeybee", "family": "invertebrate", "genus": "Apis", "species": "mellifera", "avg_lifespan_yrs": 0.2, "avg_weight_kg": 0.0001 }
{ "id": "5_8", "name": "Blue Ringed Octopus", "family": "invertebrate", "genus": "Hapalochlaena", "species": "maculosa", "avg_lifespan_yrs": 0.5, "avg_weight_kg": 0.026 }
{ "id": "5_9", "name": "Japanese Spider Crab", "family": "invertebrate", "genus": "Macrocheira", "species": "kaempferi", "avg_lifespan_yrs": 100.0, "avg_weight_kg": 19.0 }
{ "id": "5_10", "name": "European Garden Spider", "family": "invertebrate", "genus": "Araneus", "species": "diadematus", "avg_lifespan_yrs": 1.0, "avg_weight_kg": 0.0007 }
3 changes: 2 additions & 1 deletion earthmover/tests/expected/families.jsonl
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{ "family": "reptile", "all_lifespans": [40,6,55,45,30,18,8,15,45], "all_lifespans_quoted": ["40","6","55","45","30","18","8","15","45"] }
{ "family": "invertebrate", "all_lifespans": [0.25,5.0,1.5,20.0,3.0,50.0,0.2,0.5,100.0,1.0], "all_lifespans_quoted": ["0.25","5.0","1.5","20.0","3.0","50.0","0.2","0.5","100.0","1.0"] }
{ "family": "fish", "all_lifespans": [15,8,4,7,5], "all_lifespans_quoted": ["15","8","4","7","5"] }
{ "family": "mammal", "all_lifespans": [14,14,7,40,12,65,4,25,40,45,8,15,12,40,20,22,20,40,18,40,20], "all_lifespans_quoted": ["14","14","7","40","12","65","4","25","40","45","8","15","12","40","20","22","20","40","18","40","20"] }
{ "family": "bird", "all_lifespans": [50,20,30,25,4,9,35,20,23,12,20,30,8,18,12], "all_lifespans_quoted": ["50","20","30","25","4","9","35","20","23","12","20","30","8","18","12"] }
{ "family": "reptile", "all_lifespans": [40,6,55,45,30,18,8,15,45], "all_lifespans_quoted": ["40","6","55","45","30","18","8","15","45"] }
6 changes: 4 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
wheel
aiohttp>=3.8.1
dask[dataframe]~=2023.5.0
dask[dataframe]~=2024.8.0; python_version>="3.10"
dask[dataframe]~=2023.5.0; python_version<"3.10"
Jinja2>=2.11.3
networkx>=2.6.3
pandas>=1.3.5,<=2.2.1
pandas[performance]~=2.2.2; python_version>="3.10"
pandas>=1.3.5,<=2.2.1; python_version<"3.10"
requests>=2.23.0
setuptools>=44.0.0
5 changes: 5 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Topic :: Office/Business",
"Topic :: Scientific/Engineering",
"Topic :: Utilities"
Expand Down