Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Data modeling improvements #129

Merged
merged 20 commits into from
Oct 16, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add allow_parallel_relationships arg
a-s-g93 committed Oct 15, 2024
commit 00aa6a043f698a885e22340f289435b3bb3fbd3f
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -8,9 +8,11 @@

### Changed

* Change `DataModel` generation logic to first generate nodes, then generate relationships
* Change initial `DataModel` generation logic to first generate nodes, then generate relationships
* Updated examples
* Add `DataModel` validator to check for parallel relationships
* Add `allow_parallel_relationships` arg to `DataModel` generating methods
* remove `use_yaml_data_model` arg from `DataModel` generation methods, as it is no longer relevant
* Update `DataModel`, `Node`, `Relationship` and `Property` validation logic to better utilize Pydantic

### Added
8 changes: 4 additions & 4 deletions neo4j_runway/llm/base.py
Original file line number Diff line number Diff line change
@@ -135,10 +135,10 @@ def _get_initial_data_model_response(
use_advanced_data_model_generation_rules: bool,
data_dictionary: Dict[str, Any],
max_retries: int = 3,
use_yaml_data_model: bool = False,
allow_duplicate_properties: bool = False,
enforce_uniqueness: bool = True,
) -> Union[DataModel, Dict[str, Any]]:
allow_parallel_relationships: bool = False,
) -> DataModel:
"""
Performs at least 2 LLM calls:
1. Request the LLM to find nodes, relationships and properties that should be in the data model.
@@ -157,6 +157,7 @@ def _get_initial_data_model_response(
valid_columns=valid_columns,
allow_duplicate_column_mappings=allow_duplicate_properties,
enforce_uniqueness=enforce_uniqueness,
allow_parallel_relationships=allow_parallel_relationships,
)
formatted_prompt = create_initial_nodes_prompt(
discovery_text=discovery_text,
@@ -209,10 +210,10 @@ def _get_initial_data_model_response(
formatted_prompt=formatted_prompt,
valid_columns=valid_columns,
max_retries=max_retries,
use_yaml_data_model=use_yaml_data_model,
data_dictionary=data_dictionary,
allow_duplicate_properties=allow_duplicate_properties,
enforce_uniqueness=enforce_uniqueness,
allow_parallel_relationships=allow_parallel_relationships,
)

return initial_data_model
@@ -223,7 +224,6 @@ def _get_data_model_response(
valid_columns: dict[str, list[str]],
data_dictionary: Dict[str, Any],
max_retries: int = 3,
use_yaml_data_model: bool = False,
allow_duplicate_properties: bool = False,
enforce_uniqueness: bool = True,
apply_neo4j_naming_conventions: bool = True,
29 changes: 12 additions & 17 deletions neo4j_runway/modeler/modeler.py
Original file line number Diff line number Diff line change
@@ -269,11 +269,11 @@ def get_model(
def create_initial_model(
self,
max_retries: int = 3,
use_yaml_data_model: bool = False,
use_advanced_data_model_generation_rules: bool = True,
allow_duplicate_properties: bool = False,
enforce_uniqueness: bool = True,
) -> Union[DataModel, Dict[str, Any]]:
allow_parallel_relationships: bool = False,
) -> DataModel:
"""
Generate the initial model.
You may access this model with the `get_model` method and providing `version=1`.
@@ -282,21 +282,20 @@ def create_initial_model(
----------
max_retries : int, optional
The max number of retries for generating the initial model, by default 3
use_yaml_data_model : bool, optional
Whether to pass the data model in YAML format while making corrections, by default False
use_advanced_data_model_generation_rules, optional
Whether to include advanced data modeling rules, by default True
allow_duplicate_properties : bool, optional
Whether to allow a property to exist on multiple node labels or relationship types, by default False
enforce_uniqueness : bool, optional
Whether to error if a node has no unique identifiers (unique or node key).
Setting this to false may be detrimental during code generation and ingestion. By default True
allow_parallel_relationships : bool, optional
Whether to allow parallel relationships to exist in the data model, by default False

Returns
-------
Union[DataModel, str]
The generated data model if a valid model is generated, or
A dictionary containing information about the failed generation attempt.
DataModel
The generated data model.
"""

response = self.llm._get_initial_data_model_response(
@@ -307,14 +306,11 @@ def create_initial_model(
multifile=self.is_multifile,
use_advanced_data_model_generation_rules=use_advanced_data_model_generation_rules,
max_retries=max_retries,
use_yaml_data_model=use_yaml_data_model,
allow_duplicate_properties=allow_duplicate_properties,
enforce_uniqueness=enforce_uniqueness,
allow_parallel_relationships=allow_parallel_relationships,
)

if isinstance(response, dict):
return response

self.model_history.append(response)

self._initial_model_created = True
@@ -326,10 +322,10 @@ def iterate_model(
iterations: int = 1,
corrections: Optional[str] = None,
use_advanced_data_model_generation_rules: bool = True,
use_yaml_data_model: bool = False,
max_retries: int = 3,
allow_duplicate_properties: bool = False,
enforce_uniqueness: bool = True,
allow_parallel_relationships: bool = False,
) -> DataModel:
"""
Iterate on the current model. A data model must exist in the `model_history` property to run.
@@ -344,20 +340,20 @@ def iterate_model(
What changes the user would like the LLM to address in the next model, by default None
max_retries : int, optional
The max number of retries for generating the initial model, by default 3
use_yaml_data_model : bool, optional
Whether to pass the data model in YAML format while making corrections, by default False
use_advanced_data_model_generation_rules, optional
Whether to include advanced data modeling rules, by default True
allow_duplicate_properties : bool, optional
Whether to allow a property to exist on multiple node labels or relationship types, by default False
enforce_uniqueness : bool, optional
Whether to error if a node has no unique identifiers (unique or node key).
Setting this to false may be detrimental during code generation and ingestion. By default True
allow_parallel_relationships : bool, optional
Whether to allow parallel relationships to exist in the data model, by default False

Returns
-------
DataModel
The most recent generated data model.
The most recently generated data model.
"""

assert self._initial_model_created, "No data model present to iterate on."
@@ -371,18 +367,17 @@ def iterate() -> DataModel:
corrections=corrections,
data_dictionary=self._data_dictionary,
use_cases=self.user_input.pretty_use_cases,
use_yaml_data_model=use_yaml_data_model,
advanced_rules=use_advanced_data_model_generation_rules,
valid_columns=self.allowed_columns,
)
response = self.llm._get_data_model_response(
formatted_prompt=formatted_prompt,
max_retries=max_retries,
valid_columns=self.allowed_columns,
use_yaml_data_model=use_yaml_data_model,
data_dictionary=self._data_dictionary,
allow_duplicate_properties=allow_duplicate_properties,
enforce_uniqueness=enforce_uniqueness,
allow_parallel_relationships=allow_parallel_relationships,
)

self.model_history.append(response)