Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Data modeling improvements #129

Merged
merged 20 commits into from
Oct 16, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
remove EntityPool and Error COT code
a-s-g93 committed Oct 15, 2024
commit 71cfdf2a88ac3b93ed2cfde91bf5e48fc6f2693c
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -8,8 +8,10 @@

### Changed

* Change `DataModel` generation logic to first generate nodes, then generate relationships.
* Change `DataModel` generation logic to first generate nodes, then generate relationships
* Updated examples
* Add `DataModel` validator to check for parallel relationships
* Update `DataModel`, `Node`, `Relationship` and `Property` validation logic to better utilize Pydantic

### Added

23 changes: 0 additions & 23 deletions neo4j_runway/llm/base.py
Original file line number Diff line number Diff line change
@@ -10,23 +10,18 @@
from instructor.exceptions import InstructorRetryException
from tenacity import Retrying, stop_after_attempt

from ..inputs import UserInput
from ..models import DataModel
from ..models.core.node import Nodes
from ..resources.llm_response_types import (
DataModelEntityPool,
DiscoveryResponse,
ErrorRecommendations,
)
from ..resources.prompts import (
SYSTEM_PROMPTS,
)
from ..resources.prompts.data_modeling import (
create_data_model_iteration_prompt,
create_initial_data_model_cot_prompt,
create_initial_data_model_prompt,
create_initial_nodes_prompt,
create_retry_data_model_generation_prompt,
)
from .context import create_context

@@ -276,21 +271,3 @@ def _get_data_model_response(
)

return response

def _get_chain_of_thought_for_error_recommendations_response(
self, formatted_prompt: str
) -> str:
"""
Generate fixes for the previous data model.
"""
print("Analyzing errors...")
response: ErrorRecommendations = self.client.chat.completions.create(
model=self.model_name,
messages=[
{"role": "system", "content": SYSTEM_PROMPTS["retry"]},
{"role": "user", "content": formatted_prompt},
],
response_model=ErrorRecommendations,
**self.model_params,
)
return response.recommendations
36 changes: 1 addition & 35 deletions neo4j_runway/models/core/data_model.py
Original file line number Diff line number Diff line change
@@ -4,15 +4,14 @@

import json
from ast import literal_eval
from typing import Any, Dict, List, Optional, Set, Tuple, Union
from typing import Any, Dict, List, Optional, Tuple, Union

import yaml
from graphviz import Digraph
from pydantic import (
BaseModel,
ValidationError,
ValidationInfo,
field_validator,
model_validator,
)
from pydantic_core import InitErrorDetails, PydanticCustomError
@@ -21,7 +20,6 @@
InvalidArrowsDataModelError,
InvalidSolutionsWorkbenchDataModelError,
)
from ...resources.prompts.data_modeling import create_data_model_errors_cot_prompt
from ..arrows import ArrowsDataModel, ArrowsNode, ArrowsRelationship
from ..solutions_workbench import (
SolutionsWorkbenchDataModel,
@@ -51,38 +49,6 @@ class DataModel(BaseModel):
relationships: List[Relationship]
metadata: Optional[Dict[str, Any]] = None

# def __init__(
# self,
# nodes: List[Node],
# relationships: List[Relationship],
# metadata: Optional[Dict[str, Any]] = None,
# use_neo4j_naming_conventions: bool = True,
# ) -> None:
# """
# The standard Graph Data Model representation in Neo4j Runway.

# Parameters
# ----------
# nodes : List[Node]
# A list of the nodes in the data model.
# relationships : List[Relationship]
# A list of the relationships in the data model.
# metadata: Optional[Dict[str, Any]]
# Metadata from an import source such as Solutions Workbench, by default None
# use_neo4j_naming_conventions : bool, optional
# Whether to convert labels, relationships and properties to Neo4j naming conventions, by default True
# """
# super().__init__(
# nodes=nodes,
# relationships=relationships,
# metadata=metadata,
# use_neo4j_naming_conventions=True,
# )

# # default apply Neo4j naming conventions.
# if use_neo4j_naming_conventions:
# self.apply_neo4j_naming_conventions()

@property
def node_labels(self) -> List[str]:
"""
10 changes: 0 additions & 10 deletions neo4j_runway/models/core/node.py
Original file line number Diff line number Diff line change
@@ -428,16 +428,6 @@ def validate_nodes(cls, nodes: List[Node]) -> List[Node]:
def advanced_validation(self, info: ValidationInfo) -> "Nodes":
errors: List[InitErrorDetails] = list()

def _retrieve_duplicated_property_column_mapping(
context: Tuple[str, str, int, str, int, str],
) -> str:
"""Retrieve a `Property` in the data model that shares a `column_mapping` attribute."""

prop: Property = self.__getattribute__(context[1])[context[2]].properties[
context[4]
]
return prop.column_mapping

def _parse_duplicated_property_location(
context: Tuple[str, str, int, str, int, str],
) -> Tuple[str, int, str, int, str]:

This file was deleted.

146 changes: 0 additions & 146 deletions neo4j_runway/resources/llm_response_types/initial_model_pool.py

This file was deleted.

10 changes: 0 additions & 10 deletions neo4j_runway/resources/prompts/data_modeling/__init__.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,10 @@
from .data_model_error_handling import (
create_data_model_errors_cot_prompt,
create_retry_data_model_generation_prompt,
create_retry_initial_data_model_prep_generation_prompt,
)
from .initial_data_model import (
create_initial_data_model_cot_prompt,
create_initial_data_model_prompt,
create_initial_nodes_prompt,
)
from .iterative_data_model import create_data_model_iteration_prompt

__all__ = [
"create_retry_initial_data_model_prep_generation_prompt",
"create_data_model_errors_cot_prompt",
"create_retry_data_model_generation_prompt",
"create_initial_data_model_cot_prompt",
"create_initial_data_model_prompt",
"create_data_model_iteration_prompt",
"create_initial_nodes_prompt",
16 changes: 0 additions & 16 deletions neo4j_runway/resources/prompts/data_modeling/constants.py
Original file line number Diff line number Diff line change
@@ -78,22 +78,6 @@
}
"""

ENTITY_POOL_GENERATION_RULES = """Based upon the above information and of high-quality graph data models,
return the following:
* Any possible Nodes and their respective properties
* Any possible Relationships and their respective source Nodes and target Nodes
* Relationships and their respective properties, if any
* Explanations for each decision and how it will benefit the data model
* All possible relationships for nodes
* `source_name` for all nodes and relationships

Remember
* All properties must be found in the data dictionary above!
* A property may have an alias in another file as a foreign key.
* A node may not have properties from multiple files!
* A relationship may not have properties from multiple files!
* Find properties that may uniquely identify Nodes"""

NODE_GENERATION_RULES = """Please follow these rules strictly! Billions of dollars depend on you.
Nodes
* Each node must have a unique property or node key pair

This file was deleted.

32 changes: 0 additions & 32 deletions neo4j_runway/resources/prompts/data_modeling/initial_data_model.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,14 @@
from typing import Any, Dict, Optional

from ...llm_response_types.initial_model_pool import DataModelEntityPool
from .constants import (
DATA_MODEL_FORMAT,
ENTITY_POOL_GENERATION_RULES,
NODE_GENERATION_RULES,
NODES_FORMAT,
)
from .formatters import get_rules
from .template import create_data_modeling_prompt


def create_initial_data_model_cot_prompt(
discovery_text: str,
multifile: bool,
use_cases: Optional[str],
valid_columns: Dict[str, Any],
data_dictionary: Optional[Dict[str, Any]] = None,
) -> str:
"""
Generate a prompt to find nodes, relationships and properties to include in a data model.
This is only for brainstorming, result of prompt should not be a DataModel.
Returns
-------
str
The prompt.
"""
prefix = "Please generate a pool of entities that will be used to construct a graph data model."

return create_data_modeling_prompt(
prefix=prefix,
discovery=discovery_text,
multifile=multifile,
use_cases=use_cases,
valid_columns=valid_columns,
data_dictionary=data_dictionary,
rules=ENTITY_POOL_GENERATION_RULES,
)


def create_initial_nodes_prompt(
discovery_text: str,
multifile: bool,
@@ -88,7 +57,6 @@ def create_initial_data_model_prompt(
return create_data_modeling_prompt(
prefix=prefix,
discovery=discovery_text,
entity_pool=data_model_recommendations,
valid_columns=valid_columns,
data_dictionary=data_dictionary,
use_cases=use_cases,
3 changes: 0 additions & 3 deletions neo4j_runway/resources/prompts/data_modeling/template.py
Original file line number Diff line number Diff line change
@@ -31,7 +31,6 @@ def create_data_modeling_prompt(
errors: Optional[List[str]] = None,
corrections: Optional[str] = None,
data_model: Optional["DataModel"] = None, # type: ignore
entity_pool: Optional["DataModelEntityPool"] = None, # type: ignore
nodes: Optional["Nodes"] = None, # type: ignore
use_cases: Optional[str] = None,
data_model_format: Optional[str] = None,
@@ -52,8 +51,6 @@ def create_data_modeling_prompt(
res += format_corrections(corrections=corrections)
if data_model is not None:
res += format_data_model(data_model=data_model, yaml_format=data_model_as_yaml)
if entity_pool is not None:
res += format_entity_pool(entity_pool=entity_pool, retry_prompt=retry_prompt)
if nodes is not None:
res += format_nodes(nodes=nodes, retry_prompt=retry_prompt)
if use_cases is not None: