Skip to content

Commit

Permalink
adding filter and word embedding API and support for using word embed… (
Browse files Browse the repository at this point in the history
#42)

* adding filter and word embedding API and support for using word embeddings in Torque

* read me update
  • Loading branch information
arun1729 committed Mar 13, 2023
1 parent a90b3e6 commit fc64c19
Show file tree
Hide file tree
Showing 10 changed files with 695 additions and 71 deletions.
91 changes: 45 additions & 46 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,10 @@
# CogDB - Micro Graph Database for Python Applications
> Documents and examples at [cogdb.io](https://cogdb.io)
> New release!: 3.0.2
>
> - Ability to put JSON into a graph.
> - Ability to update JSON in a graph.
> - Ability to drop edges.
> - Option to disable in memory caching.
> New release: 3.0.5
> - New word embeddings API
> - Similarity filtering using word embeddings
> - Filter step
![ScreenShot](notes/ex2.png)

Expand Down Expand Up @@ -147,21 +145,21 @@ g.v("bob").inc().all()
```
> {'result': [{'id': 'alice'}, {'id': 'charlie'}, {'id': 'dani'}]}
#### Using lambda to chose vertices while traversing the graph.
#### Filtering

```python
g.v(func=lambda x: x.startswith("d")).all()
g.v().filter(func=lambda x: x.startswith("d")).all()
```
> {'result': [{'id': 'dani'}]}

```python
g.v().out("score", func=lambda x: int(x) > 5).inc().all()
g.v().out("score").filter(func=lambda x: int(x) > 5).inc().all()
```
> {'result': [{'id': 'alice'}, {'id': 'dani'}, {'id': 'greg'}]}
```python
g.v("emily").out("follows", func=lambda x: x.startswith("f")).all()
g.v("emily").out("follows").filter(func=lambda x: x.startswith("f")).all()
```
> {'result': [{'id': 'fred'}]}
Expand Down Expand Up @@ -190,6 +188,43 @@ f.v().has('name','fred').out('follows').all()
In a json, CogDB treats `_id` property as a unique identifier for each object. If `_id` is not provided, a randomly generated `_id` is created for each object with in a JSON object.
`_id` field is used to update a JSON object, see example below.

## Using word embeddings

CogDB supports word embeddings. Word embeddings are a way to represent words as vectors. Word embeddings are useful for many NLP tasks.
There are various types of word embeddings, including popular ones like [GloVe](https://nlp.stanford.edu/projects/glove/) and [FastText](https://fasttext.cc/).

#### Add a word embedding:

```python
g.put_embedding("orange", [0.1, 0.2, 0.3, 0.4, 0.5])
```

#### Get a word embedding:

```python
g.get_embedding("orange")
```

> [0.1, 0.2, 0.3, 0.4, 0.5]
#### Delete a word embedding:

```python
g.delete_embedding("orange")
```

#### Use word embeddings in a query:

```python
g.v().sim('orange', '>', 0.35).all()
```
> {'result': [{'id': 'clementines'}, {'id': 'tangerine'}, {'id': 'orange'}]}
```python
g.v().sim('orange', 'in', [0.25, 0.35]).all()
```
> {'result': [{'id': 'banana'}, {'id': 'apple'}]}
In the above code, the sim method is used to filter vertices based on their cosine similarity with the word embedding for "orange". The operator and threshold arguments determine how the similarity is compared to the threshold value, which can be a single value or a range.

## Loading data from a file

Expand Down Expand Up @@ -228,42 +263,6 @@ g = Graph(graph_name="people")
g.load_edgelist("/path/to/edgelist", "people")
```

## Low level key-value store API:
Every record inserted into Cog's key-value store is directly persisted on to disk. It stores and retrieves data based
on hash values of the keys, it can perform fast look ups (O(1) avg) and fast (O(1) avg) inserts.

```python

from cog.database import Cog

cogdb = Cog('path/to/dbdir')

# create a namespace
cogdb.create_or_load_namespace("my_namespace")

# create new table
cogdb.create_table("new_db", "my_namespace")

# put some data
cogdb.put(('key', 'val'))

# retrieve data
cogdb.get('key')

# put some more data
cogdb.put(('key2', 'val2'))

# scan
scanner = cogdb.scanner()
for r in scanner:
print
r

# delete data
cogdb.delete('key1')

```

## Config

If no config is provided when creating a Cog instance, it will use the defaults:
Expand Down
1 change: 1 addition & 0 deletions cog/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
''' TORQUE '''
GRAPH_NODE_SET_TABLE_NAME = 'TOR_NODE_SET'
GRAPH_EDGE_SET_TABLE_NAME = 'TOR_EDGE_SET'
EMBEDDING_SET_TABLE_NAME = 'EMBEDDING_SET'

''' CUSTOM COG DB PATH '''
CUSTOM_COG_DB_PATH = None
Expand Down
6 changes: 5 additions & 1 deletion cog/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def put(self, key, store_position, store):


def get_index(self, key):
num = self.cog_hash(key) % ((sys.maxsize + 1) * 2)
num = cog_hash(key, self.config.INDEX_CAPACITY) % ((sys.maxsize + 1) * 2)
self.logger.debug("hash for: " + key + " : " + str(num))
# there may be diff when using mem slice vs write (+1 needed)
index = (self.config.INDEX_BLOCK_LEN *
Expand Down Expand Up @@ -489,3 +489,7 @@ def delete(self, key, store):
return True
else:
return False


def cog_hash(string, index_capacity):
return xxhash.xxh32(string, seed=2).intdigest() % index_capacity
3 changes: 1 addition & 2 deletions cog/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,7 @@ def use_table(self, name):
return self

def put(self, data):
assert type(data.key) is str, "Only string type is supported."
assert type(data.value) is str, "Only string type is supported."
assert type(data.key) is str, "key must be a string."
position = self.current_table.store.save(data)
self.current_table.indexer.put(data.key, position, self.current_table.store)

Expand Down
134 changes: 132 additions & 2 deletions cog/torque.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from cog.database import Cog
from cog.database import in_nodes, out_nodes, hash_predicate
from cog.core import cog_hash, Record
import json
import logging
from logging.config import dictConfig
Expand All @@ -9,6 +10,8 @@
from os import listdir
import time
import random
from math import isclose
import warnings

NOTAG = "NOTAG"

Expand All @@ -35,7 +38,6 @@ def __str__(self):


class BlankNode(object):

ID_PREFIX = "_id_"

def __init__(self, label=None):
Expand All @@ -51,7 +53,7 @@ def __str__(self):
@classmethod
def is_id(cls, label):
# print("--- > is_id", label)
return label.startswith("_:"+BlankNode.ID_PREFIX)
return label.startswith("_:" + BlankNode.ID_PREFIX)


class Graph:
Expand Down Expand Up @@ -235,6 +237,8 @@ def update(self, vertex1, predicate, vertex2):
return self

def v(self, vertex=None, func=None):
if func:
warnings.warn("The use of func is deprecated, please use filter instead.", DeprecationWarning)
if vertex is not None:
if isinstance(vertex, list):
self.last_visited_vertices = [Vertex(v) for v in vertex]
Expand All @@ -258,6 +262,7 @@ def out(self, predicates=None, func=None):
'''

if func:
warnings.warn("The use of func is deprecated, please use filter instead.", DeprecationWarning)
assert callable(func), "func must be a lambda. Example: func = lambda d: int(d) > 5"
assert not isinstance(predicates, list), "func cannot be used with a list of predicates"

Expand All @@ -280,6 +285,7 @@ def inc(self, predicates=None, func=None):
'''

if func:
warnings.warn("The use of func is deprecated, please use filter instead.", DeprecationWarning)
assert callable(func), "func must be a lambda. Example: func = lambda d: int(d) > 5"
assert not isinstance(predicates, list), "func cannot be used with a list of predicates"

Expand Down Expand Up @@ -412,6 +418,102 @@ def __hop(self, direction, predicates=None, func=None):
traverse_vertex.append(v_adjacent_obj)
self.last_visited_vertices = traverse_vertex

def filter(self, func):
'''
Applies a filter function to the vertices and removes any vertices that do not pass the filter.
'''
for v in self.last_visited_vertices:
if not func(v.id):
self.last_visited_vertices.remove(v)
return self

def sim(self, word, operator, threshold, strict=False):
"""
Applies cosine similarity filter to the vertices and removes any vertices that do not pass the filter.
Parameters:
-----------
word: str
The word to compare to the other vertices.
operator: str
The comparison operator to use. One of "==", ">", "<", ">=", "<=", or "in".
threshold: float or list of 2 floats
The threshold value(s) to use for the comparison. If operator is "==", ">", "<", ">=", or "<=", threshold should be a float. If operator is "in", threshold should be a list of 2 floats.
strict: bool, optional
If True, raises an exception if a word embedding is not found for either word. If False, assigns a similarity of 0.0 to any word embedding that is not found.
Returns:
--------
self: GraphTraversal
Returns self to allow for method chaining.
Raises:
-------
ValueError:
If operator is not a valid comparison operator or if threshold is not a valid threshold value for the given operator.
If strict is True and a word embedding is not found for either word.
"""
if not isinstance(threshold, (float, int, list)):
raise ValueError("Invalid threshold value: {}".format(threshold))

if operator == 'in':
if not isinstance(threshold, list) or len(threshold) != 2:
raise ValueError("Invalid threshold value: {}".format(threshold))
if not all(isinstance(t, (float, int)) for t in threshold):
raise ValueError("Invalid threshold value: {}".format(threshold))

filtered_vertices = []
for v in self.last_visited_vertices:
similarity = self.__cosine_similarity(word, v.id)
if not similarity:
# similarity is None if a word embedding is not found for either word.
if strict:
raise ValueError("Missing word embedding for either '{}' or '{}'".format(word, v.id))
else:
# Treat vertices without word embeddings as if they have no similarity to any other vertex.
similarity = 0.0
if operator == '=':
if isclose(similarity, threshold):
filtered_vertices.append(v)
elif operator == '>':
if similarity > threshold:
filtered_vertices.append(v)
elif operator == '<':
if similarity < threshold:
filtered_vertices.append(v)
elif operator == '>=':
if similarity >= threshold:
filtered_vertices.append(v)
elif operator == '<=':
if similarity <= threshold:
filtered_vertices.append(v)
elif operator == 'in':
if not threshold[0] <= similarity <= threshold[1]:
continue
filtered_vertices.append(v)
else:
raise ValueError("Invalid operator: {}".format(operator))
self.last_visited_vertices = filtered_vertices
return self

def __cosine_similarity(self, word1, word2):
x = self.get_embedding(word1)
y = self.get_embedding(word2)

if x is None or y is None:
return None

dot_product = 0
x_norm = 0
y_norm = 0
for i in range(len(x)):
dot_product += x[i] * y[i]
x_norm += x[i] ** 2
y_norm += y[i] ** 2
x_norm = x_norm ** (1 / 2)
y_norm = y_norm ** (1 / 2)
return dot_product / (x_norm * y_norm)

def tag(self, tag_name):
'''
Saves vertices with a tag name. Used to capture vertices while traversing a graph.
Expand Down Expand Up @@ -475,6 +577,34 @@ def lsv(self):
def get_new_graph_instance(self):
return Graph(self.graph_name, self.config.COG_HOME, self.config.COG_PATH_PREFIX)

def put_embedding(self, word, embedding):
"""
Saves a word embedding.
"""

assert isinstance(word, str), "word must be a string"
self.cog.use_namespace(self.graph_name).use_table(self.config.EMBEDDING_SET_TABLE_NAME).put(Record(
str(cog_hash(word, self.config.INDEX_CAPACITY)), embedding))

def get_embedding(self, word):
"""
Returns a word embedding.
"""
assert isinstance(word, str), "word must be a string"
record = self.cog.use_namespace(self.graph_name).use_table(self.config.EMBEDDING_SET_TABLE_NAME).get(
str(cog_hash(word, self.config.INDEX_CAPACITY)))
if record is None:
return None
return record.value

def delete_embedding(self, word):
"""
Deletes a word embedding.
"""
assert isinstance(word, str), "word must be a string"
self.cog.use_namespace(self.graph_name).use_table(self.config.EMBEDDING_SET_TABLE_NAME).delete(
str(cog_hash(word, self.config.INDEX_CAPACITY)))


class View(object):

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


setup(name='cogdb',
version='3.0.4',
version='3.0.5',
description='Persistent Embedded Graph Database',
url='http://github.com/arun1729/cog',
author='Arun Mahendra',
Expand Down

0 comments on commit fc64c19

Please sign in to comment.