adding filter and word embedding API and support for using word embed… (

#42) * adding filter and word embedding API and support for using word embeddings in Torque * read me update
arun1729 · Mar 13, 2023 · fc64c19 · fc64c19
1 parent a90b3e6
commit fc64c19
Show file tree

Hide file tree

Showing 10 changed files with 695 additions and 71 deletions.
diff --git a/README.md b/README.md
@@ -5,12 +5,10 @@
 # CogDB - Micro Graph Database for Python Applications
 > Documents and examples at [cogdb.io](https://cogdb.io)
 
-> New release!: 3.0.2
->
-> - Ability to put JSON into a graph.
-> - Ability to update JSON in a graph.
-> - Ability to drop edges.
-> - Option to disable in memory caching.
+> New release: 3.0.5
+> - New word embeddings API
+> - Similarity filtering using word embeddings
+> - Filter step
 
 ![ScreenShot](notes/ex2.png)
 
@@ -147,21 +145,21 @@ g.v("bob").inc().all()
 ```
 > {'result': [{'id': 'alice'}, {'id': 'charlie'}, {'id': 'dani'}]}
 
-#### Using lambda to chose vertices while traversing the graph.
+#### Filtering
 
 ```python
-g.v(func=lambda x: x.startswith("d")).all()
+g.v().filter(func=lambda x: x.startswith("d")).all()
 ```
 > {'result': [{'id': 'dani'}]}
 
 
 ```python
-g.v().out("score", func=lambda x: int(x) > 5).inc().all()
+g.v().out("score").filter(func=lambda x: int(x) > 5).inc().all()
 ```
 > {'result': [{'id': 'alice'}, {'id': 'dani'}, {'id': 'greg'}]}
 
 ```python
-g.v("emily").out("follows", func=lambda x: x.startswith("f")).all()
+g.v("emily").out("follows").filter(func=lambda x: x.startswith("f")).all()
 ```
 > {'result': [{'id': 'fred'}]}
 
@@ -190,6 +188,43 @@ f.v().has('name','fred').out('follows').all()
 In a json, CogDB treats `_id` property as a unique identifier for each object. If `_id` is not provided, a randomly generated `_id` is created for each object with in a JSON object.
 `_id` field is used to update a JSON object, see example below.
 
+## Using word embeddings
+
+CogDB supports word embeddings. Word embeddings are a way to represent words as vectors. Word embeddings are useful for many NLP tasks. 
+There are various types of word embeddings, including popular ones like [GloVe](https://nlp.stanford.edu/projects/glove/) and [FastText](https://fasttext.cc/).
+
+#### Add a word embedding:
+
+```python
+g.put_embedding("orange", [0.1, 0.2, 0.3, 0.4, 0.5])
+```
+
+#### Get a word embedding:
+
+```python
+g.get_embedding("orange")
+```
+
+> [0.1, 0.2, 0.3, 0.4, 0.5]
+#### Delete a word embedding:
+
+```python
+g.delete_embedding("orange")
+```
+
+#### Use word embeddings in a query:
+
+```python 
+g.v().sim('orange', '>', 0.35).all()
+```
+> {'result': [{'id': 'clementines'}, {'id': 'tangerine'}, {'id': 'orange'}]}
+
+```python
+g.v().sim('orange', 'in', [0.25, 0.35]).all()
+```
+> {'result': [{'id': 'banana'}, {'id': 'apple'}]}
+
+In the above code, the sim method is used to filter vertices based on their cosine similarity with the word embedding for "orange". The operator and threshold arguments determine how the similarity is compared to the threshold value, which can be a single value or a range.
 
 ## Loading data from a file
 
@@ -228,42 +263,6 @@ g = Graph(graph_name="people")
 g.load_edgelist("/path/to/edgelist", "people")
 ```
 
-## Low level key-value store API:
-Every record inserted into Cog's key-value store is directly persisted on to disk. It stores and retrieves data based 
-on hash values of the keys, it can perform fast look ups (O(1) avg) and fast (O(1) avg) inserts.
-
-```python
-
-from cog.database import Cog
-
-cogdb = Cog('path/to/dbdir')
-
-# create a namespace
-cogdb.create_or_load_namespace("my_namespace")
-
-# create new table
-cogdb.create_table("new_db", "my_namespace")
-
-# put some data
-cogdb.put(('key', 'val'))
-
-# retrieve data 
-cogdb.get('key')
-
-# put some more data
-cogdb.put(('key2', 'val2'))
-
-# scan
-scanner = cogdb.scanner()
-for r in scanner:
- print
- r
-
-# delete data
-cogdb.delete('key1')
-
-```
-
 ## Config
 
 If no config is provided when creating a Cog instance, it will use the defaults:

diff --git a/cog/config.py b/cog/config.py
@@ -14,6 +14,7 @@
 ''' TORQUE '''
 GRAPH_NODE_SET_TABLE_NAME = 'TOR_NODE_SET'
 GRAPH_EDGE_SET_TABLE_NAME = 'TOR_EDGE_SET'
+EMBEDDING_SET_TABLE_NAME = 'EMBEDDING_SET'
 
 ''' CUSTOM COG DB PATH '''
 CUSTOM_COG_DB_PATH = None

diff --git a/cog/core.py b/cog/core.py
@@ -243,7 +243,7 @@ def put(self, key, store_position, store):
 
 
  def get_index(self, key):
- num = self.cog_hash(key) % ((sys.maxsize + 1) * 2)
+ num = cog_hash(key, self.config.INDEX_CAPACITY) % ((sys.maxsize + 1) * 2)
  self.logger.debug("hash for: " + key + " : " + str(num))
  # there may be diff when using mem slice vs write (+1 needed)
  index = (self.config.INDEX_BLOCK_LEN *
@@ -489,3 +489,7 @@ def delete(self, key, store):
  return True
  else:
  return False
+
+
+def cog_hash(string, index_capacity):
+ return xxhash.xxh32(string, seed=2).intdigest() % index_capacity
diff --git a/cog/database.py b/cog/database.py
@@ -207,8 +207,7 @@ def use_table(self, name):
  return self
 
  def put(self, data):
- assert type(data.key) is str, "Only string type is supported."
- assert type(data.value) is str, "Only string type is supported."
+ assert type(data.key) is str, "key must be a string."
  position = self.current_table.store.save(data)
  self.current_table.indexer.put(data.key, position, self.current_table.store)
 

diff --git a/cog/torque.py b/cog/torque.py
@@ -1,5 +1,6 @@
 from cog.database import Cog
 from cog.database import in_nodes, out_nodes, hash_predicate
+from cog.core import cog_hash, Record
 import json
 import logging
 from logging.config import dictConfig
@@ -9,6 +10,8 @@
 from os import listdir
 import time
 import random
+from math import isclose
+import warnings
 
 NOTAG = "NOTAG"
 
@@ -35,7 +38,6 @@ def __str__(self):
 
 
 class BlankNode(object):
-
  ID_PREFIX = "_id_"
 
  def __init__(self, label=None):
@@ -51,7 +53,7 @@ def __str__(self):
  @classmethod
  def is_id(cls, label):
  # print("--- > is_id", label)
- return label.startswith("_:"+BlankNode.ID_PREFIX)
+ return label.startswith("_:" + BlankNode.ID_PREFIX)
 
 
 class Graph:
@@ -235,6 +237,8 @@ def update(self, vertex1, predicate, vertex2):
  return self
 
  def v(self, vertex=None, func=None):
+ if func:
+ warnings.warn("The use of func is deprecated, please use filter instead.", DeprecationWarning)
  if vertex is not None:
  if isinstance(vertex, list):
  self.last_visited_vertices = [Vertex(v) for v in vertex]
@@ -258,6 +262,7 @@ def out(self, predicates=None, func=None):
  '''
 
  if func:
+ warnings.warn("The use of func is deprecated, please use filter instead.", DeprecationWarning)
  assert callable(func), "func must be a lambda. Example: func = lambda d: int(d) > 5"
  assert not isinstance(predicates, list), "func cannot be used with a list of predicates"
 
@@ -280,6 +285,7 @@ def inc(self, predicates=None, func=None):
  '''
 
  if func:
+ warnings.warn("The use of func is deprecated, please use filter instead.", DeprecationWarning)
  assert callable(func), "func must be a lambda. Example: func = lambda d: int(d) > 5"
  assert not isinstance(predicates, list), "func cannot be used with a list of predicates"
 
@@ -412,6 +418,102 @@ def __hop(self, direction, predicates=None, func=None):
  traverse_vertex.append(v_adjacent_obj)
  self.last_visited_vertices = traverse_vertex
 
+ def filter(self, func):
+ '''
+ Applies a filter function to the vertices and removes any vertices that do not pass the filter.
+ '''
+ for v in self.last_visited_vertices:
+ if not func(v.id):
+ self.last_visited_vertices.remove(v)
+ return self
+
+ def sim(self, word, operator, threshold, strict=False):
+ """
+ Applies cosine similarity filter to the vertices and removes any vertices that do not pass the filter.
+
+ Parameters:
+ -----------
+ word: str
+ The word to compare to the other vertices.
+ operator: str
+ The comparison operator to use. One of "==", ">", "<", ">=", "<=", or "in".
+ threshold: float or list of 2 floats
+ The threshold value(s) to use for the comparison. If operator is "==", ">", "<", ">=", or "<=", threshold should be a float. If operator is "in", threshold should be a list of 2 floats.
+ strict: bool, optional
+ If True, raises an exception if a word embedding is not found for either word. If False, assigns a similarity of 0.0 to any word embedding that is not found.
+
+ Returns:
+ --------
+ self: GraphTraversal
+ Returns self to allow for method chaining.
+
+ Raises:
+ -------
+ ValueError:
+ If operator is not a valid comparison operator or if threshold is not a valid threshold value for the given operator.
+ If strict is True and a word embedding is not found for either word.
+ """
+ if not isinstance(threshold, (float, int, list)):
+ raise ValueError("Invalid threshold value: {}".format(threshold))
+
+ if operator == 'in':
+ if not isinstance(threshold, list) or len(threshold) != 2:
+ raise ValueError("Invalid threshold value: {}".format(threshold))
+ if not all(isinstance(t, (float, int)) for t in threshold):
+ raise ValueError("Invalid threshold value: {}".format(threshold))
+
+ filtered_vertices = []
+ for v in self.last_visited_vertices:
+ similarity = self.__cosine_similarity(word, v.id)
+ if not similarity:
+ # similarity is None if a word embedding is not found for either word.
+ if strict:
+ raise ValueError("Missing word embedding for either '{}' or '{}'".format(word, v.id))
+ else:
+ # Treat vertices without word embeddings as if they have no similarity to any other vertex.
+ similarity = 0.0
+ if operator == '=':
+ if isclose(similarity, threshold):
+ filtered_vertices.append(v)
+ elif operator == '>':
+ if similarity > threshold:
+ filtered_vertices.append(v)
+ elif operator == '<':
+ if similarity < threshold:
+ filtered_vertices.append(v)
+ elif operator == '>=':
+ if similarity >= threshold:
+ filtered_vertices.append(v)
+ elif operator == '<=':
+ if similarity <= threshold:
+ filtered_vertices.append(v)
+ elif operator == 'in':
+ if not threshold[0] <= similarity <= threshold[1]:
+ continue
+ filtered_vertices.append(v)
+ else:
+ raise ValueError("Invalid operator: {}".format(operator))
+ self.last_visited_vertices = filtered_vertices
+ return self
+
+ def __cosine_similarity(self, word1, word2):
+ x = self.get_embedding(word1)
+ y = self.get_embedding(word2)
+
+ if x is None or y is None:
+ return None
+
+ dot_product = 0
+ x_norm = 0
+ y_norm = 0
+ for i in range(len(x)):
+ dot_product += x[i] * y[i]
+ x_norm += x[i] ** 2
+ y_norm += y[i] ** 2
+ x_norm = x_norm ** (1 / 2)
+ y_norm = y_norm ** (1 / 2)
+ return dot_product / (x_norm * y_norm)
+
  def tag(self, tag_name):
  '''
  Saves vertices with a tag name. Used to capture vertices while traversing a graph.
@@ -475,6 +577,34 @@ def lsv(self):
  def get_new_graph_instance(self):
  return Graph(self.graph_name, self.config.COG_HOME, self.config.COG_PATH_PREFIX)
 
+ def put_embedding(self, word, embedding):
+ """
+ Saves a word embedding.
+ """
+
+ assert isinstance(word, str), "word must be a string"
+ self.cog.use_namespace(self.graph_name).use_table(self.config.EMBEDDING_SET_TABLE_NAME).put(Record(
+ str(cog_hash(word, self.config.INDEX_CAPACITY)), embedding))
+
+ def get_embedding(self, word):
+ """
+ Returns a word embedding.
+ """
+ assert isinstance(word, str), "word must be a string"
+ record = self.cog.use_namespace(self.graph_name).use_table(self.config.EMBEDDING_SET_TABLE_NAME).get(
+ str(cog_hash(word, self.config.INDEX_CAPACITY)))
+ if record is None:
+ return None
+ return record.value
+
+ def delete_embedding(self, word):
+ """
+ Deletes a word embedding.
+ """
+ assert isinstance(word, str), "word must be a string"
+ self.cog.use_namespace(self.graph_name).use_table(self.config.EMBEDDING_SET_TABLE_NAME).delete(
+ str(cog_hash(word, self.config.INDEX_CAPACITY)))
+
 
 class View(object):
 

diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 
 setup(name='cogdb',
- version='3.0.4',
+ version='3.0.5',
  description='Persistent Embedded Graph Database',
  url='http://github.com/arun1729/cog',
  author='Arun Mahendra',