From 2821fafc854abcfb279e1a48573ab3dd5d0ebaeb Mon Sep 17 00:00:00 2001 From: my-vegetable-has-exploded Date: Tue, 19 Mar 2024 04:54:57 +0000 Subject: [PATCH] rm perf.sh & refactor examples. Signed-off-by: my-vegetable-has-exploded --- bindings/python/examples/perf.sh | 5 - .../python/examples/psycopg_copy_dense.py | 57 +++++++++ .../python/examples/psycopg_copy_example.py | 117 ------------------ .../python/examples/psycopg_copy_sparse.py | 52 ++++++++ 4 files changed, 109 insertions(+), 122 deletions(-) delete mode 100644 bindings/python/examples/perf.sh create mode 100644 bindings/python/examples/psycopg_copy_dense.py delete mode 100644 bindings/python/examples/psycopg_copy_example.py create mode 100644 bindings/python/examples/psycopg_copy_sparse.py diff --git a/bindings/python/examples/perf.sh b/bindings/python/examples/perf.sh deleted file mode 100644 index 26baa4bba..000000000 --- a/bindings/python/examples/perf.sh +++ /dev/null @@ -1,5 +0,0 @@ -/bin/python3 /home/envd/pgvecto.rs/bindings/python/examples/psycopg_testcpy.py & -PID=$(ps aux | grep "/usr/lib/postgresql/15/bin/postgres" | grep -v grep | awk '{print $2}') -# use perf to collect data -sudo /usr/lib/linux-tools/5.15.0-97-generic/perf record -e cpu-clock -F 1000 -g -p $PID -- sleep 30 -sudo /usr/lib/linux-tools/5.15.0-97-generic/perf script -i perf.data > insert.out.perf \ No newline at end of file diff --git a/bindings/python/examples/psycopg_copy_dense.py b/bindings/python/examples/psycopg_copy_dense.py new file mode 100644 index 000000000..c711910f9 --- /dev/null +++ b/bindings/python/examples/psycopg_copy_dense.py @@ -0,0 +1,57 @@ +import os + +import numpy as np +import psycopg + +from pgvecto_rs.psycopg import register_vector + +URL = "postgresql://{username}:{password}@{host}:{port}/{db_name}".format( + port=os.getenv("DB_PORT", "5432"), + host=os.getenv("DB_HOST", "localhost"), + username=os.getenv("DB_USER", "postgres"), + password=os.getenv("DB_PASS", "mysecretpassword"), + db_name=os.getenv("DB_NAME", "postgres"), +) + +# Connect to the DB and init things +with psycopg.connect(URL) as conn: + conn.execute("CREATE EXTENSION IF NOT EXISTS vectors;") + register_vector(conn) + conn.execute("DROP TABLE IF EXISTS documents;") + conn.execute( + "CREATE TABLE documents (id SERIAL PRIMARY KEY, embedding vector(3) NOT NULL);", + ) + conn.commit() + try: + embeddings = [ + np.array([1, 2, 3]), + np.array([1.0, 2.0, 4.0]), + np.array([1, 3, 4]), + ] + + with conn.cursor() as cursor, cursor.copy( + "COPY documents (embedding) FROM STDIN (FORMAT BINARY)" + ) as copy: + # write row by row + for e in embeddings: + copy.write_row([e]) + copy.write_row([[1, 3, 5]]) + conn.commit() + + # Select the rows using binary format + cur = conn.execute( + "SELECT * FROM documents;", + binary=True, + ) + for row in cur.fetchall(): + print(row[0], ": ", row[1]) + + # output will be: + # 1 : [1.0, 2.0, 3.0] + # 2 : [1.0, 2.0, 4.0] + # 3 : [1.0, 3.0, 4.0] + # 4 : [1.0, 3.0, 5.0] + finally: + # Drop the table + conn.execute("DROP TABLE IF EXISTS documents;") + conn.commit() diff --git a/bindings/python/examples/psycopg_copy_example.py b/bindings/python/examples/psycopg_copy_example.py deleted file mode 100644 index 6543e2b8b..000000000 --- a/bindings/python/examples/psycopg_copy_example.py +++ /dev/null @@ -1,117 +0,0 @@ -import os - -import numpy as np -import pandas as pd -import psycopg -import pyarrow as pa -import pyarrow.parquet as pq - -from pgvecto_rs.psycopg import register_vector -from pgvecto_rs.types import SparseVector - -URL = "postgresql://{username}:{password}@{host}:{port}/{db_name}".format( - port=os.getenv("DB_PORT", "5432"), - host=os.getenv("DB_HOST", "localhost"), - username=os.getenv("DB_USER", "postgres"), - password=os.getenv("DB_PASS", "mysecretpassword"), - db_name=os.getenv("DB_NAME", "postgres"), -) - - -def copy_by_row(conn: psycopg.Connection): - conn.execute("DROP TABLE IF EXISTS documents;") - conn.execute( - "CREATE TABLE documents (id SERIAL PRIMARY KEY, embedding vector(3) NOT NULL);", - ) - conn.commit() - try: - # create a parquet file , and write row into it. - table = pa.Table.from_pandas( - pd.DataFrame( - { - "embedding": [ - np.array([1, 2, 3]), - np.array([1.0, 2.0, 4.0]), - np.array([1, 3, 4]), - ] - } - ) - ) - pq.write_table(table, "test.parquet") - - # load vectors from parquet file - table = pq.read_table("test.parquet") - # TODO: Is there a better way to convert pyarrow table to numpy array to reduce copy overhead? - embeddings = table.column("embedding").to_numpy() - - with conn.cursor() as cursor, cursor.copy( - "COPY documents (embedding) FROM STDIN (FORMAT BINARY)" - ) as copy: - # write row by row - for e in embeddings: - copy.write_row([e]) - copy.write_row([np.array([1, 3, 5])]) - conn.commit() - - # Select the rows using binary format - cur = conn.execute( - "SELECT * FROM documents;", - binary=True, - ) - for row in cur.fetchall(): - print(row[0], ": ", row[1]) - - # output will be: - # 1 : [1.0, 2.0, 3.0] - # 2 : [1.0, 2.0, 4.0] - # 3 : [1.0, 3.0, 4.0] - # 4 : [1.0, 3.0, 5.0] - finally: - # Drop the table - conn.execute("DROP TABLE IF EXISTS documents;") - conn.commit() - - -def copy_sparse_by_row(conn: psycopg.Connection): - conn.execute("DROP TABLE IF EXISTS documents;") - conn.execute( - "CREATE TABLE documents (id SERIAL PRIMARY KEY, embedding svector NOT NULL);", - ) - conn.commit() - try: - with conn.cursor() as cursor, cursor.copy( - "COPY documents (embedding) FROM STDIN (FORMAT BINARY)" - ) as copy: - copy.write_row([SparseVector(3, [0, 2], [1.0, 3.0])]) - copy.write_row([SparseVector(3, np.array([0, 1, 2]), [1.0, 2.0, 3.0])]) - copy.write_row([SparseVector(3, np.array([1, 2]), np.array([2.0, 3.0]))]) - conn.commit() - - # Select the rows using binary format - cur = conn.execute( - "SELECT * FROM documents;", - binary=True, - ) - for row in cur.fetchall(): - print(row[0], ": ", row[1]) - - # output will be: - # 1 : [1.0, 0.0, 3.0] - # 2 : [1.0, 2.0, 3.0] - # 3 : [0.0, 2.0, 3.0] - finally: - # Drop the table - conn.execute("DROP TABLE IF EXISTS documents;") - conn.commit() - - -# Connect to the DB and init things -with psycopg.connect(URL) as conn: - conn.execute("CREATE EXTENSION IF NOT EXISTS vectors;") - register_vector(conn) - - # example for vectorf32 - copy_by_row(conn) - - # example for sparse vector - copy_sparse_by_row(conn) diff --git a/bindings/python/examples/psycopg_copy_sparse.py b/bindings/python/examples/psycopg_copy_sparse.py new file mode 100644 index 000000000..861a71658 --- /dev/null +++ b/bindings/python/examples/psycopg_copy_sparse.py @@ -0,0 +1,52 @@ +import os + +import numpy as np +import psycopg + +from pgvecto_rs.psycopg import register_vector +from pgvecto_rs.types import SparseVector + +URL = "postgresql://{username}:{password}@{host}:{port}/{db_name}".format( + port=os.getenv("DB_PORT", "5432"), + host=os.getenv("DB_HOST", "localhost"), + username=os.getenv("DB_USER", "postgres"), + password=os.getenv("DB_PASS", "mysecretpassword"), + db_name=os.getenv("DB_NAME", "postgres"), +) + + +# Connect to the DB and init things +with psycopg.connect(URL) as conn: + conn.execute("CREATE EXTENSION IF NOT EXISTS vectors;") + register_vector(conn) + conn.execute("DROP TABLE IF EXISTS documents;") + conn.execute( + "CREATE TABLE documents (id SERIAL PRIMARY KEY, embedding svector NOT NULL);", + ) + conn.commit() + try: + with conn.cursor() as cursor, cursor.copy( + "COPY documents (embedding) FROM STDIN (FORMAT BINARY)" + ) as copy: + copy.write_row([SparseVector(3, [0, 2], [1.0, 3.0])]) + copy.write_row([SparseVector(3, np.array([0, 1, 2]), [1.0, 2.0, 3.0])]) + copy.write_row([SparseVector(3, np.array([1, 2]), np.array([2.0, 3.0]))]) + conn.pgconn.flush() + conn.commit() + + # Select the rows using binary format + cur = conn.execute( + "SELECT * FROM documents;", + binary=True, + ) + for row in cur.fetchall(): + print(row[0], ": ", row[1]) + + # output will be: + # 1 : [1.0, 0.0, 3.0] + # 2 : [1.0, 2.0, 3.0] + # 3 : [0.0, 2.0, 3.0] + finally: + # Drop the table + conn.execute("DROP TABLE IF EXISTS documents;") + conn.commit()