Skip to content

Commit

Permalink
Tokenizer (#92)
Browse files Browse the repository at this point in the history
  • Loading branch information
coilysiren committed Oct 23, 2023
1 parent 59bf008 commit 0e54131
Show file tree
Hide file tree
Showing 24 changed files with 626 additions and 76 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -5,3 +5,4 @@ data/sort_by_*
data/sorted_by_*
src/rust/target/
__pycache__/
**/.mypy_cache/
3 changes: 2 additions & 1 deletion data/sql_input_2.sql
Expand Up @@ -14,8 +14,9 @@ VALUES ('San Francisco', 852469, -8);
INSERT INTO city (name, population, timezone)
VALUES ('New York', 8405837, -5);

SELECT
SELECT (
name,
population,
timezone
)
FROM city;
3 changes: 2 additions & 1 deletion data/sql_input_3.sql
Expand Up @@ -14,8 +14,9 @@ VALUES ('San Francisco', -8);
INSERT INTO city (name, population)
VALUES ('New York', 8405837);

SELECT
SELECT (
name,
population,
timezone
)
FROM city;
22 changes: 22 additions & 0 deletions data/tokenizer_input_0.sql
@@ -0,0 +1,22 @@
CREATE TABLE town ();

CREATE TABLE city (
name VARCHAR,
population INT,
timezone INT
);

INSERT INTO city (name, population, timezone)
VALUES ('San Francisco', 852469, -8);

INSERT INTO city (name, population)
VALUES ('New York', 8405837);

SELECT (
name,
population,
timezone
)
FROM city;

SELECT name FROM city;
1 change: 1 addition & 0 deletions data/tokenizer_input_1.sql
@@ -0,0 +1 @@
CREATE TABLE town ();
1 change: 1 addition & 0 deletions data/tokenizer_input_2.sql
@@ -0,0 +1 @@
SELECT name FROM city;
2 changes: 2 additions & 0 deletions data/tokenizer_input_3.sql
@@ -0,0 +1,2 @@
INSERT INTO city (name, population, timezone)
VALUES ('San Francisco', 852469, -8);
2 changes: 2 additions & 0 deletions data/tokenizer_input_4.sql
@@ -0,0 +1,2 @@
INSERT INTO items (type)
VALUES ('"d"r"u"g"s"');
6 changes: 6 additions & 0 deletions data/tokenizer_input_5.sql
@@ -0,0 +1,6 @@
SELECT (
name,
population,
timezone
)
FROM city;
5 changes: 5 additions & 0 deletions data/tokenizer_input_6.sql
@@ -0,0 +1,5 @@
CREATE TABLE city (
name VARCHAR,
population INT,
timezone INT
);
33 changes: 33 additions & 0 deletions data/tokenizer_output_0.json
@@ -0,0 +1,33 @@
[
{ "worker": "CREATE TABLE", "args": ["town", []] },
{
"worker": "CREATE TABLE",
"args": [
"city",
["name", "VARCHAR", "population", "INT", "timezone", "INT"]
]
},
{
"worker": "INSERT INTO",
"args": [
"city",
["name", "population", "timezone"],
"VALUES",
["'San Francisco'", "852469", "-8"]
]
},
{
"worker": "INSERT INTO",
"args": [
"city",
["name", "population"],
"VALUES",
["'New York'", "8405837"]
]
},
{
"worker": "SELECT",
"args": [["name", "population", "timezone"], "FROM", "city"]
},
{ "worker": "SELECT", "args": ["name", "FROM", "city"] }
]
6 changes: 6 additions & 0 deletions data/tokenizer_output_1.json
@@ -0,0 +1,6 @@
[
{
"worker": "CREATE TABLE",
"args": ["town", []]
}
]
6 changes: 6 additions & 0 deletions data/tokenizer_output_2.json
@@ -0,0 +1,6 @@
[
{
"worker": "SELECT",
"args": ["name", "FROM", "city"]
}
]
11 changes: 11 additions & 0 deletions data/tokenizer_output_3.json
@@ -0,0 +1,11 @@
[
{
"worker": "INSERT INTO",
"args": [
"city",
["name", "population", "timezone"],
"VALUES",
["'San Francisco'", "852469", "-8"]
]
}
]
6 changes: 6 additions & 0 deletions data/tokenizer_output_4.json
@@ -0,0 +1,6 @@
[
{
"worker": "INSERT INTO",
"args": ["items", ["type"], "VALUES", ["'\"d\"r\"u\"g\"s\"'"]]
}
]
6 changes: 6 additions & 0 deletions data/tokenizer_output_5.json
@@ -0,0 +1,6 @@
[
{
"worker": "SELECT",
"args": [["name", "population", "timezone"], "FROM", "city"]
}
]
9 changes: 9 additions & 0 deletions data/tokenizer_output_6.json
@@ -0,0 +1,9 @@
[
{
"worker": "CREATE TABLE",
"args": [
"city",
["name", "VARCHAR", "population", "INT", "timezone", "INT"]
]
}
]
146 changes: 146 additions & 0 deletions snippets/python/sql_script.py
@@ -0,0 +1,146 @@

import dataclasses
import json
import re
import typing

import tokenizer_script


@dataclasses.dataclass(frozen=True)
class SQLState:
state: dict

def read_table_meta(self, table_name: str) -> dict:
return self.state.get(table_name, {}).get("metadata", {})

def read_table_rows(self, table_name: str) -> list[dict]:
return self.state.get(table_name, {}).get("rows", [])

def read_information_schema(self) -> list[dict]:
return [data["metadata"] for data in self.state.values()]

def write_table_meta(self, table_name: str, data: dict):
state = self.state
table = state.get(table_name, {})
metadata = table.get("metadata", {})
metadata.update(data)
table["metadata"] = metadata
state[table_name] = table
return self.__class__(state)

def write_table_rows(self, table_name: str, data: dict):
state = self.state
table = state.get(table_name, {})
rows = table.get("rows", [])
rows.append(data)
table["rows"] = rows
state[table_name] = table
return self.__class__(state)


class SQLType:
@staticmethod
def varchar(data) -> str:
data_str = str(data).strip()
data_str = re.sub(r'^["\']', "", data_str) # leading ' or "
data_str = re.sub(r'["\']$', "", data_str) # trailing ' or "
return data_str

@staticmethod
def int(data) -> int:
return int(data.strip())


class SQLFunctions:
@staticmethod
def create_table(state: SQLState, *args, table_schema="public") -> typing.Tuple[list, SQLState]:
output: list[dict] = []
table_name = args[0]

# get columns
columns = {}
columns_str = args[1]
if columns_str:
# fmt: off
columns = {
columns_str[i]: columns_str[i + 1]
for i in range(0, len(columns_str), 2)
}
# fmt: on

if not state.read_table_meta(table_name):
state = state.write_table_meta(
table_name,
{
"table_name": table_name,
"table_schema": table_schema,
"colums": columns,
},
)
return (output, state)

@staticmethod
def insert_into(state: SQLState, *args) -> typing.Tuple[list, SQLState]:
output: list[dict] = []
table_name = args[0]
keys = args[1]
values = args[3]
key_value_map = dict(zip(keys, values))

sql_type_map = {
"VARCHAR": SQLType.varchar,
"INT": SQLType.int,
}

data = {}
metadata = state.read_table_meta(table_name)
if metadata:
for key, value in key_value_map.items():
data[key] = sql_type_map[metadata["colums"][key]](value)
state = state.write_table_rows(table_name, data)

return (output, state)

@staticmethod
def select(state: SQLState, *args) -> typing.Tuple[list, SQLState]:
output: list[dict] = []
select_columns = args[0] if isinstance(args[0], list) else [args[0]]
from_value = args[2]

# `information_schema.tables` is a special case
if from_value == "information_schema.tables":
data = state.read_information_schema()
else:
data = state.read_table_rows(from_value)

output = []
for datum in data:
# fmt: off
output.append({
key: datum.get(key)
for key in select_columns
})
# fmt: on

return (output, state)


def run_sql(input_sql: list[str]) -> list[str]:
output = []
state = SQLState(state={})
sql_tokenizer = tokenizer_script.SQLTokenizer(
{
"CREATE TABLE": SQLFunctions.create_table,
"INSERT INTO": SQLFunctions.insert_into,
"SELECT": SQLFunctions.select,
}
)
sql_token_list = sql_tokenizer.tokenize_sql(input_sql)

# iterate over each line of sql
for sql_tokens in sql_token_list:
output, state = sql_tokens.worker_func(state, *sql_tokens.args)

return [json.dumps(output)]

0 comments on commit 0e54131

Please sign in to comment.