Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement full text search #4416

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.15)

project(Kuzu VERSION 0.6.0.6 LANGUAGES CXX C)
project(Kuzu VERSION 0.6.0.7 LANGUAGES CXX C)

option(SINGLE_THREADED "Single-threaded mode" FALSE)
if(SINGLE_THREADED)
Expand Down
15 changes: 8 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ allconfig:
$(call config-cmake-release, \
-DBUILD_BENCHMARK=TRUE \
-DBUILD_EXAMPLES=TRUE \
-DBUILD_EXTENSIONS="httpfs;duckdb;json;postgres;sqlite" \
-DBUILD_EXTENSIONS="httpfs;duckdb;json;postgres;sqlite;fts" \
-DBUILD_JAVA=TRUE \
-DBUILD_NODEJS=TRUE \
-DBUILD_PYTHON=TRUE \
Expand All @@ -98,7 +98,7 @@ alldebug:
$(call run-cmake-debug, \
-DBUILD_BENCHMARK=TRUE \
-DBUILD_EXAMPLES=TRUE \
-DBUILD_EXTENSIONS="httpfs;duckdb;json;postgres;sqlite" \
-DBUILD_EXTENSIONS="httpfs;duckdb;json;postgres;sqlite;fts" \
-DBUILD_JAVA=TRUE \
-DBUILD_NODEJS=TRUE \
-DBUILD_PYTHON=TRUE \
Expand Down Expand Up @@ -186,7 +186,7 @@ example:

extension-test-build:
$(call run-cmake-release, \
-DBUILD_EXTENSIONS="httpfs;duckdb;json;postgres;sqlite" \
-DBUILD_EXTENSIONS="httpfs;duckdb;json;postgres;sqlite;fts" \
-DBUILD_EXTENSION_TESTS=TRUE \
)

Expand All @@ -207,13 +207,13 @@ extension-json-test: extension-json-test-build

extension-debug:
$(call run-cmake-debug, \
-DBUILD_EXTENSIONS="httpfs;duckdb;json;postgres;sqlite" \
-DBUILD_EXTENSIONS="httpfs;duckdb;json;postgres;sqlite;fts" \
-DBUILD_KUZU=FALSE \
)

extension-release:
$(call run-cmake-release, \
-DBUILD_EXTENSIONS="httpfs;duckdb;json;postgres;sqlite" \
-DBUILD_EXTENSIONS="httpfs;duckdb;json;postgres;sqlite;fts" \
-DBUILD_KUZU=FALSE \
)

Expand All @@ -230,11 +230,11 @@ shell-test:
# parallelism.
tidy: | allconfig java_native_header
run-clang-tidy -p build/release -quiet -j $(NUM_THREADS) \
"^$(realpath src)|$(realpath extension)|$(realpath tools)/(?!shell/linenoise.cpp)"
"^$(realpath src)|$(realpath extension)/(?!fts/third_party/snowball/)|$(realpath tools)/(?!shell/linenoise.cpp)"

tidy-analyzer: | allconfig java_native_header
run-clang-tidy -config-file .clang-tidy-analyzer -p build/release -quiet -j $(NUM_THREADS) \
"^$(realpath src)|$(realpath extension)|$(realpath tools)/(?!shell/linenoise.cpp)"
"^$(realpath src)|$(realpath extension)/(?!fts/third_party/snowball/)|$(realpath tools)/(?!shell/linenoise.cpp)"

clangd-diagnostics: | allconfig java_native_header
find src -name *.h -or -name *.cpp | xargs \
Expand All @@ -253,6 +253,7 @@ clean-extension:
cmake -E rm -rf extension/duckdb/build
cmake -E rm -rf extension/postgres/build
cmake -E rm -rf extension/sqlite/build
cmake -E rm -rf extension/fts/build

clean-python-api:
cmake -E rm -rf tools/python_api/build
Expand Down
1 change: 1 addition & 0 deletions dataset/fts-small/copy.cypher
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
COPY doc from "dataset/fts-small/vDoc.csv";
1 change: 1 addition & 0 deletions dataset/fts-small/schema.cypher
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CREATE NODE TABLE doc (ID UINT64, content STRING, author STRING, name STRING, PRIMARY KEY (ID))
3 changes: 3 additions & 0 deletions dataset/fts-small/vDoc.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
0,alice studys in waterloo and toronto,alice,toronto
acquamarin marked this conversation as resolved.
Show resolved Hide resolved
3,bob is studying in vancouver,alice,vancouver
20,dan has not study at any places,bob,waterloo
1 change: 1 addition & 0 deletions dataset/ms-passage/copy.cypher
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
COPY doc from "dataset/ms-passage/vDoc.csv";
1 change: 1 addition & 0 deletions dataset/ms-passage/schema.cypher
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CREATE NODE TABLE doc (ID UINT64, content STRING, PRIMARY KEY (ID))
500 changes: 500 additions & 0 deletions dataset/ms-passage/vDoc.csv

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions extension/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ if ("sqlite" IN_LIST BUILD_EXTENSIONS)
endif ()
endif ()

if ("fts" IN_LIST BUILD_EXTENSIONS)
add_subdirectory(fts)
endif ()

if (${BUILD_EXTENSION_TESTS})
include_directories(${CMAKE_SOURCE_DIR}/third_party/spdlog)
add_definitions(-DTEST_FILES_DIR="extension")
Expand Down
27 changes: 27 additions & 0 deletions extension/fts/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
include_directories(
${PROJECT_SOURCE_DIR}/src/include
src/include
third_party/snowball/libstemmer)

add_subdirectory(src)

add_library(fts_extension
SHARED
${FTS_OBJECT_FILES})

add_subdirectory(third_party/snowball)

target_link_libraries(fts_extension
PRIVATE
snowball)

set_extension_properties(fts_extension fts)

if (WIN32)
acquamarin marked this conversation as resolved.
Show resolved Hide resolved
# See comment in extension/httpfs/CMakeLists.txt
target_link_libraries(fts_extension PRIVATE kuzu)
endif ()

if (APPLE)
set_apple_dynamic_lookup(fts_extension)
endif ()
9 changes: 9 additions & 0 deletions extension/fts/src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
add_subdirectory(function)

add_library(fts_extension_main
OBJECT
fts_extension.cpp)

set(FTS_OBJECT_FILES
${FTS_OBJECT_FILES} $<TARGET_OBJECTS:fts_extension_main>
PARENT_SCOPE)
38 changes: 38 additions & 0 deletions extension/fts/src/fts_extension.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#include "fts_extension.h"

#include "catalog/catalog_entry/catalog_entry_type.h"
#include "function/create_fts_index.h"
#include "function/drop_fts_index.h"
#include "function/fts.h"
#include "function/query_fts_index.h"
#include "function/stem.h"
#include "main/client_context.h"
#include "main/database.h"

namespace kuzu {
namespace fts_extension {

void FTSExtension::load(main::ClientContext* context) {
auto& db = *context->getDatabase();
ADD_SCALAR_FUNC(StemFunction);
ADD_GDS_FUNC(FTSFunction);
db.addStandaloneCallFunction(CreateFTSFunction::name, CreateFTSFunction::getFunctionSet());
db.addTableFunction(QueryFTSFunction::name, QueryFTSFunction::getFunctionSet());
db.addStandaloneCallFunction(DropFTSFunction::name, DropFTSFunction::getFunctionSet());
}

} // namespace fts_extension
} // namespace kuzu

extern "C" {
// Because we link against the static library on windows, we implicitly inherit KUZU_STATIC_DEFINE,
// which cancels out any exporting, so we can't use KUZU_API.
#if defined(_WIN32)
#define INIT_EXPORT __declspec(dllexport)
#else
#define INIT_EXPORT __attribute__((visibility("default")))
#endif
INIT_EXPORT void init(kuzu::main::ClientContext* context) {
kuzu::fts_extension::FTSExtension::load(context);
}
}
12 changes: 12 additions & 0 deletions extension/fts/src/function/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
add_library(kuzu_fts_function
OBJECT
stem.cpp
create_fts_index.cpp
query_fts_index.cpp
drop_fts_index.cpp
fts.cpp
fts_utils.cpp)

set(FTS_OBJECT_FILES
${FTS_OBJECT_FILES} $<TARGET_OBJECTS:kuzu_fts_function>
PARENT_SCOPE)
170 changes: 170 additions & 0 deletions extension/fts/src/function/create_fts_index.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
#include "function/create_fts_index.h"

#include "binder/ddl/bound_alter_info.h"
#include "binder/expression/expression_util.h"
#include "catalog/catalog.h"
#include "common/exception/binder.h"
#include "common/types/value/nested.h"
#include "fts_extension.h"
#include "function/fts_utils.h"
#include "function/table/bind_input.h"

namespace kuzu {
namespace fts_extension {

using namespace kuzu::common;
using namespace kuzu::main;
using namespace kuzu::function;

struct CreateFTSBindData final : public StandaloneTableFuncBindData {
std::string tableName;
std::string indexName;
std::vector<std::string> properties;

CreateFTSBindData(std::string tableName, std::string indexName,
std::vector<std::string> properties)
: StandaloneTableFuncBindData{}, tableName{std::move(tableName)},
indexName{std::move(indexName)}, properties{std::move(properties)} {}

std::unique_ptr<TableFuncBindData> copy() const override {
return std::make_unique<CreateFTSBindData>(tableName, indexName, properties);
}
};

static void validateIndexNotExist(const catalog::NodeTableCatalogEntry& entry,
acquamarin marked this conversation as resolved.
Show resolved Hide resolved
const std::string& indexName) {
if (entry.containsIndex(indexName)) {
throw common::BinderException{common::stringFormat("Index: {} already exists in table: {}.",
indexName, entry.getName())};
}
}

static std::vector<std::string> bindProperties(const catalog::NodeTableCatalogEntry& entry,
acquamarin marked this conversation as resolved.
Show resolved Hide resolved
const common::Value& properties) {
std::vector<std::string> result;
for (auto i = 0u; i < properties.getChildrenSize(); i++) {
auto propertyName = NestedVal::getChildVal(&properties, i)->toString();
if (!entry.containsProperty(propertyName)) {
throw BinderException{common::stringFormat("Property: {} does not exist in table {}.",
propertyName, entry.getName())};
}
result.push_back(std::move(propertyName));
}
return result;
}

static std::unique_ptr<TableFuncBindData> bindFunc(ClientContext* context,
ScanTableFuncBindInput* input) {
auto indexName = input->inputs[1].toString();
auto& nodeTableEntry =
FTSUtils::bindTable(input->inputs[0], context, indexName, FTSUtils::IndexOperation::CREATE);
auto properties = bindProperties(nodeTableEntry, input->inputs[2]);
validateIndexNotExist(nodeTableEntry, indexName);
return std::make_unique<CreateFTSBindData>(nodeTableEntry.getName(), indexName,
std::move(properties));
}

std::string createFTSIndexQuery(ClientContext& context, const TableFuncBindData& bindData) {
auto createFTSBindData = bindData.constPtrCast<CreateFTSBindData>();
auto tableName = createFTSBindData->tableName;
auto indexName = createFTSBindData->indexName;
auto properties = createFTSBindData->properties;
binder::BoundAlterInfo boundAlterInfo{common::AlterType::ADD_INDEX, tableName,
std::make_unique<binder::BoundExtraIndexInfo>(indexName)};
// TODO(Ziyi): Copy statement can't be wrapped in manual transaction, so we can't wrap all
// statements in a single transaction there.
context.getTransactionContext()->commit();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you committing something on behalf of the user? You don't know if the user wants to do that. We should never do this. Instead, we need to error if the user has started a manual transaction. So we need a mechanism to tell whether the current function is being called inside a manual transaction or auto transaction. And if it is manual, we should error saying something like: "You cannot create an FTS index inside a manual transaction. CREATE_FTS_INDEX statements must be standalone queries."

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For transaction related issues, i think it is better to leave them after we support copy in manual transaction.

context.getTransactionContext()->beginAutoTransaction(false /* readOnly */);
context.getCatalog()->alterTableEntry(context.getTx(), std::move(boundAlterInfo));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is this alterTableEntry call doing? Why do you have to manually alter the catalog? Let's talk about this.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have to create an index entry in nodetables, that's what the alterTableEntry is doing.

context.getTransactionContext()->commit();
context.getTransactionContext()->beginAutoTransaction(true /* readOnly */);
auto tablePrefix = common::stringFormat("{}_{}", tableName, indexName);
// Create the tokenize macro.
std::string query = "";
if (!context.getCatalog()->containsMacro(context.getTx(), "tokenize")) {
query += R"(CREATE MACRO tokenize(query) AS
string_split(lower(regexp_replace(
CAST(query as STRING),
'[0-9!@#$%^&*()_+={}\\[\\]:;<>,.?~\\\\/\\|\'"`-]+',
' ',
'g')), ' ');)";
}

// Create the stop words table.
query += common::stringFormat("CREATE NODE TABLE {}_stopwords (sw STRING, PRIMARY KEY(sw));",
tablePrefix);
for (auto i = 0u; i < FTSExtension::NUM_STOP_WORDS; i++) {
query += common::stringFormat("CREATE (s:{}_stopwords {sw: \"{}\"});", tablePrefix,
FTSExtension::STOP_WORDS[i]);
}

// Create the terms_in_doc table which servers as a temporary table to store the relationship
// between terms and docs.
query += common::stringFormat(
"CREATE NODE TABLE {}_terms_in_doc (ID SERIAL, term string, docID INT64, primary "
"key(ID));",
tablePrefix);
for (auto& property : properties) {
query += common::stringFormat("COPY {}_terms_in_doc FROM "
"(MATCH (b:{}) "
"WITH tokenize(b.{}) AS tk, OFFSET(ID(b)) AS id "
"UNWIND tk AS t "
"WITH t AS t1, id AS id1 "
"WHERE t1 is NOT NULL AND SIZE(t1) > 0 AND "
"NOT EXISTS {MATCH (s:{}_stopwords {sw: t1})} "
"RETURN STEM(t1, 'porter'), id1);",
acquamarin marked this conversation as resolved.
Show resolved Hide resolved
tablePrefix, tableName, property, tablePrefix);
}
// Create the docs table which records the number of words in each document.
query += common::stringFormat(
"CREATE NODE TABLE {}_docs (docID INT64, len UINT64, primary key(docID));", tablePrefix);
query += common::stringFormat("COPY {}_docs FROM "
"(MATCH (t:{}_terms_in_doc) "
"RETURN t.docID, CAST(count(t) AS UINT64) "
"ORDER BY t.docID);",
tablePrefix, tablePrefix);
// Create the dic table which records all distinct terms and their document frequency.
query += common::stringFormat(
"CREATE NODE TABLE {}_dict (term STRING, df UINT64, PRIMARY KEY(term));", tablePrefix);
query += common::stringFormat("COPY {}_dict FROM "
"(MATCH (t:{}_terms_in_doc) "
"RETURN t.term, CAST(count(distinct t.docID) AS UINT64));",
tablePrefix, tablePrefix);
// Finally, create a terms table that records the documents in which the terms appear, along
// with the frequency of each term.
query += common::stringFormat(
"CREATE REL TABLE {}_terms (FROM {}_dict TO {}_docs, tf UINT64, MANY_MANY);", tablePrefix,
tablePrefix, tablePrefix);
query += common::stringFormat("COPY {}_terms FROM ("
"MATCH (b:{}_terms_in_doc) "
"RETURN b.term, b.docID, CAST(count(*) as UINT64));",
tablePrefix, tablePrefix);
// Stats table records the number of documents and the average document length.
query += common::stringFormat(
"CREATE NODE TABLE {}_stats (ID SERIAL, num_docs UINT64, avg_dl DOUBLE, PRIMARY KEY(ID));",
tablePrefix);
query += common::stringFormat("COPY {}_stats FROM (MATCH (d:{}_docs) "
"RETURN CAST(count(d) AS UINT64), "
"CAST(SUM(d.len) AS DOUBLE) / CAST(COUNT(d.len) AS DOUBLE));",
tablePrefix, tablePrefix);
return query;
}

static common::offset_t tableFunc(TableFuncInput& /*data*/, TableFuncOutput& /*output*/) {
KU_UNREACHABLE;
}

function_set CreateFTSFunction::getFunctionSet() {
function_set functionSet;
auto func = std::make_unique<TableFunction>(name, tableFunc, bindFunc, initSharedState,
initEmptyLocalState,
std::vector<LogicalTypeID>{LogicalTypeID::STRING, LogicalTypeID::STRING,
LogicalTypeID::LIST});
func->rewriteFunc = createFTSIndexQuery;
func->canParallelFunc = []() { return false; };
acquamarin marked this conversation as resolved.
Show resolved Hide resolved
functionSet.push_back(std::move(func));
return functionSet;
}

} // namespace fts_extension
} // namespace kuzu
Loading
Loading