Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(java): create ivf pq index via Java #2103

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions java/lance-jni/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ crate-type = ["cdylib"]
[dependencies]
lance.workspace = true
lance-io.workspace = true
lance-index.workspace = true
lance-linalg.workspace = true
arrow = { workspace = true, features = ["ffi"] }
arrow-schema.workspace = true
datafusion.workspace = true
Expand Down
99 changes: 99 additions & 0 deletions java/lance-jni/src/index.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
// Copyright 2024 Lance Developers.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//! Dataset Indexing.

use jni::objects::{JObject, JString};
use jni::sys::jboolean;
use jni::{sys::jint, JNIEnv};
use lance::index::vector::VectorIndexParams;
use lance_index::{DatasetIndexExt, IndexType};
use lance_linalg::distance::DistanceType;
use snafu::{location, Location};

use crate::blocking_dataset::NATIVE_DATASET;
use crate::error::{Error, Result};
use crate::{BlockingDataset, RT};

fn parse_distance_type(val: i32) -> Result<DistanceType> {
match val {
1 => Ok(DistanceType::L2),
2 => Ok(DistanceType::Cosine),
3 => Ok(DistanceType::Dot),
_ => Err(Error::Index {
message: format!("invalid distance type: {val}"),
location: location!(),
}),
}
}

#[no_mangle]
pub extern "system" fn Java_com_lancedb_lance_index_IndexBuilder_createIvfPQ(
mut env: JNIEnv,
_builder: JObject,
jdataset: JObject,
column: JString,
num_partitions: jint,
num_sub_vectors: jint,
distance_type: jint,
replace: jboolean,
) {
let mut dataset = {
unsafe { env.get_rust_field::<_, _, BlockingDataset>(jdataset, NATIVE_DATASET) }
.expect("Failed to get native dataset handle")
.clone()
};

let column: String = if let Ok(js) = env.get_string(&column) {
js.into()
} else {
env.throw_new("java/lang/IllegalArgumentException", "Invalid column name")
.expect("Failed to throw exception");
return;
};

let Ok(distance_type) = parse_distance_type(distance_type) else {
env.throw_new(
"java/lang/IllegalArgumentException",
format!("Invalid distance type: {distance_type}"),
)
.expect("Failed to throw exception");
return;
};

let params = VectorIndexParams::ivf_pq(
num_partitions as usize,
8,
num_sub_vectors as usize,
false,
distance_type,
50,
);

let res = RT
.block_on(dataset.inner.create_index(
&[&column],
IndexType::Vector,
None,
&params,
replace == 1,
))
.map_err(|e| Error::Index {
message: e.to_string(),
location: location!(),
});
if let Err(e) = res {
e.throw(&mut env);
}
}
1 change: 1 addition & 0 deletions java/lance-jni/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ mod blocking_dataset;
pub mod error;
mod ffi;
mod fragment;
mod index;
mod traits;
mod utils;

Expand Down
34 changes: 25 additions & 9 deletions java/src/main/java/com/lancedb/lance/Dataset.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

package com.lancedb.lance;

import com.lancedb.lance.index.IndexBuilder;
import io.questdb.jar.jni.JarJniLoader;
import java.io.Closeable;
import java.io.IOException;
Expand All @@ -25,9 +26,12 @@
import org.apache.arrow.memory.BufferAllocator;

/**
* Class representing a Lance dataset, interfacing with the native lance library. This class
* provides functionality to open and manage datasets with native code. The native library is loaded
* statically and utilized through native methods. It implements the {@link java.io.Closeable}
* Class representing a Lance dataset, interfacing with the native lance
* library. This class
* provides functionality to open and manage datasets with native code. The
* native library is loaded
* statically and utilized through native methods. It implements the
* {@link java.io.Closeable}
* interface to ensure proper resource management.
*/
public class Dataset implements Closeable {
Expand All @@ -39,13 +43,14 @@ public class Dataset implements Closeable {

BufferAllocator allocator;

private Dataset() {}
private Dataset() {
}

/**
* Write a dataset to the specified path.
*
* @param stream arrow stream
* @param path dataset uri
* @param path dataset uri
* @param params write parameters
* @return Dataset
*/
Expand All @@ -62,7 +67,7 @@ private static native Dataset writeWithFfiStream(long arrowStreamMemoryAddress,
/**
* Open a dataset from the specified path.
*
* @param path file path
* @param path file path
* @param allocator Arrow buffer allocator.
* @return Dataset
*/
Expand Down Expand Up @@ -93,7 +98,8 @@ public static Dataset open(String path, BufferAllocator allocator) throws IOExce
* @return A list of {@link Fragment}.
*/
public List<Fragment> getFragments() {
// Set a pointer in Fragment to dataset, to make it is easier to issue IOs later.
// Set a pointer in Fragment to dataset, to make it is easier to issue IOs
// later.
//
// We do not need to close Fragments.
return Arrays.stream(this.getFragmentsIds())
Expand All @@ -103,8 +109,17 @@ public List<Fragment> getFragments() {

private native int[] getFragmentsIds();

public IndexBuilder createIndex(String column) {
return createIndex(List.of(column));
}

public IndexBuilder createIndex(List<String> columns) {
return new IndexBuilder(this, columns);
}

/**
* Closes this dataset and releases any system resources associated with it. If the dataset is
* Closes this dataset and releases any system resources associated with it. If
* the dataset is
* already closed, then invoking this method has no effect.
*/
@Override
Expand All @@ -116,7 +131,8 @@ public void close() {
}

/**
* Native method to release the Lance dataset resources associated with the given handle.
* Native method to release the Lance dataset resources associated with the
* given handle.
*
* @param handle The native handle to the dataset resource.
*/
Expand Down
32 changes: 32 additions & 0 deletions java/src/main/java/com/lancedb/lance/index/DistanceType.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.lancedb.lance.index;

/** Distance Type. How to calculate distance between two vectors. */
public enum DistanceType {
L2(1),
COSINE(2),
DOT(3);

private final int value;

DistanceType(int value) {
this.value = value;
}

int getValue() {
return value;
}
}
124 changes: 124 additions & 0 deletions java/src/main/java/com/lancedb/lance/index/IndexBuilder.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.lancedb.lance.index;

import com.lancedb.lance.Dataset;
import java.io.IOException;
import java.util.List;
import java.util.Optional;

/** Builder for creating Index on lance dataset. */
public class IndexBuilder {
private final Dataset dataset;
private final List<String> columns;

private Optional<IndexParams> params;

private DistanceType distanceType = DistanceType.L2;

/** Set true to replace existing index. */
private boolean replace = false;

/** Constructor. */
public IndexBuilder(Dataset dataset, List<String> columns) {
this.dataset = dataset;
this.columns = columns;
}

/** Set true to replace existing index. */
public IndexBuilder replace() {
return this.replace(true);
}

/** Set true to replace existing index. */
public IndexBuilder replace(boolean flag) {
this.replace = flag;
return this;
}

/**
* Build Ivf_PQ index.
*
* @param numPartitions Number of IVF Partitions.
* @param numSubVectors Number of PQ sub-vectors.
*/
public IndexBuilder ivfPq(int numPartitions, int numSubVectors) throws IOException {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is buildIvfPqIndex() a better method name?

if (this.params.isPresent()) {
throw new IOException("A different index parameter already set.");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it might be better to throw a runtime exception or IllegalState exception, because it's not something from IO, what do you think?

}
this.params = Optional.of(new IvfPqParams(numPartitions, numSubVectors));
return this;
}

/**
* Set the distance type.
*
* <p>Default type is {@link DistanceType.L2}
*/
public IndexBuilder distanceType(DistanceType dt) {
this.distanceType = dt;
return this;
}

/** Build a Scalar index. */
public IndexBuilder scalar() throws IOException {
if (this.params.isPresent()) {
throw new IOException("A different index parameter already set.");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto, IOException might be not appropriate here

}
this.params = Optional.of(new ScalarParams());
return this;
}

/** Build the index. */
public void build() throws IOException {
if (this.params.isEmpty()) {
throw new IOException("Index parameters are not set");
}
var params = this.params.get();
if (params instanceof IvfPqParams) {
if (columns.size() != 1) {
throw new IOException("Can only create IVF_PQ on one column, got: " + columns);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto

}
createIvfPq(
dataset,
columns.get(0),
((IvfPqParams) params).getNumPartitions(),
((IvfPqParams) params).getNumSubVectors(),
distanceType.getValue(),
this.replace);

} else {
throw new IOException("Unsupported Index Parameter");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto

}
}

/**
* Explicitly call create ivf pq with primitive parameters for simplicity.
*
* @param dataset the dataset instance.
* @param column the column name.
* @param numPartitions the number of IVF Partition.
* @param numSubVectors the number of PQ sub vectors.
* @param replace whether to replace existing index.
*/
private native void createIvfPq(
Dataset dataset,
String column,
int numPartitions,
int numSubVectors,
int distanceType,
boolean replace)
throws IOException;
}
18 changes: 18 additions & 0 deletions java/src/main/java/com/lancedb/lance/index/IndexParams.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.lancedb.lance.index;

/** Interface for Index Parameters. */
interface IndexParams {}