-
Notifications
You must be signed in to change notification settings - Fork 175
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(java): create ivf pq index via Java #2103
base: main
Are you sure you want to change the base?
Changes from all commits
fc5817c
1676825
b2dc507
92c3386
c710b21
c45b07d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
// Copyright 2024 Lance Developers. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
//! Dataset Indexing. | ||
|
||
use jni::objects::{JObject, JString}; | ||
use jni::sys::jboolean; | ||
use jni::{sys::jint, JNIEnv}; | ||
use lance::index::vector::VectorIndexParams; | ||
use lance_index::{DatasetIndexExt, IndexType}; | ||
use lance_linalg::distance::DistanceType; | ||
use snafu::{location, Location}; | ||
|
||
use crate::blocking_dataset::NATIVE_DATASET; | ||
use crate::error::{Error, Result}; | ||
use crate::{BlockingDataset, RT}; | ||
|
||
fn parse_distance_type(val: i32) -> Result<DistanceType> { | ||
match val { | ||
1 => Ok(DistanceType::L2), | ||
2 => Ok(DistanceType::Cosine), | ||
3 => Ok(DistanceType::Dot), | ||
_ => Err(Error::Index { | ||
message: format!("invalid distance type: {val}"), | ||
location: location!(), | ||
}), | ||
} | ||
} | ||
|
||
#[no_mangle] | ||
pub extern "system" fn Java_com_lancedb_lance_index_IndexBuilder_createIvfPQ( | ||
mut env: JNIEnv, | ||
_builder: JObject, | ||
jdataset: JObject, | ||
column: JString, | ||
num_partitions: jint, | ||
num_sub_vectors: jint, | ||
distance_type: jint, | ||
replace: jboolean, | ||
) { | ||
let mut dataset = { | ||
unsafe { env.get_rust_field::<_, _, BlockingDataset>(jdataset, NATIVE_DATASET) } | ||
.expect("Failed to get native dataset handle") | ||
.clone() | ||
}; | ||
|
||
let column: String = if let Ok(js) = env.get_string(&column) { | ||
js.into() | ||
} else { | ||
env.throw_new("java/lang/IllegalArgumentException", "Invalid column name") | ||
.expect("Failed to throw exception"); | ||
return; | ||
}; | ||
|
||
let Ok(distance_type) = parse_distance_type(distance_type) else { | ||
env.throw_new( | ||
"java/lang/IllegalArgumentException", | ||
format!("Invalid distance type: {distance_type}"), | ||
) | ||
.expect("Failed to throw exception"); | ||
return; | ||
}; | ||
|
||
let params = VectorIndexParams::ivf_pq( | ||
num_partitions as usize, | ||
8, | ||
num_sub_vectors as usize, | ||
false, | ||
distance_type, | ||
50, | ||
); | ||
|
||
let res = RT | ||
.block_on(dataset.inner.create_index( | ||
&[&column], | ||
IndexType::Vector, | ||
None, | ||
¶ms, | ||
replace == 1, | ||
)) | ||
.map_err(|e| Error::Index { | ||
message: e.to_string(), | ||
location: location!(), | ||
}); | ||
if let Err(e) = res { | ||
e.throw(&mut env); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,6 +52,7 @@ mod blocking_dataset; | |
pub mod error; | ||
mod ffi; | ||
mod fragment; | ||
mod index; | ||
mod traits; | ||
mod utils; | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
/* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package com.lancedb.lance.index; | ||
|
||
/** Distance Type. How to calculate distance between two vectors. */ | ||
public enum DistanceType { | ||
L2(1), | ||
COSINE(2), | ||
DOT(3); | ||
|
||
private final int value; | ||
|
||
DistanceType(int value) { | ||
this.value = value; | ||
} | ||
|
||
int getValue() { | ||
return value; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
/* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package com.lancedb.lance.index; | ||
|
||
import com.lancedb.lance.Dataset; | ||
import java.io.IOException; | ||
import java.util.List; | ||
import java.util.Optional; | ||
|
||
/** Builder for creating Index on lance dataset. */ | ||
public class IndexBuilder { | ||
private final Dataset dataset; | ||
private final List<String> columns; | ||
|
||
private Optional<IndexParams> params; | ||
|
||
private DistanceType distanceType = DistanceType.L2; | ||
|
||
/** Set true to replace existing index. */ | ||
private boolean replace = false; | ||
|
||
/** Constructor. */ | ||
public IndexBuilder(Dataset dataset, List<String> columns) { | ||
this.dataset = dataset; | ||
this.columns = columns; | ||
} | ||
|
||
/** Set true to replace existing index. */ | ||
public IndexBuilder replace() { | ||
return this.replace(true); | ||
} | ||
|
||
/** Set true to replace existing index. */ | ||
public IndexBuilder replace(boolean flag) { | ||
this.replace = flag; | ||
return this; | ||
} | ||
|
||
/** | ||
* Build Ivf_PQ index. | ||
* | ||
* @param numPartitions Number of IVF Partitions. | ||
* @param numSubVectors Number of PQ sub-vectors. | ||
*/ | ||
public IndexBuilder ivfPq(int numPartitions, int numSubVectors) throws IOException { | ||
if (this.params.isPresent()) { | ||
throw new IOException("A different index parameter already set."); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it might be better to throw a runtime exception or IllegalState exception, because it's not something from IO, what do you think? |
||
} | ||
this.params = Optional.of(new IvfPqParams(numPartitions, numSubVectors)); | ||
return this; | ||
} | ||
|
||
/** | ||
* Set the distance type. | ||
* | ||
* <p>Default type is {@link DistanceType.L2} | ||
*/ | ||
public IndexBuilder distanceType(DistanceType dt) { | ||
this.distanceType = dt; | ||
return this; | ||
} | ||
|
||
/** Build a Scalar index. */ | ||
public IndexBuilder scalar() throws IOException { | ||
if (this.params.isPresent()) { | ||
throw new IOException("A different index parameter already set."); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto, IOException might be not appropriate here |
||
} | ||
this.params = Optional.of(new ScalarParams()); | ||
return this; | ||
} | ||
|
||
/** Build the index. */ | ||
public void build() throws IOException { | ||
if (this.params.isEmpty()) { | ||
throw new IOException("Index parameters are not set"); | ||
} | ||
var params = this.params.get(); | ||
if (params instanceof IvfPqParams) { | ||
if (columns.size() != 1) { | ||
throw new IOException("Can only create IVF_PQ on one column, got: " + columns); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto |
||
} | ||
createIvfPq( | ||
dataset, | ||
columns.get(0), | ||
((IvfPqParams) params).getNumPartitions(), | ||
((IvfPqParams) params).getNumSubVectors(), | ||
distanceType.getValue(), | ||
this.replace); | ||
|
||
} else { | ||
throw new IOException("Unsupported Index Parameter"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto |
||
} | ||
} | ||
|
||
/** | ||
* Explicitly call create ivf pq with primitive parameters for simplicity. | ||
* | ||
* @param dataset the dataset instance. | ||
* @param column the column name. | ||
* @param numPartitions the number of IVF Partition. | ||
* @param numSubVectors the number of PQ sub vectors. | ||
* @param replace whether to replace existing index. | ||
*/ | ||
private native void createIvfPq( | ||
Dataset dataset, | ||
String column, | ||
int numPartitions, | ||
int numSubVectors, | ||
int distanceType, | ||
boolean replace) | ||
throws IOException; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
/* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package com.lancedb.lance.index; | ||
|
||
/** Interface for Index Parameters. */ | ||
interface IndexParams {} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is
buildIvfPqIndex()
a better method name?