Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add cosine_distance for vector similarity compute #10737

Merged
merged 13 commits into from
Mar 24, 2023
38 changes: 38 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ members = [
"src/common/tracing",
"src/common/storage",
"src/common/profile",
"src/common/vector",
# Query
"src/query/ast",
"src/query/codegen",
Expand Down
1 change: 1 addition & 0 deletions src/common/base/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,6 @@ pub mod containers;
pub mod mem_allocator;
pub mod rangemap;
pub mod runtime;

pub use runtime::match_join_handle;
pub use runtime::set_alloc_error_hook;
23 changes: 23 additions & 0 deletions src/common/vector/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[package]
name = "common-vector"
version = { workspace = true }
authors = { workspace = true }
license = { workspace = true }
publish = { workspace = true }
edition = { workspace = true }

[lib]
doctest = false
test = false

[dependencies] # In alphabetical order
common-exception = { path = "../exception" }

ndarray = "0.15.6"

[build-dependencies]

[features]

[dev-dependencies]
approx = "0.5.1"
34 changes: 34 additions & 0 deletions src/common/vector/src/distance.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// Copyright 2023 Datafuse Labs.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use common_exception::ErrorCode;
use common_exception::Result;
use ndarray::ArrayView;

pub fn cosine_distance(from: &[f32], to: &[f32]) -> Result<f32> {
if from.len() != to.len() {
return Err(ErrorCode::InvalidArgument(format!(
"Vector length not equal: {:} != {:}",
from.len(),
to.len(),
)));
}

let a = ArrayView::from(from);
let b = ArrayView::from(to);
let aa_sum = (&a * &a).sum();
let bb_sum = (&b * &b).sum();

Ok((&a * &b).sum() / ((aa_sum).sqrt() * (bb_sum).sqrt()))
}
17 changes: 17 additions & 0 deletions src/common/vector/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// Copyright 2023 Datafuse Labs.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

mod distance;

pub use distance::cosine_distance;
41 changes: 41 additions & 0 deletions src/common/vector/tests/it/distance.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// Copyright 2023 Datafuse Labs.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use common_vector::cosine_distance;

#[test]
fn test_cosine() {
{
let x: Vec<f32> = (1..9).map(|v| v as f32).collect();
let y: Vec<f32> = (100..108).map(|v| v as f32).collect();
let d = cosine_distance(&x, &y).unwrap();
// from scipy.spatial.distance.cosine
approx::assert_relative_eq!(d, 0.900_957);
}

{
let x = vec![3.0, 45.0, 7.0, 2.0, 5.0, 20.0, 13.0, 12.0];
let y = vec![2.0, 54.0, 13.0, 15.0, 22.0, 34.0, 50.0, 1.0];
let d = cosine_distance(&x, &y).unwrap();
// from sklearn.metrics.pairwise import cosine_similarity
approx::assert_relative_eq!(d, 0.873_580_6);
}

{
let x = vec![3.0, 45.0, 7.0, 2.0, 5.0, 20.0, 13.0, 12.0];
let y = vec![2.0, 54.0];
let d = cosine_distance(&x, &y);
assert!(d.is_err());
}
}
15 changes: 15 additions & 0 deletions src/common/vector/tests/it/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// Copyright 2023 Datafuse Labs.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

mod distance;
1 change: 1 addition & 0 deletions src/query/functions/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ common-exception = { path = "../../common/exception" }
common-expression = { path = "../expression" }
common-hashtable = { path = "../../common/hashtable" }
common-io = { path = "../../common/io" }
common-vector = { path = "../../common/vector" }
jsonb = { workspace = true }

# Crates.io dependencies
Expand Down
2 changes: 2 additions & 0 deletions src/query/functions/src/scalars/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ mod map;
mod math;
mod tuple;
mod variant;
mod vector;

mod comparison;
mod decimal;
Expand Down Expand Up @@ -55,4 +56,5 @@ pub fn register(registry: &mut FunctionRegistry) {
hash::register(registry);
other::register(registry);
decimal::register(registry);
vector::register(registry);
}
47 changes: 47 additions & 0 deletions src/query/functions/src/scalars/vector.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
// Copyright 2023 Datafuse Labs.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use common_arrow::arrow::buffer::Buffer;
use common_expression::types::ArrayType;
use common_expression::types::Float32Type;
use common_expression::types::F32;
use common_expression::vectorize_with_builder_2_arg;
use common_expression::FunctionDomain;
use common_expression::FunctionRegistry;
use common_vector::cosine_distance;

pub fn register(registry: &mut FunctionRegistry) {
registry.register_passthrough_nullable_2_arg::<ArrayType<Float32Type>, ArrayType<Float32Type>, Float32Type, _, _>(
"cosine_distance",
|_, _| FunctionDomain::MayThrow,
vectorize_with_builder_2_arg::<ArrayType<Float32Type>, ArrayType<Float32Type>, Float32Type>(
|lhs, rhs, output, ctx| {
let l_f32=
unsafe { std::mem::transmute::<Buffer<F32>, Buffer<f32>>(lhs) };
let r_f32=
unsafe { std::mem::transmute::<Buffer<F32>, Buffer<f32>>(rhs) };

match cosine_distance(l_f32.as_slice(), r_f32.as_slice()) {
Ok(dist) => {
output.push(F32::from(dist));
}
Err(err) => {
ctx.set_error(output.len(), err.to_string());
output.push(F32::from(0.0));
}
}
}
),
);
}
1 change: 1 addition & 0 deletions src/query/functions/tests/it/scalars/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ mod regexp;
mod string;
mod tuple;
mod variant;
mod vector;

pub fn run_ast(file: &mut impl Write, text: impl AsRef<str>, columns: &[(&str, Column)]) {
let text = text.as_ref();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1098,6 +1098,8 @@ Functions overloads:
28 contains(Array(T0), T0) :: Boolean
0 cos(Float64) :: Float64
1 cos(Float64 NULL) :: Float64 NULL
0 cosine_distance(Array(Float32), Array(Float32)) :: Float32
1 cosine_distance(Array(Float32) NULL, Array(Float32) NULL) :: Float32 NULL
0 cot(Float64) :: Float64
1 cot(Float64 NULL) :: Float64 NULL
0 crc32(String) :: UInt32
Expand Down
23 changes: 23 additions & 0 deletions src/query/functions/tests/it/scalars/testdata/vector.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
ast : cosine_distance([a], [b])
raw expr : cosine_distance(array(a::Float32), array(b::Float32))
checked expr : cosine_distance<Array(Float32), Array(Float32)>(array<T0=Float32><T0>(a), array<T0=Float32><T0>(b))
evaluation:
+--------+---------+---------+---------+
| | a | b | Output |
+--------+---------+---------+---------+
| Type | Float32 | Float32 | Float32 |
| Domain | {0..=2} | {3..=5} | Unknown |
| Row 0 | 0 | 3 | NaN |
| Row 1 | 1 | 4 | 1 |
| Row 2 | 2 | 5 | 1 |
+--------+---------+---------+---------+
evaluation (internal):
+--------+----------------------+
| Column | Data |
+--------+----------------------+
| a | Float32([0, 1, 2]) |
| b | Float32([3, 4, 5]) |
| Output | Float32([NaN, 1, 1]) |
+--------+----------------------+


36 changes: 36 additions & 0 deletions src/query/functions/tests/it/scalars/vector.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// Copyright 2023 Datafuse Labs.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use std::io::Write;

use common_expression::types::*;
use common_expression::FromData;
use goldenfile::Mint;

use super::run_ast;

#[test]
fn test_vector() {
let mut mint = Mint::new("tests/it/scalars/testdata");
let file = &mut mint.new_goldenfile("vector.txt").unwrap();

test_vector_cosine_distance(file);
}

fn test_vector_cosine_distance(file: &mut impl Write) {
run_ast(file, "cosine_distance([a], [b])", &[
("a", Float32Type::from_data(vec![0f32, 1.0, 2.0])),
("b", Float32Type::from_data(vec![3f32, 4.0, 5.0])),
]);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# From sklearn.metrics.pairwise import cosine_similarity
query F
select cosine_distance([3.0, 45.0, 7.0, 2.0, 5.0, 20.0, 13.0, 12.0], [2.0, 54.0, 13.0, 15.0, 22.0, 34.0, 50.0, 1.0]) as sim
----
0.8735807