Skip to content

Commit

Permalink
Enable PECOS HNSW with command line interface
Browse files Browse the repository at this point in the history
  • Loading branch information
OctoberChang committed Oct 16, 2021
1 parent 4cfb9d5 commit 597724e
Show file tree
Hide file tree
Showing 5 changed files with 333 additions and 10 deletions.
14 changes: 10 additions & 4 deletions pecos/ann/hnsw/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,15 @@ X_tst = np.random.randn(1000, 100).astype(np.float32)
Note that the data type needed to be `np.float32`.

#### HNSW Training
Train the HNSW model (i.e., building the graph-based indexing data structure) with maximum number of threads available on your machine (`threads=0`):
Train the HNSW model (i.e., building the graph-based indexing data structure) with maximum number of threads available on your machine (`threads=-1`):
```python
from pecos.ann.hnsw import HNSW
train_params = HNSW.TrainParams(M=32, efC=300, metric_type="ip", threads=-1)
model = HNSW.train(X_trn, train_params=train_params)
model = HNSW.train(X_trn, train_params=train_params, pred_params=None)
```
Users are also welcome to train the default parameters via
```python
model = HNSW.train(X_trn)
```

#### HNSW Save and Load
Expand All @@ -47,11 +51,13 @@ searchers = model.searchers_create(num_searcher=4)
Finally, we conduct ANN inference by inputing searchers to the HNSW model.
```python
pred_params = HNSW.PredParams(efS=100, topk=10)
indices, distances = model.predict(X_tst, pred_params=pred_params, searchers=searchers, ret_csr=False)
Yt_pred = model.predict(X_tst, pred_params=pred_params, searchers=searchers)
```
where `Yt_pred` is a `scipy.sparse.csr_matrix` whose row indices are sorted by its distances ascendingly.

Alternatively, it is also feasible to do inference without pre-allocating searchers, which may have larger overhead since it will **re-allocate** intermediate graph-searhing variables for each query matrix `X_tst`.
```python
pred_params.threads = 2
indices, distances = model.predict(X_tst, pred_params=pred_params, ret_csr=False)
```
When `ret_csr=True`, the prediction function will return a single csr matrix that combines the indices and distances numpy array.
When `ret_csr=False`, the prediction function will return the indices and distances numpy array.
8 changes: 4 additions & 4 deletions pecos/ann/hnsw/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,14 @@ class TrainParams(pecos.BaseParams):
"""Training Parameters of HNSW class
Attributes:
M (int): maximum number of edges per node for layer l=1,...,L. For layer l=0, its 2*M.
M (int): maximum number of edges per node for layer l=1,...,L. For layer l=0, its 2*M. Default 32
efC (int): size of the priority queue when performing best first search during construction. Default 100
threads (int): number of threads to use for training HNSW indexer. Default -1 to use all
max_level_upper_bound (int): number of maximum layers in the hierarchical graph. Default -1 to ignore
metric_type (str): distance metric type, can be "ip" for inner product or "l2" for Euclidean distance
"""

M: int = 24
M: int = 32
efC: int = 100
threads: int = -1
max_level_upper_bound: int = -1
Expand Down Expand Up @@ -213,13 +213,13 @@ def get_pred_params(self):
"""
return copy.deepcopy(self.pred_params)

def predict(self, X, pred_params=None, searchers=None, ret_csr=False):
def predict(self, X, pred_params=None, searchers=None, ret_csr=True):
"""predict with multi-thread. If searchers are provided, less overhead for online inference.
Args:
X (nd.array/ScipyDrmF32, scipy.sparse.csr_matrix/ScipyCsrF32): query matrix to be predicted. (num_query x feat_dim).
pred_params (HNSW.PredParams, optional): instance of pecos.ann.hnsw.HNSW.PredParams
searchers (c_void_p): pointer to C/C++ std::vector<pecos::ann::HNSW:Searcher>. It's an object returned by self.create_searcher().
ret_csr (bool): if true, the returns will be csr matrix. if false, return indices/distances np.array
ret_csr (bool): if true, the returns will be csr matrix. if false, return indices/distances np.array (default true)
Returns:
indices (np.array): returned indices array, sorted by smallest-to-largest distances. (num_query x pred_params.topk)
distances (np.array): returned dinstances array, sorted by smallest-to-largest distances (num_query x pred_params.topk)
Expand Down
137 changes: 137 additions & 0 deletions pecos/ann/hnsw/predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
# with the License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
# and limitations under the License.
import argparse
import os
import numpy as np
from pecos.utils import smat_util
from .model import HNSW


def parse_arguments():
"""Parse Inference arguments"""

parser = argparse.ArgumentParser()

# Required parameters
parser.add_argument(
"-i",
"--inst-path",
type=str,
required=True,
metavar="PATH",
help="path to the CSR npz or Row-majored npy file of the feature matrix (nr_insts * nr_feats) to be indexed by HNSW",
)
parser.add_argument(
"-m",
"--model-folder",
type=str,
required=True,
metavar="DIR",
help="path to the model folder to load the HNSW index for inference",
)

# Optional
parser.add_argument(
"-efS",
"--efSearch",
type=int,
default=100,
metavar="INT",
help="size of the priority queue when performing best first search during inference. (Default 100)"
)
parser.add_argument(
"-k",
"--topk",
type=int,
default=10,
metavar="INT",
help="maximum number of candidates (sorted by distances, nearest first) to be returned",
)
parser.add_argument(
"-n",
"--threads",
type=int,
default=-1,
metavar="int",
help= "number of threads to use for inference of hnsw indexer (default -1 to use all)"
)
parser.add_argument(
"-y",
"--label-path",
type=str,
default=None,
metavar="PATH",
help="path to the npz file of the ground truth label matrix (CSR, nr_tst * nr_items)",
)
parser.add_argument(
"-o",
"--save-pred-path",
type=str,
default=None,
metavar="PATH",
help="path to save the predictions (CSR sorted by distances, nr_tst * nr_items)",
)

return parser


def do_predict(args):
"""Predict and Evaluate for HNSW model
Args:
args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
"""

# Load data
Xt = smat_util.load_matrix(args.inst_path).astype(np.float32)

# Load model
model = HNSW.load(args.model_folder)

# Setup HNSW Searchers for thread-safe inference
threads = os.cpu_count() if args.threads <= 0 else args.threads
searchers = model.searchers_create(num_searcher=threads)

# Setup prediction params
# pred_params.threads will be overrided if searchers are provided in model.predict()
pred_params = HNSW.PredParams(
efS=args.efSearch,
topk=args.topk,
threads=threads,
)

# Model Predicting
Yt_pred = model.predict(
Xt,
pred_params=pred_params,
searchers=searchers,
ret_csr=True,
)

# Save prediction
if args.save_pred_path:
smat_util.save_matrix(args.save_pred_path, Yt_pred)

# Evaluate Recallk@k
if args.label_path:
Yt = smat_util.load_matrix(args.label_path)
# assuming ground truth is similarity-based (larger the better)
Yt_topk = smat_util.sorted_csr(Yt, only_topk=args.topk)
# assuming prediction matrix is distance-based, so need 1-dist=similiarty
Yt_pred.data = 1.0 - Yt_pred.data
metric = smat_util.Metrics.generate(Yt_topk, Yt_pred, topk=args.topk)
print("Recall{}@{} {:.6f}%".format(args.topk, args.topk, 100. * metric.recall[-1]))


if __name__ == "__main__":
parser = parse_arguments()
args = parser.parse_args()
do_predict(args)
147 changes: 147 additions & 0 deletions pecos/ann/hnsw/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
# Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
# with the License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
# and limitations under the License.
import argparse
import os
import numpy as np
from pecos.utils import smat_util
from .model import HNSW


def parse_arguments():
"""Parse training arguments"""

parser = argparse.ArgumentParser()

# Required parameters
parser.add_argument(
"-i",
"--inst-path",
type=str,
required=True,
metavar="PATH",
help="path to the CSR npz or Row-majored npy file of the item matrix (nr_items * nr_feats) to be indexed by HNSW",
)
parser.add_argument(
"-m",
"--model-folder",
type=str,
required=True,
metavar="DIR",
help="path to the model folder that saved the HNSW index",
)

# Optional

# HNSW Indexing parameters
parser.add_argument(
"--metric-type",
type=str,
default="ip",
metavar="STR",
help="distance metric type, can be ip (inner product) or l2 (Euclidean distance), default is set to ip",
)
parser.add_argument(
"-maxM",
"--max-edge-per-node",
type=int,
default=32,
metavar="INT",
help="maximum number of edges per node for layer l=1,...,L. For l=0, it becomes 2*M (default 32)"
)
parser.add_argument(
"-efC",
"--efConstruction",
type=int,
default=100,
metavar="INT",
help="size of the priority queue when performing best first search during construction (default 100)"
)
parser.add_argument(
"-n",
"--threads",
type=int,
default=-1,
metavar="int",
help= "number of threads to use for training and inference of hnsw indexer (default -1 to use all)"
)
parser.add_argument(
"-maxL",
"--max-level-upper-bound",
type=int,
default=-1,
metavar="int",
help= "number of maximum layers in the hierarchical graph (default -1 to ignore)",
)

# HNSW Prediction kwargs
parser.add_argument(
"-efS",
"--efSearch",
type=int,
default=100,
metavar="INT",
help="size of the priority queue when performing best first search during inference (default 100)"
)
parser.add_argument(
"-k",
"--topk",
type=int,
default=10,
metavar="INT",
help="maximum number of candidates (sorted by distances, nearest first) to be returned (default 10)",
)

return parser


def do_train(args):
"""Train and Save HNSW model
Args:
args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
"""

# Create model folder
if not os.path.exists(args.model_folder):
os.makedirs(args.model_folder)

# Load training inputs
X = smat_util.load_matrix(args.inst_path).astype(np.float32)

# Setup training and prediction params
# Note that prediction params can be overrided in inference time
train_params = HNSW.TrainParams(
M=args.max_edge_per_node,
efC=args.efConstruction,
metric_type=args.metric_type,
max_level_upper_bound=args.max_level_upper_bound,
threads=args.threads,
)
pred_params = HNSW.PredParams(
efS=args.efSearch,
topk=args.topk,
threads=args.threads,
)

# train and save HNSW indexer
model = HNSW.train(
X,
train_params=train_params,
pred_params=pred_params,
)

model.save(args.model_folder)


if __name__ == "__main__":
parser = parse_arguments()
args = parser.parse_args()
do_train(args)
Loading

0 comments on commit 597724e

Please sign in to comment.