Enable PECOS HNSW with command line interface

amzn · Oct 16, 2021 · 597724e · 597724e
1 parent 4cfb9d5
commit 597724e
Show file tree

Hide file tree

Showing 5 changed files with 333 additions and 10 deletions.
diff --git a/pecos/ann/hnsw/README.md b/pecos/ann/hnsw/README.md
@@ -20,11 +20,15 @@ X_tst = np.random.randn(1000, 100).astype(np.float32)
 Note that the data type needed to be `np.float32`.   
 
 #### HNSW Training
-Train the HNSW model (i.e., building the graph-based indexing data structure) with maximum number of threads available on your machine (`threads=0`):
+Train the HNSW model (i.e., building the graph-based indexing data structure) with maximum number of threads available on your machine (`threads=-1`):
 ```python
 from pecos.ann.hnsw import HNSW
 train_params = HNSW.TrainParams(M=32, efC=300, metric_type="ip", threads=-1)
-model = HNSW.train(X_trn, train_params=train_params)
+model = HNSW.train(X_trn, train_params=train_params, pred_params=None)
+```
+Users are also welcome to train the default parameters via
+```python
+model = HNSW.train(X_trn)
 ```
 
 #### HNSW Save and Load
@@ -47,11 +51,13 @@ searchers = model.searchers_create(num_searcher=4)
 Finally, we conduct ANN inference by inputing searchers to the HNSW model.
 ```python
 pred_params = HNSW.PredParams(efS=100, topk=10)
-indices, distances = model.predict(X_tst, pred_params=pred_params, searchers=searchers, ret_csr=False)
+Yt_pred = model.predict(X_tst, pred_params=pred_params, searchers=searchers)
 ```
+where `Yt_pred` is a `scipy.sparse.csr_matrix` whose row indices are sorted by its distances ascendingly.  
+
 Alternatively, it is also feasible to do inference without pre-allocating searchers, which may have larger overhead since it will **re-allocate** intermediate graph-searhing variables for each query matrix `X_tst`.
 ```python
 pred_params.threads = 2
 indices, distances = model.predict(X_tst, pred_params=pred_params, ret_csr=False)
 ```
-When `ret_csr=True`, the prediction function will return a single csr matrix that combines the indices and distances numpy array.
+When `ret_csr=False`, the prediction function will return the indices and distances numpy array.
diff --git a/pecos/ann/hnsw/model.py b/pecos/ann/hnsw/model.py
@@ -34,14 +34,14 @@ class TrainParams(pecos.BaseParams):
         """Training Parameters of HNSW class
 
         Attributes:
-            M (int): maximum number of edges per node for layer l=1,...,L. For layer l=0, its 2*M.
+            M (int): maximum number of edges per node for layer l=1,...,L. For layer l=0, its 2*M. Default 32
             efC (int): size of the priority queue when performing best first search during construction. Default 100
             threads (int): number of threads to use for training HNSW indexer. Default -1 to use all
             max_level_upper_bound (int): number of maximum layers in the hierarchical graph. Default -1 to ignore
             metric_type (str): distance metric type, can be "ip" for inner product or "l2" for Euclidean distance
         """
 
-        M: int = 24
+        M: int = 32
         efC: int = 100
         threads: int = -1
         max_level_upper_bound: int = -1
@@ -213,13 +213,13 @@ def get_pred_params(self):
         """
         return copy.deepcopy(self.pred_params)
 
-    def predict(self, X, pred_params=None, searchers=None, ret_csr=False):
+    def predict(self, X, pred_params=None, searchers=None, ret_csr=True):
         """predict with multi-thread. If searchers are provided, less overhead for online inference.
         Args:
             X (nd.array/ScipyDrmF32, scipy.sparse.csr_matrix/ScipyCsrF32): query matrix to be predicted. (num_query x feat_dim).
             pred_params (HNSW.PredParams, optional): instance of pecos.ann.hnsw.HNSW.PredParams
             searchers (c_void_p): pointer to C/C++ std::vector<pecos::ann::HNSW:Searcher>. It's an object returned by self.create_searcher().
-            ret_csr (bool): if true, the returns will be csr matrix. if false, return indices/distances np.array
+            ret_csr (bool): if true, the returns will be csr matrix. if false, return indices/distances np.array (default true)
         Returns:
             indices (np.array): returned indices array, sorted by smallest-to-largest distances. (num_query x pred_params.topk)
             distances (np.array): returned dinstances array, sorted by smallest-to-largest distances (num_query x pred_params.topk)

diff --git a/pecos/ann/hnsw/predict.py b/pecos/ann/hnsw/predict.py
@@ -0,0 +1,137 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import argparse
+import os
+import numpy as np
+from pecos.utils import smat_util
+from .model import HNSW
+
+
+def parse_arguments():
+    """Parse Inference arguments"""
+
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument(
+        "-i",
+        "--inst-path",
+        type=str,
+        required=True,
+        metavar="PATH",
+        help="path to the CSR npz or Row-majored npy file of the feature matrix (nr_insts * nr_feats) to be indexed by HNSW",
+    )
+    parser.add_argument(
+        "-m",
+        "--model-folder",
+        type=str,
+        required=True,
+        metavar="DIR",
+        help="path to the model folder to load the HNSW index for inference",
+    )
+
+    # Optional
+    parser.add_argument(
+        "-efS",
+        "--efSearch",
+        type=int,
+        default=100,
+        metavar="INT",
+        help="size of the priority queue when performing best first search during inference. (Default 100)"
+    )
+    parser.add_argument(
+        "-k",
+        "--topk",
+        type=int,
+        default=10,
+        metavar="INT",
+        help="maximum number of candidates (sorted by distances, nearest first) to be returned",
+    )
+    parser.add_argument(
+        "-n",
+        "--threads",
+        type=int,
+        default=-1,
+        metavar="int",
+        help= "number of threads to use for inference of hnsw indexer (default -1 to use all)"
+    )
+    parser.add_argument(
+        "-y",
+        "--label-path",
+        type=str,
+        default=None,
+        metavar="PATH",
+        help="path to the npz file of the ground truth label matrix (CSR, nr_tst * nr_items)",
+    )
+    parser.add_argument(
+        "-o",
+        "--save-pred-path",
+        type=str,
+        default=None,
+        metavar="PATH",
+        help="path to save the predictions (CSR sorted by distances, nr_tst * nr_items)",
+    )
+
+    return parser
+
+
+def do_predict(args):
+    """Predict and Evaluate for HNSW model
+
+    Args:
+        args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
+    """
+
+    # Load data
+    Xt = smat_util.load_matrix(args.inst_path).astype(np.float32)
+
+    # Load model
+    model = HNSW.load(args.model_folder)
+
+    # Setup HNSW Searchers for thread-safe inference
+    threads = os.cpu_count() if args.threads <= 0 else args.threads
+    searchers = model.searchers_create(num_searcher=threads)
+
+    # Setup prediction params
+    # pred_params.threads will be overrided if searchers are provided in model.predict()
+    pred_params = HNSW.PredParams(
+        efS=args.efSearch,
+        topk=args.topk,
+        threads=threads,
+    )
+
+    # Model Predicting
+    Yt_pred = model.predict(
+        Xt,
+        pred_params=pred_params,
+        searchers=searchers,
+        ret_csr=True,
+    )
+
+    # Save prediction
+    if args.save_pred_path:
+        smat_util.save_matrix(args.save_pred_path, Yt_pred)
+
+    # Evaluate Recallk@k
+    if args.label_path:
+        Yt = smat_util.load_matrix(args.label_path)
+        # assuming ground truth is similarity-based (larger the better)
+        Yt_topk = smat_util.sorted_csr(Yt, only_topk=args.topk)
+        # assuming prediction matrix is distance-based, so need 1-dist=similiarty
+        Yt_pred.data = 1.0 - Yt_pred.data
+        metric = smat_util.Metrics.generate(Yt_topk, Yt_pred, topk=args.topk)
+        print("Recall{}@{} {:.6f}%".format(args.topk, args.topk, 100. * metric.recall[-1]))
+
+
+if __name__ == "__main__":
+    parser = parse_arguments()
+    args = parser.parse_args()
+    do_predict(args)
diff --git a/pecos/ann/hnsw/train.py b/pecos/ann/hnsw/train.py
@@ -0,0 +1,147 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance
+#  with the License. A copy of the License is located at
+#
+#  http://aws.amazon.com/apache2.0/
+#
+#  or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
+#  OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions
+#  and limitations under the License.
+import argparse
+import os
+import numpy as np
+from pecos.utils import smat_util
+from .model import HNSW
+
+
+def parse_arguments():
+    """Parse training arguments"""
+
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument(
+        "-i",
+        "--inst-path",
+        type=str,
+        required=True,
+        metavar="PATH",
+        help="path to the CSR npz or Row-majored npy file of the item matrix (nr_items * nr_feats) to be indexed by HNSW",
+    )
+    parser.add_argument(
+        "-m",
+        "--model-folder",
+        type=str,
+        required=True,
+        metavar="DIR",
+        help="path to the model folder that saved the HNSW index",
+    )
+
+    # Optional
+
+    # HNSW Indexing parameters
+    parser.add_argument(
+        "--metric-type",
+        type=str,
+        default="ip",
+        metavar="STR",
+        help="distance metric type, can be ip (inner product) or l2 (Euclidean distance), default is set to ip",
+    )
+    parser.add_argument(
+        "-maxM",
+        "--max-edge-per-node",
+        type=int,
+        default=32,
+        metavar="INT",
+        help="maximum number of edges per node for layer l=1,...,L. For l=0, it becomes 2*M (default 32)"
+    )
+    parser.add_argument(
+        "-efC",
+        "--efConstruction",
+        type=int,
+        default=100,
+        metavar="INT",
+        help="size of the priority queue when performing best first search during construction (default 100)"
+    )
+    parser.add_argument(
+        "-n",
+        "--threads",
+        type=int,
+        default=-1,
+        metavar="int",
+        help= "number of threads to use for training and inference of hnsw indexer (default -1 to use all)"
+    )
+    parser.add_argument(
+        "-maxL",
+        "--max-level-upper-bound",
+        type=int,
+        default=-1,
+        metavar="int",
+        help= "number of maximum layers in the hierarchical graph (default -1 to ignore)",
+    )
+
+    # HNSW Prediction kwargs
+    parser.add_argument(
+        "-efS",
+        "--efSearch",
+        type=int,
+        default=100,
+        metavar="INT",
+        help="size of the priority queue when performing best first search during inference (default 100)"
+    )
+    parser.add_argument(
+        "-k",
+        "--topk",
+        type=int,
+        default=10,
+        metavar="INT",
+        help="maximum number of candidates (sorted by distances, nearest first) to be returned (default 10)",
+    )
+
+    return parser
+
+
+def do_train(args):
+    """Train and Save HNSW model
+
+    Args:
+        args (argparse.Namespace): Command line arguments parsed by `parser.parse_args()`
+    """
+
+    # Create model folder
+    if not os.path.exists(args.model_folder):
+        os.makedirs(args.model_folder)
+
+    # Load training inputs
+    X = smat_util.load_matrix(args.inst_path).astype(np.float32)
+
+    # Setup training and prediction params
+    # Note that prediction params can be overrided in inference time
+    train_params = HNSW.TrainParams(
+        M=args.max_edge_per_node,
+        efC=args.efConstruction,
+        metric_type=args.metric_type,
+        max_level_upper_bound=args.max_level_upper_bound,
+        threads=args.threads,
+    )
+    pred_params = HNSW.PredParams(
+        efS=args.efSearch,
+        topk=args.topk,
+        threads=args.threads,
+    )
+
+    # train and save HNSW indexer
+    model = HNSW.train(
+        X,
+        train_params=train_params,
+        pred_params=pred_params,
+    )
+
+    model.save(args.model_folder)
+
+
+if __name__ == "__main__":
+    parser = parse_arguments()
+    args = parser.parse_args()
+    do_train(args)