init, add scripts, update readme

matchyc · May 13, 2024 · 9707c0e · 9707c0e
1 parent cde202e
commit 9707c0e
Show file tree

Hide file tree

Showing 10 changed files with 195 additions and 53 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -0,0 +1,86 @@
+name: GitHub Clone Count Update Everyday
+
+on:
+  schedule:
+    - cron: "0 */24 * * *"
+  workflow_dispatch:
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          persist-credentials: false # otherwise, the token used is the GITHUB_TOKEN, instead of your personal access token.
+          fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository.
+
+      - name: gh login
+        run: echo "${{ secrets.SECRET_TOKEN }}" | gh auth login --with-token
+
+      - name: parse latest clone count
+        run: |
+          curl --user "${{ github.actor }}:${{ secrets.SECRET_TOKEN }}" \
+            -H "Accept: application/vnd.github.v3+json" \
+            https://api.github.com/repos/${{ github.repository }}/traffic/clones \
+            > clone.json
+      - name: create gist and download previous count
+        id: set_id
+        run: |
+          if gh secret list | grep -q "GIST_ID"
+          then
+              echo "GIST_ID found"
+              echo ::set-output name=GIST::${{ secrets.GIST_ID }}
+              curl https://gist.githubusercontent.com/${{ github.actor }}/${{ secrets.GIST_ID }}/raw/clone.json > clone_before.json
+              if cat clone_before.json | grep '404: Not Found'; then
+                echo "GIST_ID not valid anymore. Creating another gist..."
+                gist_id=$(gh gist create clone.json | awk -F / '{print $NF}')
+                echo $gist_id | gh secret set GIST_ID
+                echo ::set-output name=GIST::$gist_id
+                cp clone.json clone_before.json
+                git rm --ignore-unmatch  CLONE.md
+              fi
+          else
+              echo "GIST_ID not found. Creating a gist..."
+              gist_id=$(gh gist create clone.json | awk -F / '{print $NF}')
+              echo $gist_id | gh secret set GIST_ID
+              echo ::set-output name=GIST::$gist_id
+              cp clone.json clone_before.json
+          fi
+      - name: update clone.json
+        run: |
+          curl https://raw.githubusercontent.com/MShawon/github-clone-count-badge/master/main.py > main.py
+          python3 main.py
+      - name: Update gist with latest count
+        run: |
+          content=$(sed -e 's/\\/\\\\/g' -e 's/\t/\\t/g' -e 's/\"/\\"/g' -e 's/\r//g' "clone.json" | sed -E ':a;N;$!ba;s/\r{0,1}\n/\\n/g')
+          echo '{"description": "${{ github.repository }} clone statistics", "files": {"clone.json": {"content": "'"$content"'"}}}' > post_clone.json
+          curl -s -X PATCH \
+            --user "${{ github.actor }}:${{ secrets.SECRET_TOKEN }}" \
+            -H "Content-Type: application/json" \
+            -d @post_clone.json https://api.github.com/gists/${{ steps.set_id.outputs.GIST }} > /dev/null 2>&1
+          if [ ! -f CLONE.md ]; then
+            shields="https://img.shields.io/badge/dynamic/json?color=success&label=Clone&query=count&url="
+            url="https://gist.githubusercontent.com/${{ github.actor }}/${{ steps.set_id.outputs.GIST }}/raw/clone.json"
+            repo="https://github.com/MShawon/github-clone-count-badge"
+            echo ''> CLONE.md
+            echo '
+            **Markdown**
+            ```markdown' >> CLONE.md
+            echo "[![GitHub Clones]($shields$url&logo=github)]($repo)" >> CLONE.md
+            echo '
+            ```
+            **HTML**
+            ```html' >> CLONE.md
+            echo "<a href='$repo'><img alt='GitHub Clones' src='$shields$url&logo=github'></a>" >> CLONE.md
+            echo '```' >> CLONE.md
+            
+            git add CLONE.md
+            git config --global user.name "GitHub Action"
+            git config --global user.email "action@github.com"
+            git commit -m "create clone count badge"
+          fi
+      - name: Push
+        uses: ad-m/github-push-action@master
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/CLONE.md b/CLONE.md
@@ -0,0 +1,11 @@
+
+
+  **Markdown**
+  ```markdown
+[![GitHub Clones](https://img.shields.io/badge/dynamic/json?color=success&label=Clone&query=count&url=https://gist.githubusercontent.com/matchyc/c4295bccf42f4b2be4b7777a43bd65e9/raw/clone.json&logo=github)](https://github.com/MShawon/github-clone-count-badge)
+
+  ```
+  **HTML**
+  ```html
+<a href='https://github.com/MShawon/github-clone-count-badge'><img alt='GitHub Clones' src='https://img.shields.io/badge/dynamic/json?color=success&label=Clone&query=count&url=https://gist.githubusercontent.com/matchyc/c4295bccf42f4b2be4b7777a43bd65e9/raw/clone.json&logo=github'></a>
+```
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Meng Chen
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -1,71 +1,76 @@
-# RoarGraph: A Projected Bipartite Graph for Efficient Cross-Modal Approximate Nearest Neighbor Search | MysteryANN
+# RoarGraph: A Projected Bipartite Graph for Efficient Cross-Modal Approximate Nearest Neighbor Search
 
-This repository includes the codes for VLDB 2024 paper RoarGraph, it also 🏆 Winning NeurIPS' Competition Track: Big ANN, Practical Vector Search Challenge. (OOD Track) (Our other solution won the Sparse Track).
+This repository includes the codes for the VLDB 2024 paper RoarGraph.
 
-[![NIPS Big-ANN Benchmark 2023](https://img.shields.io/badge/NIPS%20Big--ANN%20Benchmark-2023-blue)](https://big-ann-benchmarks.com/neurips23.html)
+![](https://api.visitorbadge.io/api/VisitorHit?user=matchyc&repo=RoarGraph&countColor=%237B1E7A)
 
-![](https://api.visitorbadge.io/api/VisitorHit?user=matchyc&repo=mysteryann&countColor=%237B1E7A)
-
-
-
-[![GitHub Clones](https://img.shields.io/badge/dynamic/json?color=success&label=Clone&query=count&url=https://gist.githubusercontent.com/matchyc/daf1f1c1372416a529003f91b5562fdc/raw/clone.json&logo=github)](https://github.com/MShawon/github-clone-count-badge)
+[![GitHub Clones](https://img.shields.io/badge/dynamic/json?color=success&label=Clone&query=count&url=https://gist.githubusercontent.com/matchyc/c4295bccf42f4b2be4b7777a43bd65e9/raw/clone.json&logo=github)](https://github.com/MShawon/github-clone-count-badge)
 
 
 This code builds upon the NSG repo and incorporates other open-source implementations.
 
 The main branch is the codebase of the RoarGraph paper.
-The codes for the NIPS 2023 challenge are available in separate branches.
-
 ## Getting Started & Reproduce Experiments in the Paper
-File format: all `fbin` files begin with number of vectors (uint32, 4 bytes), dimension (uint32, 4 bytes), and followed by the vector data.
+File format: all `fbin` files begin with the number of vectors (uint32, 4 bytes), dimension (uint32, 4 bytes), and followed by the vector data. (Same format as big-ann competition.)
 
-We use zenodo `https://zenodo.org/` to save indexes files online (50GB for free), however, it may take a while to download file with x GB size (tested 500KB/s) since its a free platform for publishing research data.
+We use zenodo `https://zenodo.org/` to publish research data and indexes files online (zenodo provides 50GB for free).
 
 0. Prerequisite
 ```
 cmake >= 3.24
 g++ >= 9.4
 CPU supports AVX-512
 
-
 Python >= 3.8
 Python package:
 numpy
 urllib
 tarfile
 ```
 
+```bash
+git clone --recursive https://github.com/matchyc/RoarGraph.git
+```
+
 1. prepare datasets
 The script will download datasets used in the paper and save them in the `./data` directory.
 - dataset name:
     - t2i-10M
-    - LAION-10M
-    - WebVid-2.5M
+    - laion-10M
+    - webvid-2.5M
+
+Taking the yandex text-to-image dataset as an example.
+
 ```bash
-bash prepare_data.sh <dataset name>
+bash prepare_data.sh t2i-10M
 ```
 
 2. Compile and build
 ```bash
-mkdir build
+mkdir -p build
 cd build
 cmake .. -DCMAKE_BUILD_TYPE=Release && make -j
 ```
 
 
 3. Bulild Index
 
-3.1 Compute groundtruth for forming a bipartite graph
-We use program provided by [here](https://github.com/matchyc/DiskANN/tree/master/tests/utils), which utilizes MKL on CPU.
-You can change the code in the `compute_groundtruth.cpp` file to adjust the memory comsumption. (This program will save both vector ids and distances, we don't need the later one.)
+3.1 Compute groundtruth for forming a bipartite graph.  
+We use a program provided by [here](https://github.com/matchyc/DiskANN/tree/master/tests/utils), which utilizes MKL on the CPU.
+You can change the code in the `compute_groundtruth.cpp` file to adjust the memory consumption. (This program will save both vector ids and distances, however, we don't need the later one.)
 - base_file: the base data.
 - query_file: the training queries.
 - gt_file: save path.
+- K: $N_q$ in the paper.
 ```bash
 prefix=../data/t2i-10M
-./compute_groundtruth --data_type float --dist_fn l2  --base_file ${prefix}/base.10M.fbin  --query_file ${prefix}/query.train.10M.fbin  --gt_file ${prefix}/train.gt.bin --K 100
+cp ./thirdparty/DiskANN/tests/utils/compute_groundtruth compute_groundtruth
+mkdir -p ${prefix}
+./compute_groundtruth --data_type float --dist_fn mips --base_file ${prefix}/base.10M.fbin  --query_file ${prefix}/query.train.10M.fbin  --gt_file ${prefix}/train.gt.bin --K 100
 ```
-However, it can take hours (see Section 5 in the paper). You can just leverage GPU for faster computation, like [raft](https://github.com/rapidsai/raft). It is easy to use by following the instructions, you can reach me out for getting the python scripts to use raft on GPU. Otherwise, you can slice the training query file and use 10% of it, wihch can also deliver decent performance.
+This step can take hours (see evaluations in Section 5 in the paper). You can just leverage GPU for faster computation, like [raft](https://github.com/rapidsai/raft). It is easy to use by following the instructions, you can reach me out for getting the python scripts to use raft on GPU for these three datasets.
+
+Otherwise, you can slice the training query set and use 10% of it to save much time and evaluate the effects of different training set sizes, wihch can also deliver decent performance.
 
 3.2 build the graph index
 - base_data_path: base data path.
@@ -77,15 +82,14 @@ However, it can take hours (see Section 5 in the paper). You can just leverage G
 - L_pjpq: $L$ in the paper.
 - T: number of threads employed to construct the graph.
 ```bash
-mkdir -p build
 cd build
 cmake .. -DCMAKE_BUILD_TYPE=Release && make -j
 prefix=../data/t2i-10M
 ./tests/test_build_roargraph --data_type float --dist ip \
 --base_data_path ${prefix}/base.10M.fbin  \
 --sampled_query_data_path ${prefix}/query.train.10M.fbin \
 --projection_index_save_path ${prefix}/t2i_10M_roar.index \
---learn_base_nn_path ${prefix}/t2i.train.in.base.nn.dist.10M.ibin \
+--learn_base_nn_path ${prefix}/train.gt.bin \
 --M_sq 100 --M_pjbp 35 --L_pjpq 500 -T 64
 ```
 
@@ -94,43 +98,43 @@ prefix=../data/t2i-10M
 - num_threads: number of threads for searching.
 - topk: $K$ answers will be returned for evaluation.
 - gt_path: the groundtruths for queries in evaluations.
-- query_path: queries file for evalution.
+- query_path: queries file for evaluation.
 - L_pq: capacities of priority queue during the search phase.
 - evaluation_save_path: file path to save performance statistics (optional).
+
 ```bash
 num_threads=16
 topk=10
 prefix=../data/t2i-10M
 ./tests/test_search_roargraph --data_type float \
 --dist ip --base_data_path ${prefix}/base.10M.fbin \
 --projection_index_save_path ${prefix}/t2i_10M_roar.index \
---gt_path ${prefix}/groundtruth.base.10M.query.10k.ibin \
---query_path ${prefix}/query.public.10k.fbin \
+--gt_path ${prefix}/gt.10k.ibin  \
+--query_path ${prefix}/query.10k.fbin \
 --L_pq 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 110 120 130 140 150 160 170 180 190 200 220 240 260 280 300 350 400 450 500 550 600 650 700 750 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 \
 --k ${topk}  -T ${num_threads} \
 --evaluation_save_path ${prefix}/test_search_t2i_10M_top${topk}_T${num_threads}.csv
 ```
 
-## Reproduce the Experiment
-If rigorously reproduce is needed, we provide the constructed indexes for three datasets.
-First, download them and then search on these indexes can reproduce the performance experiment in the paper.
-- https://zenodo.org/records/11073098/files/t2i_10M_roar.index?download=1
-- https://zenodo.org/records/11073098/files/laion_10M_roar.index?download=1
-- https://zenodo.org/records/11073098/files/webvid_2.5M_roar.index?download=1
+## Reproduce the Experiment by Pre-Constructed Indexes
+If rigorously reproduction is needed, we provide the constructed indexes for three datasets.
+First, download the built indexes used for evaluations.
+- https://zenodo.org/records/11090378/files/t2i_10M_roar.index?download=1
+- https://zenodo.org/records/11090378/files/laion_10M_roar.index?download=1
+- https://zenodo.org/records/11090378/files/webvid_2.5M_roar.index?download=1
 
-Simply download the index and set the projection_index_save_path as the index path to perform searches. If downloading takes too long, you can request that I upload/send the index files to you for strict reproduction, provided you can offer a suitable file sharing platform.
+Download the query file and ground truth file, set the correct projection_index_save_path as the index path to perform searches.
+
+If you plan to use the constructed index, you can avoid downloading the big dataset files, but only download the query file and ground truth file from links that can be obtained from the `prepare_dataset.sh` script.
 
 ## License
 MIT License
 
 
 
 ## Contact
-If you wish, please leave a message if you plan to use this idea.
-
 For questions or inquiries, feel free to reach out to Meng Chen at
 [mengchen22@m.fudan.edu.cn](mailto:mengchen22@m.fudan.edu.cn)
-<!-- [mengchen9909@gmail.com](mailto:mengchen9909@gmail.com) -->
 
 
 
diff --git a/change_meta_data_in_file.py b/change_meta_data_in_file.py
@@ -0,0 +1,16 @@
+# ./data/t2i-10M/query.train.10M.fbin overwrite the first 4 bytes by 10000000 as uint32
+
+import os
+import sys
+import numpy
+
+def fit_meta_data_in_file(data_file, data_size):
+    with open(data_file, 'r+b') as f:
+        f.seek(0)
+        f.write(numpy.array([data_size], dtype=numpy.uint32).tobytes())
+
+if __name__ == '__main__':
+    data_file = sys.argv[1]
+    data_size = int(sys.argv[2])
+    fit_meta_data_in_file(data_file, data_size)
+    print(f"change meta data in file {data_file} to {data_size}")
diff --git a/include/efanna2e/util.h b/include/efanna2e/util.h
@@ -206,7 +206,7 @@ void load_data(const char *filename, uint32_t &points_num, uint32_t &dim, T *&da
         std::cerr << "Read file incompleted! filename:" << std::string(filename) << std::endl;
         throw std::runtime_error("Data file size wrong!");
     }
-    std::cout << "load data from file: " << filename << " points_num: " << points_num << " dim: " << dim << std::endl;
+    std::cout << "Finish load data from file: " << filename << " points_num: " << points_num << " dim: " << dim << std::endl;
     in.close();
 }
 

diff --git a/prepare_data.sh b/prepare_data.sh
@@ -19,15 +19,18 @@ mkdir -p data/$1
 
 if [ "$1" == "t2i-10M" ]; then
     echo "dataset t2i"
-    need_size=$((200*4*10000000+8))
-    query_10k_size=$((200*4*10000+8))
+    need_size=$((200*4*10000000+8-1))
+    query_10k_size=$((200*4*10000+8-1))
     # download the dataset
     if [ ! -e ./data/$1/gt.10k.ibin ]; then
         curl -r 0-${need_size} -o data/$1/base.10M.fbin https://storage.yandexcloud.net/yandex-research/ann-datasets/T2I/base.10M.fbin
         curl -r 0-${need_size} -o data/$1/query.train.10M.fbin https://storage.yandexcloud.net/yandex-research/ann-datasets/T2I/query.learn.50M.fbin
         curl -r 0-${query_10k_size} -o data/$1/query.10k.fbin https://storage.yandexcloud.net/yandex-research/ann-datasets/T2I/query.public.100K.fbin
-        curl -o data/$1/gt.10k.ibin https://zenodo.org/records/11073098/files/t2i.gt.10k.ibin
+        curl -o data/$1/gt.10k.ibin https://zenodo.org/records/11090378/files/t2i.gt.10k.ibin
     fi
+    curl -o data/$1/gt.10k.ibin https://zenodo.org/records/11090378/files/t2i.gt.10k.ibin
+    python3 change_meta_data_in_file.py ./data/t2i-10M/query.train.10M.fbin 10000000
+    python change_meta_data_in_file.py ./data/t2i-10M/query.10k.fbin 10000
 elif [ "$1" == "laion-10M" ]; then
     echo "dataset laion"
     # download the dataset
@@ -48,8 +51,8 @@ elif [ "$1" == "laion-10M" ]; then
     # export text and img simultaneously, watch out the DRAM.
     python3 export_fbin_from_npy.py
     if [ ! -e ./data/$1/gt.10k.ibin ]; then
-        curl -o data/$1/query.10k.fbin https://zenodo.org/records/11073098/files/laion.query.10k.fbin
-        curl -o data/$1/gt.10k.ibin https://zenodo.org/records/11073098/files/laion.gt.10k.ibin
+        curl -o data/$1/query.10k.fbin https://zenodo.org/records/11090378/files/laion.query.10k.fbin
+        curl -o data/$1/gt.10k.ibin https://zenodo.org/records/11090378/files/laion.gt.10k.ibin
     fi
 elif [ "$1" == "webvid-2.5M" ]; then
     echo "dataset webvid"
@@ -60,8 +63,12 @@ elif [ "$1" == "webvid-2.5M" ]; then
         # python3 prepare_for_clip_webvid.py
     fi
 
+    if [ ! -e ./data/clip-webvid-2.5M/query.train.2.5M.fbin ]; then
+        curl -o data/clip-webvid-2.5M/query.train.2.5M.fbin https://zenodo.org/records/11090378/files/webvid.query.train.2.5M.fbin
+    fi
+
     if [ ! -e ./data/clip-webvid-2.5M/gt.10k.ibin ]; then
-        curl -o data/clip-webvid-2.5M/query.10k.fbin https://zenodo.org/records/11073098/files/webvid.query.10k.fbin
-        curl -o data/clip-webvid-2.5M/gt.10k.ibin https://zenodo.org/records/11073098/files/webvid.gt.10k.ibin
+        curl -o data/clip-webvid-2.5M/query.10k.fbin https://zenodo.org/records/11090378/files/webvid.query.10k.fbin
+        curl -o data/clip-webvid-2.5M/gt.10k.ibin https://zenodo.org/records/11090378/files/webvid.gt.10k.ibin
     fi
 fi
diff --git a/run_roargraph_search_test.sh b/run_roargraph_search_test.sh
@@ -8,11 +8,8 @@ prefix=../data/t2i-10M
 ./tests/test_search_roargraph --data_type float \
 --dist ip --base_data_path ${prefix}/base.10M.fbin \
 --projection_index_save_path ${prefix}/t2i_10M_roar.index \
---gt_path ${prefix}/groundtruth.base.10M.query.10k.ibin \
---query_path ${prefix}/query.public.10k.fbin \
+--gt_path ${prefix}/gt.10k.ibin  \
+--query_path ${prefix}/query.10k.fbin \
 --L_pq 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 110 120 130 140 150 160 170 180 190 200 220 240 260 280 300 350 400 450 500 550 600 650 700 750 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 \
 --k ${topk}  -T ${num_threads} \
 --evaluation_save_path ${prefix}/test_search_t2i_10M_top${topk}_T${num_threads}.csv
-# --sampled_query_data_path ${prefix}/query.10M.fbin \
-# --bipartite_index_save_path ${prefix}/t2i_10M_bipartite.index \
-# --evaluation_save_path /home/cm/projects/ann/exp_result/cross_modal/t2i_10M_evaluation/t2i_10M_design_v2.5_learn_${learn_size}_r${one_k}_M_35_L_500_q_10k_with_rderr_T${num_threads}.csv
diff --git a/run_roargraph_test.sh b/run_roargraph_test.sh
@@ -4,7 +4,7 @@ cmake .. -DCMAKE_BUILD_TYPE=Release && make -j
 prefix=../data/t2i-10M
 ./tests/test_build_roargraph --data_type float --dist ip \
 --base_data_path ${prefix}/base.10M.fbin  \
---sampled_query_data_path ${prefix}/query.learn.10M.fbin \
+--sampled_query_data_path ${prefix}/query.train.10M.fbin \
 --projection_index_save_path ${prefix}/t2i_10M_roar.index \
---learn_base_nn_path ${prefix}/t2i.train.in.base.nn.dist.10M.ibin \
+--learn_base_nn_path ${prefix}/train.gt.bin \
 --M_sq 100 --M_pjbp 35 --L_pjpq 500 -T 64