diff --git a/.circleci/config.yml b/.circleci/config.yml index 6a55622..5bac982 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,113 +1,77 @@ -# Golang CircleCI 2.0 configuration file -# -# Check https://circleci.com/docs/2.0/language-go/ for more details -version: 2 -jobs: - build-edge: # test with redisearch:edge +version: 2.1 + +executors: + edge: docker: - image: circleci/golang:1.13 - image: redislabs/redisearch:edge - - working_directory: /go/src/github.com/RediSearch/ftsb - steps: - - checkout - - run: make test - - run: bash <(curl -s https://codecov.io/bash) -t ${CODECOV_TOKEN} - - build-latest: # test with redisearch:latest + latest: docker: - image: circleci/golang:1.13 - image: redislabs/redisearch:latest - working_directory: /go/src/github.com/RediSearch/ftsb - steps: - - checkout - - run: make test - - ci-benchmark-edge: # test nightly with redisearch:edge - docker: - - image: circleci/golang:1.13 - - image: redislabs/redisearch:edge - - working_directory: /go/src/github.com/RediSearch/ftsb +jobs: + ci-benchmark: + parameters: + redisearch_version: + type: executor + use_case: + type: string + executor: << parameters.redisearch_version >> steps: - checkout - run: make - run: sudo apt install python3.6 -y - run: sudo apt install python3-pip -y - - run: python3 -m pip install wheel redisbench-admin==0.1.10 + - run: python3 -m pip install wheel redisbench-admin==0.1.12 - run: - name: ecommerce-inventory use case + name: << parameters.use_case >> use case command: | redisbench-admin run \ - --repetitions 7 \ + --repetitions 3 \ --output-file-prefix circleci \ --upload-results-s3 \ - --benchmark-config-file https://s3.amazonaws.com/benchmarks.redislabs/redisearch/datasets/ecommerce-inventory/ecommerce-inventory.redisearch.cfg.json - - run: - name: nyc_taxis CI use case with HSET - command: | - redisbench-admin run \ - --repetitions 3 \ - --output-file-prefix circleci \ - --benchmark-requests 1000000 \ - --upload-results-s3 \ - --benchmark-config-file https://s3.amazonaws.com/benchmarks.redislabs/redisearch/datasets/nyc_taxis-hashes-CI/nyc_taxis-hashes-CI.redisearch.cfg.json - no_output_timeout: 30m - - run: - name: nyc_taxis CI use case with FT.ADD - command: | - redisbench-admin run \ - --repetitions 3 \ - --output-file-prefix circleci \ - --benchmark-requests 1000000 \ - --upload-results-s3 \ - --benchmark-config-file https://s3.amazonaws.com/benchmarks.redislabs/redisearch/datasets/nyc_taxis-ftadd-CI/nyc_taxis-ftadd-CI.redisearch.cfg.json - no_output_timeout: 30m + --benchmark-config-file https://s3.amazonaws.com/benchmarks.redislabs/redisearch/datasets/<< parameters.use_case >>/<< parameters.use_case >>.redisearch.cfg.json - ci-benchmark-latest: # test nightly with redisearch:edge + build-edge: # test with redisearch:edge docker: - image: circleci/golang:1.13 - - image: redislabs/redisearch:latest - - working_directory: /go/src/github.com/RediSearch/ftsb + - image: redislabs/redisearch:edge steps: - checkout - - run: make - - run: sudo apt install python3.6 -y - - run: sudo apt install python3-pip -y - - run: python3 -m pip install wheel redisbench-admin==0.1.10 - - run: | - redisbench-admin run \ - --repetitions 7 \ - --output-file-prefix circleci \ - --upload-results-s3 \ - --benchmark-config-file https://s3.amazonaws.com/benchmarks.redislabs/redisearch/datasets/ecommerce-inventory/ecommerce-inventory.redisearch.cfg.json + - run: make test + - run: bash <(curl -s https://codecov.io/bash) -t ${CODECOV_TOKEN} + build-latest: # test with redisearch:latest + docker: + - image: circleci/golang:1.13 + - image: redislabs/redisearch:latest - build-multiarch-docker: - machine: - enabled: true steps: - checkout - - run: | - echo "$DOCKER_REDISBENCH_PWD" | base64 --decode | docker login --username $DOCKER_REDISBENCH_USER --password-stdin - - run: - name: Build - command: | - make docker-release - no_output_timeout: 20m + - run: make test workflows: - version: 2 commit: jobs: - build-edge - build-latest - - ci-benchmark-edge - - ci-benchmark-latest: + - ci-benchmark: + name: edge-ecommerce-inventory + redisearch_version: edge + use_case: "ecommerce-inventory" + - ci-benchmark: + name: edge-nyc_taxis-ft.add + redisearch_version: edge + use_case: "nyc_taxis-ft.add" requires: - - ci-benchmark-edge + - edge-ecommerce-inventory + - ci-benchmark: + name: edge-nyc_taxis-hashes + redisearch_version: edge + use_case: "nyc_taxis-hashes" + requires: + - edge-nyc_taxis-ft.add ci_benchmarks: triggers: @@ -118,7 +82,19 @@ workflows: only: - master jobs: - - ci-benchmark-edge - - ci-benchmark-latest: + - ci-benchmark: + name: edge-ecommerce-inventory + redisearch_version: edge + use_case: "ecommerce-inventory" + - ci-benchmark: + name: edge-nyc_taxis-ft.add + redisearch_version: edge + use_case: "nyc_taxis-ft.add" + requires: + - edge-ecommerce-inventory + - ci-benchmark: + name: edge-nyc_taxis-hashes + redisearch_version: edge + use_case: "nyc_taxis-hashes" requires: - - ci-benchmark-edge + - edge-nyc_taxis-ft.add diff --git a/.dockerignore b/.dockerignore deleted file mode 100644 index 7052f6e..0000000 --- a/.dockerignore +++ /dev/null @@ -1,8 +0,0 @@ - -contrib/ -docs/ -bin/ -.circleci/ -README.md -LICENSE -coverage.txt \ No newline at end of file diff --git a/.github/workflows/ci-benchmarks.yml b/.github/workflows/ci-benchmarks.yml index 3f26179..c781bdc 100644 --- a/.github/workflows/ci-benchmarks.yml +++ b/.github/workflows/ci-benchmarks.yml @@ -12,13 +12,14 @@ jobs: strategy: matrix: go: [ '1.14'] - redisearch_version: ['edge','latest'] + redisearch_version: ['edge'] + use_case: ['ecommerce-inventory','nyc_taxis-ft.add','nyc_taxis-hashes'] services: redis: image: redislabs/redisearch:${{ matrix.redisearch_version }} ports: - 6379:6379 - name: Benchmark redisearch:${{ matrix.redisearch_version }} with Go ${{ matrix.go }} + name: Benchmark ${{ matrix.use_case }} redisearch:${{ matrix.redisearch_version }} with Go ${{ matrix.go }} steps: - uses: actions/checkout@v2 - name: Build and Run Benchmark @@ -35,18 +36,16 @@ jobs: mkdir -p $GOPATH/src/github.com/$GITHUB_REPOSITORY mv $(pwd)/* $GOPATH/src/github.com/$GITHUB_REPOSITORY cd $GOPATH/src/github.com/$GITHUB_REPOSITORY - go get ./... - go test ./... - go install ./... + make test sudo apt install python3.6 -y sudo apt install python3-pip -y sudo apt-get install python3-setuptools -y cd $GOPATH/src/github.com/$GITHUB_REPOSITORY sudo python3 -m pip install wheel - python3 -m pip install redisbench-admin==0.1.10 + python3 -m pip install redisbench-admin==0.1.12 ~/.local/bin/redisbench-admin run \ - --repetitions 7 \ + --repetitions 3 \ --output-file-prefix github-actions \ --upload-results-s3 \ - --benchmark-config-file https://s3.amazonaws.com/benchmarks.redislabs/redisearch/datasets/ecommerce-inventory/ecommerce-inventory.redisearch.cfg.json + --benchmark-config-file https://s3.amazonaws.com/benchmarks.redislabs/redisearch/datasets/${{ matrix.use_case }}/${{ matrix.use_case }}.redisearch.cfg.json diff --git a/.gitignore b/.gitignore index c578455..d1563e2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,14 +1,19 @@ # FTSB outputs # ################ -cmd/ftsb_generate_redisearch/__pycache__/* -cmd/ftsb_generate_redisearch/nyc_taxis/tmp/* cmd/ftsb_redisearch/ftsb_redisearch +################### +# Data generators # +################### *.txt *.csv -cmd/ftsb_generate_data/*.csv +*.pyc +*__pycache__* +scripts/datagen_redisearch/__pycache__/** +scripts/datagen_redisearch/nyc_taxis/tmp/* +################### # Idea / others # ################# diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 7a35dbd..0000000 --- a/Dockerfile +++ /dev/null @@ -1,24 +0,0 @@ -FROM golang:1.13.5-buster AS builder - -# Copy the code from the host and compile it -WORKDIR $GOPATH/src/github.com/RediSearch/ftsb -COPY . ./ -RUN cd $GOPATH/src/github.com/RediSearch/ftsb/cmd && CGO_ENABLED=0 GOOS=linux go get ./... -RUN cd $GOPATH/src/github.com/RediSearch/ftsb/cmd/ftsb_generate_data && CGO_ENABLED=0 GOOS=linux go build -o /tmp/ftsb_generate_data -RUN cd $GOPATH/src/github.com/RediSearch/ftsb/cmd/ftsb_generate_queries && CGO_ENABLED=0 GOOS=linux go build -o /tmp/ftsb_generate_queries -RUN cd $GOPATH/src/github.com/RediSearch/ftsb/cmd/ftsb_load_redisearch && CGO_ENABLED=0 GOOS=linux go build -o /tmp/ftsb_load_redisearch -RUN cd $GOPATH/src/github.com/RediSearch/ftsb/cmd/ftsb_run_queries_redisearch && CGO_ENABLED=0 GOOS=linux go build -o /tmp/ftsb_run_queries_redisearch - -FROM golang:1.13.5-buster -ENV PATH ./:$PATH -COPY --from=builder /tmp/ftsb_generate_data ./ -COPY --from=builder /tmp/ftsb_generate_queries ./ -COPY --from=builder /tmp/ftsb_load_redisearch ./ -COPY --from=builder /tmp/ftsb_run_queries_redisearch ./ -RUN apt-get update && apt-get install redis-tools -y -RUN apt-get update && apt-get install bzip2 -y -COPY docker_entrypoint.sh ./ -COPY scripts ./scripts -RUN chmod -R 751 scripts -RUN chmod 751 docker_entrypoint.sh -ENTRYPOINT ["./docker_entrypoint.sh"] \ No newline at end of file diff --git a/README.md b/README.md index 8870ead..1bf42ee 100644 --- a/README.md +++ b/README.md @@ -14,9 +14,7 @@ including RediSearch. This code is based on a fork of work initially made public by TSBS at https://github.com/timescale/tsbs. -Current databases supported: -+ RediSearch ## Overview The Full-Text Search Benchmark (FTSB) is a collection of Python and Go programs that are used to generate datasets (Python) and then benchmark(Go) read and write performance of various databases. The intent is to make the FTSB extensible so that a variety of use cases (e.g., ecommerce, jsondata, logs, etc.), query types, and databases can be included and benchmarked. @@ -27,6 +25,26 @@ To this end, we hope to help SAs, and prospective database administrators find t FTSB is used to benchmark bulk load performance and query execution performance. To accomplish this in a fair way, the data to be inserted and the queries to run are always pre-generated and native Go clients are used wherever possible to connect to each database. +## Current databases supported + ++ RediSearch + +### Current use cases + +Currently, FTSB supports three use cases: +- **nyc_taxis** [[details kere](docs/nyc_taxis-benchmark/description.md)]. This benchmark focus himself on write performance, making usage of TLC Trip Record Data that contains the rides that have been performed in yellow taxis in New York in 2015. On total, the benchmark loads >12M documents + + +- **enwiki-abstract** [[details kere](docs/enwiki-abstract-benchmark/description.md)], from English-language [Wikipedia:Database](https://en.wikipedia.org/wiki/Wikipedia:Database_download) page abstracts. This use case generates +3 TEXT fields per document, and focus himself on full text queries performance. + + +- **ecommerce-inventory** [[details kere](docs/ecommerce-inventory-benchmark/description.md)], from a base dataset of [10K fashion products on Amazon.com](https://data.world/promptcloud/fashion-products-on-amazon-com/workspace/file?filename=amazon_co-ecommerce_sample.csv) which are then multiplexed by categories, sellers, and countries to produce larger datasets > 1M docs. This benchmark focuses on updates and aggregate performance, splitting into Reads (FT.AGGREGATE), Cursor Reads (FT.CURSOR), and Updates (FT.ADD) the performance numbers. +The use case generates an index with 10 TAG fields (3 sortable and 1 non indexed), and 16 NUMERIC sortable non indexed fields per document. +The aggregate queries are designed to be extremely costly both on computation and network TX, given that on each query we're aggregating and filtering over a large portion of the dataset while additionally loading 21 fields. +Both the update and read rates can be adjusted. + + ### Installation @@ -40,6 +58,9 @@ cd $GOPATH/src/github.com/RediSearch/ftsb make ``` + + + ## How to use it? Using FTSB for benchmarking involves 2 phases: data and query generation, and query execution. diff --git a/docker_entrypoint.sh b/docker_entrypoint.sh deleted file mode 100755 index 383bb2a..0000000 --- a/docker_entrypoint.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/sh -echo "------------------------------------------------" -echo "Full-Text Search Benchmark (FTSB) - Docker Image" -echo "------------------------------------------------" -echo "Checking if request binary $1 exists" -if [ -f ./$1 ]; then - ./"$@" - echo - echo "...done." - exit 0 -else - echo "$1 binary does not exist." - exit 1 -fi diff --git a/docs/ecommerce-inventory-benchmark/description.md b/docs/ecommerce-inventory-benchmark/description.md new file mode 100644 index 0000000..1a3ec06 --- /dev/null +++ b/docs/ecommerce-inventory-benchmark/description.md @@ -0,0 +1,102 @@ +## Ecommerce inventory use case + +From a base dataset of 10K fashion products on Amazon.com which are then multiplexed by categories, sellers, and countries to produce larger datasets > 1M docs, this benchmark focuses on updates and aggregate performance, splitting into Reads (FT.AGGREGATE), and Updates (FT.ADD) the performance numbers. + +The aggregate queries are designed to be extremely costly both on computation and network TX, given that on each query we're aggregating and filtering over a large portion of the dataset while additionally loading 21 fields. + +Both the update and read rates can be adjusted. For a sample of each benchmark query see the Sample benchmark queries section. + +### Example Document + +``` +{ + "market" : "US", + "nodeId" : "1", + "skuId" : "eac7efa5dbd3d667f26eb3d3ab504464", + "onhand" : "12433", + "onhandLastUpdatedTimestamp" : "1600104589", + "allocated" : "10520", + "allocatedLastUpdatedTimestamp" : "1600122420", + "reserved" : "5364", + "reservedLastUpdatedTimestamp" : "1600117671", + "storeAllocated" : "9361", + "storeAllocatedLastUpdatedTimestamp" : "1600170143", + "transferAllocated" : "53579", + "transferAllocatedLastUpdatedTimestamp" : "1600106905", + "storeReserved" : "20087", + "storeReservedLastUpdatedTimestamp" : "1600147472", + "confirmedQuantity" : "15220", + "standardSafetyStock" : "21812", + "bopisSafetyStock" : "27806", + "virtualHold" : "13166", + "availableToSource" : "true", + "standardAvailableToPromise" : "true", + "bopisAvailableToPromise" : "true", + "nodeType" : "store", + "brand" : "Hornby", + "onHold" : "false", + "exclusionType" : "false", +} +``` + +## Query types + +### Aggregate queries + +Aggregations are a way to process the results of a search query, group, sort and transform them - and extract analytic insights from them. Much like aggregation queries in other databases and search engines, they can be used to create analytics reports, or perform Faceted Search style queries. + +|Query type|Description|Clauses included|Status| +|:---|:---|:---|:---| +|q1| Aggregate across 21 fields | | :heavy_check_mark: + +#### Sample q1 +``` +"FT.AGGREGATE" "inventory" "@market:{US} @skuId:{eee6f264563e61d690ceabb6a8bb28d6|2fe6e513acb7ace5aa7b89229083177b|d96328b0c08be37483282fed691e4d50|101778de5f9a4c8f134dd9abe60438c7|645963128e1f3df9fa6d6b0532108692|8879776477df76e540c82ca36ebd8b9e|35f998be32160c459d1d7b299e1cd7ec|f15ad197f43696982851a459ea1d98f8|58a29f6572df84c2bde26e473cc6fb0f|25d789a3b0651447a5a292670c971c21|0450c568b52f0e654616eef013d04746|052b9bc93b18747309483128313a2b76|72eeb30838e409980c344ed8608a7768|2953bdb96b2450c28c4ded0f9d30c2fc|d9f74437ffff1b055e79ae86dae2ff49|995777a914a1edb958a0da25b145643b|9aa711b367533c012cb110d7ebca844b|4ea4a1cf2b2136a8ba1c31f2708d8962|a015329569179c7775dcf5ce263330bb|f25ec41379c99e5e4fecfd967af82847|808d68823c96d75ee1ae869b6bb98133|556ac5522c39f050bd8a20767d8e1a4b|65beb05bc8e212af16a20a6970ac7b0c|8aa01bfb947d2aa80e65bc2dc34ec3da|fe62d40eda72db44d39d74f94a1168ff|2131da14a8f4f6d75270bdeb29b926cc|7474cf8f2da17c55263b4b192f18d77b|3e232649e15d7bf87055d30c372e3c8a|6ff572b7a995330f6bfd41a7d139da17|a50caf5a58d8a62571e2186c22c14654|1c2fb56fa30f483e270cb8a5441f7e8b|4b7bb58d0635bbfee6997658db1c1039|e7ac7e1effaa54cc409a09b711659416|42fccdc1368987b8b10486d060504d54|a801a071909b6561962df79b40ef28b0|ffee9727cfb5f53e12dd9d9fca0cf8aa|3a0fec0884138388e4c15ae8efca8d15|bf1143be4c3665e7ef1923428ca96cb4|74d2f09e30529fe19645cdaa86d5a79b|bd9749a58b96ebe1b4e9d66fad08ae7d|bdf9d6874160d07df78424271bdea6d7|d19c9fca1b537ed5bfe6f3ddd1ecd4fb|f157293c89ba2e116475d555992c778a|0eb15184a4050bbfa3bdc0c8789e57bf|25cdbda21e0f7e165c0136a7034e9304|0e3152aba3f7eb9cf2a84c2a0b56150c|76673c4eae2d8ecb09bdb9c880de40ab|d00536420b49a24024d1478353930c08|135c298448acf57a99fb36c785cb812c|1f8bbd31df2ceab3b67c9ac66e4729d3|3b78bdce914b8767a80203adad2f3ec7|8e6623cff18270ec508df41c93cc112a|be5728309319c53188ef6a40e7fe9bbc|808861a3a7355a080e0e1ad3cd51ba3f|8430d6b58ced27973b22c318d9557139|b4cb7a143965035274bb0b7cdf245bf7|27b7e4f72488d6345bdcac8d2a03dbb6|29f0058f73d4aff6008ebb434ca65433|dfbc15b579cfb269c07ecaab8d3c2c6a|227db01ce18777bbf75dd39677bcb38a|b46bd73106737d8c2804d34aa04edcaf|92df5b755346feb7f3b0a93498e363e1|5e1126991cc8cb793549f3e0f199e7c1|9e9c317f3c2e4337d0f85f0ecb0c8a0d|a3b2210b1e090d68dc883ef120a7dcd3|eff388ea4bfddbbdc1b86cd5738f66e7|f69f3a25f851f3f56acfbc1bc2d5029e|dda2d6752683937a8c3cd338ccd034a7|42b2dc99bba449644798be12c197b959|342d2b01ed22bc6a889a4b33949d516f|87132b02061eac84e9261d9635996f8d|9cd7af6fbba4d82159c61183ecc3d8c4|e17d1b120237c88edfa1bfbb4e72cd09|95534049f7431eea318a6018bc1eff3f|4ea4a1cf2b2136a8ba1c31f2708d8962|c89a713edaf634e80ab5fcd349c0035d|7d47627e91b6fa1d313da8151e9d8ffd|7ad3004492c90f1cff628097ac52303b|a20b236a5a7e92515ba07d91bf32c0ab|227b29472146ddbe9cabd9d98ad453c5|a9a5ca5c6241deb0c809d006ee3ae26b|eeadf21a379a4c1d47eaf674b5ba321b|c94992bd3a1075962c5c566187826ce5|65a69f7accc778a9f7d576f45701d8ac|e67c3eb27ed74448111e9c8686606ddb|2657a9d69f0a695f23f6e227dc559eb8|76c13e372a882b41dd5232ef198bf886|cf8cbdc12ebc3f75dc145f2b665b429b|ee68a4d2e5ea7b6cc03a2634bac2f58e|a6ba5e11b1d3414e5d624448f2fc2e1f|59045d3273dd03bef983f4f0eefcb31b|c98df0aee64e88a3a2cb9a9b530fe72e|89ea26d69fe164bc4c96c0fd3a61b84c|555c900756b846482ed5821ee37e412d|1affc5987967a343ecf4aaff9e3d43c0|a904949d92b880508aa0ca19827db3a7|bae24935ccf8743d4e2a77a6d1df04ae|584acaeb3b2b569637f2dab72eb39b23|416e1c8974407729c41983cbfe08d3a1|33e64eaacbfc53e484f46fd4f79d3c5e} @nodeId:{482|1630|1138|964|92|587|1354|920|967|197|1312|642|358|952|1117|1211|1308|344|1532|1280|1109|1663|638|769|455|1032|233|527|417|127|1521|1447|614|208|329|1009|1632|1005|1260|1581|652|1012|1458|366|1438|1154|763|1017|248|1639|196|1073|39|297|1692|1304|1598|451|708|570|21|30|1344|1209|492|1316|524|508|1413|1499|538|1698|218|147|1184|277|5|1182|955|774|417|1378|845|835|475|667|653|288|1042|852|1059|1568|674|1162|917|942|1147|976|1306|325}" "LOAD" "21" "@market" "@skuId" "@nodeId" "@brand" "@nodeType" "@onhand" "@allocated" "@confirmedQuantity" "@reserved" "@virtualHold" "@availableToSource" "@standardAvailableToPromise" "@bopisAvailableToPromise" "@storeAllocated" "@bopisSafetyStock" "@transferAllocated" "@standardSafetyStock" "@storeReserved" "@availableToSource" "@exclusionType" "@onHold" "WITHCURSOR" "COUNT" "500" +``` + +## How to benchmark + +Using FTSB for benchmarking involves 2 phases: data and query generation, and query execution. +The following steps focus on how to retrieve the data and generate the commands for the nyc_taxis use case. + +## Generating the dataset + +To generate the required dataset command file issue: +``` +cd $GOPATH/src/github.com/RediSearch/ftsb/scripts/datagen_redisearch/enwiki_abstract +python3 ftsb_generate_enwiki-abstract.py +``` + +### Index properties +The use case generates an secondary index with 3 fields per document: +- 3 TEXT sortable fields. + +## Running the benchmark + +Assuming you have `redisbench-admin` and `ftsb_redisearch` installed, for the default dataset, run: + +``` +redisbench-admin run \ + --repetitions 3 \ + --benchmark-config-file https://s3.amazonaws.com/benchmarks.redislabs/redisearch/datasets/enwiki-abstract/enwiki-abstract.redisearch.cfg.json +``` + +### Key Metrics: +After running the benchmark you should have a result json file generated, containing key information about the benchmark run(s). +Focusing specifically on this benchmark the following metrics should be taken into account and will be used to automatically choose the best run and assess results variance, ordered by the following priority ( in case of results comparison ): + +#### Setup step key metrics +| Metric Family | Metric Name | Unit | Comparison mode | +|---------------|------------------------|--------------|------------------| +| Throughput | Overall Ingestion rate | docs/sec | higher is better | +| Latency | Overall ingestion p50 | milliseconds | lower is better | + +#### Benchmarking step key metrics +| Metric Family | Metric Name | Unit | Comparison mode | +|---------------|------------------------|--------------|------------------| +| Throughput | Overall Updates and Aggregates query rate | docs/sec | Higher is better | +| Latency | Overall Updates and Aggregates query q50 latency | milliseconds | Lower is better | +| Throughput | Overall Aggregates query rate | docs/sec | Higher is better | +| Latency | Overall Aggregates query q50 latency | milliseconds | Lower is better | +| Throughput | Overall Updates query rate | docs/sec | Higher is better | +| Latency | Overall Updates query q50 latency | milliseconds | Lower is better | \ No newline at end of file diff --git a/docs/enwiki-abstract-benchmark/description.md b/docs/enwiki-abstract-benchmark/description.md new file mode 100644 index 0000000..b1e14d3 --- /dev/null +++ b/docs/enwiki-abstract-benchmark/description.md @@ -0,0 +1,90 @@ +## English-language [Wikipedia:Database](https://en.wikipedia.org/wiki/Wikipedia:Database_download) page abstracts + + +### Example Document +This use case generates 5.9 million docs, with 3 Text fields per document. +On average each added document will have a size of 227 bytes. + +``` +TODO: Add document snipet +``` + +## Query types + +### Full text search queries +|Query type|Description|Example|Status| +|:---|:---|:---|:---| +|simple-1word-query| Simple 1 Word Query | `Abraham` | :heavy_check_mark: +|2word-union-query| 2 Word Union Query | `Abraham Lincoln` | :heavy_check_mark: +|2word-intersection-query| 2 Word Intersection Query| `Abraham`|`Lincoln` | :heavy_check_mark: +|exact-3word-match| Exact 3 Word Match| `"President Abraham Lincoln"` |:heavy_multiplication_x: +|autocomplete-1100-top3| Autocomplete -1100 Top 2-3 Letter Prefixes| | :heavy_multiplication_x: +|2field-2word-intersection-query| 2 Fields, one word each, Intersection query | `@text_field1: text_value1 @text_field2: text_value2` | :heavy_multiplication_x: +|2field-1word-intersection-1numeric-range-query| 2 Fields, one text and another numeric, Intersection and numeric range query | `@text_field: text_value @numeric_field:[{min} {max}]` |:heavy_multiplication_x: + +### Spell Check queries + +Performs spelling correction on a query, returning suggestions for misspelled terms. +To simmulate misspelled terms, for each word a deterministic random number of edits in the range 0..Min(word.length/2 , 4) is chosen. + +For each edit a random type of edit (delete, insert random char, replace with random char, switch adjacent chars). + +|Query type|Description|Example|Status| +|:---|:---|:---|:---| +| simple-1word-spellcheck | Simple 1 Word Spell Check Query | `FT.SPELLCHECK {index} reids DISTANCE 1` | :heavy_check_mark: + +### Autocomplete queries +|Query type|Description|Example|Status| +|:---|:---|:---|:---| +| | | `` | :heavy_multiplication_x: + + +### Aggregate queries + +Aggregations are a way to process the results of a search query, group, sort and transform them - and extract analytic insights from them. Much like aggregation queries in other databases and search engines, they can be used to create analytics reports, or perform Faceted Search style queries. + +|Query type|Description|Clauses included|Status| +|:---|:---|:---|:---| +| | | `` | :heavy_multiplication_x: + +### Synonym queries +|Query type|Description|Example|Status| +|:---|:---|:---|:---| +| | | `` | :heavy_multiplication_x: + + +## How to benchmark + +Using FTSB for benchmarking involves 2 phases: data and query generation, and query execution. +The following steps focus on how to retrieve the data and generate the commands for the nyc_taxis use case. + +## Generating the dataset + +To generate the required dataset command file issue: +``` +cd $GOPATH/src/github.com/RediSearch/ftsb/scripts/datagen_redisearch/enwiki_abstract +python3 ftsb_generate_enwiki-abstract.py +``` + +### Index properties +The use case generates an secondary index with 3 fields per document: +- 3 TEXT sortable fields. + +## Running the benchmark + +Assuming you have `redisbench-admin` and `ftsb_redisearch` installed, for the default dataset, run: + +``` +redisbench-admin run \ + --repetitions 3 \ + --benchmark-config-file https://s3.amazonaws.com/benchmarks.redislabs/redisearch/datasets/enwiki-abstract/enwiki-abstract.redisearch.cfg.json +``` + + +### Key Metrics: +After running the benchmark you should have a result json file generated, containing key information about the benchmark run(s). +Focusing specifically on this benchmark the following metrics should be taken into account and will be used to automatically choose the best run and assess results variance, ordered by the following priority ( in case of results comparison ): + +| Metric Family | Metric Name | Unit | Comparison mode | +|---------------|------------------------|--------------|------------------| +| | | | | diff --git a/scripts/datagen_redisearch/nyc_taxis/Readme.md b/docs/nyc_taxis-benchmark/description.md similarity index 72% rename from scripts/datagen_redisearch/nyc_taxis/Readme.md rename to docs/nyc_taxis-benchmark/description.md index 7459a58..be91cdf 100644 --- a/scripts/datagen_redisearch/nyc_taxis/Readme.md +++ b/docs/nyc_taxis-benchmark/description.md @@ -1,7 +1,7 @@ ## NYC taxis use case This benchmark focus himself on write performance, making usage of TLC Trip Record Data that contains the rides that have been performed in yellow taxis in New York in 2015. -On total, the benchmark loads >140M documents like the following one: +On total, the benchmark loads >12M documents like the following one: ### Example Document On average each added document will have a size of 500 bytes. @@ -27,7 +27,7 @@ On average each added document will have a size of 500 bytes. "vendor_id": "2" } ``` -Depending on the benchmark variation it uses either `FT.ADD` or `HSET` commands. +Depending on the benchmark variation it uses either `FT.ADD` or `HSET` commands. By default HSET will be used. ## How to benchmark @@ -39,17 +39,13 @@ The original dataset is present in https://www1.nyc.gov/site/tlc/about/tlc-trip- To generate the required dataset command file issue: ``` -cd $GOPATH/src/github.com/RediSearch/ftsb/cmd/ftsb_generate_redisearch/nyc_taxis +cd $GOPATH/src/github.com/RediSearch/ftsb/scripts/datagen_redisearch/nyc_taxis python3 ftsb_generate_nyc_taxis.py ``` -This will download 12 files for a temporary folder and preprocess them to be ingested. On total you should expected a large `nyc_taxis.redisearch.commands.ALL.tar.gz` file to be generated with 140M commands to be issued to the DB, alongside it's config json `nyc_taxis.redisearch.cfg.json`. +This will download 1 to 12 files ( depending on the start and end date ) for a temporary folder and preprocess them to be ingested. +On total you should expected a large `nyc_taxis.redisearch.commands.ALL.tar.gz` file to be generated with >12M commands to be issued to the DB, alongside it's config json `nyc_taxis.redisearch.cfg.json`. -Note: to generate a dataset proper for CI runs, issue the following command that will produce 12M commands to be issued to the DB. -``` -python3 ftsb_generate_nyc_taxis.py --yellow-tripdata-end-month 1 \ - --test-name nyc_taxis-hashes-CI -``` ### FT.ADD variation To generate the FT.ADD variations you just need to include the `use-ftadd` flag, as follow: @@ -67,15 +63,20 @@ The use case generates an secondary index with 18 fields per document: ## Running the benchmark -Assuming you have `redisbench-admin` and `ftsb_redisearch` installed, for the default dataset with 140M documents, run: +Assuming you have `redisbench-admin` and `ftsb_redisearch` installed, for the default dataset with >12M documents, run: ### HSET variation ``` - https://s3.amazonaws.com/benchmarks.redislabs/redisearch/datasets/nyc_taxis-hashes/nyc_taxis-hashes.redisearch.cfg.json +redisbench-admin run \ + --repetitions 3 \ + --benchmark-config-file https://s3.amazonaws.com/benchmarks.redislabs/redisearch/datasets/nyc_taxis-hashes/nyc_taxis-hashes.redisearch.cfg.json ``` + ### FT.ADD variation ``` - https://s3.amazonaws.com/benchmarks.redislabs/redisearch/datasets/nyc_taxis-ftadd/nyc_taxis-ftadd.redisearch.cfg.json +redisbench-admin run \ + --repetitions 3 \ + --benchmark-config-file https://s3.amazonaws.com/benchmarks.redislabs/redisearch/datasets/nyc_taxis-ft.add/nyc_taxis-ft.add.redisearch.cfg.json ``` ### Key Metrics: diff --git a/docs/use-cases.md b/docs/use-cases.md deleted file mode 100644 index 9ae33ed..0000000 --- a/docs/use-cases.md +++ /dev/null @@ -1,85 +0,0 @@ - -## Current use cases - -Currently, FTSB supports three use cases: - - **ecommerce-inventory**, From a base dataset of [10K fashion products on Amazon.com](https://data.world/promptcloud/fashion-products-on-amazon-com/workspace/file?filename=amazon_co-ecommerce_sample.csv) which are then multiplexed by categories, sellers, and countries to produce larger datasets > 1M docs. This benchmark focuses on updates and aggregate performance, splitting into Reads (FT.AGGREGATE), Cursor Reads (FT.CURSOR), and Updates (FT.ADD) the performance numbers. - The use case generates an index with 10 TAG fields (3 sortable and 1 non indexed), and 16 NUMERIC sortable non indexed fields per document. - The aggregate queries are designed to be extremely costly both on computation and network TX, given that on each query we're aggregating and filtering over a large portion of the dataset while additionally loading 21 fields. - Both the update and read rates can be adjusted. - - - - **enwiki-abstract**, From English-language [Wikipedia:Database](https://en.wikipedia.org/wiki/Wikipedia:Database_download) page abstracts. This use case generates -3 TEXT fields per document. - - - - **enwiki-pages**, From English-language [Wikipedia:Database](https://en.wikipedia.org/wiki/Wikipedia:Database_download) last page revisions, containing processed metadata extracted from the full Wikipedia XML dump. - This use case generates 4 TEXT fields ( 2 sortable ), 1 sortable TAG field, and 6 sortable NUMERIC fields per document. - - - -## Appendix I: Query types - -### Appendix I.I - English-language [Wikipedia:Database](https://en.wikipedia.org/wiki/Wikipedia:Database_download) page abstracts. -#### Full text search queries -|Query type|Description|Example|Status| -|:---|:---|:---|:---| -|simple-1word-query| Simple 1 Word Query | `Abraham` | :heavy_check_mark: -|2word-union-query| 2 Word Union Query | `Abraham Lincoln` | :heavy_check_mark: -|2word-intersection-query| 2 Word Intersection Query| `Abraham`|`Lincoln` | :heavy_check_mark: -|exact-3word-match| Exact 3 Word Match| `"President Abraham Lincoln"` |:heavy_multiplication_x: -|autocomplete-1100-top3| Autocomplete -1100 Top 2-3 Letter Prefixes| | :heavy_multiplication_x: -|2field-2word-intersection-query| 2 Fields, one word each, Intersection query | `@text_field1: text_value1 @text_field2: text_value2` | :heavy_multiplication_x: -|2field-1word-intersection-1numeric-range-query| 2 Fields, one text and another numeric, Intersection and numeric range query | `@text_field: text_value @numeric_field:[{min} {max}]` |:heavy_multiplication_x: - -#### Spell Check queries - -Performs spelling correction on a query, returning suggestions for misspelled terms. -To simmulate misspelled terms, for each word a deterministic random number of edits in the range 0..Min(word.length/2 , 4) is chosen. - - -For each edit a random type of edit (delete, insert random char, replace with random char, switch adjacent chars). - -|Query type|Description|Example|Status| -|:---|:---|:---|:---| -| simple-1word-spellcheck | Simple 1 Word Spell Check Query | `FT.SPELLCHECK {index} reids DISTANCE 1` | :heavy_check_mark: - -#### Autocomplete queries -|Query type|Description|Example|Status| -|:---|:---|:---|:---| -| | | `` | :heavy_multiplication_x: - - -#### Aggregate queries - -Aggregations are a way to process the results of a search query, group, sort and transform them - and extract analytic insights from them. Much like aggregation queries in other databases and search engines, they can be used to create analytics reports, or perform Faceted Search style queries. - -|Query type|Description|Clauses included|Status| -|:---|:---|:---|:---| -| | | `` | :heavy_multiplication_x: - -#### Synonym queries -|Query type|Description|Example|Status| -|:---|:---|:---|:---| -| | | `` | :heavy_multiplication_x: - - -### Appendix I.II - English-language [Wikipedia:Database](https://en.wikipedia.org/wiki/Wikipedia:Database_download) last page revisions. - -#### Aggregate queries - -Aggregations are a way to process the results of a search query, group, sort and transform them - and extract analytic insights from them. Much like aggregation queries in other databases and search engines, they can be used to create analytics reports, or perform Faceted Search style queries. - -|Query #|Query type|Description| Status| -|:---|:---|:---|:---| -| 1 | agg-1-editor-1year-exact-page-contributions-by-day | One year period, Exact Number of contributions by day, ordered chronologically, for a given editor [(supplemental docs)](docs/redisearch.md#Q1) | :heavy_check_mark: -| 2 | agg-2-*-1month-exact-distinct-editors-by-hour | One month period, Exact Number of distinct editors contributions by hour, ordered chronologically [(supplemental docs)](docs/redisearch.md#Q2) |:heavy_check_mark: -| 3 | agg-3-*-1month-approximate-distinct-editors-by-hour | One month period, Approximate Number of distinct editors contributions by hour, ordered chronologically [(supplemental docs)](docs/redisearch.md#Q3) | :heavy_check_mark: -| 4 | agg-4-*-1day-approximate-page-contributions-by-5minutes-by-editor-username | One day period, Approximate Number of contributions by 5minutes interval by editor username, ordered first chronologically and second alphabetically by Revision editor username [(supplemental docs)](docs/redisearch.md#Q4) |:heavy_check_mark: -| 5 | agg-5-*-1month-approximate-top10-editor-usernames | One month period, Approximate All time Top 10 Revision editor usernames. [(supplemental docs)](docs/redisearch.md#Q5) | :heavy_check_mark: -| 6 | agg-6-*-1month-approximate-top10-editor-usernames-by-namespace | One month period, Approximate All time Top 10 Revision editor usernames by number of Revisions broken by namespace (TAG field) [(supplemental docs)](docs/redisearch.md#Q6) | :heavy_check_mark: -| 7 | agg-7-*-1month-avg-revision-content-length-by-editor-username | One month period, Top 10 editor username by average revision content [(supplemental docs)](docs/redisearch.md#Q7) | :heavy_check_mark: -| 8 | agg-8-editor-approximate-avg-editor-contributions-by-year | Approximate average number of contributions by year each editor makes [(supplemental docs)](docs/redisearch.md#Q8) | :heavy_check_mark: - - - - \ No newline at end of file diff --git a/go.mod b/go.mod index 8056db3..2107c4f 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/RediSearch/ftsb -go 1.14 +go 1.13 require ( code.cloudfoundry.org/bytefmt v0.0.0-20200131002437-cf55d5288a48 diff --git a/scripts/datagen_redisearch/__pycache__/common.cpython-38.pyc b/scripts/datagen_redisearch/__pycache__/common.cpython-38.pyc deleted file mode 100644 index 3402fb4..0000000 Binary files a/scripts/datagen_redisearch/__pycache__/common.cpython-38.pyc and /dev/null differ diff --git a/scripts/datagen_redisearch/ecommerce_inventory/ftsb_generate_ecommerce_inventory.py b/scripts/datagen_redisearch/ecommerce_inventory/ftsb_generate_ecommerce_inventory.py index 03b657f..38a4897 100644 --- a/scripts/datagen_redisearch/ecommerce_inventory/ftsb_generate_ecommerce_inventory.py +++ b/scripts/datagen_redisearch/ecommerce_inventory/ftsb_generate_ecommerce_inventory.py @@ -167,7 +167,7 @@ def generate_ft_aggregate_row(index, countries_alpha_3, countries_alpha_p, maxSk skuId_list = random.choices(skus, k=maxSkusList) nodeId_list = random.choices(nodes, k=maxNodesList) - cmd = ["READ", "R1", "FT.AGGREGATE", "{index}".format(index=index), + cmd = ["READ", "R1", 1, "FT.AGGREGATE", "{index}".format(index=index), "@market:{{{0}}} @skuId:{{{1}}} @nodeId:{{{2}}}".format(market, "|".join(skuId_list), "|".join(nodeId_list)) @@ -180,7 +180,7 @@ def generate_ft_aggregate_row(index, countries_alpha_3, countries_alpha_p, maxSk def generate_ft_add_row(index, doc): - cmd = ["SETUP_WRITE", "S1", "FT.ADD", "{index}".format(index=index), + cmd = ["SETUP_WRITE", "S1", 2, "FT.ADD", "{index}".format(index=index), "{index}-{doc_id}".format(index=index, doc_id=doc["doc_id"]), 1.0, "REPLACE", "FIELDS"] for f, v in doc["schema"].items(): cmd.append(f) @@ -204,7 +204,7 @@ def generate_ft_drop_row(index): def generate_ft_add_update_row(indexname, doc): - cmd = ["UPDATE", "U1", "FT.ADD", "{index}".format(index=indexname), + cmd = ["UPDATE", "U1", 2, "FT.ADD", "{index}".format(index=indexname), "{index}-{doc_id}".format(index=indexname, doc_id=doc["doc_id"]), 1.0, "REPLACE", "PARTIAL", "FIELDS"] TRUES = "true" @@ -538,7 +538,7 @@ def generate_benchmark_commands(): inputs = {"all": inputs_entry_all, "setup": inputs_entry_setup, "benchmark": inputs_entry_benchmark} deployment_requirements = init_deployment_requirement() - add_deployment_requirements_redis_server_module(deployment_requirements, "ft", {}) + add_deployment_requirements_redis_server_module(deployment_requirements, "search", {}) add_deployment_requirements_utilities(deployment_requirements, "ftsb_redisearch", {}) add_deployment_requirements_benchmark_tool(deployment_requirements, "ftsb_redisearch") diff --git a/scripts/datagen_redisearch/enwiki_abstract/ftsb_generate_data_wiki.py b/scripts/datagen_redisearch/enwiki_abstract/ftsb_generate_enwiki_abstract.py similarity index 100% rename from scripts/datagen_redisearch/enwiki_abstract/ftsb_generate_data_wiki.py rename to scripts/datagen_redisearch/enwiki_abstract/ftsb_generate_enwiki_abstract.py diff --git a/scripts/datagen_redisearch/nyc_taxis/ftsb_generate_nyc_taxis.py b/scripts/datagen_redisearch/nyc_taxis/ftsb_generate_nyc_taxis.py index e1e023b..1432837 100644 --- a/scripts/datagen_redisearch/nyc_taxis/ftsb_generate_nyc_taxis.py +++ b/scripts/datagen_redisearch/nyc_taxis/ftsb_generate_nyc_taxis.py @@ -314,7 +314,7 @@ def use_case_csv_row_to_cmd(row, index_types, use_ftadd, total_amount_pos, impro inputs = {"all": inputs_entry_all, "benchmark": inputs_entry_all} deployment_requirements = init_deployment_requirement() - add_deployment_requirements_redis_server_module(deployment_requirements, "ft", {}) + add_deployment_requirements_redis_server_module(deployment_requirements, "search", {}) add_deployment_requirements_utilities(deployment_requirements, "ftsb_redisearch", {}) add_deployment_requirements_benchmark_tool(deployment_requirements, "ftsb_redisearch")