opensearch-project · AkshathRaghav · Oct 11, 2023 · Oct 13, 2023 · Oct 18, 2023 · Oct 18, 2023
@@ -115,3 +115,6 @@ recipes/ccr/ccr-target-hosts.json
 
 # Tracker tracks
 tracks/
+
+# Visual Studio Code for Contributors
+.vscode/
@@ -22,35 +22,10 @@
       ]
     }{% endfor %}
   ],
-  "schedule": [
-    {
-      "operation": "delete-index"
-    },{% raw %}
-    {
-      "operation": {
-        "operation-type": "create-index",
-        "settings": {{index_settings | default({}) | tojson}}
-      }
-    },{% endraw %}
-    {
-      "operation": {
-        "operation-type": "cluster-health",
-        "index": {{ indices | map(attribute='name') | list | join(',') | tojson }},{% raw %}
-        "request-params": {
-          "wait_for_status": "{{cluster_health | default('green')}}",
-          "wait_for_no_relocating_shards": "true"
-        },
-        "retry-until-success": true
-      }
-    },
-    {
-      "operation": {
-        "operation-type": "bulk",
-        "bulk-size": {{bulk_size | default(5000)}},
-        "ingest-percentage": {{ingest_percentage | default(100)}}
-      },
-      "clients": {{bulk_indexing_clients | default(8)}}
-    },{% endraw -%}
-    {% block queries %}{% endblock %}
+  "operations": [
+    {% raw %}{{ benchmark.collect(parts="operations/*.json") }}{% endraw %}
+  ],
+  "test_procedures": [
+    {% raw %}{{ benchmark.collect(parts="test_procedures/*.json") }}{% endraw %}
   ]
-}
+}
@@ -0,0 +1,22 @@
+    {
+      "operation": {
+        "name": "index-append",
+        "operation-type": "bulk",
+        "bulk-size": {{bulk_size | default(5000)}},
+        "ingest-percentage": {{ingest_percentage | default(100)}}
+      },{% raw %}
+      "clients": {{bulk_indexing_clients | default(8)}}
+    },{% endraw %}
+    {
+      "operation": {
+        "name": "default",
+        "operation-type": "search",
+        "index": {{ indices | map(attribute='name') | list | join(',') | tojson }},
+        "body": {
+          "query": {
+            "match_all": {}
+          }
+        }
+      },{% raw %}
+      "clients": {{search_clients | default(8)}}
+    }{% endraw %}
@@ -0,0 +1,42 @@
+{ 
+  "name": "append-no-conflicts",
+  "description": "Indexes the whole document corpus using OpenSearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Benchmark will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only.",
+  "default": true,
+  "schedule": [
+    {
+      "operation": "delete-index"
+    },{% raw %}
+    {
+      "operation": {
+        "operation-type": "create-index",
+        "settings": {{index_settings | default({}) | tojson}}
+      }
+    },{% endraw %}
+    {
+      "operation": {
+        "operation-type": "cluster-health",
+        "index": {{ indices | map(attribute='name') | list | join(',') | tojson }},{% raw %}
+        "request-params": {
+          "wait_for_status": "{{cluster_health | default('green')}}",
+          "wait_for_no_relocating_shards": "true"
+        },
+        "retry-until-success": true
+      }
+    },
+    {% endraw -%}
+    {%- block queries -%}
+      {% for query in custom_queries %}
+      {
+        "operation": {
+          "name": "{{query.name}}",
+          "operation-type": "{{query['operation-type']}}",
+          "index": {{ indices | map(attribute='name') | list | join(',') | tojson }},
+          "body": {{query.body | replace("'", '"') }}
+        }
+      }{% if not loop.last %},{% endif -%}
+      {% endfor %}
+    {%- endblock %}
+  }
+  ]
+}
+
@@ -0,0 +1,16 @@
+    {
+      "name": "index-append",
+      "operation-type": "bulk",
+      "bulk-size": {{bulk_size | default(5000)}},
+      "ingest-percentage": {{ingest_percentage | default(100)}}
+    },
+    {
+      "name": "default",
+      "operation-type": "search",
+      "index": {{ indices | map(attribute='name') | list | join(',') | tojson }},
+      "body": {
+        "query": {
+          "match_all": {}
+        }
+      }
+    }
@@ -0,0 +1,29 @@
+{ 
+  "name": "append-no-conflicts",
+  "description": "Indexes the whole document corpus using OpenSearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Benchmark will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only.",
+  "default": true,
+  "schedule": [
+    {
+      "operation": "delete-index"
+    },{% raw %}
+    {
+      "operation": {
+        "operation-type": "create-index",
+        "settings": {{index_settings | default({}) | tojson}}
+      }
+    },{% endraw %}
+    {
+      "operation": {
+        "operation-type": "cluster-health",
+        "index": {{ indices | map(attribute='name') | list | join(',') | tojson }},{% raw %}
+        "request-params": {
+          "wait_for_status": "{{cluster_health | default('green')}}",
+          "wait_for_no_relocating_shards": "true"
+        },
+        "retry-until-success": true
+      }
+    }
+    {% endraw -%}
+  ]
+}
+
diff --git a/osbenchmark/resources/test-procedures.json.j2 b/osbenchmark/resources/test-procedures.json.j2
@@ -0,0 +1,21 @@
+  {
+    "operation": "delete-index"
+  },{% raw %}
+  {
+    "operation": {
+      "operation-type": "create-index",
+      "settings": {{index_settings | default({}) | tojson}}
+    }
+  },{% endraw %}
+  {
+    "operation": {
+      "operation-type": "cluster-health",
+      "index": {{ indices | map(attribute='name') | list | join(',') | tojson }},{% raw %}
+      "request-params": {
+        "wait_for_status": "{{cluster_health | default('green')}}",
+        "wait_for_no_relocating_shards": "true"
+      },
+      "retry-until-success": true
+    }
+  },{% endraw -%}
+  {% block queries %}{% endblock %}
diff --git a/osbenchmark/workload_generator/corpus.py b/osbenchmark/workload_generator/corpus.py
@@ -13,7 +13,7 @@
 # not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#	http://www.apache.org/licenses/LICENSE-2.0
+# 	http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
@@ -42,12 +42,12 @@ def template_vars(index_name, out_path, doc_count):
         "path": comp_outpath,
         "doc_count": doc_count,
         "uncompressed_bytes": os.path.getsize(out_path),
-        "compressed_bytes": os.path.getsize(comp_outpath)
+        "compressed_bytes": os.path.getsize(comp_outpath),
     }
 
 
-def get_doc_outpath(outdir, name, suffix=""):
-    return os.path.join(outdir, f"{name}-documents{suffix}.json")
+def get_doc_outpath(outdir, suffix=""):
+    return os.path.join(outdir, f"documents{suffix}.json")
 
 
 def extract(client, output_path, index, number_of_docs_requested=None):
@@ -64,16 +64,34 @@ def extract(client, output_path, index, number_of_docs_requested=None):
 
     number_of_docs = client.count(index=index)["count"]
 
-    total_docs = number_of_docs if not number_of_docs_requested else min(number_of_docs, number_of_docs_requested)
+    total_docs = (
+        number_of_docs
+        if not number_of_docs_requested
+        else min(number_of_docs, number_of_docs_requested)
+    )
 
     if total_docs > 0:
-        logger.info("[%d] total docs in index [%s]. Extracting [%s] docs.", number_of_docs, index, total_docs)
-        docs_path = get_doc_outpath(output_path, index)
-        dump_documents(client, index, get_doc_outpath(output_path, index, "-1k"), min(total_docs, 1000), " for test mode")
+        logger.info(
+            "[%d] total docs in index [%s]. Extracting [%s] docs.",
+            number_of_docs,
+            index,
+            total_docs,
+        )
+        docs_path = get_doc_outpath(output_path)
+        dump_documents(
+            client,
+            index,
+            get_doc_outpath(output_path, "-1k"),
+            min(total_docs, 1000),
+            " for test mode",
+        )
         dump_documents(client, index, docs_path, total_docs)
         return template_vars(index, docs_path, total_docs)
     else:
-        logger.info("Skipping corpus extraction fo index [%s] as it contains no documents.", index)
+        logger.info(
+            "Skipping corpus extraction fo index [%s] as it contains no documents.",
+            index,
+        )
         return None
 
 
@@ -94,12 +112,21 @@ def dump_documents(client, index, out_path, number_of_docs, progress_message_suf
             for n, doc in enumerate(helpers.scan(client, query=query, index=index)):
                 if n >= number_of_docs:
                     break
-                data = (json.dumps(doc["_source"], separators=(",", ":")) + "\n").encode("utf-8")
+                data = (
+                    json.dumps(doc["_source"], separators=(",", ":")) + "\n"
+                ).encode("utf-8")
 
                 outfile.write(data)
                 comp_outfile.write(compressor.compress(data))
 
-                render_progress(progress, progress_message_suffix, index, n + 1, number_of_docs, freq)
+                render_progress(
+                    progress,
+                    progress_message_suffix,
+                    index,
+                    n + 1,
+                    number_of_docs,
+                    freq,
+                )
 
             comp_outfile.write(compressor.flush())
     progress.finish()

@@ -13,7 +13,7 @@
 # not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#	http://www.apache.org/licenses/LICENSE-2.0
+# 	http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
@@ -26,14 +26,16 @@
 import logging
 import os
 
-INDEX_SETTINGS_EPHEMERAL_KEYS = ["uuid",
-                                 "creation_date",
-                                 "version",
-                                 "provided_name",
-                                 "store"]
+INDEX_SETTINGS_EPHEMERAL_KEYS = [
+    "uuid",
+    "creation_date",
+    "version",
+    "provided_name",
+    "store",
+]
 INDEX_SETTINGS_PARAMETERS = {
     "number_of_replicas": "{{{{number_of_replicas | default({orig})}}}}",
-    "number_of_shards": "{{{{number_of_shards | default({orig})}}}}"
+    "number_of_shards": "{{{{number_of_shards | default({orig})}}}}",
 }
 
 
@@ -81,13 +83,13 @@ def extract_index_mapping_and_settings(client, index_pattern):
         valid, reason = is_valid(index)
         if valid:
             mappings = details["mappings"]
-            index_settings = filter_ephemeral_index_settings(details["settings"]["index"])
+            index_settings = filter_ephemeral_index_settings(
+                details["settings"]["index"]
+            )
             update_index_setting_parameters(index_settings)
             results[index] = {
                 "mappings": mappings,
-                "settings": {
-                    "index": index_settings
-                }
+                "settings": {"index": index_settings},
             }
         else:
             logger.info("Skipping index [%s] (reason: %s).", index, reason)
@@ -107,14 +109,16 @@ def extract(client, outdir, index_pattern):
 
     index_obj = extract_index_mapping_and_settings(client, index_pattern)
     for index, details in index_obj.items():
-        filename = f"{index}.json"
+        filename = f"index.json"
         outpath = os.path.join(outdir, filename)
         with open(outpath, "w") as outfile:
             json.dump(details, outfile, indent=4, sort_keys=True)
             outfile.write("\n")
-        results.append({
-            "name": index,
-            "path": outpath,
-            "filename": filename,
-        })
+        results.append(
+            {
+                "name": index,
+                "path": outpath,
+                "filename": filename,
+            }
+        )
     return results