-
Notifications
You must be signed in to change notification settings - Fork 86
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Create-Workload Improvements: Write Test Procedures and Operations into Separate Directories and Files #397
Changes from 7 commits
15f2cb1
0f2af67
7debd74
66f7b29
c2f981f
296fd42
5c59e88
9cc196f
4cec68e
632dc3c
df2af12
1fcc667
de27dcf
c4fb5ea
d05766a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -115,3 +115,6 @@ recipes/ccr/ccr-target-hosts.json | |
|
||
# Tracker tracks | ||
tracks/ | ||
|
||
# Visual Studio Code for Contributors | ||
.vscode/ |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
{ | ||
"operation": { | ||
"name": "index-append", | ||
"operation-type": "bulk", | ||
"bulk-size": {{bulk_size | default(5000)}}, | ||
"ingest-percentage": {{ingest_percentage | default(100)}} | ||
},{% raw %} | ||
"clients": {{bulk_indexing_clients | default(8)}} | ||
},{% endraw %} | ||
{ | ||
"operation": { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For custom operations, we need to remove the |
||
"name": "default", | ||
"operation-type": "search", | ||
"index": {{ indices | map(attribute='name') | list | join(',') | tojson }}, | ||
"body": { | ||
"query": { | ||
"match_all": {} | ||
} | ||
} | ||
},{% raw %} | ||
"clients": {{search_clients | default(8)}} | ||
}{% endraw %} |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
{ | ||
"name": "append-no-conflicts", | ||
"description": "Indexes the whole document corpus using OpenSearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Benchmark will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only.", | ||
"default": true, | ||
"schedule": [ | ||
{ | ||
"operation": "delete-index" | ||
},{% raw %} | ||
{ | ||
"operation": { | ||
"operation-type": "create-index", | ||
"settings": {{index_settings | default({}) | tojson}} | ||
} | ||
},{% endraw %} | ||
{ | ||
"operation": { | ||
"operation-type": "cluster-health", | ||
"index": {{ indices | map(attribute='name') | list | join(',') | tojson }},{% raw %} | ||
"request-params": { | ||
"wait_for_status": "{{cluster_health | default('green')}}", | ||
"wait_for_no_relocating_shards": "true" | ||
}, | ||
"retry-until-success": true | ||
} | ||
}, | ||
{% endraw -%} | ||
{%- block queries -%} | ||
{% for query in custom_queries %} | ||
{ | ||
"operation": { | ||
"name": "{{query.name}}", | ||
"operation-type": "{{query['operation-type']}}", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We just need to specify the |
||
"index": {{ indices | map(attribute='name') | list | join(',') | tojson }}, | ||
"body": {{query.body | replace("'", '"') }} | ||
} | ||
}{% if not loop.last %},{% endif -%} | ||
{% endfor %} | ||
{%- endblock %} | ||
} | ||
] | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
{ | ||
"name": "index-append", | ||
"operation-type": "bulk", | ||
IanHoang marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"bulk-size": {{bulk_size | default(5000)}}, | ||
"ingest-percentage": {{ingest_percentage | default(100)}} | ||
}, | ||
{ | ||
"name": "default", | ||
"operation-type": "search", | ||
"index": {{ indices | map(attribute='name') | list | join(',') | tojson }}, | ||
"body": { | ||
"query": { | ||
"match_all": {} | ||
} | ||
} | ||
} |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
{ | ||
"name": "append-no-conflicts", | ||
"description": "Indexes the whole document corpus using OpenSearch default settings. We only adjust the number of replicas as we benchmark a single node cluster and Benchmark will only start the benchmark if the cluster turns green. Document ids are unique so all index operations are append only.", | ||
"default": true, | ||
"schedule": [ | ||
{ | ||
"operation": "delete-index" | ||
},{% raw %} | ||
{ | ||
"operation": { | ||
"operation-type": "create-index", | ||
"settings": {{index_settings | default({}) | tojson}} | ||
} | ||
},{% endraw %} | ||
{ | ||
"operation": { | ||
"operation-type": "cluster-health", | ||
"index": {{ indices | map(attribute='name') | list | join(',') | tojson }},{% raw %} | ||
"request-params": { | ||
"wait_for_status": "{{cluster_health | default('green')}}", | ||
"wait_for_no_relocating_shards": "true" | ||
}, | ||
"retry-until-success": true | ||
} | ||
} | ||
{% endraw -%} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is why the test is only running |
||
] | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
{ | ||
"operation": "delete-index" | ||
},{% raw %} | ||
{ | ||
"operation": { | ||
"operation-type": "create-index", | ||
"settings": {{index_settings | default({}) | tojson}} | ||
} | ||
},{% endraw %} | ||
{ | ||
"operation": { | ||
"operation-type": "cluster-health", | ||
"index": {{ indices | map(attribute='name') | list | join(',') | tojson }},{% raw %} | ||
"request-params": { | ||
"wait_for_status": "{{cluster_health | default('green')}}", | ||
"wait_for_no_relocating_shards": "true" | ||
}, | ||
"retry-until-success": true | ||
} | ||
},{% endraw -%} | ||
IanHoang marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{% block queries %}{% endblock %} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,7 +13,7 @@ | |
# not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
|
@@ -42,12 +42,12 @@ def template_vars(index_name, out_path, doc_count): | |
"path": comp_outpath, | ||
"doc_count": doc_count, | ||
"uncompressed_bytes": os.path.getsize(out_path), | ||
"compressed_bytes": os.path.getsize(comp_outpath) | ||
"compressed_bytes": os.path.getsize(comp_outpath), | ||
} | ||
|
||
|
||
def get_doc_outpath(outdir, name, suffix=""): | ||
return os.path.join(outdir, f"{name}-documents{suffix}.json") | ||
def get_doc_outpath(outdir, suffix=""): | ||
return os.path.join(outdir, f"documents{suffix}.json") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why was |
||
|
||
|
||
def extract(client, output_path, index, number_of_docs_requested=None): | ||
|
@@ -64,16 +64,34 @@ def extract(client, output_path, index, number_of_docs_requested=None): | |
|
||
number_of_docs = client.count(index=index)["count"] | ||
|
||
total_docs = number_of_docs if not number_of_docs_requested else min(number_of_docs, number_of_docs_requested) | ||
total_docs = ( | ||
number_of_docs | ||
if not number_of_docs_requested | ||
else min(number_of_docs, number_of_docs_requested) | ||
) | ||
|
||
if total_docs > 0: | ||
logger.info("[%d] total docs in index [%s]. Extracting [%s] docs.", number_of_docs, index, total_docs) | ||
docs_path = get_doc_outpath(output_path, index) | ||
dump_documents(client, index, get_doc_outpath(output_path, index, "-1k"), min(total_docs, 1000), " for test mode") | ||
logger.info( | ||
"[%d] total docs in index [%s]. Extracting [%s] docs.", | ||
number_of_docs, | ||
index, | ||
total_docs, | ||
) | ||
docs_path = get_doc_outpath(output_path) | ||
dump_documents( | ||
client, | ||
index, | ||
get_doc_outpath(output_path, "-1k"), | ||
min(total_docs, 1000), | ||
" for test mode", | ||
) | ||
dump_documents(client, index, docs_path, total_docs) | ||
return template_vars(index, docs_path, total_docs) | ||
else: | ||
logger.info("Skipping corpus extraction fo index [%s] as it contains no documents.", index) | ||
logger.info( | ||
"Skipping corpus extraction fo index [%s] as it contains no documents.", | ||
index, | ||
) | ||
return None | ||
|
||
|
||
|
@@ -94,12 +112,21 @@ def dump_documents(client, index, out_path, number_of_docs, progress_message_suf | |
for n, doc in enumerate(helpers.scan(client, query=query, index=index)): | ||
if n >= number_of_docs: | ||
break | ||
data = (json.dumps(doc["_source"], separators=(",", ":")) + "\n").encode("utf-8") | ||
data = ( | ||
json.dumps(doc["_source"], separators=(",", ":")) + "\n" | ||
).encode("utf-8") | ||
|
||
outfile.write(data) | ||
comp_outfile.write(compressor.compress(data)) | ||
|
||
render_progress(progress, progress_message_suffix, index, n + 1, number_of_docs, freq) | ||
render_progress( | ||
progress, | ||
progress_message_suffix, | ||
index, | ||
n + 1, | ||
number_of_docs, | ||
freq, | ||
) | ||
|
||
comp_outfile.write(compressor.flush()) | ||
progress.finish() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,7 +13,7 @@ | |
# not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
|
@@ -26,14 +26,16 @@ | |
import logging | ||
import os | ||
|
||
INDEX_SETTINGS_EPHEMERAL_KEYS = ["uuid", | ||
"creation_date", | ||
"version", | ||
"provided_name", | ||
"store"] | ||
INDEX_SETTINGS_EPHEMERAL_KEYS = [ | ||
"uuid", | ||
"creation_date", | ||
"version", | ||
"provided_name", | ||
"store", | ||
] | ||
INDEX_SETTINGS_PARAMETERS = { | ||
"number_of_replicas": "{{{{number_of_replicas | default({orig})}}}}", | ||
"number_of_shards": "{{{{number_of_shards | default({orig})}}}}" | ||
"number_of_shards": "{{{{number_of_shards | default({orig})}}}}", | ||
} | ||
|
||
|
||
|
@@ -81,13 +83,13 @@ def extract_index_mapping_and_settings(client, index_pattern): | |
valid, reason = is_valid(index) | ||
if valid: | ||
mappings = details["mappings"] | ||
index_settings = filter_ephemeral_index_settings(details["settings"]["index"]) | ||
index_settings = filter_ephemeral_index_settings( | ||
details["settings"]["index"] | ||
) | ||
update_index_setting_parameters(index_settings) | ||
results[index] = { | ||
"mappings": mappings, | ||
"settings": { | ||
"index": index_settings | ||
} | ||
"settings": {"index": index_settings}, | ||
} | ||
else: | ||
logger.info("Skipping index [%s] (reason: %s).", index, reason) | ||
|
@@ -107,14 +109,16 @@ def extract(client, outdir, index_pattern): | |
|
||
index_obj = extract_index_mapping_and_settings(client, index_pattern) | ||
for index, details in index_obj.items(): | ||
filename = f"{index}.json" | ||
filename = f"index.json" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why did you simplify this to just be named as |
||
outpath = os.path.join(outdir, filename) | ||
with open(outpath, "w") as outfile: | ||
json.dump(details, outfile, indent=4, sort_keys=True) | ||
outfile.write("\n") | ||
results.append({ | ||
"name": index, | ||
"path": outpath, | ||
"filename": filename, | ||
}) | ||
results.append( | ||
{ | ||
"name": index, | ||
"path": outpath, | ||
"filename": filename, | ||
} | ||
) | ||
return results |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The operations file includes extraneous fields for each operations.
Ingest operations should include at minimum
name
,operation-type
,bulk-size
, andingest-percentage
fields.Each search operation should just include a json
name
,operation-type
,index
, andbody
fields.Fields like
search_clients
orbulk_indexing_clients
belong in test_procedures file. For reference, see NYC_Taxis workload's operations file: https://github.com/opensearch-project/opensearch-benchmark-workloads/blob/main/nyc_taxis/operations/default.json