GoogleCloudPlatform · gargnitingoogle · Aug 26, 2024 · Aug 7, 2024 · Aug 12, 2024
@@ -23,7 +23,7 @@
 sys.path.append("../")
 from utils.utils import get_memory, get_cpu, standard_timestamp, is_mash_installed
 
-_LOCAL_LOGS_LOCATION = "../../bin/dlio-logs"
+_LOCAL_LOGS_LOCATION = "../../bin/dlio-logs/logs"
 
 record = {
  "pod_name": "",
@@ -44,7 +44,7 @@
 }
 
 
-def downloadDlioOutputs(dlioWorkloads):
+def downloadDlioOutputs(dlioWorkloads: set, instanceId: str):
  for dlioWorkload in dlioWorkloads:
  print(f"Downloading DLIO logs from the bucket {dlioWorkload.bucket}...")
  result = subprocess.run(
@@ -55,7 +55,7 @@ def downloadDlioOutputs(dlioWorkloads):
  "cp",
  "-r",
  "--no-user-output-enabled", # do not print names of files being copied
- f"gs://{dlioWorkload.bucket}/logs",
+ f"gs://{dlioWorkload.bucket}/logs/{instanceId}",
  _LOCAL_LOGS_LOCATION,
  ],
  capture_output=False,
@@ -92,6 +92,11 @@ def downloadDlioOutputs(dlioWorkloads):
  ),
  required=True,
  )
+ parser.add_argument(
+ "--instance-id",
+ help="unique string ID for current test-run",
+ required=True,
+ )
  args = parser.parse_args()
 
  try:
@@ -102,7 +107,7 @@ def downloadDlioOutputs(dlioWorkloads):
  dlioWorkloads = dlio_workload.ParseTestConfigForDlioWorkloads(
  args.workload_config
  )
- downloadDlioOutputs(dlioWorkloads)
+ downloadDlioOutputs(dlioWorkloads, args.instance_id)
 
  """
  "{num_files_train}-{mean_file_size}-{batch_size}":
@@ -120,7 +125,7 @@ def downloadDlioOutputs(dlioWorkloads):
  if not mash_installed:
  print("Mash is not installed, will skip parsing CPU and memory usage.")
 
- for root, _, files in os.walk(_LOCAL_LOGS_LOCATION):
+ for root, _, files in os.walk(_LOCAL_LOGS_LOCATION + "/" + args.instance_id):
  if files:
  print(f"Parsing directory {root} ...")
  per_epoch_stats_file = root + "/per_epoch_stats.json"
@@ -153,9 +158,9 @@ def downloadDlioOutputs(dlioWorkloads):
 
  if key not in output:
  output[key] = {
- "num_files_train": part_list[2],
- "mean_file_size": part_list[3],
- "batch_size": part_list[4],
+ "num_files_train": part_list[-3],
+ "mean_file_size": part_list[-2],
+ "batch_size": part_list[-1],
  "records": {
  "local-ssd": [],
  "gcsfuse-generic": [],
@@ -167,7 +172,7 @@ def downloadDlioOutputs(dlioWorkloads):
  r = record.copy()
  r["pod_name"] = summary_data["hostname"]
  r["epoch"] = i + 1
- r["scenario"] = "-".join(part_list[5:])
+ r["scenario"] = root.split("/")[-1]
  r["train_au_percentage"] = round(
  summary_data["metric"]["train_au_percentage"][i], 2
  )
@@ -221,7 +226,7 @@ def downloadDlioOutputs(dlioWorkloads):
  " (s),GPU Utilization (%),Throughput (sample/s),Throughput"
  " (MB/s),Throughput over Local SSD (%),GCSFuse Lowest Memory (MB),GCSFuse"
  " Highest Memory (MB),GCSFuse Lowest CPU (core),GCSFuse Highest CPU"
- " (core),Pod,Start,End,GcsfuseMountOptions\n"
+ " (core),Pod,Start,End,GcsfuseMountOptions,InstanceID\n"
  )
 
  for key in output:
@@ -242,19 +247,25 @@ def downloadDlioOutputs(dlioWorkloads):
  ):
  for i in range(len(record_set["records"]["local-ssd"])):
  r = record_set["records"][scenario][i]
- r["throughput_over_local_ssd"] = round(
- r["train_throughput_mb_per_second"]
- / record_set["records"]["local-ssd"][i][
- "train_throughput_mb_per_second"
- ]
- * 100,
- 2,
- )
+ try:
+ r["throughput_over_local_ssd"] = round(
+ r["train_throughput_mb_per_second"]
+ / record_set["records"]["local-ssd"][i][
+ "train_throughput_mb_per_second"
+ ]
+ * 100,
+ 2,
+ )
+ except ZeroDivisionError:
+ print("Got ZeroDivisionError. Ignoring it.")
+ r["throughput_over_local_ssd"] = 0
+ except:
+ raise
  output_file.write(
  f"{record_set['mean_file_size']},{record_set['num_files_train']},{total_size},{record_set['batch_size']},{scenario},"
  )
  output_file.write(
- f"{r['epoch']},{r['duration']},{r['train_au_percentage']},{r['train_throughput_samples_per_second']},{r['train_throughput_mb_per_second']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\"\n"
+ f"{r['epoch']},{r['duration']},{r['train_au_percentage']},{r['train_throughput_samples_per_second']},{r['train_throughput_mb_per_second']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\",{args.instance_id}\n"
  )
  else:
  for i in range(len(record_set["records"][scenario])):
@@ -264,7 +275,7 @@ def downloadDlioOutputs(dlioWorkloads):
  f"{record_set['mean_file_size']},{record_set['num_files_train']},{total_size},{record_set['batch_size']},{scenario},"
  )
  output_file.write(
- f"{r['epoch']},{r['duration']},{r['train_au_percentage']},{r['train_throughput_samples_per_second']},{r['train_throughput_mb_per_second']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\"\n"
+ f"{r['epoch']},{r['duration']},{r['train_au_percentage']},{r['train_throughput_samples_per_second']},{r['train_throughput_mb_per_second']},{r['throughput_over_local_ssd']},{r['lowest_memory']},{r['highest_memory']},{r['lowest_cpu']},{r['highest_cpu']},{r['pod_name']},{r['start']},{r['end']},\"{r['gcsfuse_mount_options']}\",{args.instance_id}\n"
  )
 
  output_file.close()
@@ -34,21 +34,22 @@ def run_command(command: str):
  print(result.stderr)
 
 
-def createHelmInstallCommands(dlioWorkloads: set) -> list:
- """Create helm install commands for the given dlioWorkload objects."""
+def createHelmInstallCommands(dlioWorkloads: set, instanceId: str):
+ """Create helm install commands for the given set of dlioWorkload objects."""
  helm_commands = []
  for dlioWorkload in dlioWorkloads:
  for batchSize in dlioWorkload.batchSizes:
  commands = [
  (
  'helm install'
- f' {dlioWorkload.bucket}-{batchSize}-{dlioWorkload.scenario} unet3d-loading-test'
+ f' dlio-unet3d-{dlioWorkload.scenario}-{dlioWorkload.numFilesTrain}-{dlioWorkload.recordLength}-{batchSize} unet3d-loading-test'
  ),
  f'--set bucketName={dlioWorkload.bucket}',
  f'--set scenario={dlioWorkload.scenario}',
  f'--set dlio.numFilesTrain={dlioWorkload.numFilesTrain}',
  f'--set dlio.recordLength={dlioWorkload.recordLength}',
  f'--set dlio.batchSize={batchSize}',
+ f'--set instanceId={instanceId}',
  ]
 
  helm_command = ' '.join(commands)
@@ -60,7 +61,9 @@ def main(args) -> None:
  dlioWorkloads = dlio_workload.ParseTestConfigForDlioWorkloads(
  args.workload_config
  )
- helmInstallCommands = createHelmInstallCommands(dlioWorkloads)
+ helmInstallCommands = createHelmInstallCommands(
+ dlioWorkloads, args.instance_id
+ )
  for helmInstallCommand in helmInstallCommands:
  print(f'{helmInstallCommand}')
  if not args.dry_run:
@@ -81,6 +84,11 @@ def main(args) -> None:
  help='Runs DLIO Unet3d tests using this JSON workload configuration.',
  required=True,
  )
+ parser.add_argument(
+ '--instance-id',
+ help='unique string ID for current test-run',
+ required=True,
+ )
  parser.add_argument(
  '-n',
  '--dry-run',

@@ -16,7 +16,7 @@
 apiVersion: v1
 kind: Pod
 metadata:
- name: dlio-tester-{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}-{{ .Values.scenario }}
+ name: dlio-tester-{{ .Values.scenario }}-{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}
  {{- if ne .Values.scenario "local-ssd" }}
  annotations:
  gke-gcsfuse/volumes: "true"
@@ -73,6 +73,8 @@ spec:
  sleep 300
  {{ end }}
 
+ outputDir=/logs/{{ .Values.instanceId }}/{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}/{{ .Values.scenario }}
+
  echo "Testing {{ .Values.scenario }}"
  mpirun -np 8 dlio_benchmark workload=unet3d_a100 \
  ++workload.train.epochs=4 \
@@ -84,14 +86,14 @@ spec:
  ++workload.reader.batch_size={{ .Values.dlio.batchSize }} \
  ++workload.dataset.record_length={{ .Values.dlio.recordLength }} \
  ++workload.reader.read_threads={{ .Values.dlio.readThreads }} \
- ++workload.output.folder=/logs/{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}/{{ .Values.scenario }}
+ ++workload.output.folder=${outputDir}
 
  # dump the gcsfuse-mount-configuration to a file in output-directory.
  {{ if eq .Values.scenario "gcsfuse-generic"}}
- echo "{{ .Values.gcsfuse.mountOptions }}" > /logs/{{ .Values.dlio.numFilesTrain }}-{{ .Values.dlio.recordLength }}-{{ .Values.dlio.batchSize }}/{{ .Values.scenario }}/gcsfuse_mount_options
+ echo "{{ .Values.gcsfuse.mountOptions }}" > ${outputDir}/gcsfuse_mount_options
  {{ end }}
 
- gsutil -m cp -R /logs gs://{{ .Values.bucketName }}/logs/$(date +"%Y-%m-%d-%H-%M")
+ gsutil -m cp -R /logs/{{ .Values.instanceId }} gs://{{ .Values.bucketName }}/logs/{{ .Values.instanceId }}/$(date +"%Y-%m-%d-%H-%M")
  volumeMounts:
  - name: dshm
  mountPath: /dev/shm

@@ -22,6 +22,7 @@ bucketName: gke-dlio-test-data
 # scenario controls the kind of storage that is used for the load testing. local-ssd means directly on LSSD; gcsfuse-generic means on a gcsfuse mount with gcsfuse.mountOptions sent from the caller; gcsfuse-no-file-cache and gcsfuse-file-cache mean as their name suggests.
 scenario: local-ssd
 nodeType: n2-standard-96
+instanceId: ldap-yyyymmdd-hhmmss
 
 resourceLimits:
  cpu: 0

@@ -16,7 +16,7 @@
 apiVersion: v1
 kind: Pod
 metadata:
- name: fio-tester-{{ .Values.fio.readType }}-{{ lower .Values.fio.fileSize }}-{{ lower .Values.fio.blockSize }}-{{ .Values.scenario }}
+ name: fio-tester-{{ .Values.instanceId }}-{{ .Values.scenario }}-{{ .Values.fio.readType }}-{{ lower .Values.fio.fileSize }}-{{ lower .Values.fio.blockSize }}-{{ .Values.fio.numThreads }}-{{ .Values.fio.filesPerThread }}
  {{- if ne .Values.scenario "local-ssd" }}
  annotations:
  gke-gcsfuse/volumes: "true"
@@ -45,7 +45,12 @@ spec:
  echo "Install dependencies..."
  apt-get update
  apt-get install -y libaio-dev gcc make git time wget
-
+
+ no_of_files_per_thread={{ .Values.fio.filesPerThread }}
+ block_size={{ .Values.fio.blockSize }}
+ file_size={{ .Values.fio.fileSize }}
+ num_of_threads={{ .Values.fio.numThreads }}
+
  {{ if eq .Values.scenario "local-ssd" }}
  echo "Installing gsutil..."
  apt-get update && apt-get install -y apt-transport-https ca-certificates gnupg curl
@@ -105,12 +110,8 @@ spec:
 
  echo "Setup default values..."
  epoch=4
- no_of_files_per_thread={{ .Values.fio.filesPerThread }}
  read_type={{ .Values.fio.readType }}
  pause_in_seconds=20
- block_size={{ .Values.fio.blockSize }}
- file_size={{ .Values.fio.fileSize }}
- num_of_threads={{ .Values.fio.numThreads }}
  workload_dir=/data
 
  # Cleaning the pagecache, dentries and inode cache before the starting the workload.
@@ -125,18 +126,18 @@ spec:
  time ls -R $workload_dir 1> /dev/null
 
  echo "Run fio tests..."
- mkdir -p /data/fio-output/{{ .Values.scenario }}/$read_type
+ output_dir=/data/fio-output/{{ .Values.instanceId }}/${file_size}-{{ lower .Values.fio.blockSize}}-${num_of_threads}-${no_of_files_per_thread}/{{ .Values.scenario }}/$read_type
+ mkdir -p ${output_dir}
 
  # dump the gcsfuse-mount-configuration to a file in output-directory.
  {{ if eq .Values.scenario "gcsfuse-generic" }}
- echo "{{ .Values.gcsfuse.mountOptions }}" > /data/fio-output/{{ .Values.scenario }}/$read_type/gcsfuse_mount_options
+ echo "{{ .Values.gcsfuse.mountOptions }}" > ${output_dir}/gcsfuse_mount_options
  {{ end }}
 
  for i in $(seq $epoch); do
-
  echo "[Epoch ${i}] start time:" `date +%s`
  free -mh # Memory usage before workload start.
- NUMJOBS=$num_of_threads NRFILES=$no_of_files_per_thread FILE_SIZE=$file_size BLOCK_SIZE=$block_size READ_TYPE=$read_type DIR=$workload_dir fio ${filename} --alloc-size=1048576 --output-format=json --output="/data/fio-output/{{ .Values.scenario }}/${read_type}/epoch${i}.json"
+ NUMJOBS=$num_of_threads NRFILES=$no_of_files_per_thread FILE_SIZE=$file_size BLOCK_SIZE=$block_size READ_TYPE=$read_type DIR=$workload_dir fio ${filename} --alloc-size=1048576 --output-format=json --output="${output_dir}/epoch${i}.json"
  free -mh # Memory usage after workload completion.
  echo "[Epoch ${i}] end time:" `date +%s`
 
@@ -154,7 +155,7 @@ spec:
  done
 
  {{ if eq .Values.scenario "local-ssd" }}
- gsutil -m cp -R /data/fio-output/local-ssd gs://{{ .Values.bucketName }}/fio-output
+ gsutil -m cp -R /data/fio-output/{{ .Values.instanceId }}/* gs://{{ .Values.bucketName }}/fio-output/{{ .Values.instanceId }}/
  {{ end }}
 
  echo "fio job completed!"

@@ -22,6 +22,7 @@ bucketName: gke-dlio-test-data
 # scenario controls the kind of storage that is used for the load testing. local-ssd means directly on LSSD; gcsfuse-generic means on a gcsfuse mount with gcsfuse.mountOptions sent from the caller; gcsfuse-no-file-cache and gcsfuse-file-cache mean as their name suggests.
 scenario: local-ssd
 nodeType: n2-standard-96
+instanceId: ldap-yyyymmdd-hhmmss
 
 resourceLimits:
  cpu: 0