Skip to content

Commit

Permalink
For X86 and AMD machines, we can create a pip based dlio installations (
Browse files Browse the repository at this point in the history
#66)

* added basic build script.

* refactored code and build setup

* refactored code build of dlio

* fixed pytest

* fixed pytest

* fixed installation script.

* Fixed comments and strings to use correct paths.

* Fixes for installation of dlio

1. Fixed setup to install post processor.
2. Fixed CI to not set PYTHONPATH

* fixed test dependencies

* fixed ci scripts to use executable created from the installer

* fixed test dependencies

* fixed path of config

* Fixed Readme to use new installation methodology

* fixed ppc env

* fixed ppc env

* fixed application

* Refactored code for better build

1. created setup.py
2. fix root drectory to dlio_benchmark
3. renamed dlio_benchmark.py to main.py
4. renamed dlio_postprocessor.py to postprocessor.py
5. fixed documentation to use dlio_benchmark and dlio_postprocessor entry points.
  • Loading branch information
hariharan-devarajan authored Jun 20, 2023
1 parent fabdfd7 commit 9f5a8db
Show file tree
Hide file tree
Showing 85 changed files with 338 additions and 342 deletions.
42 changes: 9 additions & 33 deletions .github/workflows/python-package-conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest ]
os: [ ubuntu-20.04 ]
profiler: [ DEFAULT, DLIO_PROFILER ]
gcc: [10]
name: ${{ matrix.os }}-${{ matrix.profiler }}-${{ matrix.gcc }}
Expand All @@ -35,23 +35,19 @@ jobs:
sudo apt-get install $CC $CXX libc6
sudo apt-get install mpich
python -m pip install --upgrade pip
pip install --upgrade --upgrade-strategy eager -r dev-requirements.txt
pip install .[test]
if [[ $DLIO_PROFILER == 'DLIO_PROFILER' ]]; then
sudo apt-get install libhwloc-dev
git clone https://github.com/hariharan-devarajan/dlio-profiler /home/runner/work/dlio_profiler
cd /home/runner/work/dlio_profiler
git submodule update --init --recursive
pushd external/GOTCHA
git apply ../gotcha_glibc_workaround.patch
popd
mkdir build
cd build
cmake ../
sudo make install -j
fi
- name: test_gen_data
run: |
touch __init__.py
export PYTHONPATH=./:$PYTHONPATH
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
RDMAV_FORK_SAFE=1 mpirun -np 2 pytest -k test_gen_data[png-tensorflow] -v
RDMAV_FORK_SAFE=1 mpirun -np 2 pytest -k test_gen_data[npz-tensorflow] -v
Expand All @@ -60,14 +56,10 @@ jobs:
RDMAV_FORK_SAFE=1 mpirun -np 2 pytest -k test_gen_data[hdf5-tensorflow] -v
- name: test_custom_storage_root_gen_data
run: |
touch __init__.py
export PYTHONPATH=./:$PYTHONPATH
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
RDMAV_FORK_SAFE=1 mpirun -np 2 pytest -k test_storage_root_gen_data -v
- name: test_train
run: |
touch __init__.py
export PYTHONPATH=./:$PYTHONPATH
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
RDMAV_FORK_SAFE=1 mpirun -np 2 pytest -k test_train[png-tensorflow-tensorflow]
RDMAV_FORK_SAFE=1 mpirun -np 2 pytest -k test_train[npz-tensorflow-tensorflow]
Expand All @@ -92,52 +84,36 @@ jobs:
RDMAV_FORK_SAFE=1 mpirun -np 2 pytest -k test_train[csv-pytorch-dali]
- name: test_custom_storage_root_train
run: |
touch __init__.py
export PYTHONPATH=./:$PYTHONPATH
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
RDMAV_FORK_SAFE=1 mpirun -np 2 pytest -k test_custom_storage_root_train -v
- name: test_checkpoint_epoch
run: |
touch __init__.py
export PYTHONPATH=./:$PYTHONPATH
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
RDMAV_FORK_SAFE=1 mpirun -np 2 pytest -k test_checkpoint_epoch -v
- name: test_checkpoint_step
run: |
touch __init__.py
export PYTHONPATH=./:$PYTHONPATH
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
RDMAV_FORK_SAFE=1 mpirun -np 2 pytest -k test_checkpoint_step -v
- name: test_eval
run: |
touch __init__.py
export PYTHONPATH=./:$PYTHONPATH
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
RDMAV_FORK_SAFE=1 mpirun -np 2 pytest -k test_eval -v
- name: test_multi_threads
run: |
touch __init__.py
export PYTHONPATH=./:$PYTHONPATH
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
RDMAV_FORK_SAFE=1 mpirun -np 2 pytest -k test_multi_threads -v
- name: test-tf-loader-tfrecord
run: |
touch __init__.py
export PYTHONPATH=./:$PYTHONPATH
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
RDMAV_FORK_SAFE=1 mpirun -np 2 python ./src/dlio_benchmark.py workload=resnet50 ++workload.dataset.num_files_train=64 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_samples_per_file=16
RDMAV_FORK_SAFE=1 mpirun -np 2 python ./src/dlio_benchmark.py workload=resnet50 ++workload.dataset.num_files_train=64 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_samples_per_file=16
RDMAV_FORK_SAFE=1 mpirun -np 2 dlio_benchmark workload=resnet50 ++workload.dataset.num_files_train=64 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_samples_per_file=16
RDMAV_FORK_SAFE=1 mpirun -np 2 dlio_benchmark workload=resnet50 ++workload.dataset.num_files_train=64 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_samples_per_file=16
- name: test-torch-loader-npz
run: |
touch __init__.py
export PYTHONPATH=./:$PYTHONPATH
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
RDMAV_FORK_SAFE=1 mpirun -np 2 python ./src/dlio_benchmark.py workload=unet3d ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
RDMAV_FORK_SAFE=1 mpirun -np 2 python ./src/dlio_benchmark.py workload=unet3d ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
RDMAV_FORK_SAFE=1 mpirun -np 2 dlio_benchmark workload=unet3d ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
RDMAV_FORK_SAFE=1 mpirun -np 2 dlio_benchmark workload=unet3d ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
- name: test-tf-loader-npz
run: |
touch __init__.py
export PYTHONPATH=./:$PYTHONPATH
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
RDMAV_FORK_SAFE=1 mpirun -np 2 python ./src/dlio_benchmark.py workload=unet3d ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
RDMAV_FORK_SAFE=1 mpirun -np 2 python ./src/dlio_benchmark.py workload=unet3d ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
RDMAV_FORK_SAFE=1 mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=False ++workload.workflow.generate_data=True ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
RDMAV_FORK_SAFE=1 mpirun -np 2 dlio_benchmark workload=unet3d ++workload.framework=tensorflow ++workload.data_reader.data_loader=tensorflow ++workload.train.computation_time=0.05 ++workload.evaluation.eval_time=0.01 ++workload.train.epochs=2 ++workload.workflow.train=True ++workload.workflow.generate_data=False ++workload.dataset.num_files_train=16 ++workload.dataset.num_files_eval=16 ++workload.reader.read_threads=2 ++workload.dataset.record_length=4096 ++workload.dataset.record_length_stdev=0
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -156,3 +156,4 @@ dmypy.json
#Apple system files
.DS_Store
/.idea/
/venv-quartz/
4 changes: 1 addition & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,4 @@ RUN apt-get update && \
apt-get install -y mpich

RUN python -m pip install --upgrade pip
RUN pip install -r requirements.txt

ENV PYTHONPATH="${PYTHONPATH}:/workspace/dlio"
RUN pip install .
2 changes: 2 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
include requirements.txt
recursive-include configs *
25 changes: 9 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@ DLIO is an I/O benchmark for Deep Learning. DLIO is aimed at emulating the I/O b
```bash
git clone https://github.com/argonne-lcf/dlio_benchmark
cd dlio_benchmark/
pip install -r requirements.txt
export PYTHONPATH=$PWD/:$PYTHONPATH
python ./src/dlio_benchmark.py ++workload.workflow.generate_data=True
pip install .
dlio_benchmark ++workload.workflow.generate_data=True
```
Additionally, to generate the report `iostat` is needed and can be installed from the `sysstat` package using your package manager.

Expand All @@ -25,37 +24,31 @@ Additionally, to generate the report `iostat` is needed and can be installed fro
git clone https://github.com/argonne-lcf/dlio_benchmark
cd dlio_benchmark/
docker build -t dlio .
docker run -t dlio python ./src/dlio_benchmark.py ++workload.workflow.generate_data=True
docker run -t dlio dlio_benchmark ++workload.workflow.generate_data=True
```

You can also pull rebuilt container from docker hub (might not reflect the most recent change of the code):
```bash
docker docker.io/zhenghh04/dlio:latest
docker run -t docker.io/zhenghh04/dlio:latest python ./src/dlio_benchmark.py ++workload.workflow.generate_data=True
docker run -t docker.io/zhenghh04/dlio:latest python ./dlio_benchmark/main.py ++workload.workflow.generate_data=True
```

One can also run interactively inside the container
```bash
docker run -t docker.io/zhenghh04/dlio:latest /bin/bash
root@30358dd47935:/workspace/dlio$ python ./src/dlio_benchmark.py ++workload.workflow.generate_data=True
root@30358dd47935:/workspace/dlio$ python ./dlio_benchmark/main.py ++workload.workflow.generate_data=True
```

## PowerPC
PowerPC requires installation through anaconda.
```bash
# Setup required channels
conda config --prepend channels conda-forge
conda config --prepend channels https://public.dhe.ibm.com/ibmdl/export/pub/software/server/ibm-ai/conda/

# create and activate environment
conda env create --prefix ./dlio_env_ppc --file environment-ppc.yaml --force
conda activate ./dlio_env_ppc

# install hydra dependency by source
mkdir external
cd external
git clone git@github.com:facebookresearch/hydra.git
cd hydra
# install other dependencies
python -m pip install .
```

Expand All @@ -72,20 +65,20 @@ One can specify the workload through the ```workload=``` option on the command l

First, generate the data
```bash
mpirun -np 8 python3 src/dlio_benchmark.py workload=unet3d ++workload.workflow.generate_data=True ++workload.workflow.train=False
mpirun -np 8 dlio_benchmark workload=unet3d ++workload.workflow.generate_data=True ++workload.workflow.train=False
```
If possible, one can flush the filesystem caches in order to properly capture device I/O
```bash
sudo sync && echo 3 | sudo tee /proc/sys/vm/drop_caches
```
Finally, run the benchmark with ```iostat``` profiling, listing the io devices you would like to trace.
```bash
mpirun -np 8 python3 src/dlio_benchmark.py workload=unet3d ++workload.workflow.profiling=True ++workload.profiling.profiler=iostat ++workload.profiling.iostat_devices=[sda,sdb]
mpirun -np 8 dlio_benchmark workload=unet3d ++workload.workflow.profiling=True ++workload.profiling.profiler=iostat ++workload.profiling.iostat_devices=[sda,sdb]
```

All the outputs will be stored in ```hydra_log/unet3d/$DATE-$TIME``` folder. To post process the data, one can do
```bash
python3 src/dlio_postprocessor.py --output-folder hydra_log/unet3d/$DATE-$TIME
dlio_postprocessor --output-folder hydra_log/unet3d/$DATE-$TIME
```
This will generate ```DLIO_$model_report.txt``` in the output folder.

Expand Down
1 change: 1 addition & 0 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,4 @@ pytest-mpi
pytest-subtests
pytest-timeout
nvidia-dali-cuda110
psutil
File renamed without changes.
Empty file.
File renamed without changes.
Empty file.
File renamed without changes.
File renamed without changes.
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
'''


from src.computation.computation_handler import ComputationHandler
from dlio_benchmark.computation.computation_handler import ComputationHandler


class AsyncComputation(ComputationHandler):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
limitations under the License.
'''

from src.common.enumerations import ComputationType
from src.common.error_code import ErrorCodes
from src.computation.asynchronous_computation import AsyncComputation
from src.computation.no_computation import NoComputation
from src.computation.synchronous_computation import SyncComputation
from dlio_benchmark.common.enumerations import ComputationType
from dlio_benchmark.common.error_code import ErrorCodes
from dlio_benchmark.computation.asynchronous_computation import AsyncComputation
from dlio_benchmark.computation.no_computation import NoComputation
from dlio_benchmark.computation.synchronous_computation import SyncComputation


class ComputationFactory(object):
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
limitations under the License.
"""

from src.computation.computation_handler import ComputationHandler
from dlio_benchmark.computation.computation_handler import ComputationHandler


class NoComputation(ComputationHandler):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
limitations under the License.
"""

from src.computation.computation_handler import ComputationHandler
from dlio_benchmark.computation.computation_handler import ComputationHandler


class SyncComputation(ComputationHandler):
Expand Down
Empty file.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ template: |-
DLIO - an IO benchmark for deep learning applications.
Running the benchmark: python src/dlio_benchmark.py workload=unet3d
Running the benchmark: dlio_benchmark workload=unet3d
One can select the workload configuration using "workload={WORKLOAD}".
The corresponding YAML file is ./configs/workload/{WORKLOAD}.yaml folder.
Available choise for $APP_CONFIG_GROUPS
One can override everything in the command line, for example:
python src/dlio_benchmark.py workload.framework=tensorflow
dlio_benchmark workload.framework=tensorflow
One can also create a custom YAML file for a specific workload.
An example of a YAML file is as follows.
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@
limitations under the License.
"""

from src.common.enumerations import Compression
from src.data_generator.data_generator import DataGenerator
from dlio_benchmark.common.enumerations import Compression
from dlio_benchmark.data_generator.data_generator import DataGenerator
import math
import os

import numpy as np
import csv

from shutil import copyfile
from src.utils.utility import progress
from dlio_benchmark.utils.utility import progress
import pandas as pd

"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@

from abc import ABC, abstractmethod

from src.utils.config import ConfigArguments
from src.storage.storage_factory import StorageFactory
from dlio_benchmark.utils.config import ConfigArguments
from dlio_benchmark.storage.storage_factory import StorageFactory
import math
from mpi4py import MPI
from shutil import copyfile
import numpy as np
import logging
from src.utils.utility import utcnow, add_padding
from dlio_benchmark.utils.utility import utcnow, add_padding


class DataGenerator(ABC):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
limitations under the License.
"""

from src.common.enumerations import FormatType
from src.common.error_code import ErrorCodes
from dlio_benchmark.common.enumerations import FormatType
from dlio_benchmark.common.error_code import ErrorCodes



Expand All @@ -27,22 +27,22 @@ def __init__(self):
@staticmethod
def get_generator(type):
if type == FormatType.TFRECORD:
from src.data_generator.tf_generator import TFRecordGenerator
from dlio_benchmark.data_generator.tf_generator import TFRecordGenerator
return TFRecordGenerator()
elif type == FormatType.HDF5:
from src.data_generator.hdf5_generator import HDF5Generator
from dlio_benchmark.data_generator.hdf5_generator import HDF5Generator
return HDF5Generator()
elif type == FormatType.CSV:
from src.data_generator.csv_generator import CSVGenerator
from dlio_benchmark.data_generator.csv_generator import CSVGenerator
return CSVGenerator()
elif type == FormatType.NPZ:
from src.data_generator.npz_generator import NPZGenerator
from dlio_benchmark.data_generator.npz_generator import NPZGenerator
return NPZGenerator()
elif type == FormatType.JPEG:
from src.data_generator.jpeg_generator import JPEGGenerator
from dlio_benchmark.data_generator.jpeg_generator import JPEGGenerator
return JPEGGenerator()
elif type == FormatType.PNG:
from src.data_generator.png_generator import PNGGenerator
from dlio_benchmark.data_generator.png_generator import PNGGenerator
return PNGGenerator()
else:
raise Exception(str(ErrorCodes.EC1001))
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@
import h5py
import numpy as np

from src.common.enumerations import Compression
from src.data_generator.data_generator import DataGenerator
from src.utils.utility import progress, Profile
from dlio_benchmark.common.enumerations import Compression
from dlio_benchmark.data_generator.data_generator import DataGenerator
from dlio_benchmark.utils.utility import progress, Profile
from shutil import copyfile

from src.common.constants import MODULE_DATA_GENERATOR
from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR

dlp = Profile(MODULE_DATA_GENERATOR)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,16 @@
limitations under the License.
"""

from src.common.enumerations import Compression
from src.data_generator.data_generator import DataGenerator
from dlio_benchmark.common.enumerations import Compression
from dlio_benchmark.data_generator.data_generator import DataGenerator

import logging
import numpy as np

from src.utils.utility import progress, utcnow, Profile
from dlio_benchmark.utils.utility import progress, utcnow, Profile
from shutil import copyfile
import PIL.Image as im
from src.common.constants import MODULE_DATA_GENERATOR
from dlio_benchmark.common.constants import MODULE_DATA_GENERATOR


dlp = Profile(MODULE_DATA_GENERATOR)
Expand Down
Loading

0 comments on commit 9f5a8db

Please sign in to comment.