-
Notifications
You must be signed in to change notification settings - Fork 29
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Co-authored-by: Jae-Won Chung <jwnchung@umich.edu>
- Loading branch information
1 parent
b662a1f
commit 611084c
Showing
73 changed files
with
6,462 additions
and
161 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,3 +10,4 @@ zeus.egg-info/ | |
.git/ | ||
|
||
**/data/ | ||
**/versions/*.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,4 +11,7 @@ dist/ | |
*.json | ||
**/.DS_Store | ||
.cache/ | ||
.env | ||
env/ | ||
.pytest_cache/ | ||
/envs |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
FROM python:3.9 | ||
|
||
WORKDIR /workspace | ||
|
||
ADD . /workspace | ||
|
||
# For sqlite | ||
# RUN pip install --no-cache-dir aiosqlite | ||
|
||
# For mysql | ||
RUN pip install --no-cache-dir asyncmy | ||
RUN pip install --no-cache-dir '.[migration]' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
FROM python:3.9 | ||
|
||
WORKDIR /workspace | ||
|
||
ADD . /workspace | ||
|
||
# For sqlite | ||
# RUN pip install --no-cache-dir aiosqlite | ||
|
||
# For mysql | ||
RUN pip install --no-cache-dir asyncmy | ||
RUN pip install --no-cache-dir '.[bso-server]' | ||
|
||
CMD ["uvicorn", "zeus.optimizer.batch_size.server.router:app", "--host", "0.0.0.0", "--port", "80"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
version: '3.9' | ||
name: zeus_bso_server | ||
|
||
services: | ||
server: | ||
image: bso-server | ||
build: | ||
context: ../ | ||
dockerfile: ./docker/bso_server.Dockerfile | ||
container_name: bso | ||
restart: always | ||
environment: | ||
ZEUS_BSO_DATABASE_URL: ${ZEUS_BSO_DATABASE_URL-mysql+asyncmy://${ZEUS_BSO_DB_USER}:${ZEUS_BSO_DB_PASSWORD}@db:3306/Zeus} | ||
ZEUS_BSO_LOG_LEVEL: ${ZEUS_BSO_LOG_LEVEL} | ||
ZEUS_BSO_ECHO_SQL: ${ZEUS_BSO_ECHO_SQL} | ||
ports: | ||
# Map 80 to the container | ||
- "80:80" | ||
networks: | ||
- servernet | ||
depends_on: | ||
migration: | ||
# start running when migration is done. | ||
condition: service_completed_successfully | ||
labels: | ||
# labels for kubernetes | ||
kompose.service.type: nodeport | ||
# Pull image only when there is no image locally. Otherewise use that one. | ||
kompose.image-pull-policy: IfNotPresent | ||
# set the node port. Should be 30000-32767 | ||
kompose.service.nodeport.port: ${ZEUS_BSO_SERVER_PORT-30100} | ||
db: | ||
image: mysql | ||
container_name: db | ||
restart: always | ||
environment: | ||
MYSQL_DATABASE: Zeus | ||
MYSQL_USER: ${ZEUS_BSO_DB_USER} | ||
MYSQL_ROOT_PASSWORD: ${ZEUS_BSO_ROOT_PASSWORD} | ||
MYSQL_PASSWORD: ${ZEUS_BSO_DB_PASSWORD} | ||
expose: | ||
# Opens 3306 on the container to server & migration | ||
- 3306 | ||
volumes: | ||
- ./mysql_data:/var/lib/mysql | ||
networks: | ||
- servernet | ||
healthcheck: | ||
test: ["CMD", "mysqladmin" ,"ping", "-h", "localhost"] | ||
timeout: 3s | ||
retries: 10 | ||
start_period: 2s | ||
start_interval: 1s | ||
|
||
migration: | ||
image: bso-migration | ||
build: | ||
context: ../ | ||
dockerfile: ./docker/bso_migration.Dockerfile | ||
deploy: | ||
restart_policy: | ||
condition: on-failure | ||
max_attempts: 3 | ||
depends_on: | ||
db: | ||
# wait until db is ready to accept connection | ||
condition: service_healthy | ||
# Generate revision and upgrade database. Change message of revision as you want | ||
command: > | ||
bash -c 'cd /workspace/zeus/optimizer/batch_size && alembic revision --autogenerate -m "Baseline: create tables" && alembic upgrade head' | ||
environment: | ||
ZEUS_BSO_DATABASE_URL: ${ZEUS_BSO_DATABASE_URL-mysql+asyncmy://${ZEUS_BSO_DB_USER}:${ZEUS_BSO_DB_PASSWORD}@db:3306/Zeus} | ||
networks: | ||
- servernet | ||
volumes: | ||
# mount version scripts we generated. | ||
- ./zeus/optimizer/batch_size/migrations/versions:/workspace/zeus/optimizer/batch_size/migrations/versions | ||
labels: | ||
kompose.image-pull-policy: IfNotPresent | ||
|
||
|
||
networks: | ||
servernet: | ||
driver: bridge |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,200 @@ | ||
# Batch Size Optimizer in Zeus | ||
|
||
## What is it? | ||
|
||
Batch size optimizer(BSO) can choose the best batch size that minimizes the cost, where cost is defined as $cost = \eta \times \text{Energy consumption to accuracy} + (1-\eta) \times \text{Max power}\times \text{Time to accuracy}$. | ||
|
||
## How does it work? | ||
|
||
The core of BSO is a multi-arm-bandit based on **recurrent** training. After each training, we feed the result cost to MAB and after a certain number of trainings, MAB can converge to the best batch size. In addition to MAB, we employed early-stopping and pruning to handle stragglers. For more details, refer to [paper](https://www.usenix.org/conference/nsdi23/presentation/you). | ||
|
||
## Should I use this? | ||
|
||
The key of BSO is recurrent training. If you are training your model periodically or repeatedly, BSO can be a great choice to reduce energy or time consumption. | ||
|
||
## Limitations | ||
|
||
We currently don't support heterogeneous GPUs or different configurations. The number of GPUs, GPU models, and other configurations in JobSpec should be identical in recurrent training. If you are running your training in a various environment each time, then it might not be desirable to use BSO. | ||
|
||
## Sequence diagram of BSO | ||
|
||
```mermaid | ||
sequenceDiagram; | ||
participant BSO server | ||
participant BSO client | ||
loop Every recurrent training | ||
BSO client->>BSO server: Register the training job and ask for the batch size | ||
BSO server->>BSO client: Return the next batch size to use with a trial number | ||
loop Every epoch | ||
BSO client->>BSO server: At the end of each epoch, report the result | ||
BSO server->>BSO client: Compute the cost and tell the client if it should stop the training | ||
end | ||
BSO client->>BSO server: Report the end of the trial on exit | ||
end | ||
``` | ||
|
||
## Quick start (Server) | ||
|
||
1. Clone the repository | ||
|
||
```Shell | ||
git clone https://github.com/ml-energy/zeus/tree/master | ||
``` | ||
|
||
2. Create `.env` under `/docker`. An example of `.env` is provided below. | ||
|
||
By default, we are using the MySQL for the database. | ||
|
||
```Shell | ||
ZEUS_BSO_DB_USER="me" | ||
ZEUS_BSO_DB_PASSWORD="secret" | ||
ZEUS_BSO_ROOT_PASSWORD="secret*" | ||
ZEUS_BSO_SERVER_PORT=8000 | ||
ZEUS_BSO_LOG_LEVEL="INFO" | ||
ZEUS_BSO_ECHO_SQL="True" | ||
``` | ||
|
||
If you want to use different databases, you need to add `ZEUS_BSO_DATABASE_URL` as an environment variable. See [Remark](#remark-about-server) for detail. | ||
Also, if you are running using docker-compose or Kubernetes, you need to change the image name under `db` in the docker-compose file. | ||
|
||
3. Running a server | ||
|
||
- Using docker-compose | ||
|
||
```Shell | ||
cd docker | ||
docker-compose -f ./docker/docker-compose.yaml up | ||
``` | ||
|
||
This will build images for each container: db, migration, and the server. Then, it will spin those containers. | ||
|
||
- Using Kubernetes. | ||
|
||
1. Build an image. | ||
|
||
```Shell | ||
# From the root directory | ||
docker build -f ./docker/bso_server.Dockerfile -t bso-server . | ||
docker build -f ./docker/bso_migration.Dockerfile -t bso-migration . | ||
``` | ||
|
||
2. Create Kubernetes yaml files using Kompose. Kompose is a tool that converts docker-compose files into Kubernetes files. For more information, visit [Kompose Reference](#kompose-references) | ||
|
||
```Shell | ||
cd docker | ||
docker-compose config > docker-compose-resolved.yaml && kompose convert -f docker-compose-resolved.yaml -o ./kube/ && rm docker-compose-resolved.yaml | ||
``` | ||
|
||
It first resolves env files using docker-compose, then creates Kubernetes yaml files under `docker/kube/` | ||
|
||
3. Run kubernetes. | ||
|
||
```Shell | ||
cd kube | ||
kubectl apply -f . | ||
``` | ||
|
||
- Using uvicorn. | ||
|
||
If you are using the uvicorn to spin the server, you need to create a database and perform migration before starting the server. | ||
|
||
1. Run the database of your choice. | ||
2. Set the environment variables in `.env` | ||
|
||
```Shell | ||
ZEUS_BSO_DATABASE_URL="me" | ||
ZEUS_BSO_LOG_LEVEL="INFO" | ||
ZEUS_BSO_ECHO_SQL="True" | ||
``` | ||
|
||
3. Run Alembic migration | ||
|
||
1. Install dependencies | ||
|
||
```Bash | ||
pip install '.[migration]' | ||
``` | ||
|
||
2. Create the migration script. This will create scripts under ./versions | ||
|
||
```Bash | ||
alembic revision --autogenerate -m "Baseline: create tables" | ||
``` | ||
|
||
3. Apply migration | ||
1. Online (apply it to database directly) | ||
|
||
```Bash | ||
alembic upgrade head | ||
``` | ||
|
||
2. Offline (generate sql) | ||
|
||
```Bash | ||
alembic upgrade head --sql | ||
``` | ||
|
||
4. Run the server using uvicorn. | ||
|
||
```Shell | ||
cd zeus/optimizer/batch_size/server | ||
uvicorn router:app --reload | ||
``` | ||
|
||
Now the server is good to go! | ||
|
||
### Remark about the server | ||
|
||
Zeus Batch Size Optimizer server is using Sqlalchemy to support various types of databases. However, you need to download the corresponding async connection driver. | ||
As a default, we are using Mysql. You can add installation code to `bso_migration.Dockerfile` and `bso_server.Dockerfile`. Refer to those files for reference. | ||
|
||
## Use BSO in your training script (Client) | ||
|
||
1. Install Zeus package. | ||
|
||
```Shell | ||
pip install zeus-ml[bso] | ||
``` | ||
|
||
2. Add [`BatchSizeOptimizer`][zeus.optimizer.batch_size.client.BatchSizeOptimizer] to your training script. | ||
|
||
```Python | ||
# Initialization | ||
bso = BatchSizeOptimizer( | ||
monitor=monitor, | ||
server_url="http://127.0.0.1:8000", | ||
job=JobParams( | ||
job_id_prefix="mnist-dev", | ||
default_batch_size=256, | ||
batch_sizes=[32, 64, 256, 512, 1024, 4096, 2048], | ||
max_epochs=100 | ||
), | ||
) | ||
# ... other codes | ||
# Get batch size to use from the server | ||
batch_size = bso.get_batch_size() | ||
# ... | ||
# beginning of the train | ||
bso.on_train_begin() | ||
# ... | ||
# After evaluation | ||
bso.on_evaluate(metric) | ||
``` | ||
|
||
### Remark about the client | ||
|
||
Training can fail if | ||
|
||
1. It failed to converge within configured max_epochs | ||
2. It exceeded the early stopping threshold which is configured by `beta_knob` in `JobSpec` | ||
|
||
In that case, the optimizer will raise `ZeusBSOTrainFailError`. This means that the chosen batch size was not useful, and the BSO server will not give this batch size again. However, the user ***should re-launch the job*** so that the BSO server can give another batch size. The server will learn which batch size is useful and will converge to the batch size that causes the least cost as you launch the job multiple times. | ||
|
||
## Kompose references | ||
|
||
Refer [Kompose](https://kompose.io/) and [Kompose labels](https://github.com/kubernetes/kompose/blob/main/docs/user-guide.md) for more information. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.