-
Notifications
You must be signed in to change notification settings - Fork 63
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
699 additions
and
551 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
[deps] | ||
Augmentor = "02898b10-1f73-11ea-317c-6393d7073e15" | ||
Boltz = "4544d5e4-abc5-4dea-817f-29e4c205d9c8" | ||
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" | ||
Configurations = "5218b696-f38b-4ac9-8b61-a12ec717816d" | ||
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" | ||
FLoops = "cc61a311-1640-44b5-9fba-1b764f453329" | ||
FluxMPI = "acf642fa-ee0e-513a-b9d5-bcd150f7ec3b" | ||
Formatting = "59287772-0a20-5a39-b81b-1366585eb4c0" | ||
Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" | ||
Images = "916415d5-f1e6-5110-898d-aaa5f9f070e0" | ||
JLSO = "9da8a3cd-07a3-59c0-a743-3fdc52c30d11" | ||
JpegTurbo = "b835a17e-a41a-41e7-81f0-2f016b05efe0" | ||
Lux = "b2108857-7c20-44ae-9111-449ecde12c47" | ||
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54" | ||
NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" | ||
OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f" | ||
Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2" | ||
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" | ||
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" | ||
Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46" | ||
SimpleConfig = "f2d95530-262a-480f-aff0-1c0431e662a7" | ||
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" | ||
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" | ||
|
||
[compat] | ||
Augmentor = "0.6" | ||
Boltz = "0.1" | ||
CUDA = "3" | ||
Configurations = "0.17" | ||
FLoops = "0.2" | ||
FluxMPI = "0.6" | ||
Formatting = "0.4" | ||
Functors = "0.2, 0.3" | ||
Images = "0.24, 0.25" | ||
JLSO = "2" | ||
JpegTurbo = "0.1" | ||
Lux = "0.4" | ||
MLUtils = "0.2.10" | ||
NNlib = "0.8" | ||
OneHotArrays = "0.1" | ||
Optimisers = "0.2" | ||
Setfield = "0.8.2" | ||
SimpleConfig = "0.1" | ||
Zygote = "0.6" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,82 +1,142 @@ | ||
# Imagenet Training using Lux | ||
|
||
This implements training of popular model architectures, such as ResNet, AlexNet, and VGG on the ImageNet dataset. | ||
This implements training of popular model architectures, such as ResNet, AlexNet, and VGG on | ||
the ImageNet dataset. | ||
|
||
## Requirements | ||
|
||
* Install [julia](https://julialang.org/) | ||
* In the Julia REPL instantiate the `Project.toml` in the parent directory | ||
* Download the ImageNet dataset from http://www.image-net.org/ | ||
- Then, move and extract the training and validation images to labeled subfolders, using [the following shell script](https://github.com/pytorch/examples/blob/main/imagenet/extract_ILSVRC.sh) | ||
- Then, move and extract the training and validation images to labeled subfolders, using | ||
[this shell script](https://github.com/pytorch/examples/blob/main/imagenet/extract_ILSVRC.sh) | ||
|
||
## Training | ||
|
||
To train a model, run `main.jl` with the desired model architecture and the path to the ImageNet dataset: | ||
To train a model, run `main.jl` with the necessary parameters. See | ||
[Boltz documentation](http://lux.csail.mit.edu/stable/lib/Boltz/) for the model | ||
configuration. | ||
|
||
```bash | ||
julia --project=.. -t 8 main.jl --arch ResNet18 [imagenet-folder with train and val folders] | ||
``` | ||
|
||
The default learning rate schedule starts at 0.1 and decays by a factor of 10 every 30 epochs. This is appropriate for ResNet and models with batch normalization, but too high for AlexNet and VGG. Use 0.01 as the initial learning rate for AlexNet or VGG: | ||
|
||
```bash | ||
julia --project=.. -t 8 main.jl --arch AlexNet --learning-rate 0.01 [imagenet-folder with train and val folders] | ||
julia --project=examples/ImageNet -t 4 examples/ImageNet/main.jl\ | ||
--cfg.dataset.data_root=/home/avik-pal/data/ImageNet/\ | ||
--cfg.dataset.train_batchsize=256 --cfg.dataset.eval_batchsize=256\ | ||
--cfg.optimizer.learning_rate=0.5 | ||
|
||
julia --project=examples/ImageNet -t 4 examples/ImageNet/main.jl\ | ||
--cfg.model.name=alexnet --cfg.model.arch=alexnet\ | ||
--cfg.dataset.data_root=/home/avik-pal/data/ImageNet/\ | ||
--cfg.dataset.train_batchsize=256 --cfg.dataset.eval_batchsize=256\ | ||
--cfg.optimizer.learning_rate=0.01 | ||
``` | ||
|
||
## Distributed Data Parallel Training | ||
|
||
Setup [MPI.jl](https://juliaparallel.org/MPI.jl/stable/usage/#CUDA-aware-MPI-support) preferably with the system MPI. Set `FLUXMPI_DISABLE_CUDAMPI_SUPPORT=true` to disable communication via CuArrays (note that this will lead to a very high communication bottleneck). | ||
Setup [MPI.jl](https://juliaparallel.org/MPI.jl/stable/usage/#CUDA-aware-MPI-support) | ||
preferably with the system MPI. Set `FLUXMPI_DISABLE_CUDAMPI_SUPPORT=true` to disable | ||
communication via CuArrays (note that this might lead to a very high communication | ||
bottleneck). | ||
|
||
**Learning Rate**: Remember to linearly scale the learning-rate based on the number of processes you are using. | ||
!!! tip "Learning Rate" | ||
|
||
**NOTE**: If using CUDA-aware MPI you need to disable the default CUDA allocator by `export JULIA_CUDA_MEMORY_POOL=none`. This might slow down your code slightly but will prevent any sudden segfaults which occur without setting this parameter. | ||
Remember to linearly scale the learning-rate based on the number of processes you are | ||
using. | ||
|
||
!!! note | ||
|
||
## Usage | ||
If using CUDA-aware MPI you need to disable the default CUDA allocator by | ||
`export JULIA_CUDA_MEMORY_POOL=none`. This might slow down your code slightly but will | ||
prevent any sudden segfaults which occur without setting this parameter. | ||
|
||
```bash | ||
usage: main.jl [--arch ARCH] [--epochs EPOCHS] | ||
[--start-epoch START-EPOCH] [--batch-size BATCH-SIZE] | ||
[--learning-rate LEARNING-RATE] [--momentum MOMENTUM] | ||
[--weight-decay WEIGHT-DECAY] [--print-freq PRINT-FREQ] | ||
[--resume RESUME] [--evaluate] [--pretrained] | ||
[--seed SEED] [--distributed] [-h] data | ||
mpiexecjl -np 4 julia --project=examples/ImageNet -t 4 examples/ImageNet/main.jl\ | ||
--cfg.dataset.data_root=/home/avik-pal/data/ImageNet/\ | ||
--cfg.dataset.train_batchsize=256 --cfg.dataset.eval_batchsize=256\ | ||
--cfg.optimizer.learning_rate=0.5 | ||
``` | ||
|
||
Lux ImageNet Training | ||
|
||
positional arguments: | ||
data path to dataset | ||
## Usage | ||
|
||
```bash | ||
usage: main.jl [--cfg.seed CFG.SEED] [--cfg.model.name CFG.MODEL.NAME] | ||
[--cfg.model.arch CFG.MODEL.ARCH] | ||
[--cfg.model.pretrained CFG.MODEL.PRETRAINED] | ||
[--cfg.optimizer.name CFG.OPTIMIZER.NAME] | ||
[--cfg.optimizer.learning_rate CFG.OPTIMIZER.LEARNING_RATE] | ||
[--cfg.optimizer.nesterov CFG.OPTIMIZER.NESTEROV] | ||
[--cfg.optimizer.momentum CFG.OPTIMIZER.MOMENTUM] | ||
[--cfg.optimizer.weight_decay CFG.OPTIMIZER.WEIGHT_DECAY] | ||
[--cfg.optimizer.scheduler.name CFG.OPTIMIZER.SCHEDULER.NAME] | ||
[--cfg.optimizer.scheduler.cycle_length CFG.OPTIMIZER.SCHEDULER.CYCLE_LENGTH] | ||
[--cfg.optimizer.scheduler.damp_factor CFG.OPTIMIZER.SCHEDULER.DAMP_FACTOR] | ||
[--cfg.optimizer.scheduler.lr_step CFG.OPTIMIZER.SCHEDULER.LR_STEP] | ||
[--cfg.optimizer.scheduler.lr_step_decay CFG.OPTIMIZER.SCHEDULER.LR_STEP_DECAY] | ||
[--cfg.train.total_steps CFG.TRAIN.TOTAL_STEPS] | ||
[--cfg.train.evaluate_every CFG.TRAIN.EVALUATE_EVERY] | ||
[--cfg.train.resume CFG.TRAIN.RESUME] | ||
[--cfg.train.evaluate CFG.TRAIN.EVALUATE] | ||
[--cfg.train.checkpoint_dir CFG.TRAIN.CHECKPOINT_DIR] | ||
[--cfg.train.log_dir CFG.TRAIN.LOG_DIR] | ||
[--cfg.train.expt_subdir CFG.TRAIN.EXPT_SUBDIR] | ||
[--cfg.train.expt_id CFG.TRAIN.EXPT_ID] | ||
[--cfg.train.print_frequency CFG.TRAIN.PRINT_FREQUENCY] | ||
[--cfg.dataset.data_root CFG.DATASET.DATA_ROOT] | ||
[--cfg.dataset.eval_batchsize CFG.DATASET.EVAL_BATCHSIZE] | ||
[--cfg.dataset.train_batchsize CFG.DATASET.TRAIN_BATCHSIZE] | ||
[-h] | ||
|
||
optional arguments: | ||
--arch ARCH model architectures: VGG19, ResNet50, | ||
GoogLeNet, ResNeXt152, DenseNet201, | ||
MobileNetv3_small, ResNet34, ResNet18, | ||
DenseNet121, ResNet101, VGG13_BN, DenseNet169, | ||
MobileNetv1, VGG11_BN, DenseNet161, | ||
MobileNetv3_large, VGG11, VGG19_BN, VGG16_BN, | ||
VGG16, ResNeXt50, AlexNet, VGG13, ResNeXt101, | ||
MobileNetv2, ConvMixer or ResNet152 (default: | ||
"ResNet18") | ||
--epochs EPOCHS number of total epochs to run (type: Int64, | ||
default: 90) | ||
--start-epoch START-EPOCH | ||
manual epoch number (useful on restarts) | ||
(type: Int64, default: 0) | ||
--batch-size BATCH-SIZE | ||
mini-batch size, this is the total batch size | ||
across all GPUs (type: Int64, default: 256) | ||
--learning-rate LEARNING-RATE | ||
initial learning rate (type: Float32, default: | ||
0.1) | ||
--momentum MOMENTUM momentum (type: Float32, default: 0.9) | ||
--weight-decay WEIGHT-DECAY | ||
weight decay (type: Float32, default: 0.0001) | ||
--print-freq PRINT-FREQ | ||
print frequency (type: Int64, default: 10) | ||
--resume RESUME resume from checkpoint (default: "") | ||
--evaluate evaluate model on validation set | ||
--pretrained use pre-trained model | ||
--seed SEED seed for initializing training. (type: Int64, | ||
default: 0) | ||
--cfg.seed CFG.SEED (type: Int64, default: 12345) | ||
--cfg.model.name CFG.MODEL.NAME | ||
(default: "resnet") | ||
--cfg.model.arch CFG.MODEL.ARCH | ||
(default: "resnet18") | ||
--cfg.model.pretrained CFG.MODEL.PRETRAINED | ||
(type: Bool, default: false) | ||
--cfg.optimizer.name CFG.OPTIMIZER.NAME | ||
(default: "adam") | ||
--cfg.optimizer.learning_rate CFG.OPTIMIZER.LEARNING_RATE | ||
(type: Float32, default: 0.01) | ||
--cfg.optimizer.nesterov CFG.OPTIMIZER.NESTEROV | ||
(type: Bool, default: false) | ||
--cfg.optimizer.momentum CFG.OPTIMIZER.MOMENTUM | ||
(type: Float32, default: 0.0) | ||
--cfg.optimizer.weight_decay CFG.OPTIMIZER.WEIGHT_DECAY | ||
(type: Float32, default: 0.0) | ||
--cfg.optimizer.scheduler.name CFG.OPTIMIZER.SCHEDULER.NAME | ||
(default: "step") | ||
--cfg.optimizer.scheduler.cycle_length CFG.OPTIMIZER.SCHEDULER.CYCLE_LENGTH | ||
(type: Int64, default: 50000) | ||
--cfg.optimizer.scheduler.damp_factor CFG.OPTIMIZER.SCHEDULER.DAMP_FACTOR | ||
(type: Float32, default: 1.2) | ||
--cfg.optimizer.scheduler.lr_step CFG.OPTIMIZER.SCHEDULER.LR_STEP | ||
(type: Vector{Int64}, default: [100000, 250000, 500000]) | ||
--cfg.optimizer.scheduler.lr_step_decay CFG.OPTIMIZER.SCHEDULER.LR_STEP_DECAY | ||
(type: Float32, default: 0.1) | ||
--cfg.train.total_steps CFG.TRAIN.TOTAL_STEPS | ||
(type: Int64, default: 800000) | ||
--cfg.train.evaluate_every CFG.TRAIN.EVALUATE_EVERY | ||
(type: Int64, default: 10000) | ||
--cfg.train.resume CFG.TRAIN.RESUME | ||
(default: "") | ||
--cfg.train.evaluate CFG.TRAIN.EVALUATE | ||
(type: Bool, default: false) | ||
--cfg.train.checkpoint_dir CFG.TRAIN.CHECKPOINT_DIR | ||
(default: "checkpoints") | ||
--cfg.train.log_dir CFG.TRAIN.LOG_DIR | ||
(default: "logs") | ||
--cfg.train.expt_subdir CFG.TRAIN.EXPT_SUBDIR | ||
(default: "") | ||
--cfg.train.expt_id CFG.TRAIN.EXPT_ID | ||
(default: "") | ||
--cfg.train.print_frequency CFG.TRAIN.PRINT_FREQUENCY | ||
(type: Int64, default: 100) | ||
--cfg.dataset.data_root CFG.DATASET.DATA_ROOT | ||
(default: "") | ||
--cfg.dataset.eval_batchsize CFG.DATASET.EVAL_BATCHSIZE | ||
(type: Int64, default: 64) | ||
--cfg.dataset.train_batchsize CFG.DATASET.TRAIN_BATCHSIZE | ||
(type: Int64, default: 64) | ||
-h, --help show this help message and exit | ||
``` | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
@option struct ModelConfig | ||
name::String = "resnet" | ||
arch::String = "resnet18" | ||
pretrained::Bool = false | ||
end | ||
|
||
@option struct SchedulerConfig | ||
name::String = "step" | ||
cycle_length::Int = 50000 | ||
damp_factor::Float32 = 1.2f0 | ||
lr_step::Vector{Int64} = [100000, 250000, 500000] | ||
lr_step_decay::Float32 = 0.1f0 | ||
end | ||
|
||
@option struct OptimizerConfig | ||
name::String = "adam" | ||
learning_rate::Float32 = 0.01f0 | ||
nesterov::Bool = false | ||
momentum::Float32 = 0.0f0 | ||
weight_decay::Float32 = 0.0f0 | ||
scheduler::SchedulerConfig = SchedulerConfig() | ||
end | ||
|
||
@option struct TrainConfig | ||
total_steps::Int = 800000 | ||
evaluate_every::Int = 10000 | ||
resume::String = "" | ||
evaluate::Bool = false | ||
checkpoint_dir::String = "checkpoints" | ||
log_dir::String = "logs" | ||
expt_subdir::String = "" | ||
expt_id::String = "" | ||
print_frequency::Int = 100 | ||
end | ||
|
||
@option struct DatasetConfig | ||
data_root::String = "" | ||
eval_batchsize::Int = 64 | ||
train_batchsize::Int = 64 | ||
end | ||
|
||
@option struct ExperimentConfig | ||
seed::Int = 12345 | ||
model::ModelConfig = ModelConfig() | ||
optimizer::OptimizerConfig = OptimizerConfig() | ||
train::TrainConfig = TrainConfig() | ||
dataset::DatasetConfig = DatasetConfig() | ||
end |
Oops, something went wrong.