Update Imagenet example

LuxDL · Sep 5, 2022 · 7958d72 · 7958d72
1 parent 2c9d6e2
commit 7958d72
Show file tree

Hide file tree

Showing 7 changed files with 699 additions and 551 deletions.
diff --git a/examples/ImageNet/Project.toml b/examples/ImageNet/Project.toml
@@ -0,0 +1,45 @@
+[deps]
+Augmentor = "02898b10-1f73-11ea-317c-6393d7073e15"
+Boltz = "4544d5e4-abc5-4dea-817f-29e4c205d9c8"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+Configurations = "5218b696-f38b-4ac9-8b61-a12ec717816d"
+Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
+FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
+FluxMPI = "acf642fa-ee0e-513a-b9d5-bcd150f7ec3b"
+Formatting = "59287772-0a20-5a39-b81b-1366585eb4c0"
+Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
+Images = "916415d5-f1e6-5110-898d-aaa5f9f070e0"
+JLSO = "9da8a3cd-07a3-59c0-a743-3fdc52c30d11"
+JpegTurbo = "b835a17e-a41a-41e7-81f0-2f016b05efe0"
+Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
+MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
+NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
+OneHotArrays = "0b1bfda6-eb8a-41d2-88d8-f5af5cad476f"
+Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
+SimpleConfig = "f2d95530-262a-480f-aff0-1c0431e662a7"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+
+[compat]
+Augmentor = "0.6"
+Boltz = "0.1"
+CUDA = "3"
+Configurations = "0.17"
+FLoops = "0.2"
+FluxMPI = "0.6"
+Formatting = "0.4"
+Functors = "0.2, 0.3"
+Images = "0.24, 0.25"
+JLSO = "2"
+JpegTurbo = "0.1"
+Lux = "0.4"
+MLUtils = "0.2.10"
+NNlib = "0.8"
+OneHotArrays = "0.1"
+Optimisers = "0.2"
+Setfield = "0.8.2"
+SimpleConfig = "0.1"
+Zygote = "0.6"
diff --git a/examples/ImageNet/README.md b/examples/ImageNet/README.md
@@ -1,82 +1,142 @@
 # Imagenet Training using Lux
 
-This implements training of popular model architectures, such as ResNet, AlexNet, and VGG on the ImageNet dataset.
+This implements training of popular model architectures, such as ResNet, AlexNet, and VGG on
+the ImageNet dataset.
 
 ## Requirements
 
 * Install [julia](https://julialang.org/)
 * In the Julia REPL instantiate the `Project.toml` in the parent directory
 * Download the ImageNet dataset from http://www.image-net.org/
-  - Then, move and extract the training and validation images to labeled subfolders, using [the following shell script](https://github.com/pytorch/examples/blob/main/imagenet/extract_ILSVRC.sh)
+  - Then, move and extract the training and validation images to labeled subfolders, using
+    [this shell script](https://github.com/pytorch/examples/blob/main/imagenet/extract_ILSVRC.sh)
 
 ## Training
 
-To train a model, run `main.jl` with the desired model architecture and the path to the ImageNet dataset:
+To train a model, run `main.jl` with the necessary parameters. See
+[Boltz documentation](http://lux.csail.mit.edu/stable/lib/Boltz/) for the model
+configuration.
 
 ```bash
-julia --project=.. -t 8 main.jl --arch ResNet18 [imagenet-folder with train and val folders]
-```
-
-The default learning rate schedule starts at 0.1 and decays by a factor of 10 every 30 epochs. This is appropriate for ResNet and models with batch normalization, but too high for AlexNet and VGG. Use 0.01 as the initial learning rate for AlexNet or VGG:
-
-```bash
-julia --project=.. -t 8 main.jl --arch AlexNet --learning-rate 0.01 [imagenet-folder with train and val folders]
+julia --project=examples/ImageNet -t 4 examples/ImageNet/main.jl\
+  --cfg.dataset.data_root=/home/avik-pal/data/ImageNet/\
+  --cfg.dataset.train_batchsize=256 --cfg.dataset.eval_batchsize=256\
+  --cfg.optimizer.learning_rate=0.5
+
+julia --project=examples/ImageNet -t 4 examples/ImageNet/main.jl\
+  --cfg.model.name=alexnet --cfg.model.arch=alexnet\
+  --cfg.dataset.data_root=/home/avik-pal/data/ImageNet/\
+  --cfg.dataset.train_batchsize=256 --cfg.dataset.eval_batchsize=256\
+  --cfg.optimizer.learning_rate=0.01
 ```
 
 ## Distributed Data Parallel Training
 
-Setup [MPI.jl](https://juliaparallel.org/MPI.jl/stable/usage/#CUDA-aware-MPI-support) preferably with the system MPI. Set `FLUXMPI_DISABLE_CUDAMPI_SUPPORT=true` to disable communication via CuArrays (note that this will lead to a very high communication bottleneck).
+Setup [MPI.jl](https://juliaparallel.org/MPI.jl/stable/usage/#CUDA-aware-MPI-support)
+preferably with the system MPI. Set `FLUXMPI_DISABLE_CUDAMPI_SUPPORT=true` to disable
+communication via CuArrays (note that this might lead to a very high communication
+bottleneck).
 
-**Learning Rate**: Remember to linearly scale the learning-rate based on the number of processes you are using.
+!!! tip "Learning Rate"
 
-**NOTE**: If using CUDA-aware MPI you need to disable the default CUDA allocator by `export JULIA_CUDA_MEMORY_POOL=none`. This might slow down your code slightly but will prevent any sudden segfaults which occur without setting this parameter.
+    Remember to linearly scale the learning-rate based on the number of processes you are
+    using.
 
+!!! note
 
-## Usage
+    If using CUDA-aware MPI you need to disable the default CUDA allocator by
+    `export JULIA_CUDA_MEMORY_POOL=none`. This might slow down your code slightly but will
+    prevent any sudden segfaults which occur without setting this parameter.
 
 ```bash
-usage: main.jl [--arch ARCH] [--epochs EPOCHS]
-               [--start-epoch START-EPOCH] [--batch-size BATCH-SIZE]
-               [--learning-rate LEARNING-RATE] [--momentum MOMENTUM]
-               [--weight-decay WEIGHT-DECAY] [--print-freq PRINT-FREQ]
-               [--resume RESUME] [--evaluate] [--pretrained]
-               [--seed SEED] [--distributed] [-h] data
+mpiexecjl -np 4 julia --project=examples/ImageNet -t 4 examples/ImageNet/main.jl\
+  --cfg.dataset.data_root=/home/avik-pal/data/ImageNet/\
+  --cfg.dataset.train_batchsize=256 --cfg.dataset.eval_batchsize=256\
+  --cfg.optimizer.learning_rate=0.5
+```
 
-Lux ImageNet Training
 
-positional arguments:
-  data                  path to dataset
+## Usage
+
+```bash
+usage: main.jl [--cfg.seed CFG.SEED] [--cfg.model.name CFG.MODEL.NAME]
+               [--cfg.model.arch CFG.MODEL.ARCH]
+               [--cfg.model.pretrained CFG.MODEL.PRETRAINED]
+               [--cfg.optimizer.name CFG.OPTIMIZER.NAME]
+               [--cfg.optimizer.learning_rate CFG.OPTIMIZER.LEARNING_RATE]
+               [--cfg.optimizer.nesterov CFG.OPTIMIZER.NESTEROV]
+               [--cfg.optimizer.momentum CFG.OPTIMIZER.MOMENTUM]
+               [--cfg.optimizer.weight_decay CFG.OPTIMIZER.WEIGHT_DECAY]
+               [--cfg.optimizer.scheduler.name CFG.OPTIMIZER.SCHEDULER.NAME]
+               [--cfg.optimizer.scheduler.cycle_length CFG.OPTIMIZER.SCHEDULER.CYCLE_LENGTH]
+               [--cfg.optimizer.scheduler.damp_factor CFG.OPTIMIZER.SCHEDULER.DAMP_FACTOR]
+               [--cfg.optimizer.scheduler.lr_step CFG.OPTIMIZER.SCHEDULER.LR_STEP]                                                                             
+               [--cfg.optimizer.scheduler.lr_step_decay CFG.OPTIMIZER.SCHEDULER.LR_STEP_DECAY]
+               [--cfg.train.total_steps CFG.TRAIN.TOTAL_STEPS]
+               [--cfg.train.evaluate_every CFG.TRAIN.EVALUATE_EVERY]
+               [--cfg.train.resume CFG.TRAIN.RESUME]
+               [--cfg.train.evaluate CFG.TRAIN.EVALUATE]
+               [--cfg.train.checkpoint_dir CFG.TRAIN.CHECKPOINT_DIR]
+               [--cfg.train.log_dir CFG.TRAIN.LOG_DIR]
+               [--cfg.train.expt_subdir CFG.TRAIN.EXPT_SUBDIR]
+               [--cfg.train.expt_id CFG.TRAIN.EXPT_ID]
+               [--cfg.train.print_frequency CFG.TRAIN.PRINT_FREQUENCY]
+               [--cfg.dataset.data_root CFG.DATASET.DATA_ROOT]
+               [--cfg.dataset.eval_batchsize CFG.DATASET.EVAL_BATCHSIZE]
+               [--cfg.dataset.train_batchsize CFG.DATASET.TRAIN_BATCHSIZE]                                                                                     
+               [-h]
 
 optional arguments:
-  --arch ARCH           model architectures: VGG19, ResNet50,
-                        GoogLeNet, ResNeXt152, DenseNet201,
-                        MobileNetv3_small, ResNet34, ResNet18,
-                        DenseNet121, ResNet101, VGG13_BN, DenseNet169,
-                        MobileNetv1, VGG11_BN, DenseNet161,
-                        MobileNetv3_large, VGG11, VGG19_BN, VGG16_BN,
-                        VGG16, ResNeXt50, AlexNet, VGG13, ResNeXt101,
-                        MobileNetv2, ConvMixer or ResNet152 (default:
-                        "ResNet18")
-  --epochs EPOCHS       number of total epochs to run (type: Int64,
-                        default: 90)
-  --start-epoch START-EPOCH
-                        manual epoch number (useful on restarts)
-                        (type: Int64, default: 0)
-  --batch-size BATCH-SIZE
-                        mini-batch size, this is the total batch size
-                        across all GPUs (type: Int64, default: 256)
-  --learning-rate LEARNING-RATE
-                        initial learning rate (type: Float32, default:
-                        0.1)
-  --momentum MOMENTUM   momentum (type: Float32, default: 0.9)
-  --weight-decay WEIGHT-DECAY
-                        weight decay (type: Float32, default: 0.0001)
-  --print-freq PRINT-FREQ
-                        print frequency (type: Int64, default: 10)
-  --resume RESUME       resume from checkpoint (default: "")
-  --evaluate            evaluate model on validation set
-  --pretrained          use pre-trained model
-  --seed SEED           seed for initializing training.  (type: Int64,
-                        default: 0)
+  --cfg.seed CFG.SEED   (type: Int64, default: 12345)
+  --cfg.model.name CFG.MODEL.NAME
+                        (default: "resnet")
+  --cfg.model.arch CFG.MODEL.ARCH
+                        (default: "resnet18")
+  --cfg.model.pretrained CFG.MODEL.PRETRAINED
+                        (type: Bool, default: false)
+  --cfg.optimizer.name CFG.OPTIMIZER.NAME
+                        (default: "adam")
+  --cfg.optimizer.learning_rate CFG.OPTIMIZER.LEARNING_RATE
+                        (type: Float32, default: 0.01)
+  --cfg.optimizer.nesterov CFG.OPTIMIZER.NESTEROV
+                        (type: Bool, default: false)
+  --cfg.optimizer.momentum CFG.OPTIMIZER.MOMENTUM
+                        (type: Float32, default: 0.0)
+  --cfg.optimizer.weight_decay CFG.OPTIMIZER.WEIGHT_DECAY
+                        (type: Float32, default: 0.0)
+  --cfg.optimizer.scheduler.name CFG.OPTIMIZER.SCHEDULER.NAME
+                        (default: "step")
+  --cfg.optimizer.scheduler.cycle_length CFG.OPTIMIZER.SCHEDULER.CYCLE_LENGTH
+                        (type: Int64, default: 50000)
+  --cfg.optimizer.scheduler.damp_factor CFG.OPTIMIZER.SCHEDULER.DAMP_FACTOR
+                        (type: Float32, default: 1.2)
+  --cfg.optimizer.scheduler.lr_step CFG.OPTIMIZER.SCHEDULER.LR_STEP
+                        (type: Vector{Int64}, default: [100000, 250000, 500000])
+  --cfg.optimizer.scheduler.lr_step_decay CFG.OPTIMIZER.SCHEDULER.LR_STEP_DECAY
+                        (type: Float32, default: 0.1)
+  --cfg.train.total_steps CFG.TRAIN.TOTAL_STEPS
+                        (type: Int64, default: 800000)
+  --cfg.train.evaluate_every CFG.TRAIN.EVALUATE_EVERY
+                        (type: Int64, default: 10000)
+  --cfg.train.resume CFG.TRAIN.RESUME
+                        (default: "")
+  --cfg.train.evaluate CFG.TRAIN.EVALUATE
+                        (type: Bool, default: false)
+  --cfg.train.checkpoint_dir CFG.TRAIN.CHECKPOINT_DIR
+                        (default: "checkpoints")
+  --cfg.train.log_dir CFG.TRAIN.LOG_DIR
+                        (default: "logs")
+  --cfg.train.expt_subdir CFG.TRAIN.EXPT_SUBDIR
+                        (default: "")
+  --cfg.train.expt_id CFG.TRAIN.EXPT_ID
+                        (default: "")
+  --cfg.train.print_frequency CFG.TRAIN.PRINT_FREQUENCY
+                        (type: Int64, default: 100)
+  --cfg.dataset.data_root CFG.DATASET.DATA_ROOT
+                        (default: "")
+  --cfg.dataset.eval_batchsize CFG.DATASET.EVAL_BATCHSIZE
+                        (type: Int64, default: 64)
+  --cfg.dataset.train_batchsize CFG.DATASET.TRAIN_BATCHSIZE
+                        (type: Int64, default: 64)
   -h, --help            show this help message and exit
-```
+```
diff --git a/examples/ImageNet/config.jl b/examples/ImageNet/config.jl
@@ -0,0 +1,48 @@
+@option struct ModelConfig
+    name::String = "resnet"
+    arch::String = "resnet18"
+    pretrained::Bool = false
+end
+
+@option struct SchedulerConfig
+    name::String = "step"
+    cycle_length::Int = 50000
+    damp_factor::Float32 = 1.2f0
+    lr_step::Vector{Int64} = [100000, 250000, 500000]
+    lr_step_decay::Float32 = 0.1f0
+end
+
+@option struct OptimizerConfig
+    name::String = "adam"
+    learning_rate::Float32 = 0.01f0
+    nesterov::Bool = false
+    momentum::Float32 = 0.0f0
+    weight_decay::Float32 = 0.0f0
+    scheduler::SchedulerConfig = SchedulerConfig()
+end
+
+@option struct TrainConfig
+    total_steps::Int = 800000
+    evaluate_every::Int = 10000
+    resume::String = ""
+    evaluate::Bool = false
+    checkpoint_dir::String = "checkpoints"
+    log_dir::String = "logs"
+    expt_subdir::String = ""
+    expt_id::String = ""
+    print_frequency::Int = 100
+end
+
+@option struct DatasetConfig
+    data_root::String = ""
+    eval_batchsize::Int = 64
+    train_batchsize::Int = 64
+end
+
+@option struct ExperimentConfig
+    seed::Int = 12345
+    model::ModelConfig = ModelConfig()
+    optimizer::OptimizerConfig = OptimizerConfig()
+    train::TrainConfig = TrainConfig()
+    dataset::DatasetConfig = DatasetConfig()
+end