nv-tlabs · Rinatum · Jul 27, 2021 · Jul 27, 2021 · DRTorresRuiz · Jul 27, 2021
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,79 @@
+# Approximately 10 min to build
+
+FROM nvidia/cuda:10.2-cudnn7-devel
+# Python
+ARG python_version=3.7
+ARG SSH_PASSWORD=password
+ARG SSH_PUBLIC_KEY="ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCojldgho9VK4WaCbSjBAPr9i6daYdQ5s9uGpVuXLI6cAKtzT8G9AQg+wYZayYNthexuzmp5BwpyJT8QQTsUUgBuaocSAjZff8uFKNN9yVMVtT8RIYw/NVVkb97ZPx3ZxN2e7m6BlJyKNg8jKOw4qiUMCH70wYprjEKVUzEjJnM7Mq/BnJPYJr+DQG7IE9uGJwGiE7gHAatsECkcg+QcrMHpLwtha91VE/U13C5dSE072mAX50QnWSGZV2SGg+o8AJViwixJCNMZhld6thClmFezYJjsb9Uz1Hss6xatntxIjUmjL2Lyc/uWFiep+0/R5GPQ9Tbq929IpZ1DwbW5J0x rinatmullahmetov@Rinats-MacBook-Pro.local"
+
+
+# https://docs.docker.com/engine/examples/running_ssh_service/
+# Last is SSH login fix. Otherwise user is kicked off after login
+RUN apt-get update && apt-get install -y openssh-server && \
+    mkdir /var/run/sshd && echo "root:$SSH_PASSWORD" | chpasswd && \
+    sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
+    sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \
+    sed -i 's/#PasswordAuthentication yes/PasswordAuthentication no/' /etc/ssh/sshd_config && \
+    echo "export VISIBLE=now" >> /etc/profile && \
+    mkdir /root/.ssh && chmod 700 /root/.ssh && \
+    echo "$SSH_PUBLIC_KEY" >> /root/.ssh/authorized_keys && \
+    chmod 644 /root/.ssh/authorized_keys
+
+ENV NOTVISIBLE "in users profile"
+ENV CONDA_DIR /opt/conda
+ENV PATH $CONDA_DIR/bin:$PATH
+
+# writing env variables to /etc/profile as mentioned here:
+# https://docs.docker.com/engine/examples/running_ssh_service/#environment-variables
+RUN echo "export CONDA_DIR=$CONDA_DIR" >> /etc/profile && \
+    echo "export PATH=$CONDA_DIR/bin:$PATH" >> /etc/profile && \
+    echo "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH" >> /etc/profile && \
+    echo "export LIBRARY_PATH=/usr/local/cuda/lib64:/lib/x86_64-linux-gnu:$LIBRARY_PATH" >> /etc/profile && \
+    echo "export CUDA_HOME=/usr/local/cuda" >> /etc/profile
+
+# Install Miniconda
+RUN mkdir -p $CONDA_DIR && \
+    apt-get update && \
+    apt-get install -y wget git vim htop zip libhdf5-dev g++ graphviz libgtk2.0-dev \
+    openmpi-bin nano cmake libopenblas-dev liblapack-dev libx11-dev && \
+    wget --quiet https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+    /bin/bash /Miniconda3-latest-Linux-x86_64.sh -f -b -p $CONDA_DIR && \
+    ln /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/local/cuda/lib64/libcudnn.so && \
+    ln /usr/lib/x86_64-linux-gnu/libcudnn.so.7 /usr/local/cuda/lib64/libcudnn.so.7 && \
+    ln /usr/include/cudnn.h /usr/local/cuda/include/cudnn.h && \
+    rm Miniconda3-latest-Linux-x86_64.sh
+
+# Install Data Science essential
+RUN conda config --set remote_read_timeout_secs 100000.0 && \
+    conda update openssl ca-certificates certifi && \
+    conda install -y python=${python_version} && \
+    pip install --upgrade pip --timeout=1000 && \
+    pip install --upgrade requests --timeout=1000 && \
+    conda install Pillow scikit-learn pandas matplotlib mkl nose pyyaml six && \
+    pip install opencv-contrib-python requests scipy tqdm --timeout=1000 && \
+    conda install pytorch torchvision cudatoolkit=10.2 -c pytorch && \
+    pip install pydantic graphviz hiddenlayer torchsummary --timeout=1000 && \
+    pip install albumentations --timeout=1000 && \
+    conda install -c anaconda jupyter && \
+    conda install -c conda-forge jupyterlab && \
+    pip install git+https://github.com/ipython-contrib/jupyter_contrib_nbextensions --timeout=1000 && \
+    jupyter contrib nbextension install && \
+    conda clean -yt
+
+# Install NVIDIA Apex
+RUN git clone https://github.com/NVIDIA/apex && cd apex && \
+    pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ && \
+    cd .. && rm -r apex
+
+ENV LD_LIBRARY_PATH /usr/local/cuda/lib64:/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
+ENV LIBRARY_PATH /usr/local/cuda/lib64:/lib/x86_64-linux-gnu:$LIBRARY_PATH
+ENV CUDA_HOME /usr/local/cuda
+
+COPY requirements.txt /jumanji/requirements.txt
+RUN pip install -r /jumanji/requirements.txt
+
+COPY . /jumanji
+
+EXPOSE 8888 6006 22
+ENTRYPOINT ["/usr/sbin/sshd"]
+CMD ["-D"]
diff --git a/datasetGAN/train_interpreter.py b/datasetGAN/train_interpreter.py
@@ -36,6 +36,7 @@
 import os
 device_ids = [0]
 from PIL import Image
+from tqdm import tqdm
 import gc
 
 import pickle
@@ -51,6 +52,7 @@
 from torch.utils.data import Dataset, DataLoader
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 import cv2
+from imageio import imwrite as imsave
 
 class trainData(Dataset):
 
@@ -242,7 +244,7 @@ def generate_data(args, checkpoint_path, num_sample, start_step=0, vis=True):
 
         print( "num_sample: ", num_sample)
 
-        for i in range(num_sample):
+        for i in tqdm(range(num_sample)):
             if i % 100 == 0:
                 print("Genearte", i, "Out of:", num_sample)
 
@@ -316,9 +318,9 @@ def generate_data(args, checkpoint_path, num_sample, start_step=0, vis=True):
 
                 color_mask = 0.7 * colorize_mask(img_seg_final, palette) + 0.3 * img
 
-                scipy.misc.imsave(os.path.join(result_path, "vis_" + str(i) + '.jpg'),
+                imsave(os.path.join(result_path, "vis_" + str(i) + '.jpg'),
                                   color_mask.astype(np.uint8))
-                scipy.misc.imsave(os.path.join(result_path, "vis_" + str(i) + '_image.jpg'),
+                imsave(os.path.join(result_path, "vis_" + str(i) + '_image.jpg'),
                                   img.astype(np.uint8))
             else:
                 seg_cache.append(img_seg_final)
@@ -424,7 +426,7 @@ def prepare_data(args, palette):
 
 
     vis = np.concatenate(vis, 1)
-    scipy.misc.imsave(os.path.join(args['exp_dir'], "train_data.jpg"),
+    imsave(os.path.join(args['exp_dir'], "train_data.jpg"),
                       vis)
 
     return all_feature_maps_train, all_mask_train, num_data
@@ -484,44 +486,45 @@ def main(args
         best_loss = 10000000
         stop_sign = 0
         for epoch in range(100):
-            for X_batch, y_batch in train_loader:
-                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
-                y_batch = y_batch.type(torch.long)
-                y_batch = y_batch.type(torch.long)
-
-                optimizer.zero_grad()
-                y_pred = classifier(X_batch)
-                loss = criterion(y_pred, y_batch)
-                acc = multi_acc(y_pred, y_batch)
-
-                loss.backward()
-                optimizer.step()
-
-                iteration += 1
-                if iteration % 1000 == 0:
-                    print('Epoch : ', str(epoch), 'iteration', iteration, 'loss', loss.item(), 'acc', acc)
-                    gc.collect()
-
-
-                if iteration % 5000 == 0:
-                    model_path = os.path.join(args['exp_dir'],
-                                              'model_20parts_iter' +  str(iteration) + '_number_' + str(MODEL_NUMBER) + '.pth')
-                    print('Save checkpoint, Epoch : ', str(epoch), ' Path: ', model_path)
-
-                    torch.save({'model_state_dict': classifier.state_dict()},
-                               model_path)
-
-                if epoch > 3:
-                    if loss.item() < best_loss:
-                        best_loss = loss.item()
-                        break_count = 0
-                    else:
-                        break_count += 1
-
-                    if break_count > 50:
-                        stop_sign = 1
-                        print("*************** Break, Total iters,", iteration, ", at epoch", str(epoch), "***************")
-                        break
+            with tqdm(train_loader, unit="batch") as tepoch:
+                for X_batch, y_batch in tepoch:
+                    X_batch, y_batch = X_batch.to(device), y_batch.to(device)
+                    y_batch = y_batch.type(torch.long)
+                    y_batch = y_batch.type(torch.long)
+
+                    optimizer.zero_grad()
+                    y_pred = classifier(X_batch)
+                    loss = criterion(y_pred, y_batch)
+                    acc = multi_acc(y_pred, y_batch)
+
+                    loss.backward()
+                    optimizer.step()
+
+                    iteration += 1
+                    if iteration % 1000 == 0:
+                        print('Epoch : ', str(epoch), 'iteration', iteration, 'loss', loss.item(), 'acc', acc)
+                        gc.collect()
+
+
+                    if iteration % 5000 == 0:
+                        model_path = os.path.join(args['exp_dir'],
+                                                  'model_20parts_iter' +  str(iteration) + '_number_' + str(MODEL_NUMBER) + '.pth')
+                        print('Save checkpoint, Epoch : ', str(epoch), ' Path: ', model_path)
+
+                        torch.save({'model_state_dict': classifier.state_dict()},
+                                   model_path)
+
+                    if epoch > 3:
+                        if loss.item() < best_loss:
+                            best_loss = loss.item()
+                            break_count = 0
+                        else:
+                            break_count += 1
+
+                        if break_count > 50:
+                            stop_sign = 1
+                            print("*************** Break, Total iters,", iteration, ", at epoch", str(epoch), "***************")
+                            break
 
             if stop_sign == 1:
                 break

diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,7 @@
 torch==1.4.0
 torchvision==0.5.0
+scipy==1.5.4
+opencv-python==4.5.2.54
+tqdm==4.61.1
+Pillow==8.3.0
+imageio==2.9.0