iterative · iesahin · Jan 18, 2022 · Jan 19, 2022 · Jan 19, 2022 · Jan 19, 2022
diff --git a/example-cml/generate.bash b/example-cml/generate.bash
@@ -0,0 +1,156 @@
+#!/usr/bin/env bash
+
+set -veux
+
+HERE="$( cd "$(dirname "$0")" ; pwd -P )"
+export HERE
+PROJECT_NAME="example-cml"
+PROJECT_SUFFIX="$(git rev-parse --short HEAD)-$(date +%F-%H-%M-%S)"
+
+SEED_REPO="git@github.com:iterative/example_cml"
+
+export REPO_ROOT="${HERE}/build/${PROJECT_NAME}-${PROJECT_SUFFIX}"
+
+# Count the number of git tag calls in this repository
+NUM_TAGS=$(grep 'git tag' ${HERE}/generate-* | wc -l)
+# Start a bit more in the past
+TOTAL_TAGS=$(( NUM_TAGS + 10 ))
+
+export STEP_TIME=$(( RANDOM + 50000 ))
+export TAG_TIME=$(( $(date +%s) - ( TOTAL_TAGS * STEP_TIME ) ))
+
+export GIT_AUTHOR_NAME="Olivaw Owlet"
+export GIT_AUTHOR_EMAIL="64868532+iterative-olivaw@users.noreply.github.com"
+export GIT_COMMITTER_NAME="$GIT_AUTHOR_NAME"
+export GIT_COMMITTER_EMAIL="$GIT_AUTHOR_EMAIL"
+
+tag_tick() {
+  export TAG_TIME=$(( TAG_TIME + STEP_TIME ))
+  export GIT_AUTHOR_DATE=${TAG_TIME}
+  export GIT_COMMITTER_DATE=${TAG_TIME}
+}
+
+export -f tag_tick
+
+if [[ -d "$REPO_ROOT" ]]; then
+    echo "Repo $REPO_ROOT already exists, please remove it first."
+    exit 1
+fi
+
+BRANCH_MODIFY_SCRIPT="modify-branch.bash"
+
+mkdir -p "${REPO_ROOT}"
+pushd "${REPO_ROOT}"
+
+# git clone "${SEED_REPO}"
+# SEED_DIR=$(basename "${SEED_REPO}")
+
+hubs=(github)
+
+git_remote_from_hub() {
+    local hubname=$1
+    local repo_name=$2
+    case $hubname in
+        github ) echo "git@github.com:iterative/${repo_name}"
+            ;;
+        * ) echo "No support for $hubname yet"
+            exit 99
+            ;;
+    esac
+}
+
+for hub in ${hubs} ; do
+    mkdir -p ${REPO_ROOT}/${hub}
+    for source_dir in $(find ${HERE}/${hub}/ -maxdepth 1 -mindepth 1 -type d) ; do
+        repo_name=$(basename ${source_dir})
+        target_dir="${REPO_ROOT}/${hub}/${repo_name}"
+        git clone --depth=1 ${SEED_REPO} ${target_dir}
+        pushd ${target_dir}
+        # Delete git to reinit
+        rm -rf .git
+        git init --initial-branch=seed
+        git add .
+        git commit -m "Initial commit from files in ${SEED_REPO}"
+        git remote add origin "$(git_remote_from_hub $hub $repo_name)"
+        for branch_dir in $(find ${source_dir}  -maxdepth 1 -mindepth 1 -type d) ; do
+            branch_name=$(basename ${branch_dir})
+            git checkout -b ${branch_name}
+            cp -r ${branch_dir}/. ${target_dir}
+            if [[ -f  "${BRANCH_MODIFY_SCRIPT}" ]] ; then
+                chmod u+x "${BRANCH_MODIFY_SCRIPT}"
+                bash -c "${BRANCH_MODIFY_SCRIPT}"
+                # remove not to check in the script to the repository
+                rm -f "${BRANCH_MODIFY_SCRIPT}"
+            fi
+
+            git add .
+            git commit -m "Modifications for ${branch_name}"
+            # move this to push script
+            # git branch --set-upstream-to=origin/${branch_name}
+            git status -s
+            git checkout seed
+        done
+        popd
+    done
+done
+
+## TODO: Our push script should push all generated repositories and DVC elements
+
+PUSH_SCRIPT="${REPO_ROOT}/push-${PROJECT_NAME}.bash"
+
+cat > "${PUSH_SCRIPT}" <<EOF
+#!/usr/bin/env bash
+
+set -veux
+
+for hub in ${hubs} ; do
+    for source_dir in \$(find ${HERE}/${hub}/ -maxdepth 1 -mindepth 1 -type d) ; do
+        repo_name=\$(basename ${source_dir})
+        target_dir="${REPO_ROOT}/\${hub}/\${repo_name}"
+        pushd \${target_dir}
+        ## We are pushing the branches one by one to prevent "seed" branch to appear
+        for branch_dir in \$(find \${source_dir}  -maxdepth 1 -mindepth 1 -type d) ; do
+            branch_name=\$(basename \${branch_dir})
+            git checkout \${branch_name}
+            dvc push
+            git push --force -u origin
+        done
+        popd
+done
+EOF
+
+chmod u+x "${PUSH_SCRIPT}"
+
+popd
+
+cat << EOF
+
+##################################
+### REPOSITORY GENERATION DONE ###
+##################################
+
+Repositories are in:
+
+${REPO_ROOT}
+
+Push scripts are written to:
+$(ls -1 ${REPO_ROOT}/*.bash)
+
+You may remove the generated repo with:
+
+$ rm -fR ${REPO_ROOT}
+EOF
+
+unset HERE
+unset PROJECT_NAME
+unset REPO_NAME
+unset REPO_ROOT
+unset STEP_TIME
+unset TAG_TIME
+unset GIT_AUTHOR_NAME
+unset GIT_AUTHOR_EMAIL
+unset GIT_AUTHOR_DATE
+unset GIT_COMMITTER_NAME
+unset GIT_COMMITTER_EMAIL
+unset GIT_COMMITTER_DATE
+unset tag_tick
diff --git a/...le-cml/github/cml-runner-base-case/dependabot-pip-pillow-8.3.2/.github/workflows/cml.yaml b/...le-cml/github/cml-runner-base-case/dependabot-pip-pillow-8.3.2/.github/workflows/cml.yaml
@@ -0,0 +1,49 @@
+name: train-my-model
+
+on: [push]
+
+jobs:
+  deploy-runner:
+    runs-on: [ubuntu-latest]
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: iterative/setup-cml@v1
+
+      - name: deploy
+        shell: bash
+        env:
+          repo_token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+        run: |
+          cml-runner \
+          --cloud aws \
+          --cloud-region us-west \
+          --cloud-type=t2.micro \
+          --labels=cml-runner
+
+  run:
+    needs: deploy-runner
+    runs-on: [self-hosted,cml-runner]
+    container: docker://dvcorg/cml
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - uses: actions/setup-python@v2
+      with:
+        python-version: '3.6'
+
+    - name: cml
+      env:
+        repo_token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
+      run: |
+        python --version
+        pip install -r requirements.txt
+        python train.py
+
+        echo "## Report from your EC2 Instance" > report.md
+        cat metrics.txt >> report.md
+        cml-publish "confusion_matrix.png" --md >> report.md
+        cml-send-comment report.md
diff --git a/example-cml/github/cml-runner-base-case/dependabot-pip-pillow-8.3.2/README.md b/example-cml/github/cml-runner-base-case/dependabot-pip-pillow-8.3.2/README.md
@@ -0,0 +1,21 @@
+# Example `cml-runner` workflow
+
+This repository contains a sample project using [CML](https://github.com/iterative/cml) to provision and launch a small EC2 instance and run a machine learning workflow on the instance:
+- GitHub will deploy a runner machine and setup CML with the `setup-CML` GitHub Action
+- The workflow uses `cml-runner` to provision and launch a `t2.micro` instance on AWS EC2
+- The new `t2.micro` instance runs a workflow to pull a Docker container, install Python package requirements, and train a `scikitlearn` model.
+- CML returns a summary of the model accuracy and a confusion matrix as a comment in your Pull Request. 
+
+The key file enabling these actions is `.github/workflows/cml.yaml`.
+
+## Secrets and environmental variables
+In this example, `.github/workflows/cml.yaml` contains three environmental variables that are stored as repository secrets.
+
+| Secret  | Description  | 
+|---|---|
+|  PERSONAL_ACCESS_TOKEN | You must create a personal access token with repository and workflow permissions. |
+| AWS_ACCESS_KEY_ID  | AWS credential for accessing S3 storage  | 
+| AWS_SECRET_ACCESS_KEY | AWS credential for accessing S3 storage |
+
+The `cml-runner` function currently works with AWS and Azure cloud service providers. For Azure, you'll want to substitute the `AWS` secrets for Azure's credential variables. 
+
diff --git a/example-cml/github/cml-runner-base-case/dependabot-pip-pillow-8.3.2/requirements.txt b/example-cml/github/cml-runner-base-case/dependabot-pip-pillow-8.3.2/requirements.txt
@@ -0,0 +1,13 @@
+cycler==0.10.0
+joblib==1.0.1
+kiwisolver==1.3.1
+matplotlib==3.3.4
+numpy==1.19.4
+Pillow==8.3.2
+pyparsing==2.4.7
+python-dateutil==2.8.1
+scikit-learn==0.24.1
+scipy==1.5.4
+six==1.15.0
+sklearn==0.0
+threadpoolctl==2.1.0
diff --git a/example-cml/github/cml-runner-base-case/dependabot-pip-pillow-8.3.2/train.py b/example-cml/github/cml-runner-base-case/dependabot-pip-pillow-8.3.2/train.py
@@ -0,0 +1,25 @@
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import plot_confusion_matrix
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
+import matplotlib.pyplot as plt
+
+
+# Generate some data
+X,y = make_classification(1000)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=18)
+
+# Fit a model
+depth = 2
+clf = RandomForestClassifier(max_depth=depth)
+clf.fit(X_train,y_train)
+
+# Assess accuracy on held-out data and print the accuracy
+acc = clf.score(X_test, y_test)
+print(acc)
+with open("metrics.txt", 'w') as outfile:
+        outfile.write("Accuracy: " + str(acc) + "\n")
+
+# Plot it
+disp = plot_confusion_matrix(clf, X_test, y_test, normalize='true',cmap=plt.cm.Blues)
+plt.savefig('confusion_matrix.png')
diff --git a/example-cml/github/cml-runner-base-case/expeiment/.github/workflows/cml.yaml b/example-cml/github/cml-runner-base-case/expeiment/.github/workflows/cml.yaml
@@ -0,0 +1,45 @@
+name: Train-in-the-cloud
+
+on: [push]
+
+jobs:
+  deploy-runner:
+    runs-on: [ubuntu-latest]
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: iterative/setup-cml@v1
+
+      - name: deploy
+        shell: bash
+        env:
+          repo_token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+        run: |
+          cml-runner \
+          --cloud aws \
+          --cloud-region us-west \
+          --cloud-type=t2.micro \
+          --labels=cml-runner
+
+  model-training:
+    needs: deploy-runner
+    runs-on: [self-hosted,cml-runner]
+    container: docker://dvcorg/cml-py3
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: "Train my model"
+      env:
+        repo_token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
+      run: |
+        python --version
+        pip install -r requirements.txt
+        python train.py
+
+        echo "## Report from your EC2 Instance" > report.md
+        cat metrics.txt >> report.md
+        cml-publish "confusion_matrix.png" --md >> report.md
+        cml-send-comment report.md
diff --git a/example-cml/github/cml-runner-base-case/expeiment/README.md b/example-cml/github/cml-runner-base-case/expeiment/README.md
@@ -0,0 +1,21 @@
+# Example `cml-runner` workflow
+
+This repository contains a sample project using [CML](https://github.com/iterative/cml) to provision and launch a small EC2 instance and run a machine learning workflow on the instance:
+- GitHub will deploy a runner machine and setup CML with the `setup-CML` GitHub Action
+- The workflow uses `cml-runner` to provision and launch a `t2.micro` instance on AWS EC2
+- The new `t2.micro` instance runs a workflow to pull a Docker container, install Python package requirements, and train a `scikitlearn` model.
+- CML returns a summary of the model accuracy and a confusion matrix as a comment in your Pull Request. 
+
+The key file enabling these actions is `.github/workflows/cml.yaml`.
+
+## Secrets and environmental variables
+In this example, `.github/workflows/cml.yaml` contains three environmental variables that are stored as repository secrets.
+
+| Secret  | Description  | 
+|---|---|
+|  PERSONAL_ACCESS_TOKEN | You must create a personal access token with repository and workflow permissions. |
+| AWS_ACCESS_KEY_ID  | AWS credential for accessing S3 storage  | 
+| AWS_SECRET_ACCESS_KEY | AWS credential for accessing S3 storage |
+
+The `cml-runner` function currently works with AWS and Azure cloud service providers. For Azure, you'll want to substitute the `AWS` secrets for Azure's credential variables. 
+
diff --git a/example-cml/github/cml-runner-base-case/expeiment/requirements.txt b/example-cml/github/cml-runner-base-case/expeiment/requirements.txt
@@ -0,0 +1,13 @@
+cycler==0.10.0
+joblib==1.0.1
+kiwisolver==1.3.1
+matplotlib==3.3.4
+numpy==1.19.4
+Pillow==8.1.0
+pyparsing==2.4.7
+python-dateutil==2.8.1
+scikit-learn==0.24.1
+scipy==1.5.4
+six==1.15.0
+sklearn==0.0
+threadpoolctl==2.1.0
diff --git a/example-cml/github/cml-runner-base-case/expeiment/train.py b/example-cml/github/cml-runner-base-case/expeiment/train.py
@@ -0,0 +1,25 @@
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import plot_confusion_matrix
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
+import matplotlib.pyplot as plt
+
+
+# Generate some data
+X,y = make_classification(1000)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=18)
+
+# Fit a model
+depth = 5
+clf = RandomForestClassifier(max_depth=depth)
+clf.fit(X_train,y_train)
+
+# Assess accuracy on held-out data and print the accuracy
+acc = clf.score(X_test, y_test)
+print(acc)
+with open("metrics.txt", 'w') as outfile:
+        outfile.write("Accuracy: " + str(acc) + "\n")
+
+# Plot it
+disp = plot_confusion_matrix(clf, X_test, y_test, normalize='true',cmap=plt.cm.Blues)
+plt.savefig('confusion_matrix.png')