Skip to content

Add CPTAC studies from GDC (#2031) #312

Add CPTAC studies from GDC (#2031)

Add CPTAC studies from GDC (#2031) #312

Workflow file for this run

name: Zip studies on push to master branch
on:
# Trigger the workflow on push,
# but only for the master branch
push:
branches:
- master
paths:
- 'crdc/gdc/**'
- 'public/**'
jobs:
zip_studies:
if: github.repository == 'cbioportal/datahub'
runs-on: ubuntu-latest
steps:
- id: files
uses: jitterbit/get-changed-files@v1
- run: |
echo "##[set-output name=unique_study_paths_list;]$(echo "${{ steps.files.outputs.all }}" | awk 'BEGIN { RS=" "; ORS=" " }
{
if (index($0, "public/") == 1) {
split($0, parts, "/");
set["public/" parts[2]]++;
}
if (index($0, "crdc/gdc/") == 1) {
split($0, parts, "/");
set["crdc/gdc/" parts[3]]++;
}
}
END { for (studypath in set) { print studypath } }'"
exit 0
id: unique_studies
- uses: actions/checkout@v2
- run: |
# detecting modified studies and archiving them
# create a list of all studies under 'public'
echo creating ${STUDY_LIST_FILENAME}
echo "[" > ${STUDY_LIST_FILENAME}
first_study=1
find crdc/gdc -type d -mindepth 1 -maxdepth 1 | sort > ${STUDY_PATH_FILENAME}
find public -type d -mindepth 1 -maxdepth 1 | sort >> ${STUDY_PATH_FILENAME}
studypaths=()
while read studypath ; do
if [ ${first_study} -ne 1 ] ; then
echo "," >> ${STUDY_LIST_FILENAME}
fi
studypaths+=("$studypath")
studyname=$(basename "$studypath")
echo -n "\"${studyname}\"" >> ${STUDY_LIST_FILENAME}
first_study=0
done < ${STUDY_PATH_FILENAME}
rm -f ${STUDY_PATH_FILENAME}
echo "" >> ${STUDY_LIST_FILENAME}
echo "]" >> ${STUDY_LIST_FILENAME}
# copy study list file to S3
echo copying ${STUDY_LIST_FILENAME} to S3
aws s3 cp --acl public-read ${STUDY_LIST_FILENAME} s3://${AWS_S3_BUCKET}/
rm -f ${STUDY_LIST_FILENAME}
# first save .git
tar -cvf ${SAVED_GIT_REPO_FILENAME} .git > /dev/null
mkdir studies_to_upload
for study_path in ${{ steps.unique_studies.outputs.unique_study_paths_list }}; do
if [ ! -d "${changed_study}" ]; then
continue
fi
changed_study=$(basename "$study_path")
echo "Pulling study: ${study_path}."
git lfs pull --include="${study_path}"
echo "Compressing this study: ${study_path}."
tar -czvf studies_to_upload/${changed_study}.tar.gz ${changed_study}
echo "Copy file to S3: studies_to_upload/${changed_study}.tar.gz"
aws s3 cp --acl public-read studies_to_upload/${changed_study}.tar.gz s3://${AWS_S3_BUCKET}/
echo "Remove this directory: ${study_path}."
rm -rf "${study_path}"
echo "Remove this file: studies_to_upload/${changed_study}.tar.gz."
rm studies_to_upload/${changed_study}.tar.gz
echo "Remove .git"
rm -rf .git
# add original .git back
tar -xf ${SAVED_GIT_REPO_FILENAME} > /dev/null
done
env:
STUDY_PATH_FILENAME: study_paths.txt
STUDY_LIST_FILENAME: study_list.json
SAVED_GIT_REPO_FILENAME: save_git.tar
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }}
AWS_EC2_METADATA_DISABLED: "true" # fixes aws cli error (code 255); needed since ubuntu-latest v2.262.1