Skip to content

Parallel Process Output URLs #4469

Parallel Process Output URLs

Parallel Process Output URLs #4469

name: Parallel Process Output URLs
on:
schedule:
- cron: '*/30 * * * *' # Run every 30 minutes
workflow_dispatch: # Allow manual triggering
jobs:
prepare:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Select and split URLs
run: |
shuf -n 600 output_urls.txt > selected_urls.txt
split -n l/20 -d selected_urls.txt url_part_
- name: Upload URL parts
uses: actions/upload-artifact@v4
with:
name: url-parts
path: url_part_*
- name: Upload selected URLs
uses: actions/upload-artifact@v4
with:
name: selected-urls
path: selected_urls.txt
archive-urls:
needs: prepare
runs-on: ubuntu-latest
strategy:
matrix:
part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.11
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install ArchiveBox
run: |
pip install archivebox
- name: Download URL part
uses: actions/download-artifact@v4
with:
name: url-parts
- name: Initialize and configure ArchiveBox
run: |
mkdir archivebox_data
cd archivebox_data
archivebox init > /dev/null 2>&1
archivebox config --set SAVE_ARCHIVE_DOT_ORG=True
archivebox config --set SAVE_TITLE=False
archivebox config --set SAVE_FAVICON=False
archivebox config --set SAVE_WGET=False
archivebox config --set SAVE_WARC=False
archivebox config --set SAVE_PDF=False
archivebox config --set SAVE_SCREENSHOT=False
archivebox config --set SAVE_DOM=False
archivebox config --set SAVE_SINGLEFILE=False
archivebox config --set SAVE_READABILITY=False
archivebox config --set SAVE_MERCURY=False
archivebox config --set SAVE_GIT=False
archivebox config --set SAVE_MEDIA=False
archivebox config --set USE_COLOR=False
archivebox config --set SHOW_PROGRESS=False
- name: Check and Archive URLs
run: |
cd archivebox_data
while IFS= read -r url; do
if ! curl -s "http://archive.org/wayback/available?url=$url" | grep -q '"archived_snapshots":{}' || ! curl -s -o /dev/null -w "%{http_code}" "http://archive.org/wayback/available?url=$url" | grep -q "200"; then
echo "Archiving: $url"
archivebox add "$url" > /dev/null 2>&1
else
echo "URL already archived: $url"
fi
done < ../url_part_$(printf "%02d" ${{ matrix.part }})
- name: Clean up
run: |
rm -rf archivebox_data
update-output-file:
needs: archive-urls
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Download selected URLs
uses: actions/download-artifact@v4
with:
name: selected-urls
- name: Remove archived URLs from output file
run: |
if [ ! -s selected_urls.txt ] || [ ! -s output_urls.txt ]; then
echo "Either selected_urls.txt or output_urls.txt is empty. Skipping update."
else
grep -vFf selected_urls.txt output_urls.txt > temp_urls.txt || true
if [ -s temp_urls.txt ]; then
mv temp_urls.txt output_urls.txt
echo "Updated output_urls.txt"
else
echo "All URLs processed. Clearing output_urls.txt"
> output_urls.txt
fi
fi
- name: Commit changes
run: |
git config --local user.email "action@github.com"
git config --local user.name "GitHub Action"
git fetch origin main
git merge origin/main --no-edit
git add output_urls.txt
git diff --quiet && git diff --staged --quiet || (git commit -m "Remove archived URLs from output_urls.txt" && git push)
cleanup:
needs: update-output-file
runs-on: ubuntu-latest
steps:
- name: Remove artifacts
uses: geekyeggo/delete-artifact@v2
with:
name: |
url-parts
selected-urls