Parallel Process Output URLs #4470
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Parallel Process Output URLs | |
on: | |
schedule: | |
- cron: '*/30 * * * *' # Run every 30 minutes | |
workflow_dispatch: # Allow manual triggering | |
jobs: | |
prepare: | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Select and split URLs | |
run: | | |
shuf -n 600 output_urls.txt > selected_urls.txt | |
split -n l/20 -d selected_urls.txt url_part_ | |
- name: Upload URL parts | |
uses: actions/upload-artifact@v4 | |
with: | |
name: url-parts | |
path: url_part_* | |
- name: Upload selected URLs | |
uses: actions/upload-artifact@v4 | |
with: | |
name: selected-urls | |
path: selected_urls.txt | |
archive-urls: | |
needs: prepare | |
runs-on: ubuntu-latest | |
strategy: | |
matrix: | |
part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Set up Python 3.11 | |
uses: actions/setup-python@v5 | |
with: | |
python-version: '3.11' | |
- name: Install ArchiveBox | |
run: | | |
pip install archivebox | |
- name: Download URL part | |
uses: actions/download-artifact@v4 | |
with: | |
name: url-parts | |
- name: Initialize and configure ArchiveBox | |
run: | | |
mkdir archivebox_data | |
cd archivebox_data | |
archivebox init > /dev/null 2>&1 | |
archivebox config --set SAVE_ARCHIVE_DOT_ORG=True | |
archivebox config --set SAVE_TITLE=False | |
archivebox config --set SAVE_FAVICON=False | |
archivebox config --set SAVE_WGET=False | |
archivebox config --set SAVE_WARC=False | |
archivebox config --set SAVE_PDF=False | |
archivebox config --set SAVE_SCREENSHOT=False | |
archivebox config --set SAVE_DOM=False | |
archivebox config --set SAVE_SINGLEFILE=False | |
archivebox config --set SAVE_READABILITY=False | |
archivebox config --set SAVE_MERCURY=False | |
archivebox config --set SAVE_GIT=False | |
archivebox config --set SAVE_MEDIA=False | |
archivebox config --set USE_COLOR=False | |
archivebox config --set SHOW_PROGRESS=False | |
- name: Check and Archive URLs | |
run: | | |
cd archivebox_data | |
while IFS= read -r url; do | |
if ! curl -s "http://archive.org/wayback/available?url=$url" | grep -q '"archived_snapshots":{}' || ! curl -s -o /dev/null -w "%{http_code}" "http://archive.org/wayback/available?url=$url" | grep -q "200"; then | |
echo "Archiving: $url" | |
archivebox add "$url" > /dev/null 2>&1 | |
else | |
echo "URL already archived: $url" | |
fi | |
done < ../url_part_$(printf "%02d" ${{ matrix.part }}) | |
- name: Clean up | |
run: | | |
rm -rf archivebox_data | |
update-output-file: | |
needs: archive-urls | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Download selected URLs | |
uses: actions/download-artifact@v4 | |
with: | |
name: selected-urls | |
- name: Remove archived URLs from output file | |
run: | | |
if [ ! -s selected_urls.txt ] || [ ! -s output_urls.txt ]; then | |
echo "Either selected_urls.txt or output_urls.txt is empty. Skipping update." | |
else | |
grep -vFf selected_urls.txt output_urls.txt > temp_urls.txt || true | |
if [ -s temp_urls.txt ]; then | |
mv temp_urls.txt output_urls.txt | |
echo "Updated output_urls.txt" | |
else | |
echo "All URLs processed. Clearing output_urls.txt" | |
> output_urls.txt | |
fi | |
fi | |
- name: Commit changes | |
run: | | |
git config --local user.email "action@github.com" | |
git config --local user.name "GitHub Action" | |
git fetch origin main | |
git merge origin/main --no-edit | |
git add output_urls.txt | |
git diff --quiet && git diff --staged --quiet || (git commit -m "Remove archived URLs from output_urls.txt" && git push) | |
cleanup: | |
needs: update-output-file | |
runs-on: ubuntu-latest | |
steps: | |
- name: Remove artifacts | |
uses: geekyeggo/delete-artifact@v2 | |
with: | |
name: | | |
url-parts | |
selected-urls |