Skip to content

Continuous URL Archiving Hourly #38

Continuous URL Archiving Hourly

Continuous URL Archiving Hourly #38

name: Continuous URL Archiving Hourly
on:
push:
paths:
- 'continuously_updated_urls_hourly.txt'
schedule:
- cron: '0 * * * *' # Run hourly
workflow_dispatch: # Allow manual triggering
jobs:
prepare:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Split URLs
run: |
split -n l/20 -d continuously_updated_urls_hourly.txt url_part_
- name: Upload URL parts
uses: actions/upload-artifact@v4
with:
name: url-parts
path: url_part_*
archive-urls:
needs: prepare
runs-on: ubuntu-latest
strategy:
matrix:
part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.11
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install ArchiveBox
run: |
pip install archivebox
- name: Download URL part
uses: actions/download-artifact@v4
with:
name: url-parts
- name: Initialize and configure ArchiveBox
run: |
mkdir archivebox_data
cd archivebox_data
archivebox init > /dev/null 2>&1
archivebox config --set SAVE_ARCHIVE_DOT_ORG=True
archivebox config --set SAVE_TITLE=False
archivebox config --set SAVE_FAVICON=False
archivebox config --set SAVE_WGET=False
archivebox config --set SAVE_WARC=False
archivebox config --set SAVE_PDF=False
archivebox config --set SAVE_SCREENSHOT=False
archivebox config --set SAVE_DOM=False
archivebox config --set SAVE_SINGLEFILE=False
archivebox config --set SAVE_READABILITY=False
archivebox config --set SAVE_MERCURY=False
archivebox config --set SAVE_GIT=False
archivebox config --set SAVE_MEDIA=False
archivebox config --set USE_COLOR=False
archivebox config --set SHOW_PROGRESS=False
- name: Archive URLs
run: |
cd archivebox_data
while IFS= read -r url; do
echo "Archiving: $url"
archivebox add "$url" > /dev/null 2>&1
done < ../url_part_$(printf "%02d" ${{ matrix.part }})
- name: Clean up
run: |
rm -rf archivebox_data
cleanup:
needs: archive-urls
runs-on: ubuntu-latest
steps:
- name: Remove artifacts
uses: geekyeggo/delete-artifact@v2
with:
name: url-parts