Continuous URL Archiving Hourly #220
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Continuous URL Archiving Hourly | |
on: | |
push: | |
paths: | |
- 'continuously_updated_urls_hourly.txt' | |
schedule: | |
- cron: '0 * * * *' # Run hourly | |
workflow_dispatch: # Allow manual triggering | |
jobs: | |
prepare: | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Split URLs | |
run: | | |
split -n l/20 -d continuously_updated_urls_hourly.txt url_part_ | |
- name: Upload URL parts | |
uses: actions/upload-artifact@v4 | |
with: | |
name: url-parts | |
path: url_part_* | |
archive-urls: | |
needs: prepare | |
runs-on: ubuntu-latest | |
strategy: | |
matrix: | |
part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] | |
steps: | |
- uses: actions/checkout@v4 | |
- name: Set up Python 3.11 | |
uses: actions/setup-python@v5 | |
with: | |
python-version: '3.11' | |
- name: Install ArchiveBox | |
run: | | |
pip install archivebox | |
- name: Download URL part | |
uses: actions/download-artifact@v4 | |
with: | |
name: url-parts | |
- name: Initialize and configure ArchiveBox | |
run: | | |
mkdir archivebox_data | |
cd archivebox_data | |
archivebox init > /dev/null 2>&1 | |
archivebox config --set SAVE_ARCHIVE_DOT_ORG=True | |
archivebox config --set SAVE_TITLE=False | |
archivebox config --set SAVE_FAVICON=False | |
archivebox config --set SAVE_WGET=False | |
archivebox config --set SAVE_WARC=False | |
archivebox config --set SAVE_PDF=False | |
archivebox config --set SAVE_SCREENSHOT=False | |
archivebox config --set SAVE_DOM=False | |
archivebox config --set SAVE_SINGLEFILE=False | |
archivebox config --set SAVE_READABILITY=False | |
archivebox config --set SAVE_MERCURY=False | |
archivebox config --set SAVE_GIT=False | |
archivebox config --set SAVE_MEDIA=False | |
archivebox config --set USE_COLOR=False | |
archivebox config --set SHOW_PROGRESS=False | |
- name: Archive URLs | |
run: | | |
cd archivebox_data | |
while IFS= read -r url; do | |
echo "Archiving: $url" | |
archivebox add "$url" > /dev/null 2>&1 | |
done < ../url_part_$(printf "%02d" ${{ matrix.part }}) | |
- name: Clean up | |
run: | | |
rm -rf archivebox_data | |
cleanup: | |
needs: archive-urls | |
runs-on: ubuntu-latest | |
steps: | |
- name: Remove artifacts | |
uses: geekyeggo/delete-artifact@v2 | |
with: | |
name: url-parts |