From 4e3b1a9477b0c1f3b4ee07d74f0d393b8fd53257 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20W=C3=B6rpel?= Date: Tue, 20 Feb 2024 19:22:50 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/docker.yml | 49 ++++++++++++++ Dockerfile | 125 +++++++++++++++++++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 .github/workflows/docker.yml create mode 100644 Dockerfile diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml new file mode 100644 index 0000000..02d8ca2 --- /dev/null +++ b/.github/workflows/docker.yml @@ -0,0 +1,49 @@ +name: Build ingest-file + +on: + workflow_dispatch: {} + push: + paths: + - Dockerfile + - .github/workflows/docker.yml + +permissions: + packages: write + +jobs: + docker: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + - name: Docker meta + id: meta + uses: docker/metadata-action@v4 + with: + images: ghcr.io/investigativedata/ingest-file-base + tags: | + type=ref,event=branch + type=semver,pattern={{version}} + type=sha + type=raw,value=latest + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + with: + install: true + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Build and push release + uses: docker/build-push-action@v3 + with: + context: . + # platforms: linux/amd64,linux/arm64 + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..1702095 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,125 @@ +FROM python:3.11-slim +ENV DEBIAN_FRONTEND noninteractive + +LABEL org.opencontainers.image.title "Base image for FollowTheMoney File Ingestors" +LABEL org.opencontainers.image.licenses AGPL3 +LABEL org.opencontainers.image.source https://github.com/investigativedata/ingest-file-base + +# Enable non-free archive for `unrar`. +RUN echo "deb http://http.us.debian.org/debian bookworm non-free" >/etc/apt/sources.list.d/nonfree.list +RUN apt-get -qq -y update \ + && apt-get -qq -y install build-essential locales ca-certificates \ + # python deps (mostly to install their dependencies) + python3-pip python3-dev python3-pil \ + # tesseract + tesseract-ocr libtesseract-dev libleptonica-dev pkg-config\ + # libraries + libxslt1-dev libpq-dev libldap2-dev libsasl2-dev \ + zlib1g-dev libicu-dev libxml2-dev \ + # package tools + unrar p7zip-full \ + # audio & video metadata + libmediainfo-dev \ + # image processing, djvu + imagemagick-common imagemagick mdbtools djvulibre-bin \ + libtiff5-dev libjpeg-dev libfreetype6-dev libwebp-dev \ + libtiff-tools ghostscript librsvg2-bin jbig2dec \ + pst-utils \ + ### tesseract + tesseract-ocr-eng \ + tesseract-ocr-swa \ + tesseract-ocr-swe \ + # tesseract-ocr-tam \ + # tesseract-ocr-tel \ + tesseract-ocr-fil \ + # tesseract-ocr-tha \ + tesseract-ocr-tur \ + tesseract-ocr-ukr \ + # tesseract-ocr-vie \ + tesseract-ocr-nld \ + tesseract-ocr-nor \ + tesseract-ocr-pol \ + tesseract-ocr-por \ + tesseract-ocr-ron \ + tesseract-ocr-rus \ + tesseract-ocr-slk \ + tesseract-ocr-slv \ + tesseract-ocr-spa \ + # tesseract-ocr-spa_old \ + tesseract-ocr-sqi \ + tesseract-ocr-srp \ + tesseract-ocr-ind \ + tesseract-ocr-isl \ + tesseract-ocr-ita \ + # tesseract-ocr-ita_old \ + # tesseract-ocr-jpn \ + tesseract-ocr-kan \ + tesseract-ocr-kat \ + # tesseract-ocr-kor \ + tesseract-ocr-khm \ + tesseract-ocr-lav \ + tesseract-ocr-lit \ + # tesseract-ocr-mal \ + tesseract-ocr-mkd \ + tesseract-ocr-mya \ + tesseract-ocr-mlt \ + tesseract-ocr-msa \ + tesseract-ocr-est \ + # tesseract-ocr-eus \ + tesseract-ocr-fin \ + tesseract-ocr-fra \ + tesseract-ocr-frk \ + # tesseract-ocr-frm \ + # tesseract-ocr-glg \ + # tesseract-ocr-grc \ + tesseract-ocr-heb \ + tesseract-ocr-hin \ + tesseract-ocr-hrv \ + tesseract-ocr-hye \ + tesseract-ocr-hun \ + # tesseract-ocr-ben \ + tesseract-ocr-bul \ + tesseract-ocr-cat \ + tesseract-ocr-ces \ + tesseract-ocr-nep \ + # tesseract-ocr-chi_sim \ + # tesseract-ocr-chi_tra \ + # tesseract-ocr-chr \ + tesseract-ocr-dan \ + tesseract-ocr-deu \ + tesseract-ocr-ell \ + # tesseract-ocr-enm \ + # tesseract-ocr-epo \ + # tesseract-ocr-equ \ + tesseract-ocr-afr \ + tesseract-ocr-ara \ + tesseract-ocr-aze \ + tesseract-ocr-bel \ + tesseract-ocr-uzb \ + ### pdf convert: libreoffice + a bunch of fonts + libreoffice fonts-opensymbol hyphen-fr hyphen-de \ + hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-core fonts-dejavu-extra \ + fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \ + fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \ + fonts-tlwg-purisa \ + ### + && apt-get -qq -y autoremove \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ + && localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 + +# Set up the locale and make sure the system uses unicode for the file system. +ENV LANG='en_US.UTF-8' \ + TZ='UTC' \ + OMP_THREAD_LIMIT='1' \ + OPENBLAS_NUM_THREADS='1' + +RUN groupadd -g 1000 -r app \ + && useradd -m -u 1000 -s /bin/false -g app app + +# Download the ftm-typepredict model +RUN mkdir /models/ && \ + curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz" + +RUN pip3 install --no-cache-dir --prefer-binary --upgrade pip +RUN pip3 install --no-cache-dir --prefer-binary --upgrade setuptools wheel