-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
5df0e55
commit 4e3b1a9
Showing
2 changed files
with
174 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
name: Build ingest-file | ||
|
||
on: | ||
workflow_dispatch: {} | ||
push: | ||
paths: | ||
- Dockerfile | ||
- .github/workflows/docker.yml | ||
|
||
permissions: | ||
packages: write | ||
|
||
jobs: | ||
docker: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v3 | ||
- name: Set up QEMU | ||
uses: docker/setup-qemu-action@v2 | ||
- name: Docker meta | ||
id: meta | ||
uses: docker/metadata-action@v4 | ||
with: | ||
images: ghcr.io/investigativedata/ingest-file-base | ||
tags: | | ||
type=ref,event=branch | ||
type=semver,pattern={{version}} | ||
type=sha | ||
type=raw,value=latest | ||
- name: Set up Docker Buildx | ||
uses: docker/setup-buildx-action@v2 | ||
with: | ||
install: true | ||
- name: Login to GitHub Container Registry | ||
uses: docker/login-action@v2 | ||
with: | ||
registry: ghcr.io | ||
username: ${{ github.actor }} | ||
password: ${{ secrets.GITHUB_TOKEN }} | ||
- name: Build and push release | ||
uses: docker/build-push-action@v3 | ||
with: | ||
context: . | ||
# platforms: linux/amd64,linux/arm64 | ||
push: true | ||
tags: ${{ steps.meta.outputs.tags }} | ||
labels: ${{ steps.meta.outputs.labels }} | ||
cache-from: type=gha | ||
cache-to: type=gha,mode=max |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
FROM python:3.11-slim | ||
ENV DEBIAN_FRONTEND noninteractive | ||
|
||
LABEL org.opencontainers.image.title "Base image for FollowTheMoney File Ingestors" | ||
LABEL org.opencontainers.image.licenses AGPL3 | ||
LABEL org.opencontainers.image.source https://github.com/investigativedata/ingest-file-base | ||
|
||
# Enable non-free archive for `unrar`. | ||
RUN echo "deb http://http.us.debian.org/debian bookworm non-free" >/etc/apt/sources.list.d/nonfree.list | ||
RUN apt-get -qq -y update \ | ||
&& apt-get -qq -y install build-essential locales ca-certificates \ | ||
# python deps (mostly to install their dependencies) | ||
python3-pip python3-dev python3-pil \ | ||
# tesseract | ||
tesseract-ocr libtesseract-dev libleptonica-dev pkg-config\ | ||
# libraries | ||
libxslt1-dev libpq-dev libldap2-dev libsasl2-dev \ | ||
zlib1g-dev libicu-dev libxml2-dev \ | ||
# package tools | ||
unrar p7zip-full \ | ||
# audio & video metadata | ||
libmediainfo-dev \ | ||
# image processing, djvu | ||
imagemagick-common imagemagick mdbtools djvulibre-bin \ | ||
libtiff5-dev libjpeg-dev libfreetype6-dev libwebp-dev \ | ||
libtiff-tools ghostscript librsvg2-bin jbig2dec \ | ||
pst-utils \ | ||
### tesseract | ||
tesseract-ocr-eng \ | ||
tesseract-ocr-swa \ | ||
tesseract-ocr-swe \ | ||
# tesseract-ocr-tam \ | ||
# tesseract-ocr-tel \ | ||
tesseract-ocr-fil \ | ||
# tesseract-ocr-tha \ | ||
tesseract-ocr-tur \ | ||
tesseract-ocr-ukr \ | ||
# tesseract-ocr-vie \ | ||
tesseract-ocr-nld \ | ||
tesseract-ocr-nor \ | ||
tesseract-ocr-pol \ | ||
tesseract-ocr-por \ | ||
tesseract-ocr-ron \ | ||
tesseract-ocr-rus \ | ||
tesseract-ocr-slk \ | ||
tesseract-ocr-slv \ | ||
tesseract-ocr-spa \ | ||
# tesseract-ocr-spa_old \ | ||
tesseract-ocr-sqi \ | ||
tesseract-ocr-srp \ | ||
tesseract-ocr-ind \ | ||
tesseract-ocr-isl \ | ||
tesseract-ocr-ita \ | ||
# tesseract-ocr-ita_old \ | ||
# tesseract-ocr-jpn \ | ||
tesseract-ocr-kan \ | ||
tesseract-ocr-kat \ | ||
# tesseract-ocr-kor \ | ||
tesseract-ocr-khm \ | ||
tesseract-ocr-lav \ | ||
tesseract-ocr-lit \ | ||
# tesseract-ocr-mal \ | ||
tesseract-ocr-mkd \ | ||
tesseract-ocr-mya \ | ||
tesseract-ocr-mlt \ | ||
tesseract-ocr-msa \ | ||
tesseract-ocr-est \ | ||
# tesseract-ocr-eus \ | ||
tesseract-ocr-fin \ | ||
tesseract-ocr-fra \ | ||
tesseract-ocr-frk \ | ||
# tesseract-ocr-frm \ | ||
# tesseract-ocr-glg \ | ||
# tesseract-ocr-grc \ | ||
tesseract-ocr-heb \ | ||
tesseract-ocr-hin \ | ||
tesseract-ocr-hrv \ | ||
tesseract-ocr-hye \ | ||
tesseract-ocr-hun \ | ||
# tesseract-ocr-ben \ | ||
tesseract-ocr-bul \ | ||
tesseract-ocr-cat \ | ||
tesseract-ocr-ces \ | ||
tesseract-ocr-nep \ | ||
# tesseract-ocr-chi_sim \ | ||
# tesseract-ocr-chi_tra \ | ||
# tesseract-ocr-chr \ | ||
tesseract-ocr-dan \ | ||
tesseract-ocr-deu \ | ||
tesseract-ocr-ell \ | ||
# tesseract-ocr-enm \ | ||
# tesseract-ocr-epo \ | ||
# tesseract-ocr-equ \ | ||
tesseract-ocr-afr \ | ||
tesseract-ocr-ara \ | ||
tesseract-ocr-aze \ | ||
tesseract-ocr-bel \ | ||
tesseract-ocr-uzb \ | ||
### pdf convert: libreoffice + a bunch of fonts | ||
libreoffice fonts-opensymbol hyphen-fr hyphen-de \ | ||
hyphen-en-us hyphen-it hyphen-ru fonts-dejavu fonts-dejavu-core fonts-dejavu-extra \ | ||
fonts-droid-fallback fonts-dustin fonts-f500 fonts-fanwood fonts-freefont-ttf \ | ||
fonts-liberation fonts-lmodern fonts-lyx fonts-sil-gentium fonts-texgyre \ | ||
fonts-tlwg-purisa \ | ||
### | ||
&& apt-get -qq -y autoremove \ | ||
&& apt-get clean \ | ||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ | ||
&& localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 | ||
|
||
# Set up the locale and make sure the system uses unicode for the file system. | ||
ENV LANG='en_US.UTF-8' \ | ||
TZ='UTC' \ | ||
OMP_THREAD_LIMIT='1' \ | ||
OPENBLAS_NUM_THREADS='1' | ||
|
||
RUN groupadd -g 1000 -r app \ | ||
&& useradd -m -u 1000 -s /bin/false -g app app | ||
|
||
# Download the ftm-typepredict model | ||
RUN mkdir /models/ && \ | ||
curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz" | ||
|
||
RUN pip3 install --no-cache-dir --prefer-binary --upgrade pip | ||
RUN pip3 install --no-cache-dir --prefer-binary --upgrade setuptools wheel |