Skip to content

Commit

Permalink
Add specific DL configuration for the full docker image (#1117)
Browse files Browse the repository at this point in the history
* Provide a DL-enabled configuration for the full grobid image
* add missing copyright and licence models in the configuration
  • Loading branch information
lfoppiano committed Jun 12, 2024
1 parent 516926d commit 6afe157
Show file tree
Hide file tree
Showing 3 changed files with 335 additions and 1 deletion.
4 changes: 4 additions & 0 deletions Dockerfile.delft
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ RUN rm -rf grobid-home/lib/lin-32
RUN rm -rf grobid-home/lib/win-*
RUN rm -rf grobid-home/lib/mac-64

# Setting DL-powered configuration
RUN rm grobid-home/config/grobid.yaml && \
mv grobid-home/config/grobid-full.yaml grobid-home/config/grobid.yaml

RUN ./gradlew clean assemble --no-daemon --info --stacktrace

WORKDIR /opt/grobid
Expand Down
330 changes: 330 additions & 0 deletions grobid-home/config/grobid-full.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,330 @@
# this is the configuration file for the GROBID instance that uses the Deep Learning Models.

grobid:
# where all the Grobid resources are stored (models, lexicon, native libraries, etc.), normally no need to change
grobidHome: "grobid-home"

# path relative to the grobid-home path (e.g. tmp for grobid-home/tmp) or absolute path (/tmp)
temp: "tmp"

# normally nothing to change here, path relative to the grobid-home path (e.g. grobid-home/lib)
nativelibrary: "lib"

pdf:
pdfalto:
# path relative to the grobid-home path (e.g. grobid-home/pdfalto), you don't want to change this normally
path: "pdfalto"
# security for PDF parsing
memoryLimitMb: 6096
timeoutSec: 120

# security relative to the PDF parsing result
blocksMax: 200000
tokensMax: 1000000

consolidation:
# define the bibliographical data consolidation service to be used, either "crossref" for CrossRef REST API or
# "glutton" for https://github.com/kermitt2/biblio-glutton
#service: "crossref"
service: "glutton"
glutton:
#url: "https://cloud.science-miner.com/glutton"
url: "http://localhost:8080"
crossref:
mailto:
# to use crossref web API, you need normally to use it politely and to indicate an email address here, e.g.
#mailto: "toto@titi.tutu"
token:
# to use Crossref metadata plus service (available by subscription)
#token: "yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere"

proxy:
# proxy to be used when doing external call to the consolidation service
host:
port:

# CORS configuration for the GROBID web API service
corsAllowedOrigins: "*"
corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD"
corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin"

# the actual implementation for language recognition to be used
languageDetectorFactory: "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory"

# the actual implementation for optional sentence segmentation to be used (PragmaticSegmenter or OpenNLP)
#sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory"
sentenceDetectorFactory: "org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory"

# maximum concurrency allowed to GROBID server for processing parallel requests - change it according to your CPU/GPU capacities
# for a production server running only GROBID, set the value slightly above the available number of threads of the server
# to get best performance and security
concurrency: 10
# when the pool is full, for queries waiting for the availability of a Grobid engine, this is the maximum time wait to try
# to get an engine (in seconds) - normally never change it
poolMaxWait: 1

delft:
# DeLFT global parameters
# delft installation path if Deep Learning architectures are used to implement one of the sequence labeling model,
# embeddings are usually compiled as lmdb under delft/data (this parameter is ignored if only featured-engineered CRF are used)
install: "../delft"
pythonVirtualEnv:

wapiti:
# Wapiti global parameters
# number of threads for training the wapiti models (0 to use all available processors)
nbThreads: 0

models:
# we configure here how each sequence labeling model should be implemented
# for feature-engineered CRF, use "wapiti" and possible training parameters are window, epsilon and nbMaxIterations
# for Deep Learning, use "delft" and select the target DL architecture (see DeLFT library), the training
# parameters then depends on this selected DL architecture

- name: "segmentation"
# at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
engine: "wapiti"
#engine: "delft"
wapiti:
# wapiti training parameters, they will be used at training time only
epsilon: 0.0000001
window: 50
nbMaxIterations: 2000
delft:
# deep learning parameters
architecture: "BidLSTM_CRF_FEATURES"
useELMo: false
runtime:
# parameters used at runtime/prediction
max_sequence_length: 3000
batch_size: 1
training:
# parameters used for training
max_sequence_length: 3000
batch_size: 10

- name: "fulltext"
# at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
engine: "wapiti"
wapiti:
# wapiti training parameters, they will be used at training time only
epsilon: 0.0001
window: 20
nbMaxIterations: 1500

- name: "header"
#engine: "wapiti"
engine: "delft"
wapiti:
# wapiti training parameters, they will be used at training time only
epsilon: 0.000001
window: 30
nbMaxIterations: 1500
delft:
# deep learning parameters
architecture: "BidLSTM_ChainCRF_FEATURES"
#transformer: "allenai/scibert_scivocab_cased"
useELMo: false
runtime:
# parameters used at runtime/prediction
#max_sequence_length: 510
max_sequence_length: 3000
batch_size: 1
training:
# parameters used for training
#max_sequence_length: 510
#batch_size: 6
max_sequence_length: 3000
batch_size: 9

- name: "reference-segmenter"
#engine: "wapiti"
engine: "delft"
wapiti:
# wapiti training parameters, they will be used at training time only
epsilon: 0.00001
window: 20
delft:
# deep learning parameters
architecture: "BidLSTM_ChainCRF_FEATURES"
useELMo: false
runtime:
# parameters used at runtime/prediction (for this model, use same max_sequence_length as training)
max_sequence_length: 3000
batch_size: 2
training:
# parameters used for training
max_sequence_length: 3000
batch_size: 10

- name: "name-header"
engine: "wapiti"
#engine: "delft"
delft:
# deep learning parameters
architecture: "BidLSTM_CRF_FEATURES"

- name: "name-citation"
engine: "wapiti"
#engine: "delft"
delft:
# deep learning parameters
architecture: "BidLSTM_CRF_FEATURES"

- name: "date"
engine: "wapiti"
#engine: "delft"
delft:
# deep learning parameters
architecture: "BidLSTM_CRF_FEATURES"

- name: "figure"
engine: "wapiti"
#engine: "delft"
wapiti:
# wapiti training parameters, they will be used at training time only
epsilon: 0.00001
window: 20
delft:
# deep learning parameters
architecture: "BidLSTM_CRF"

- name: "table"
engine: "wapiti"
#engine: "delft"
wapiti:
# wapiti training parameters, they will be used at training time only
epsilon: 0.00001
window: 20
delft:
# deep learning parameters
architecture: "BidLSTM_CRF"

- name: "affiliation-address"
#engine: "wapiti"
engine: "delft"
delft:
# deep learning parameters
architecture: "BidLSTM_CRF_FEATURES"

- name: "citation"
#engine: "wapiti"
engine: "delft"
wapiti:
# wapiti training parameters, they will be used at training time only
epsilon: 0.00001
window: 50
nbMaxIterations: 3000
delft:
# deep learning parameters
architecture: "BidLSTM_CRF_FEATURES"
#architecture: "BERT_CRF"
#transformer: "michiyasunaga/LinkBERT-base"
useELMo: false
runtime:
# parameters used at runtime/prediction
max_sequence_length: 500
batch_size: 30
training:
# parameters used for training
max_sequence_length: 500
batch_size: 50

- name: "patent-citation"
engine: "wapiti"
wapiti:
# wapiti training parameters, they will be used at training time only
epsilon: 0.0001
window: 20
delft:
# deep learning parameters
architecture: "BidLSTM_CRF_FEATURES"
#architecture: "BERT_CRF"
runtime:
# parameters used at runtime/prediction
max_sequence_length: 800
batch_size: 20
training:
# parameters used for training
max_sequence_length: 1000
batch_size: 40

- name: "funding-acknowledgement"
#engine: "wapiti"
engine: "delft"
wapiti:
# wapiti training parameters, they will be used at training time only
epsilon: 0.00001
window: 50
nbMaxIterations: 2000
delft:
# deep learning parameters
architecture: "BidLSTM_CRF_FEATURES"
#architecture: "BERT_CRF"
#transformer: "michiyasunaga/LinkBERT-base"
useELMo: false
runtime:
# parameters used at runtime/prediction
max_sequence_length: 800
batch_size: 20
training:
# parameters used for training
max_sequence_length: 500
batch_size: 40

- name: "copyright"
# at this time, we only have a DeLFT implementation,
# use "wapiti" if the deep learning library JNI is not available and model will then be ignored
# engine: "delft"
engine: "wapiti"
delft:
# deep learning parameters
architecture: "gru"
#architecture: "bert"
#transformer: "allenai/scibert_scivocab_cased"

- name: "license"
# at this time, for being active, it must be DeLFT, no other implementation is available
# use "wapiti" if the deep learning library JNI is not available and model will then be ignored
# engine: "delft"
engine: "wapiti"
delft:
# deep learning parameters
architecture: "gru"
#architecture: "bert"
#transformer: "allenai/scibert_scivocab_cased"

# for **service only**: how to load the models,
# false -> models are loaded when needed, avoiding putting in memory useless models (only in case of CRF) but slow down
# significantly the service at first call
# true -> all the models are loaded into memory at the server startup (default), slow the start of the services
# and models not used will take some more memory (only in case of CRF), but server is immediatly warm and ready
modelPreload: true

server:
type: custom
applicationConnectors:
- type: http
port: 8070
adminConnectors:
- type: http
port: 8071
registerDefaultExceptionMappers: false
# change the following for having all http requests logged
requestLog:
appenders: []

# these logging settings apply to the Grobid service usage mode
logging:
level: INFO
loggers:
org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF"
org.glassfish.jersey.internal: "OFF"
com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF"
appenders:
- type: console
threshold: WARN
timeZone: UTC
# uncomment to have the logs in json format
#layout:
# type: json
2 changes: 1 addition & 1 deletion grobid-home/config/resources-registry.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"format": "vec",
"lang": "en",
"item": "word",
"url": "http://nlp.stanford.edu/data/wordvecs/glove.840B.300d.zip"
"url": "https://huggingface.co/stanfordnlp/glove/resolve/main/glove.840B.300d.zip"
},
{
"name": "fasttext-crawl",
Expand Down

0 comments on commit 6afe157

Please sign in to comment.