Add specific DL configuration for the full docker image (#1117)

* Provide a DL-enabled configuration for the full grobid image * add missing copyright and licence models in the configuration
kermitt2 · Jun 12, 2024 · 6afe157 · 6afe157
1 parent 516926d
commit 6afe157
Show file tree

Hide file tree

Showing 3 changed files with 335 additions and 1 deletion.
diff --git a/Dockerfile.delft b/Dockerfile.delft
@@ -47,6 +47,10 @@ RUN rm -rf grobid-home/lib/lin-32
 RUN rm -rf grobid-home/lib/win-*
 RUN rm -rf grobid-home/lib/mac-64
 
+# Setting DL-powered configuration
+RUN rm grobid-home/config/grobid.yaml && \
+ mv grobid-home/config/grobid-full.yaml grobid-home/config/grobid.yaml
+
 RUN ./gradlew clean assemble --no-daemon --info --stacktrace
 
 WORKDIR /opt/grobid

diff --git a/grobid-home/config/grobid-full.yaml b/grobid-home/config/grobid-full.yaml
@@ -0,0 +1,330 @@
+# this is the configuration file for the GROBID instance that uses the Deep Learning Models.
+
+grobid:
+ # where all the Grobid resources are stored (models, lexicon, native libraries, etc.), normally no need to change
+ grobidHome: "grobid-home"
+
+ # path relative to the grobid-home path (e.g. tmp for grobid-home/tmp) or absolute path (/tmp)
+ temp: "tmp"
+
+ # normally nothing to change here, path relative to the grobid-home path (e.g. grobid-home/lib)
+ nativelibrary: "lib"
+
+ pdf:
+ pdfalto:
+ # path relative to the grobid-home path (e.g. grobid-home/pdfalto), you don't want to change this normally
+ path: "pdfalto"
+ # security for PDF parsing
+ memoryLimitMb: 6096
+ timeoutSec: 120
+
+ # security relative to the PDF parsing result
+ blocksMax: 200000
+ tokensMax: 1000000
+
+ consolidation:
+ # define the bibliographical data consolidation service to be used, either "crossref" for CrossRef REST API or
+ # "glutton" for https://github.com/kermitt2/biblio-glutton
+ #service: "crossref"
+ service: "glutton"
+ glutton:
+ #url: "https://cloud.science-miner.com/glutton"
+ url: "http://localhost:8080"
+ crossref:
+ mailto:
+ # to use crossref web API, you need normally to use it politely and to indicate an email address here, e.g.
+ #mailto: "toto@titi.tutu"
+ token:
+ # to use Crossref metadata plus service (available by subscription)
+ #token: "yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere"
+
+ proxy:
+ # proxy to be used when doing external call to the consolidation service
+ host:
+ port:
+
+ # CORS configuration for the GROBID web API service
+ corsAllowedOrigins: "*"
+ corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD"
+ corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin"
+
+ # the actual implementation for language recognition to be used
+ languageDetectorFactory: "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory"
+
+ # the actual implementation for optional sentence segmentation to be used (PragmaticSegmenter or OpenNLP)
+ #sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory"
+ sentenceDetectorFactory: "org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory"
+
+ # maximum concurrency allowed to GROBID server for processing parallel requests - change it according to your CPU/GPU capacities
+ # for a production server running only GROBID, set the value slightly above the available number of threads of the server
+ # to get best performance and security
+ concurrency: 10
+ # when the pool is full, for queries waiting for the availability of a Grobid engine, this is the maximum time wait to try
+ # to get an engine (in seconds) - normally never change it
+ poolMaxWait: 1
+
+ delft:
+ # DeLFT global parameters
+ # delft installation path if Deep Learning architectures are used to implement one of the sequence labeling model,
+ # embeddings are usually compiled as lmdb under delft/data (this parameter is ignored if only featured-engineered CRF are used)
+ install: "../delft"
+ pythonVirtualEnv:
+
+ wapiti:
+ # Wapiti global parameters
+ # number of threads for training the wapiti models (0 to use all available processors)
+ nbThreads: 0
+
+ models:
+ # we configure here how each sequence labeling model should be implemented
+ # for feature-engineered CRF, use "wapiti" and possible training parameters are window, epsilon and nbMaxIterations
+ # for Deep Learning, use "delft" and select the target DL architecture (see DeLFT library), the training
+ # parameters then depends on this selected DL architecture
+
+ - name: "segmentation"
+ # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
+ engine: "wapiti"
+ #engine: "delft"
+ wapiti:
+ # wapiti training parameters, they will be used at training time only
+ epsilon: 0.0000001
+ window: 50
+ nbMaxIterations: 2000
+ delft:
+ # deep learning parameters
+ architecture: "BidLSTM_CRF_FEATURES"
+ useELMo: false
+ runtime:
+ # parameters used at runtime/prediction
+ max_sequence_length: 3000
+ batch_size: 1
+ training:
+ # parameters used for training
+ max_sequence_length: 3000
+ batch_size: 10
+
+ - name: "fulltext"
+ # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
+ engine: "wapiti"
+ wapiti:
+ # wapiti training parameters, they will be used at training time only
+ epsilon: 0.0001
+ window: 20
+ nbMaxIterations: 1500
+
+ - name: "header"
+ #engine: "wapiti"
+ engine: "delft"
+ wapiti:
+ # wapiti training parameters, they will be used at training time only
+ epsilon: 0.000001
+ window: 30
+ nbMaxIterations: 1500
+ delft:
+ # deep learning parameters
+ architecture: "BidLSTM_ChainCRF_FEATURES"
+ #transformer: "allenai/scibert_scivocab_cased"
+ useELMo: false
+ runtime:
+ # parameters used at runtime/prediction
+ #max_sequence_length: 510
+ max_sequence_length: 3000
+ batch_size: 1
+ training:
+ # parameters used for training
+ #max_sequence_length: 510
+ #batch_size: 6
+ max_sequence_length: 3000
+ batch_size: 9
+
+ - name: "reference-segmenter"
+ #engine: "wapiti"
+ engine: "delft"
+ wapiti:
+ # wapiti training parameters, they will be used at training time only
+ epsilon: 0.00001
+ window: 20
+ delft:
+ # deep learning parameters
+ architecture: "BidLSTM_ChainCRF_FEATURES"
+ useELMo: false
+ runtime:
+ # parameters used at runtime/prediction (for this model, use same max_sequence_length as training)
+ max_sequence_length: 3000
+ batch_size: 2
+ training:
+ # parameters used for training
+ max_sequence_length: 3000
+ batch_size: 10
+
+ - name: "name-header"
+ engine: "wapiti"
+ #engine: "delft"
+ delft:
+ # deep learning parameters
+ architecture: "BidLSTM_CRF_FEATURES"
+
+ - name: "name-citation"
+ engine: "wapiti"
+ #engine: "delft"
+ delft:
+ # deep learning parameters
+ architecture: "BidLSTM_CRF_FEATURES"
+
+ - name: "date"
+ engine: "wapiti"
+ #engine: "delft"
+ delft:
+ # deep learning parameters
+ architecture: "BidLSTM_CRF_FEATURES"
+
+ - name: "figure"
+ engine: "wapiti"
+ #engine: "delft"
+ wapiti:
+ # wapiti training parameters, they will be used at training time only
+ epsilon: 0.00001
+ window: 20
+ delft:
+ # deep learning parameters
+ architecture: "BidLSTM_CRF"
+
+ - name: "table"
+ engine: "wapiti"
+ #engine: "delft"
+ wapiti:
+ # wapiti training parameters, they will be used at training time only
+ epsilon: 0.00001
+ window: 20
+ delft:
+ # deep learning parameters
+ architecture: "BidLSTM_CRF"
+
+ - name: "affiliation-address"
+ #engine: "wapiti"
+ engine: "delft"
+ delft:
+ # deep learning parameters
+ architecture: "BidLSTM_CRF_FEATURES"
+
+ - name: "citation"
+ #engine: "wapiti"
+ engine: "delft"
+ wapiti:
+ # wapiti training parameters, they will be used at training time only
+ epsilon: 0.00001
+ window: 50
+ nbMaxIterations: 3000
+ delft:
+ # deep learning parameters
+ architecture: "BidLSTM_CRF_FEATURES"
+ #architecture: "BERT_CRF"
+ #transformer: "michiyasunaga/LinkBERT-base"
+ useELMo: false
+ runtime:
+ # parameters used at runtime/prediction
+ max_sequence_length: 500
+ batch_size: 30
+ training:
+ # parameters used for training
+ max_sequence_length: 500
+ batch_size: 50
+
+ - name: "patent-citation"
+ engine: "wapiti"
+ wapiti:
+ # wapiti training parameters, they will be used at training time only
+ epsilon: 0.0001
+ window: 20
+ delft:
+ # deep learning parameters
+ architecture: "BidLSTM_CRF_FEATURES"
+ #architecture: "BERT_CRF"
+ runtime:
+ # parameters used at runtime/prediction
+ max_sequence_length: 800
+ batch_size: 20
+ training:
+ # parameters used for training
+ max_sequence_length: 1000
+ batch_size: 40
+
+ - name: "funding-acknowledgement"
+ #engine: "wapiti"
+ engine: "delft"
+ wapiti:
+ # wapiti training parameters, they will be used at training time only
+ epsilon: 0.00001
+ window: 50
+ nbMaxIterations: 2000
+ delft:
+ # deep learning parameters
+ architecture: "BidLSTM_CRF_FEATURES"
+ #architecture: "BERT_CRF"
+ #transformer: "michiyasunaga/LinkBERT-base"
+ useELMo: false
+ runtime:
+ # parameters used at runtime/prediction
+ max_sequence_length: 800
+ batch_size: 20
+ training:
+ # parameters used for training
+ max_sequence_length: 500
+ batch_size: 40
+
+ - name: "copyright"
+ # at this time, we only have a DeLFT implementation,
+ # use "wapiti" if the deep learning library JNI is not available and model will then be ignored
+# engine: "delft"
+ engine: "wapiti"
+ delft:
+ # deep learning parameters
+ architecture: "gru"
+ #architecture: "bert"
+ #transformer: "allenai/scibert_scivocab_cased"
+
+ - name: "license"
+ # at this time, for being active, it must be DeLFT, no other implementation is available
+ # use "wapiti" if the deep learning library JNI is not available and model will then be ignored
+# engine: "delft"
+ engine: "wapiti"
+ delft:
+ # deep learning parameters
+ architecture: "gru"
+ #architecture: "bert"
+ #transformer: "allenai/scibert_scivocab_cased"
+
+ # for **service only**: how to load the models,
+ # false -> models are loaded when needed, avoiding putting in memory useless models (only in case of CRF) but slow down
+ # significantly the service at first call
+ # true -> all the models are loaded into memory at the server startup (default), slow the start of the services
+ # and models not used will take some more memory (only in case of CRF), but server is immediatly warm and ready
+ modelPreload: true
+
+server:
+ type: custom
+ applicationConnectors:
+ - type: http
+ port: 8070
+ adminConnectors:
+ - type: http
+ port: 8071
+ registerDefaultExceptionMappers: false
+ # change the following for having all http requests logged
+ requestLog:
+ appenders: []
+
+# these logging settings apply to the Grobid service usage mode
+logging:
+ level: INFO
+ loggers:
+ org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF"
+ org.glassfish.jersey.internal: "OFF"
+ com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF"
+ appenders:
+ - type: console
+ threshold: WARN
+ timeZone: UTC
+ # uncomment to have the logs in json format
+ #layout:
+ # type: json
diff --git a/grobid-home/config/resources-registry.json b/grobid-home/config/resources-registry.json
@@ -9,7 +9,7 @@
  "format": "vec", 
  "lang": "en",
  "item": "word",
- "url": "http://nlp.stanford.edu/data/wordvecs/glove.840B.300d.zip"
+ "url": "https://huggingface.co/stanfordnlp/glove/resolve/main/glove.840B.300d.zip"
  },
  {
  "name": "fasttext-crawl",