-
Notifications
You must be signed in to change notification settings - Fork 445
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add specific DL configuration for the full docker image (#1117)
* Provide a DL-enabled configuration for the full grobid image * add missing copyright and licence models in the configuration
- Loading branch information
Showing
3 changed files
with
335 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,330 @@ | ||
# this is the configuration file for the GROBID instance that uses the Deep Learning Models. | ||
|
||
grobid: | ||
# where all the Grobid resources are stored (models, lexicon, native libraries, etc.), normally no need to change | ||
grobidHome: "grobid-home" | ||
|
||
# path relative to the grobid-home path (e.g. tmp for grobid-home/tmp) or absolute path (/tmp) | ||
temp: "tmp" | ||
|
||
# normally nothing to change here, path relative to the grobid-home path (e.g. grobid-home/lib) | ||
nativelibrary: "lib" | ||
|
||
pdf: | ||
pdfalto: | ||
# path relative to the grobid-home path (e.g. grobid-home/pdfalto), you don't want to change this normally | ||
path: "pdfalto" | ||
# security for PDF parsing | ||
memoryLimitMb: 6096 | ||
timeoutSec: 120 | ||
|
||
# security relative to the PDF parsing result | ||
blocksMax: 200000 | ||
tokensMax: 1000000 | ||
|
||
consolidation: | ||
# define the bibliographical data consolidation service to be used, either "crossref" for CrossRef REST API or | ||
# "glutton" for https://github.com/kermitt2/biblio-glutton | ||
#service: "crossref" | ||
service: "glutton" | ||
glutton: | ||
#url: "https://cloud.science-miner.com/glutton" | ||
url: "http://localhost:8080" | ||
crossref: | ||
mailto: | ||
# to use crossref web API, you need normally to use it politely and to indicate an email address here, e.g. | ||
#mailto: "toto@titi.tutu" | ||
token: | ||
# to use Crossref metadata plus service (available by subscription) | ||
#token: "yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere" | ||
|
||
proxy: | ||
# proxy to be used when doing external call to the consolidation service | ||
host: | ||
port: | ||
|
||
# CORS configuration for the GROBID web API service | ||
corsAllowedOrigins: "*" | ||
corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD" | ||
corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin" | ||
|
||
# the actual implementation for language recognition to be used | ||
languageDetectorFactory: "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory" | ||
|
||
# the actual implementation for optional sentence segmentation to be used (PragmaticSegmenter or OpenNLP) | ||
#sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory" | ||
sentenceDetectorFactory: "org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory" | ||
|
||
# maximum concurrency allowed to GROBID server for processing parallel requests - change it according to your CPU/GPU capacities | ||
# for a production server running only GROBID, set the value slightly above the available number of threads of the server | ||
# to get best performance and security | ||
concurrency: 10 | ||
# when the pool is full, for queries waiting for the availability of a Grobid engine, this is the maximum time wait to try | ||
# to get an engine (in seconds) - normally never change it | ||
poolMaxWait: 1 | ||
|
||
delft: | ||
# DeLFT global parameters | ||
# delft installation path if Deep Learning architectures are used to implement one of the sequence labeling model, | ||
# embeddings are usually compiled as lmdb under delft/data (this parameter is ignored if only featured-engineered CRF are used) | ||
install: "../delft" | ||
pythonVirtualEnv: | ||
|
||
wapiti: | ||
# Wapiti global parameters | ||
# number of threads for training the wapiti models (0 to use all available processors) | ||
nbThreads: 0 | ||
|
||
models: | ||
# we configure here how each sequence labeling model should be implemented | ||
# for feature-engineered CRF, use "wapiti" and possible training parameters are window, epsilon and nbMaxIterations | ||
# for Deep Learning, use "delft" and select the target DL architecture (see DeLFT library), the training | ||
# parameters then depends on this selected DL architecture | ||
|
||
- name: "segmentation" | ||
# at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation | ||
engine: "wapiti" | ||
#engine: "delft" | ||
wapiti: | ||
# wapiti training parameters, they will be used at training time only | ||
epsilon: 0.0000001 | ||
window: 50 | ||
nbMaxIterations: 2000 | ||
delft: | ||
# deep learning parameters | ||
architecture: "BidLSTM_CRF_FEATURES" | ||
useELMo: false | ||
runtime: | ||
# parameters used at runtime/prediction | ||
max_sequence_length: 3000 | ||
batch_size: 1 | ||
training: | ||
# parameters used for training | ||
max_sequence_length: 3000 | ||
batch_size: 10 | ||
|
||
- name: "fulltext" | ||
# at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation | ||
engine: "wapiti" | ||
wapiti: | ||
# wapiti training parameters, they will be used at training time only | ||
epsilon: 0.0001 | ||
window: 20 | ||
nbMaxIterations: 1500 | ||
|
||
- name: "header" | ||
#engine: "wapiti" | ||
engine: "delft" | ||
wapiti: | ||
# wapiti training parameters, they will be used at training time only | ||
epsilon: 0.000001 | ||
window: 30 | ||
nbMaxIterations: 1500 | ||
delft: | ||
# deep learning parameters | ||
architecture: "BidLSTM_ChainCRF_FEATURES" | ||
#transformer: "allenai/scibert_scivocab_cased" | ||
useELMo: false | ||
runtime: | ||
# parameters used at runtime/prediction | ||
#max_sequence_length: 510 | ||
max_sequence_length: 3000 | ||
batch_size: 1 | ||
training: | ||
# parameters used for training | ||
#max_sequence_length: 510 | ||
#batch_size: 6 | ||
max_sequence_length: 3000 | ||
batch_size: 9 | ||
|
||
- name: "reference-segmenter" | ||
#engine: "wapiti" | ||
engine: "delft" | ||
wapiti: | ||
# wapiti training parameters, they will be used at training time only | ||
epsilon: 0.00001 | ||
window: 20 | ||
delft: | ||
# deep learning parameters | ||
architecture: "BidLSTM_ChainCRF_FEATURES" | ||
useELMo: false | ||
runtime: | ||
# parameters used at runtime/prediction (for this model, use same max_sequence_length as training) | ||
max_sequence_length: 3000 | ||
batch_size: 2 | ||
training: | ||
# parameters used for training | ||
max_sequence_length: 3000 | ||
batch_size: 10 | ||
|
||
- name: "name-header" | ||
engine: "wapiti" | ||
#engine: "delft" | ||
delft: | ||
# deep learning parameters | ||
architecture: "BidLSTM_CRF_FEATURES" | ||
|
||
- name: "name-citation" | ||
engine: "wapiti" | ||
#engine: "delft" | ||
delft: | ||
# deep learning parameters | ||
architecture: "BidLSTM_CRF_FEATURES" | ||
|
||
- name: "date" | ||
engine: "wapiti" | ||
#engine: "delft" | ||
delft: | ||
# deep learning parameters | ||
architecture: "BidLSTM_CRF_FEATURES" | ||
|
||
- name: "figure" | ||
engine: "wapiti" | ||
#engine: "delft" | ||
wapiti: | ||
# wapiti training parameters, they will be used at training time only | ||
epsilon: 0.00001 | ||
window: 20 | ||
delft: | ||
# deep learning parameters | ||
architecture: "BidLSTM_CRF" | ||
|
||
- name: "table" | ||
engine: "wapiti" | ||
#engine: "delft" | ||
wapiti: | ||
# wapiti training parameters, they will be used at training time only | ||
epsilon: 0.00001 | ||
window: 20 | ||
delft: | ||
# deep learning parameters | ||
architecture: "BidLSTM_CRF" | ||
|
||
- name: "affiliation-address" | ||
#engine: "wapiti" | ||
engine: "delft" | ||
delft: | ||
# deep learning parameters | ||
architecture: "BidLSTM_CRF_FEATURES" | ||
|
||
- name: "citation" | ||
#engine: "wapiti" | ||
engine: "delft" | ||
wapiti: | ||
# wapiti training parameters, they will be used at training time only | ||
epsilon: 0.00001 | ||
window: 50 | ||
nbMaxIterations: 3000 | ||
delft: | ||
# deep learning parameters | ||
architecture: "BidLSTM_CRF_FEATURES" | ||
#architecture: "BERT_CRF" | ||
#transformer: "michiyasunaga/LinkBERT-base" | ||
useELMo: false | ||
runtime: | ||
# parameters used at runtime/prediction | ||
max_sequence_length: 500 | ||
batch_size: 30 | ||
training: | ||
# parameters used for training | ||
max_sequence_length: 500 | ||
batch_size: 50 | ||
|
||
- name: "patent-citation" | ||
engine: "wapiti" | ||
wapiti: | ||
# wapiti training parameters, they will be used at training time only | ||
epsilon: 0.0001 | ||
window: 20 | ||
delft: | ||
# deep learning parameters | ||
architecture: "BidLSTM_CRF_FEATURES" | ||
#architecture: "BERT_CRF" | ||
runtime: | ||
# parameters used at runtime/prediction | ||
max_sequence_length: 800 | ||
batch_size: 20 | ||
training: | ||
# parameters used for training | ||
max_sequence_length: 1000 | ||
batch_size: 40 | ||
|
||
- name: "funding-acknowledgement" | ||
#engine: "wapiti" | ||
engine: "delft" | ||
wapiti: | ||
# wapiti training parameters, they will be used at training time only | ||
epsilon: 0.00001 | ||
window: 50 | ||
nbMaxIterations: 2000 | ||
delft: | ||
# deep learning parameters | ||
architecture: "BidLSTM_CRF_FEATURES" | ||
#architecture: "BERT_CRF" | ||
#transformer: "michiyasunaga/LinkBERT-base" | ||
useELMo: false | ||
runtime: | ||
# parameters used at runtime/prediction | ||
max_sequence_length: 800 | ||
batch_size: 20 | ||
training: | ||
# parameters used for training | ||
max_sequence_length: 500 | ||
batch_size: 40 | ||
|
||
- name: "copyright" | ||
# at this time, we only have a DeLFT implementation, | ||
# use "wapiti" if the deep learning library JNI is not available and model will then be ignored | ||
# engine: "delft" | ||
engine: "wapiti" | ||
delft: | ||
# deep learning parameters | ||
architecture: "gru" | ||
#architecture: "bert" | ||
#transformer: "allenai/scibert_scivocab_cased" | ||
|
||
- name: "license" | ||
# at this time, for being active, it must be DeLFT, no other implementation is available | ||
# use "wapiti" if the deep learning library JNI is not available and model will then be ignored | ||
# engine: "delft" | ||
engine: "wapiti" | ||
delft: | ||
# deep learning parameters | ||
architecture: "gru" | ||
#architecture: "bert" | ||
#transformer: "allenai/scibert_scivocab_cased" | ||
|
||
# for **service only**: how to load the models, | ||
# false -> models are loaded when needed, avoiding putting in memory useless models (only in case of CRF) but slow down | ||
# significantly the service at first call | ||
# true -> all the models are loaded into memory at the server startup (default), slow the start of the services | ||
# and models not used will take some more memory (only in case of CRF), but server is immediatly warm and ready | ||
modelPreload: true | ||
|
||
server: | ||
type: custom | ||
applicationConnectors: | ||
- type: http | ||
port: 8070 | ||
adminConnectors: | ||
- type: http | ||
port: 8071 | ||
registerDefaultExceptionMappers: false | ||
# change the following for having all http requests logged | ||
requestLog: | ||
appenders: [] | ||
|
||
# these logging settings apply to the Grobid service usage mode | ||
logging: | ||
level: INFO | ||
loggers: | ||
org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF" | ||
org.glassfish.jersey.internal: "OFF" | ||
com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF" | ||
appenders: | ||
- type: console | ||
threshold: WARN | ||
timeZone: UTC | ||
# uncomment to have the logs in json format | ||
#layout: | ||
# type: json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters