diff --git a/.github/workflows/ci-build-manual.yml b/.github/workflows/ci-build-manual.yml new file mode 100644 index 0000000..0c7f53a --- /dev/null +++ b/.github/workflows/ci-build-manual.yml @@ -0,0 +1,42 @@ +name: Build and push a development version on docker + +on: + workflow_dispatch: + + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up JDK 17 + uses: actions/setup-java@v4 + with: + java-version: '17.0.10+7' + distribution: 'temurin' + cache: 'gradle' + - name: Build with Gradle + run: ./gradlew build -x test + + docker-build: + needs: [ build ] + runs-on: ubuntu-latest + + steps: + - name: Create more disk space + run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY" + - uses: actions/checkout@v4 + - name: Build and push + id: docker_build + uses: mr-smithers-excellent/docker-build-push@v6 + with: + dockerfile: Dockerfile.datastet + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + image: lfoppiano/datastet + registry: docker.io + pushImage: true + tags: latest-develop + - name: Image digest + run: echo ${{ steps.docker_build.outputs.digest }} \ No newline at end of file diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml new file mode 100644 index 0000000..cc0f192 --- /dev/null +++ b/.github/workflows/ci-build.yml @@ -0,0 +1,59 @@ +name: Build unstable + +on: [push] + +concurrency: + group: gradle +# cancel-in-progress: true + + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up JDK 17 + uses: actions/setup-java@v4 + with: + java-version: '17.0.10+7' + distribution: 'temurin' + cache: 'gradle' + - name: Build with Gradle + run: ./gradlew build -x test + + - name: Test with Gradle Jacoco and Coveralls + run: ./gradlew test jacocoTestReport coveralls --no-daemon + + - name: Coveralls GitHub Action + uses: coverallsapp/github-action@v2 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + format: jacoco + + docker-build: + needs: [ build ] + runs-on: ubuntu-latest + + steps: + - name: Create more disk space + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /opt/hostedtoolcache + - uses: actions/checkout@v4 + - name: Build and push + id: docker_build + uses: mr-smithers-excellent/docker-build-push@v6 + with: + dockerfile: Dockerfile.datastet + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + image: lfoppiano/datastet + registry: docker.io + pushImage: ${{ github.event_name != 'pull_request' }} + tags: latest-develop + - name: Image digest + run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/.github/workflows/ci-integration-manual.yml b/.github/workflows/ci-integration-manual.yml new file mode 100644 index 0000000..c507534 --- /dev/null +++ b/.github/workflows/ci-integration-manual.yml @@ -0,0 +1,32 @@ +name: Run integration tests manually + +on: +# push: +# branches: +# - master + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout grobid home + uses: actions/checkout@v4 + with: + repository: kermitt2/grobid + path: ./grobid + - name: Checkout Datastet + uses: actions/checkout@v4 + with: + path: ./grobid/datastet + - name: Set up JDK 17 + uses: actions/setup-java@v4 + with: + java-version: '17.0.10+7' + distribution: 'temurin' + cache: 'gradle' + - name: Build and run integration tests + working-directory: ./grobid/datastet + run: ./gradlew copyModels integration --no-daemon + diff --git a/.github/workflows/ci-release.yml b/.github/workflows/ci-release.yml new file mode 100644 index 0000000..19e59d1 --- /dev/null +++ b/.github/workflows/ci-release.yml @@ -0,0 +1,74 @@ +name: Build release + +on: + workflow_dispatch: + push: + tags: + - 'v*' + +concurrency: + group: docker + cancel-in-progress: true + + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up JDK 17 + uses: actions/setup-java@v4 + with: + java-version: '17.0.10+7' + distribution: 'temurin' + cache: 'gradle' + - name: Build with Gradle + run: ./gradlew build -x test + + - name: Test with Gradle Jacoco and Coveralls + run: ./gradlew test jacocoTestReport coveralls --no-daemon + + - name: Coveralls GitHub Action + uses: coverallsapp/github-action@v2 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + format: jacoco + + + docker-build: + needs: [build] + runs-on: ubuntu-latest + + steps: + - name: Create more disk space + run: sudo rm -rf /usr/share/dotnet && sudo rm -rf /opt/ghc && sudo rm -rf "/usr/local/share/boost" && sudo rm -rf "$AGENT_TOOLSDIRECTORY" + - name: Set tags + id: set_tags + run: | + DOCKER_IMAGE=lfoppiano/datastet + VERSION="" + if [[ $GITHUB_REF == refs/tags/v* ]]; then + VERSION=${GITHUB_REF#refs/tags/v} + fi + if [[ $VERSION =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then + TAGS="${VERSION}" + else + TAGS="latest" + fi + echo "TAGS=${TAGS}" + echo ::set-output name=tags::${TAGS} + - uses: actions/checkout@v4 + - name: Build and push + id: docker_build + uses: mr-smithers-excellent/docker-build-push@v6 + with: + dockerfile: Dockerfile.local + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + image: lfoppiano/datastet + registry: docker.io + pushImage: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.set_tags.outputs.tags }} + - name: Image digest + run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/Dockerfile.datastet b/Dockerfile.datastet index 7ec2ada..e28f647 100644 --- a/Dockerfile.datastet +++ b/Dockerfile.datastet @@ -6,138 +6,72 @@ FROM openjdk:17-jdk-slim as builder USER root -ARG GROBID_VERSION - RUN apt-get update && \ - apt-get -y --no-install-recommends install unzip - -WORKDIR /opt/grobid-source - -# gradle -COPY gradle/ ./gradle/ -COPY gradlew ./ -COPY gradle.properties ./ -COPY build.gradle ./ -COPY settings.gradle ./ - -# source -COPY grobid-home/ ./grobid-home/ -COPY grobid-core/ ./grobid-core/ -#COPY grobid-service/ ./grobid-service/ -COPY grobid-trainer/ ./grobid-trainer/ -COPY datastet/ ./datastet/ - -# cleaning unused native libraries before packaging -RUN rm -rf grobid-home/pdf2xml -RUN rm -rf grobid-home/pdfalto/lin-32 -RUN rm -rf grobid-home/pdfalto/mac-64 -RUN rm -rf grobid-home/pdfalto/win-* -RUN rm -rf grobid-home/lib/lin-32 -RUN rm -rf grobid-home/lib/win-* -RUN rm -rf grobid-home/lib/mac-64 - -RUN ./gradlew clean assemble install --no-daemon --info --stacktrace - -WORKDIR ./datastet/ -RUN ./gradlew clean install --no-daemon --info --stacktrace + apt-get -y --no-install-recommends install apt-utils libxml2 git unzip wget + +WORKDIR /opt/grobid +RUN mkdir -p datastet-source grobid-home/models +COPY src datastet-source/src +COPY settings.gradle datastet-source/ +COPY resources/config/config-docker.yml datastet-source/resources/config/config.yml +COPY resources/models datastet-source/resources/models +COPY resources/lexicon datastet-source/resources/lexicon +COPY build.gradle datastet-source/ +COPY gradle.properties datastet-source/ +COPY gradle datastet-source/gradle/ +COPY gradlew datastet-source/ +#COPY .git datastet-source/.git +#COPY localLibs datastet-source/localLibs + +# Preparing models +WORKDIR /opt/grobid/datastet-source +RUN rm -rf /opt/grobid/grobid-home/models/* +RUN ./gradlew clean assemble -x shadowJar --no-daemon --stacktrace --info +RUN ./gradlew installModels --no-daemon --info --stacktrace \ + && rm -f /opt/grobid/grobid-home/models/*.zip + +# Preparing distribution WORKDIR /opt/grobid -#RUN unzip -o /opt/grobid-source/grobid-service/build/distributions/grobid-service-*.zip && \ -# mv grobid-service* grobid-service -RUN unzip -o /opt/grobid-source/grobid-home/build/distributions/grobid-home-*.zip && \ - chmod -R 755 /opt/grobid/grobid-home/pdfalto -RUN rm -rf grobid-source +RUN unzip -o /opt/grobid/datastet-source/build/distributions/datastet-*.zip -d datastet_distribution \ + && mv datastet_distribution/datastet-* datastet \ + && rm -rf /opt/grobid/datastet-source/build + +# install Pub2TEI +WORKDIR /opt/ +RUN wget https://github.com/kermitt2/Pub2TEI/archive/refs/heads/master.zip && \ + unzip master.zip && \ + mv Pub2TEI-master Pub2TEI && \ + rm master.zip + # ------------------- # build runtime image # ------------------- -# use NVIDIA Container Toolkit to automatically recognize possible GPU drivers on the host machine -FROM tensorflow/tensorflow:2.7.0-gpu -CMD nvidia-smi +FROM lfoppiano/grobid:0.8.1-full as runtime # setting locale is likely useless but to be sure ENV LANG C.UTF-8 -# update NVIDIA Cuda key (following a key rotation in April 2022) -RUN apt-get install -y wget -RUN apt-key del 7fa2af80 -RUN rm /etc/apt/sources.list.d/cuda.list -RUN rm /etc/apt/sources.list.d/nvidia-ml.list -RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb -RUN dpkg -i cuda-keyring_1.0-1_all.deb - -# install JRE, python and other dependencies -RUN apt-get update && \ - apt-get -y --no-install-recommends install apt-utils build-essential gcc libxml2 libfontconfig unzip curl \ - openjdk-17-jre-headless openjdk-17-jdk ca-certificates-java \ - musl gfortran \ - python3 python3-pip python3-setuptools python3-dev \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* - -WORKDIR /opt/grobid - -COPY --from=builder /opt/grobid . - -RUN python3 -m pip install pip --upgrade - -# install DeLFT via pypi -RUN pip3 install requests delft==0.3.3 -# link the data directory to /data -# the current working directory will most likely be /opt/grobid -RUN mkdir -p /data \ - && ln -s /data /opt/grobid/data \ - && ln -s /data ./data - -# disable python warnings (and fix logging) -ENV PYTHONWARNINGS="ignore" - WORKDIR /opt/grobid +RUN rm -rf /opt/grobid/grobid-home/models/*-with_ELMo \ + && rm -rf /opt/grobid/grobid-service \ + && ln -sf datastet/resources/ resources -ENV JAVA_OPTS=-Xmx4g - -# install jep (and temporarily the matching JDK) -ENV JDK_URL=https://download.java.net/java/GA/jdk17.0.2/dfd4a8d0985749f896bed50d7138ee7f/8/GPL/openjdk-17.0.2_linux-x64_bin.tar.gz -RUN curl --fail --show-error --location -q ${JDK_URL} -o /tmp/openjdk.tar.gz -RUN mkdir /tmp/jdk-17 -RUN tar xvfz /tmp/openjdk.tar.gz --directory /tmp/jdk-17 --strip-components 1 --no-same-owner -RUN /tmp/jdk-17/bin/javac -version -RUN JAVA_HOME=/tmp/jdk-17 pip3 install jep==4.0.2 -RUN rm -f /tmp/openjdk.tar.gz -RUN rm -rf /tmp/jdk-17 -ENV LD_LIBRARY_PATH=/usr/local/lib/python3.8/dist-packages/jep:grobid-home/lib/lin-64:grobid-home/lib/lin-64/jep:${LD_LIBRARY_PATH} -# remove libjep.so because we are providing our own version in the virtual env above -RUN rm /opt/grobid/grobid-home/lib/lin-64/jep/libjep.so - -# preload embeddings, for GROBID all the RNN models use glove-840B (default for the script), ELMo is currently not loaded -# download GROBID fine-tuned models based on SciBERT if selected - -COPY --from=builder /opt/grobid-source/grobid-home/scripts/preload_embeddings.py . -# embeddings will be loaded when building and running tests - -RUN ln -s /opt/grobid /opt/delft +# the last command above is just a hack to make the lexicon loader working -COPY --from=builder /opt/grobid-source/datastet /opt/grobid/datastet -COPY --from=builder /root/.m2/repository/org /opt/grobid/datastet/lib/org +COPY --from=builder /opt/grobid/grobid-home/models ./grobid-home/models +COPY --from=builder /opt/grobid/datastet ./datastet/ +COPY --from=builder /opt/grobid/datastet-source/resources/config/config.yml ./datastet/resources/config/ +COPY --from=builder /opt/grobid/datastet-source/resources/lexicon/ ./datastet/resources/lexicon/ -# install Pub2TEI -WORKDIR /opt/ -RUN wget https://github.com/kermitt2/Pub2TEI/archive/refs/heads/master.zip -RUN unzip master.zip -RUN mv Pub2TEI-master Pub2TEI - -WORKDIR /opt/grobid/datastet - -RUN mkdir /opt/grobid/delft -RUN mkdir /opt/grobid/delft/delft -COPY --from=builder /opt/grobid-source/grobid-home/config/resources-registry.json /opt/grobid/delft/delft/resources-registry.json +COPY --from=builder /opt/grobid/datastet /opt/grobid/datastet +COPY --from=builder /opt/Pub2TEI /opt/Pub2TEI -WORKDIR /opt/grobid/datastet +VOLUME ["/opt/grobid/grobid-home/tmp"] -# trigger gradle wrapper install -RUN ./gradlew --version -RUN ./gradlew installModels && rm -rf resources/models && rm ../grobid-home/models/dataseer*.zip && rm ../grobid-home/models/context_*.zip +#WORKDIR /opt/grobid # install ELMo #RUN wget https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json @@ -147,16 +81,20 @@ RUN ./gradlew installModels && rm -rf resources/models && rm ../grobid-home/mode #RUN mv elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5 /opt/elmo/ # this will build and load embeddings on the image forever (only if required by the config) :) -WORKDIR /opt/grobid/datastet -#RUN ./gradlew clean build test -RUN ./gradlew clean assemble --no-daemon --stacktrace --info -x test +# LF: AFAIK this is not needed at the moment as all the models are running with bert, but might +# be a solution if we want to support the GRU version +# RUN python3 preload_embeddings.py --registry ./resources-registry.json --embedding word2vec + +ARG GROBID_VERSION +ENV GROBID_VERSION=${GROBID_VERSION:-latest} +ENV DATASTET_OPTS "-Djava.library.path=/opt/grobid/grobid-home/lib/lin-64:/usr/local/lib/python3.8/dist-packages/jep --add-opens java.base/java.lang=ALL-UNNAMED" + -#CMD ["./gradlew", "run"] -CMD ["sh", "-c", "java --add-opens java.base/java.lang=ALL-UNNAMED -jar build/libs/datastet-0.8.0-onejar.jar server resources/config/config.yml"] +CMD ["./datastet/bin/datastet", "server", "datastet/resources/config/config.yml"] LABEL \ authors="The contributors" \ org.label-schema.name="datastet" \ org.label-schema.description="Image with DataStet service" \ - org.label-schema.url="https://github.com/kermitt2/datastet" \ + org.label-schema.url="https://github.com/DataSeer/datastet" \ org.label-schema.version=${GROBID_VERSION} \ No newline at end of file diff --git a/build.gradle b/build.gradle index 15232c4..b1d6e18 100644 --- a/build.gradle +++ b/build.gradle @@ -7,19 +7,18 @@ buildscript { dependencies { classpath 'gradle.plugin.org.kt3k.gradle.plugin:coveralls-gradle-plugin:2.12.0' classpath "gradle.plugin.com.github.jengelman.gradle.plugins:shadow:7.0.0" + classpath group: 'org.yaml', name: 'snakeyaml', version: '1.19' } } plugins { id 'com.github.johnrengelman.shadow' version '7.0.0' - id "de.undercouch.download" version "4.1.1" + id "de.undercouch.download" version "5.6.0" + id "jacoco" + id 'distribution' + id 'application' } -apply plugin: 'jacoco' - -jacoco { - toolVersion = '0.8.8' -} apply plugin: 'java-library' apply plugin: 'base' @@ -30,8 +29,8 @@ version = '0.8.0' description = """datastet""" -sourceCompatibility = 1.11 -targetCompatibility = 1.11 +sourceCompatibility = 1.17 +targetCompatibility = 1.17 import org.apache.tools.ant.taskdefs.condition.Os @@ -67,12 +66,12 @@ dependencies { //Apache commons implementation group: 'commons-pool', name: 'commons-pool', version: '1.6' - implementation group: 'commons-io', name: 'commons-io', version: '2.5' + implementation group: 'commons-io', name: 'commons-io', version: '2.9.0' //implementation group: 'commons-logging', name: 'commons-logging', version: '1.2' - implementation group: 'org.apache.httpcomponents', name: 'httpclient', version: '4.5.3' + implementation group: 'org.apache.httpcomponents', name: 'httpclient', version: '4.5.14' implementation group: 'org.apache.httpcomponents', name: 'httpmime', version: '4.5.3' implementation group: 'org.apache.commons', name: 'commons-lang3', version: '3.6' - implementation group: 'org.apache.commons', name: 'commons-collections4', version: '4.1' + implementation group: 'org.apache.commons', name: 'commons-collections4', version: '4.4' implementation group: 'org.apache.commons', name: 'commons-csv', version: '1.5' implementation group: 'com.google.guava', name: 'guava', version: '28.2-jre' @@ -119,19 +118,21 @@ dependencies { implementation 'org.apache.opennlp:opennlp-tools:1.9.1' //Grobid - implementation group: 'org.grobid', name: 'grobid-core', version: '0.8.0' - implementation group: 'org.grobid', name: 'grobid-trainer', version: '0.8.0' + implementation group: 'org.grobid', name: 'grobid-core', version: '0.8.1' + implementation group: 'org.grobid', name: 'grobid-trainer', version: '0.8.1' //Tests testImplementation group: 'junit', name: 'junit', version: '4.12' testImplementation group: 'org.hamcrest', name: 'hamcrest-all', version: '1.3' + testImplementation 'org.powermock:powermock-module-junit4:2.0.9' + testImplementation 'org.powermock:powermock-api-easymock:2.0.9' } -configurations.all { +configurations.all { resolutionStrategy { force 'xml-apis:xml-apis:1.4.01' } - + exclude group: 'org.slf4j', module: "slf4j-log4j12" exclude group: 'org.slf4j', module: "slf4j-jdk14" exclude group: 'log4j', module: "log4j" @@ -143,14 +144,14 @@ configurations.all { def libraries = "" if (Os.isFamily(Os.FAMILY_MAC)) { if (Os.OS_ARCH.equals("aarch64")) { - libraries = "${file("../grobid-home/lib/mac_arm-64").absolutePath}" + libraries = "${file("../grobid-home/lib/mac_arm-64").absolutePath}" } else { libraries = "${file("../grobid-home/lib/mac-64").absolutePath}" } } else if (Os.isFamily(Os.FAMILY_UNIX)) { libraries = "${file("../grobid-home/lib/lin-64/jep").absolutePath}:" + - "${file("../grobid-home/lib/lin-64").absolutePath}:" -} else { + "${file("../grobid-home/lib/lin-64").absolutePath}:" +} else { throw new RuntimeException("Unsupported platform!") } @@ -161,10 +162,10 @@ test { exclude '**/**IntegrationTest**' if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) { - jvmArgs "--add-opens", "java.base/java.util.stream=ALL-UNNAMED", - "--add-opens", "java.base/java.io=ALL-UNNAMED" + jvmArgs "--add-opens", "java.base/java.util.stream=ALL-UNNAMED", + "--add-opens", "java.base/java.io=ALL-UNNAMED" } - systemProperty "java.library.path","${System.getProperty('java.library.path')}:" + libraries + systemProperty "java.library.path", "${System.getProperty('java.library.path')}:" + libraries } // return the default value if the property has not been specified in command line @@ -172,6 +173,17 @@ ext.getArg = { propName, defaultVal -> return project.hasProperty(propName) ? project.getProperty(propName) : defaultVal; } +task integration(type: Test) { + include '**' + maxHeapSize = "1024m" + + if (JavaVersion.current().compareTo(JavaVersion.VERSION_1_8) > 0) { + jvmArgs "--add-opens", "java.base/java.util.stream=ALL-UNNAMED", + "--add-opens", "java.base/java.io=ALL-UNNAMED" + } + systemProperty "java.library.path", "${System.getProperty('java.library.path')}:" + libraries +} + import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar apply plugin: 'java' @@ -190,116 +202,78 @@ publishing { task install(dependsOn: publishToMavenLocal) -task mainJar(type: ShadowJar) { - zip64 true - from sourceSets.main.output - - configurations = [project.configurations.runtimeClasspath] - - from { - project.configurations.implementation.collect { - it.isDirectory() ? [] : localLibs.contains(it.getName()) ? zipTree(it) : [] - } - } -} +//task mainJar(type: ShadowJar) { +// zip64 true +// from sourceSets.main.output +// +// configurations = [project.configurations.runtimeClasspath] +// +// from { +// project.configurations.implementation.collect { +// it.isDirectory() ? [] : localLibs.contains(it.getName()) ? zipTree(it) : [] +// } +// } +//} shadowJar { - classifier = 'onejar' + archiveClassifier = 'onejar' mergeServiceFiles() zip64 true manifest { attributes 'Main-Class': 'org.grobid.core.main.batch.DatastetMain' } - + from sourceSets.main.output configurations = [project.configurations.runtimeClasspath] - - from { - project.configurations.implementation.collect { - it.isDirectory() ? [] : localLibs.contains(it.getName()) ? zipTree(it) : [] - } - } } jar { - dependsOn mainJar +// dependsOn mainJar enabled true } -tasks.withType(Tar){ - duplicatesStrategy = DuplicatesStrategy.EXCLUDE -} - -tasks.withType(Zip){ - duplicatesStrategy = DuplicatesStrategy.EXCLUDE -} +distZip.enabled = true +distTar.enabled = false +shadowDistZip.enabled = false +shadowDistTar.enabled = false artifacts { - archives jar archives shadowJar } -task installModels(type: Copy) { - from "${rootDir}/resources/models/" - include "**" - into "${rootDir}/../grobid-home/models/" - - doLast { - download { - src "https://grobid.s3.amazonaws.com/dataseer-binary_bert-0.3.1.zip" - dest "${rootDir}/../grobid-home/models/" - overwrite false - } - ant.unzip(src: "${rootDir}/../grobid-home/models/dataseer-binary_bert-0.3.1.zip", dest: "${rootDir}/../grobid-home/models/") - - download { - src "https://grobid.s3.amazonaws.com/dataseer-first_bert-0.3.1.zip" - dest "${rootDir}/../grobid-home/models/" - overwrite false - } - ant.unzip(src: "${rootDir}/../grobid-home/models/dataseer-first_bert-0.3.1.zip", dest: "${rootDir}/../grobid-home/models/") - - download { - src "https://grobid.s3.amazonaws.com/dataseer-reuse_bert-0.3.1.zip" - dest "${rootDir}/../grobid-home/models/" - overwrite false - } - ant.unzip(src: "${rootDir}/../grobid-home/models/dataseer-reuse_bert-0.3.1.zip", dest: "${rootDir}/../grobid-home/models/") +def conf = new org.yaml.snakeyaml.Yaml().load(new File("resources/config/config.yml").newInputStream()) +def grobidHome = conf.grobidHome.replace("\$", "").replace('{', "").replace("GROBID_HOME:- ", "").replace("}", "") +if (grobidHome.startsWith("../")) { + grobidHome = "${rootProject.rootDir}/${grobidHome}" +} - download { - src "https://grobid.s3.amazonaws.com/datasets-BERT_CRF-0.3.2.zip" - dest "${rootDir}/../grobid-home/models/" - overwrite false - } - ant.unzip(src: "${rootDir}/../grobid-home/models/datasets-BERT_CRF-0.3.2.zip", dest: "${rootDir}/../grobid-home/models/") +def models = [ + 'https://grobid.s3.amazonaws.com/dataseer-binary_bert-0.3.1.zip', + 'https://grobid.s3.amazonaws.com/dataseer-first_bert-0.3.1.zip', + 'https://grobid.s3.amazonaws.com/dataseer-reuse_bert-0.3.1.zip', + 'https://grobid.s3.amazonaws.com/datasets-BERT_CRF-0.3.2.zip', + 'https://grobid.s3.amazonaws.com/context_bert-0.3.2.zip', + 'https://grobid.s3.amazonaws.com/context_bert_used-0.3.2.zip', + 'https://grobid.s3.amazonaws.com/context_bert_creation-0.3.2.zip', + 'https://grobid.s3.amazonaws.com/context_bert_shared-0.3.2.zip' +] + +def installModels = tasks.register("installModels") + +models.eachWithIndex { model, index -> + def downloadTask = tasks.register("downloadModel_$index", Download) { + src(model) + dest "${grobidHome}/models/" + onlyIfNewer true + // overwrite true + } - download { - src "https://grobid.s3.amazonaws.com/context_bert-0.3.2.zip" - dest "${rootDir}/../grobid-home/models/" - overwrite false - } - ant.unzip(src: "${rootDir}/../grobid-home/models/context_bert-0.3.2.zip", dest: "${rootDir}/../grobid-home/models/") - - download { - src "https://grobid.s3.amazonaws.com/context_bert_used-0.3.2.zip" - dest "${rootDir}/../grobid-home/models/" - overwrite false - } - ant.unzip(src: "${rootDir}/../grobid-home/models/context_bert_used-0.3.2.zip", dest: "${rootDir}/../grobid-home/models/") - - download { - src "https://grobid.s3.amazonaws.com/context_bert_creation-0.3.2.zip" - dest "${rootDir}/../grobid-home/models/" - overwrite false - } - ant.unzip(src: "${rootDir}/../grobid-home/models/context_bert_creation-0.3.2.zip", dest: "${rootDir}/../grobid-home/models/") - - download { - src "https://grobid.s3.amazonaws.com/context_bert_shared-0.3.2.zip" - dest "${rootDir}/../grobid-home/models/" - overwrite false - } - ant.unzip(src: "${rootDir}/../grobid-home/models/context_bert_shared-0.3.2.zip", dest: "${rootDir}/../grobid-home/models/") + def unzipTask = tasks.register("unzipModel_$index", Copy) { + dependsOn downloadTask + from zipTree(downloadTask.get().outputs.files.first()) + into "${grobidHome}/models/" } + + installModels.get().dependsOn(unzipTask) } //tasks.withType(JavaCompile) { @@ -323,7 +297,7 @@ task(train_dataseer, dependsOn: 'classes', type: JavaExec, group: 'training') { } else { jvmArgs '-Xmx3072m' } - systemProperty "java.library.path","${System.getProperty('java.library.path')}:" + libraries + systemProperty "java.library.path", "${System.getProperty('java.library.path')}:" + libraries // jvmArgs '-Xms2g', '-Xmx8g' } @@ -345,7 +319,7 @@ task(eval_dataseer, dependsOn: 'classes', type: JavaExec, group: 'evaluation') { } else { jvmArgs '-Xmx3072m' } - systemProperty "java.library.path","${System.getProperty('java.library.path')}:" + libraries + systemProperty "java.library.path", "${System.getProperty('java.library.path')}:" + libraries // jvmArgs '-Xms2g', '-Xmx8g' } @@ -360,7 +334,7 @@ task(eval_dataseer_split, dependsOn: 'classes', type: JavaExec, group: 'evaluati } else { jvmArgs '-Xms2g', '-Xmx8g' } - systemProperty "java.library.path","${System.getProperty('java.library.path')}:" + libraries + systemProperty "java.library.path", "${System.getProperty('java.library.path')}:" + libraries } // Run like this: ./gradlew eval_dataseer_nfold -PgH=/path/grobid/home -Pt=10 @@ -373,7 +347,7 @@ task(eval_dataseer_nfold, dependsOn: 'classes', type: JavaExec, group: 'evaluati } else { jvmArgs '-Xms2g', '-Xmx8g' } - systemProperty "java.library.path","${System.getProperty('java.library.path')}:" + libraries + systemProperty "java.library.path", "${System.getProperty('java.library.path')}:" + libraries } // Run like this: ./gradlew annotated_corpus_generator_csv -Pfull=/path/input/fulltext -Ppdf=/path/input/pdf -Pcsv=/path/csv -Pxml=/output/directory @@ -386,7 +360,7 @@ task(annotated_corpus_generator_csv, dependsOn: 'classes', type: JavaExec, group } else { jvmArgs '-Xms2g', '-Xmx8g' } - systemProperty "java.library.path","${System.getProperty('java.library.path')}:" + libraries + systemProperty "java.library.path", "${System.getProperty('java.library.path')}:" + libraries } ////////// @@ -406,3 +380,15 @@ application { args = ['server', 'resources/config/config.yml'] } } + +jacocoTestReport { + reports { + xml.enabled = true // coveralls plugin depends on xml format report + html.enabled = true + } + dependsOn test // tests are required to run before generating the report +} + +coveralls { + jacocoReportPath 'build/reports/jacoco/test/jacocoTestReport.xml' +} diff --git a/gradle.properties b/gradle.properties new file mode 100644 index 0000000..dd48123 --- /dev/null +++ b/gradle.properties @@ -0,0 +1,3 @@ +org.gradle.caching=false +org.gradle.daemon=false +org.gradle.jvmargs= --add-exports=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-exports=jdk.unsupported/sun.misc=ALL-UNNAMED \ No newline at end of file diff --git a/resources/config/config.yml b/resources/config/config.yml index 917e307..cfb4524 100644 --- a/resources/config/config.yml +++ b/resources/config/config.yml @@ -19,7 +19,7 @@ entityFishingPort: 443 #entityFishingPort: 8090 # if true we use binary classifiers for the contexts, otherwise use a single multi-label classifier -# binary classifiers perform better, but havier to use +# binary classifiers perform better, but heavier to use useBinaryContextClassifiers: false # sequence labeling model (identify data-related sections) @@ -153,7 +153,7 @@ logging: com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF" appenders: - type: console - threshold: WARN + threshold: INFO timeZone: UTC # uncomment to have the logs in json format #layout: diff --git a/src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java b/src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java index 8fc06a8..b089324 100644 --- a/src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java +++ b/src/main/java/org/grobid/core/analyzers/DatastetAnalyzer.java @@ -88,6 +88,11 @@ public List tokenizeWithLayoutToken(String text) { return result; } + @Override + public List retokenizeFromLayoutToken(List tokens) { + throw new UnsupportedOperationException("Method retokenizeFromLayoutToken not yet implemented"); + } + public List retokenize(List chunks) { List result = new ArrayList<>(); for (String chunk : chunks) { diff --git a/src/test/java/org/grobid/core/engines/DataseerClassifierTest.java b/src/test/java/org/grobid/core/engines/DataseerClassifierTest.java index 115b8d6..8940913 100644 --- a/src/test/java/org/grobid/core/engines/DataseerClassifierTest.java +++ b/src/test/java/org/grobid/core/engines/DataseerClassifierTest.java @@ -1,49 +1,40 @@ package org.grobid.core.engines; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; import org.apache.commons.io.IOUtils; -import org.grobid.core.document.Document; -import org.grobid.core.factory.GrobidFactory; -import org.grobid.core.utilities.GrobidProperties; -import org.grobid.core.utilities.DataseerConfiguration; import org.grobid.core.main.GrobidHomeFinder; -import org.grobid.core.utilities.GrobidConfig.ModelParameters; import org.grobid.core.main.LibraryLoader; +import org.grobid.core.utilities.DatastetConfiguration; +import org.grobid.core.utilities.GrobidConfig.ModelParameters; +import org.grobid.core.utilities.GrobidProperties; import org.junit.Before; import org.junit.BeforeClass; -import org.junit.Test; import org.junit.Ignore; +import org.junit.Test; import java.io.File; import java.nio.charset.StandardCharsets; -import java.util.List; -import java.util.Arrays; import java.util.ArrayList; - -import org.apache.commons.lang3.tuple.Pair; - -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.hasSize; -import static org.junit.Assert.assertNotNull; +import java.util.Arrays; +import java.util.List; /** * @author Patrice */ @Ignore public class DataseerClassifierTest { - private static DataseerConfiguration configuration; + private static DatastetConfiguration configuration; @BeforeClass public static void setUpClass() throws Exception { - DataseerConfiguration dataseerConfiguration = null; + DatastetConfiguration dataseerConfiguration = null; try { ObjectMapper mapper = new ObjectMapper(new YAMLFactory()); File yamlFile = new File("resources/config/dataseer-ml.yml").getAbsoluteFile(); yamlFile = new File(yamlFile.getAbsolutePath()); - dataseerConfiguration = mapper.readValue(yamlFile, DataseerConfiguration.class); + dataseerConfiguration = mapper.readValue(yamlFile, DatastetConfiguration.class); String pGrobidHome = dataseerConfiguration.getGrobidHome(); diff --git a/src/test/java/org/grobid/core/engines/DatasetParserTest.java b/src/test/java/org/grobid/core/engines/DatasetParserTest.java index 8864128..80d31c1 100644 --- a/src/test/java/org/grobid/core/engines/DatasetParserTest.java +++ b/src/test/java/org/grobid/core/engines/DatasetParserTest.java @@ -1,50 +1,41 @@ package org.grobid.core.engines; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; import org.apache.commons.io.IOUtils; -import org.grobid.core.document.Document; import org.grobid.core.data.Dataset; -import org.grobid.core.factory.GrobidFactory; -import org.grobid.core.utilities.GrobidProperties; -import org.grobid.core.utilities.DataseerConfiguration; import org.grobid.core.main.GrobidHomeFinder; -import org.grobid.core.utilities.GrobidConfig.ModelParameters; import org.grobid.core.main.LibraryLoader; +import org.grobid.core.utilities.DatastetConfiguration; +import org.grobid.core.utilities.GrobidConfig.ModelParameters; +import org.grobid.core.utilities.GrobidProperties; import org.junit.Before; import org.junit.BeforeClass; -import org.junit.Test; import org.junit.Ignore; +import org.junit.Test; import java.io.File; import java.nio.charset.StandardCharsets; -import java.util.List; -import java.util.Arrays; import java.util.ArrayList; - -import org.apache.commons.lang3.tuple.Pair; - -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.hasSize; -import static org.junit.Assert.assertNotNull; +import java.util.Arrays; +import java.util.List; /** * @author Patrice */ @Ignore public class DatasetParserTest { - private static DataseerConfiguration configuration; + private static DatastetConfiguration configuration; @BeforeClass public static void setUpClass() throws Exception { - DataseerConfiguration dataseerConfiguration = null; + DatastetConfiguration dataseerConfiguration = null; try { ObjectMapper mapper = new ObjectMapper(new YAMLFactory()); File yamlFile = new File("resources/config/dataseer-ml.yml").getAbsoluteFile(); yamlFile = new File(yamlFile.getAbsolutePath()); - dataseerConfiguration = mapper.readValue(yamlFile, DataseerConfiguration.class); + dataseerConfiguration = mapper.readValue(yamlFile, DatastetConfiguration.class); String pGrobidHome = dataseerConfiguration.getGrobidHome(); diff --git a/src/test/java/org/grobid/core/lexicon/DatasetLexiconTest.java b/src/test/java/org/grobid/core/lexicon/DatasetLexiconTest.java index 7e5c736..9e788fb 100644 --- a/src/test/java/org/grobid/core/lexicon/DatasetLexiconTest.java +++ b/src/test/java/org/grobid/core/lexicon/DatasetLexiconTest.java @@ -1,48 +1,32 @@ package org.grobid.core.lexicon; -import org.apache.commons.io.IOUtils; -import org.grobid.core.analyzers.DataseerAnalyzer; -import org.grobid.core.data.Dataset; -import org.grobid.core.document.Document; -import org.grobid.core.factory.GrobidFactory; -import org.grobid.core.layout.LayoutToken; -import org.grobid.core.utilities.GrobidProperties; -import org.grobid.core.utilities.DataseerConfiguration; -import org.grobid.core.utilities.OffsetPosition; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; import org.grobid.core.main.GrobidHomeFinder; +import org.grobid.core.utilities.DatastetConfiguration; import org.grobid.core.utilities.GrobidConfig.ModelParameters; -import org.grobid.core.main.LibraryLoader; -import org.grobid.core.utilities.Pair; -import org.junit.Before; +import org.grobid.core.utilities.GrobidProperties; import org.junit.BeforeClass; import org.junit.Test; import java.io.File; -import java.nio.charset.StandardCharsets; -import java.util.List; -import java.util.ArrayList; import java.util.Arrays; import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.hasSize; import static org.hamcrest.Matchers.is; -import static org.junit.Assert.assertNotNull; - -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.dataformat.yaml.YAMLFactory; /** * @author Patrice */ public class DatasetLexiconTest { - private static DataseerLexicon dataseerLexicon; + private static DatastetLexicon dataseerLexicon; @BeforeClass public static void setUpClass() throws Exception { - DataseerConfiguration dataseerConfiguration = null; + DatastetConfiguration dataseerConfiguration = null; try { ObjectMapper mapper = new ObjectMapper(new YAMLFactory()); - dataseerConfiguration = mapper.readValue(new File("resources/config/dataseer-ml.yml").getAbsoluteFile(), DataseerConfiguration.class); + dataseerConfiguration = mapper.readValue(new File("resources/config/dataseer-ml.yml").getAbsoluteFile(), DatastetConfiguration.class); String pGrobidHome = dataseerConfiguration.getGrobidHome(); @@ -57,7 +41,7 @@ public static void setUpClass() throws Exception { } //LibraryLoader.load(); - dataseerLexicon = DataseerLexicon.getInstance(); + dataseerLexicon = DatastetLexicon.getInstance(); } catch (final Exception exp) { System.err.println("GROBID dataset initialisation failed: " + exp); @@ -71,9 +55,9 @@ public void testDatasetDOISuccess() throws Exception { String testStringDryad = "https://doi.org/10.5061/DRYAD.0SN63/7"; String testStringFigshare = "https://doi.org/10.6084/m9.figshare.10275182"; - boolean zenodoCheck = DataseerLexicon.getInstance().isDatasetDOI(testStringZenodo); - boolean dryadCheck = DataseerLexicon.getInstance().isDatasetDOI(testStringDryad); - boolean figshareCheck = DataseerLexicon.getInstance().isDatasetDOI(testStringFigshare); + boolean zenodoCheck = DatastetLexicon.getInstance().isDatasetDOI(testStringZenodo); + boolean dryadCheck = DatastetLexicon.getInstance().isDatasetDOI(testStringDryad); + boolean figshareCheck = DatastetLexicon.getInstance().isDatasetDOI(testStringFigshare); assertThat(zenodoCheck, is(true)); assertThat(dryadCheck, is(true)); @@ -86,9 +70,9 @@ public void testDatasetDOIFail() throws Exception { String testStringSecond = "https://doi.org/10.1371/journal.pone.0263302"; String testStringThird = "https://doi.org/10.1186/s13064-019-0127-z"; - boolean firstCheck = DataseerLexicon.getInstance().isDatasetDOI(testStringFirst); - boolean secondCheck = DataseerLexicon.getInstance().isDatasetDOI(testStringSecond); - boolean thirdCheck = DataseerLexicon.getInstance().isDatasetDOI(testStringThird); + boolean firstCheck = DatastetLexicon.getInstance().isDatasetDOI(testStringFirst); + boolean secondCheck = DatastetLexicon.getInstance().isDatasetDOI(testStringSecond); + boolean thirdCheck = DatastetLexicon.getInstance().isDatasetDOI(testStringThird); assertThat(firstCheck, is(false)); assertThat(secondCheck, is(false)); @@ -101,9 +85,9 @@ public void testDatasetUrlSuccess() throws Exception { String testStringGithub = "https://github.com/leonfodoulian/SARS_CoV_2_anosmia"; String testStringOsf = "https://osf.io/5r72u"; - boolean idCheck = DataseerLexicon.getInstance().isDatasetURL(testStringId); - boolean githubCheck = DataseerLexicon.getInstance().isDatasetURL(testStringGithub); - boolean osfCheck = DataseerLexicon.getInstance().isDatasetURL(testStringOsf); + boolean idCheck = DatastetLexicon.getInstance().isDatasetURL(testStringId); + boolean githubCheck = DatastetLexicon.getInstance().isDatasetURL(testStringGithub); + boolean osfCheck = DatastetLexicon.getInstance().isDatasetURL(testStringOsf); assertThat(idCheck, is(true)); assertThat(githubCheck, is(true)); @@ -116,9 +100,9 @@ public void testDatasetUrlFail() throws Exception { String testStringSecond = "https://nlp.johnsnowlabs.com/api/com/johnsnowlabs/nlp/annotators/LemmatizerModel.html"; String testStringThird = "https://stackoverflow.com/questions/11976393/get-github-username-by-id"; - boolean firstCheck = DataseerLexicon.getInstance().isDatasetURL(testStringFirst); - boolean secondCheck = DataseerLexicon.getInstance().isDatasetURL(testStringSecond); - boolean thirdCheck = DataseerLexicon.getInstance().isDatasetURL(testStringThird); + boolean firstCheck = DatastetLexicon.getInstance().isDatasetURL(testStringFirst); + boolean secondCheck = DatastetLexicon.getInstance().isDatasetURL(testStringSecond); + boolean thirdCheck = DatastetLexicon.getInstance().isDatasetURL(testStringThird); assertThat(firstCheck, is(false)); assertThat(secondCheck, is(false)); @@ -130,8 +114,8 @@ public void testLeadingStopwords() throws Exception { String testStringFirst = "and the dataset TOTO"; String testStringSecond = "and the dataset TOTO of"; - String firstCheck = DataseerLexicon.getInstance().removeLeadingEnglishStopwords(testStringFirst); - String secondCheck = DataseerLexicon.getInstance().removeLeadingEnglishStopwords(testStringSecond); + String firstCheck = DatastetLexicon.getInstance().removeLeadingEnglishStopwords(testStringFirst); + String secondCheck = DatastetLexicon.getInstance().removeLeadingEnglishStopwords(testStringSecond); assertThat(firstCheck, is("dataset TOTO")); assertThat(secondCheck, is("dataset TOTO of"));