diff --git a/chemrel/src/chemrel/functions/build.py b/chemrel/src/chemrel/functions/build.py deleted file mode 100644 index 42ba553..0000000 --- a/chemrel/src/chemrel/functions/build.py +++ /dev/null @@ -1,33 +0,0 @@ -from functools import partial -from pathlib import Path -from typing import Iterable, Callable -import spacy -from spacy.training import Example -from spacy.tokens import DocBin, Doc - -# make the factory work -from chemrel.functions.pipeline import custom_relation_extractor - - -# make the config work -from chemrel.functions.model import build_relation_model, build_classification_layer, build_instances, build_tensors - - -@spacy.registry.readers("Gold_ents_Corpus.v1") -def create_docbin_reader(file: Path) -> Callable[["Language"], Iterable[Example]]: - return partial(read_files, file) - - -def read_files(file: Path, nlp: "Language") -> Iterable[Example]: - """Custom reader that keeps the tokenization of the gold data, - and also adds the gold GGP annotations as we do not attempt to predict these.""" - doc_bin = DocBin().from_disk(file) - docs = doc_bin.get_docs(nlp.vocab) - for gold in docs: - pred = Doc( - nlp.vocab, - words=[t.text for t in gold], - spaces=[t.whitespace_ for t in gold], - ) - pred.ents = gold.ents - yield Example(pred, gold) diff --git a/chemrel/src/chemrel/functions/parser.py b/chemrel/src/chemrel/functions/parser.py index 1ff7606..59a44d8 100644 --- a/chemrel/src/chemrel/functions/parser.py +++ b/chemrel/src/chemrel/functions/parser.py @@ -26,7 +26,7 @@ def parse(json_loc: Path, train_file: Path, dev_file: Path, test_file: Path): """ # Creating a custom extension attribute called "rel" for the Doc class and initializing it to an empty dictionary - Doc.set_extension("rel", default={}) + Doc.set_extension("rel", default={}, force=True) # Creating a new Vocab object vocab = Vocab()