diff --git a/src/somajo/alignment.py b/src/somajo/alignment.py index ac6c83d..2243d77 100644 --- a/src/somajo/alignment.py +++ b/src/somajo/alignment.py @@ -145,6 +145,7 @@ def token_offsets(token_list, raw, position, xml_input, tokens): offsets = [(align_to_starts[s], align_to_ends[e - 1]) for s, e in offsets] if xml_input: offsets = [(align_to_entities[s][0], align_to_entities[e - 1][1]) for s, e in offsets] + offsets = [(s + position, e + position) for s, e in offsets] return offsets diff --git a/src/somajo/utils.py b/src/somajo/utils.py index 2d65db4..87f3040 100644 --- a/src/somajo/utils.py +++ b/src/somajo/utils.py @@ -21,14 +21,13 @@ def get_paragraphs_str(fh, paragraph_separator="empty_lines"): elif paragraph_separator == "empty_lines": paragraph = [] for line in fh: + paragraph.append(line) if line.strip() == "": if len(paragraph) > 0: paragraph_text = "".join(paragraph) yield paragraph_text, position paragraph = [] position += len(paragraph_text) - else: - paragraph.append(line) if len(paragraph) > 0: yield "".join(paragraph), position