Skip to content

Commit

Permalink
Keep track of position over chunks
Browse files Browse the repository at this point in the history
  • Loading branch information
tsproisl committed Dec 22, 2023
1 parent 8b31634 commit 7531745
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 2 deletions.
1 change: 1 addition & 0 deletions src/somajo/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def token_offsets(token_list, raw, position, xml_input, tokens):
offsets = [(align_to_starts[s], align_to_ends[e - 1]) for s, e in offsets]
if xml_input:
offsets = [(align_to_entities[s][0], align_to_entities[e - 1][1]) for s, e in offsets]
offsets = [(s + position, e + position) for s, e in offsets]
return offsets


Expand Down
3 changes: 1 addition & 2 deletions src/somajo/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,13 @@ def get_paragraphs_str(fh, paragraph_separator="empty_lines"):
elif paragraph_separator == "empty_lines":
paragraph = []
for line in fh:
paragraph.append(line)
if line.strip() == "":
if len(paragraph) > 0:
paragraph_text = "".join(paragraph)
yield paragraph_text, position
paragraph = []
position += len(paragraph_text)
else:
paragraph.append(line)
if len(paragraph) > 0:
yield "".join(paragraph), position

Expand Down

0 comments on commit 7531745

Please sign in to comment.