Skip to content

Commit

Permalink
Add alignment_mode argument to Span.char_span() (#12145)
Browse files Browse the repository at this point in the history
* Add alignment_mode argument to Span.char_span()

* Update website

* Update spacy/tokens/span.pyx

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Add test

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
  • Loading branch information
itssimon and adrianeboyd authored Jan 27, 2023
1 parent c68e6b8 commit 774c10f
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 11 deletions.
8 changes: 8 additions & 0 deletions spacy/tests/doc/test_span.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,14 @@ def test_spans_by_character(doc):
span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk"
)

# Span.char_span + alignment mode "contract"
span2 = doc[0:2].char_span(
span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract"
)
assert span1.start_char == span2.start_char
assert span1.end_char == span2.end_char
assert span2.label_ == "GPE"


def test_span_to_array(doc):
span = doc[1:-2]
Expand Down
1 change: 1 addition & 0 deletions spacy/tokens/span.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ class Span:
label: Union[int, str] = ...,
kb_id: Union[int, str] = ...,
vector: Optional[Floats1d] = ...,
alignment_mode: str = ...,
) -> Span: ...
@property
def conjuncts(self) -> Tuple[Token]: ...
Expand Down
11 changes: 8 additions & 3 deletions spacy/tokens/span.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ cdef class Span:
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
return result.item()

cpdef np.ndarray to_array(self, object py_attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
Expand Down Expand Up @@ -639,7 +639,7 @@ cdef class Span:
else:
return self.doc[root]

def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0):
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict"):
"""Create a `Span` object from the slice `span.text[start : end]`.
start (int): The index of the first character of the span.
Expand All @@ -649,11 +649,16 @@ cdef class Span:
kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
the span.
alignment_mode (str): How character indices are aligned to token
boundaries. Options: "strict" (character indices must be aligned
with token boundaries), "contract" (span of all tokens completely
within the character span), "expand" (span of all tokens at least
partially covered by the character span). Defaults to "strict".
RETURNS (Span): The newly constructed object.
"""
start_idx += self.c.start_char
end_idx += self.c.start_char
return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector)
return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode)

@property
def conjuncts(self):
Expand Down
17 changes: 9 additions & 8 deletions website/docs/api/span.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -186,14 +186,15 @@ the character indices don't map to a valid span.
> assert span.text == "New York"
> ```
| Name | Description |
| ----------- | ----------------------------------------------------------------------------------------- |
| `start` | The index of the first character of the span. ~~int~~ |
| `end` | The index of the last character after the span. ~~int~~ |
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
| Name | Description |
| ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `start` | The index of the first character of the span. ~~int~~ |
| `end` | The index of the last character after the span. ~~int~~ |
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
| `kb_id` | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
| `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ |
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
## Span.similarity {id="similarity",tag="method",model="vectors"}
Expand Down

0 comments on commit 774c10f

Please sign in to comment.