-
Notifications
You must be signed in to change notification settings - Fork 84
/
Copy pathutils.py
81 lines (63 loc) · 2.33 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import pysbd
class Rule(object):
def __init__(self, pattern, replacement):
self.pattern = pattern
self.replacement = replacement
def __repr__(self): # pragma: no cover
return '<{} pattern="{}" and replacement="{}">'.format(
self.__class__.__name__, self.pattern, self.replacement)
class Text(str):
"""Extending str functionality to apply regex rules
https://stackoverflow.com/questions/4698493/can-i-add-custom-methods-attributes-to-built-in-python-types
Parameters
----------
str : str
string content
Returns
-------
str
input as it is if rule pattern doesnt match
else replacing found pattern with replacement chars
"""
def apply(self, *rules):
for each_r in rules:
self = re.sub(each_r.pattern, each_r.replacement, self)
return self
class TextSpan(object):
def __init__(self, sent, start, end):
"""
Sentence text and its start & end character offsets within original text
Parameters
----------
sent : str
Sentence text
start : int
start character offset of a sentence in original text
end : int
end character offset of a sentence in original text
"""
self.sent = sent
self.start = start
self.end = end
def __repr__(self): # pragma: no cover
return "{0}(sent={1}, start={2}, end={3})".format(
self.__class__.__name__, repr(self.sent), self.start, self.end)
def __eq__(self, other):
if isinstance(self, other.__class__):
return self.sent == other.sent and self.start == other.start and self.end == other.end
class PySBDFactory(object):
"""pysbd as a spacy component through entrypoints"""
def __init__(self, nlp, language='en'):
self.nlp = nlp
self.seg = pysbd.Segmenter(language=language, clean=False,
char_span=True)
def __call__(self, doc):
sents_char_spans = self.seg.segment(doc.text_with_ws)
start_token_ids = [sent.start for sent in sents_char_spans]
for token in doc:
token.is_sent_start = (True if token.idx
in start_token_ids else False)
return doc