Skip to content

Commit

Permalink
Return source white space string (#95)
Browse files Browse the repository at this point in the history
* fix: return actual white space string

* test: white_space returns source string

* fix: surface length rather than raw length

* Add a few more tests and use escape characters

Escape characters make the source easier to read.

* Cache whitespace to avoid invalidation issues

* Remove commented code

* Add whitespace clobber test

---------

Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
  • Loading branch information
kino-ma and polm authored Nov 10, 2024
1 parent 38f679b commit bd3d34b
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 9 deletions.
24 changes: 16 additions & 8 deletions fugashi/fugashi.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ cdef class Node:
feature string, which is an untokenized CSV string."""
cdef const mecab_node_t* c_node
cdef str _surface
cdef str _ws
cdef object features
cdef object wrapper

Expand All @@ -60,10 +61,6 @@ cdef class Node:
@property
def surface(self):
if self._surface is None:
#self._surface = self.c_node.surface[:self.c_node.length].decode('utf-8')
#base = self._offset + (self.c_node.rlength - self.c_node.length)
#end = self._offset + self.c_node.rlength
#self._surface = self.__cstr[end - self.c_node.length:end].decode('utf-8')
pass
return self._surface

Expand Down Expand Up @@ -107,11 +104,13 @@ cdef class Node:

@property
def white_space(self):
# The half-width spaces before the token, if any.
if self.length == self.rlength:
if self._ws is None:
return ''
else:
return ' ' * (self.rlength - self.length)
return self._ws

@white_space.setter
def white_space(self, ws):
self._ws = ws

cdef list pad_none(self, list fields):
try:
Expand Down Expand Up @@ -285,6 +284,15 @@ cdef class GenericTagger:
self._cache[shash] = sys.intern(surf.decode("utf-8"))
nn.surface = self._cache[shash]

# do the same for whitespace
nodelen = node.rlength - node.length
pnode = node.prev
ws = pnode.surface[pnode.length : pnode.length + nodelen]
wshash = hash(ws)
if wshash not in self._cache:
self._cache[wshash] = sys.intern(ws.decode("utf-8"))
nn.white_space = self._cache[wshash]

out.append(nn)

def nbest(self, text, num=10):
Expand Down
20 changes: 19 additions & 1 deletion fugashi/tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,16 @@
('稻村に行きました', ['0,2', '*', '0', '*', '*']),
)

# Last number is token index of white space
WHITE_SPACE_TESTS = (
("これは 半角スペースです", " ", 2),
("これは\tタブ文字です", "\t", 2),
("これは\n改行文字です", "\n", 2),
("これは\n\t 複数種類の空白文字です", "\n\t ", 2),
("これは\n\t 複数種類の空白文字です", "\n\t ", 2),
("\tタブ文字で始まる文字列", "\t", 0),
)

@pytest.mark.parametrize('text,wakati', WAKATI_TESTS)
def test_wakati(text, wakati):
tagger = Tagger('-Owakati')
Expand Down Expand Up @@ -82,7 +92,15 @@ def test_accent(text, accent):
def test_clobber():
# Check that memory isn't clobbered by repeated parse calls
tagger = Tagger()
nodes1 = tagger("a b c d")
nodes1 = tagger("a\tb c d")
nodes2 = tagger("x y z !")

assert "a b c d".split() == [nn.surface for nn in nodes1]
assert ["", "\t", " ", " "] == [nn.white_space for nn in nodes1]

@pytest.mark.parametrize("text,space,idx", WHITE_SPACE_TESTS)
def test_white_space(text, space, idx):
tagger = Tagger()
nodes = tagger.parseToNodeList(text)

assert nodes[idx].white_space == space

0 comments on commit bd3d34b

Please sign in to comment.