Skip to content

Commit

Permalink
Fixes issue piskvorky#326.
Browse files Browse the repository at this point in the history
- Always compactify after Dictionary token filtering
- Add a test for Dictionary token filtering
- Add a basic test for Dictionary merging
  • Loading branch information
cscorley committed Apr 18, 2015
1 parent ef1680f commit 4863040
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 1 deletion.
2 changes: 1 addition & 1 deletion gensim/corpora/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,6 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):

# do the actual filtering, then rebuild dictionary to remove gaps in ids
self.filter_tokens(good_ids=good_ids)
self.compactify()
logger.info("resulting dictionary: %s" % self)


Expand All @@ -240,6 +239,7 @@ def filter_tokens(self, bad_ids=None, good_ids=None):
self.dfs = dict((tokenid, freq)
for tokenid, freq in iteritems(self.dfs)
if tokenid in good_ids)
self.compactify()


def compactify(self):
Expand Down
26 changes: 26 additions & 0 deletions gensim/test/test_corpora_dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,38 @@ def testBuild(self):
'system': 5, 'time': 6, 'trees': 9, 'user': 7}
self.assertEqual(d.token2id, expected)

def testMerge(self):
d = Dictionary(self.texts)
f = Dictionary(self.texts[:3])
g = Dictionary(self.texts[3:])

f.merge_with(g)
self.assertEqual(sorted(d.token2id.keys()), sorted(f.token2id.keys()))

def testFilter(self):
d = Dictionary(self.texts)
d.filter_extremes(no_below=2, no_above=1.0, keep_n=4)
expected = {0: 3, 1: 3, 2: 3, 3: 3}
self.assertEqual(d.dfs, expected)

def testFilterTokens(self):
self.maxDiff = 10000
d = Dictionary(self.texts)

removed_word = d[0]
d.filter_tokens([0])

expected = {'computer': 0, 'eps': 8, 'graph': 10, 'human': 1,
'interface': 2, 'minors': 11, 'response': 3, 'survey': 4,
'system': 5, 'time': 6, 'trees': 9, 'user': 7}
del expected[removed_word]
self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys()))

expected[removed_word] = len(expected)
d.add_documents([[removed_word]])
self.assertEqual(sorted(d.token2id.keys()), sorted(expected.keys()))


def test_doc2bow(self):
d = Dictionary([["žluťoučký"], ["žluťoučký"]])

Expand Down

0 comments on commit 4863040

Please sign in to comment.