Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test and refactor WikiCorpus #1821

Merged
merged 9 commits into from
Jan 11, 2018
14 changes: 6 additions & 8 deletions gensim/corpora/wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Copyright (C) 2012 Lars Buitinck <larsmans@gmail.com>
# Copyright (C) 2018 Emmanouil Stergiadis <em.stergiadis@gmail.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


Expand Down Expand Up @@ -56,8 +57,8 @@
RE_P12 = re.compile(r'\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) # table formatting
RE_P13 = re.compile(r'\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) # table cell formatting
RE_P14 = re.compile(r'\[\[Category:[^][]*\]\]', re.UNICODE) # categories
# Remove File and Image template
RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
RE_P15 = re.compile(r'\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE) # Remove File and Image template


# MediaWiki namespaces (https://www.mediawiki.org/wiki/Manual:Namespace) that
# ought to be ignored
Expand Down Expand Up @@ -332,19 +333,15 @@ def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), diction
self.token_min_len = token_min_len
self.token_max_len = token_max_len
self.lower = lower

if dictionary is None:
self.dictionary = Dictionary(self.get_texts())
else:
self.dictionary = dictionary
self.dictionary = dictionary or Dictionary(self.get_texts())

def get_texts(self):
"""
Iterate over the dump, returning text version of each article as a list
of tokens.

Only articles of sufficient length are returned (short articles & redirects
etc are ignored). This is control by `article_min_tokens` on the class instance.
etc are ignored). This is controlled by `article_min_tokens` on the class instance.

Note that this iterates over the **texts**; if you want vectors, just use
the standard corpus interface instead of this function::
Expand Down Expand Up @@ -380,6 +377,7 @@ def get_texts(self):
yield (tokens, (pageid, title))
else:
yield tokens

except KeyboardInterrupt:
logger.warn(
"user terminated iteration over Wikipedia corpus after %i documents with %i positions "
Expand Down
167 changes: 166 additions & 1 deletion gensim/test/test_corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import numpy as np

from gensim.corpora import (bleicorpus, mmcorpus, lowcorpus, svmlightcorpus,
ucicorpus, malletcorpus, textcorpus, indexedcorpus)
ucicorpus, malletcorpus, textcorpus, indexedcorpus, wikicorpus)
from gensim.interfaces import TransformedCorpus
from gensim.utils import to_unicode
from gensim.test.utils import datapath, get_tmpfile
Expand Down Expand Up @@ -400,6 +400,171 @@ def test_indexing(self):
pass


# Needed for the test_custom_tokenizer is the TestWikiCorpus class.
# Cannot be nested due to serializing.
def custom_tokenizer(content, token_min_len=2, token_max_len=15, lower=True):
return [
to_unicode(token.lower()) if lower else to_unicode(token) for token in content.split()
if token_min_len <= len(token) <= token_max_len and not token.startswith('_')
]


class TestWikiCorpus(TestTextCorpus):
def setUp(self):
self.corpus_class = wikicorpus.WikiCorpus
self.file_extension = '.xml.bz2'
self.fname = datapath('testcorpus.' + self.file_extension.lstrip('.'))
self.enwiki = datapath('enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2')

def test_default_preprocessing(self):
expected = ['computer', 'human', 'interface']
corpus = self.corpus_class(self.fname, article_min_tokens=0)
first_text = next(corpus.get_texts())
self.assertEqual(expected, first_text)

def test_len(self):
# When there is no min_token limit all 9 articles must be registered.
corpus = self.corpus_class(self.fname, article_min_tokens=0)
all_articles = corpus.get_texts()
assert (len(list(all_articles)) == 9)

# With a huge min_token limit, all articles should be filtered out.
corpus = self.corpus_class(self.fname, article_min_tokens=100000)
all_articles = corpus.get_texts()
assert (len(list(all_articles)) == 0)

def test_load_with_metadata(self):
corpus = self.corpus_class(self.fname, article_min_tokens=0)
corpus.metadata = True
self.assertEqual(len(corpus), 9)

docs = list(corpus)
self.assertEqual(len(docs), 9)

for i, docmeta in enumerate(docs):
doc, metadata = docmeta
article_no = i + 1 # Counting IDs from 1
self.assertEqual(metadata[0], str(article_no))
self.assertEqual(metadata[1], 'Article%d' % article_no)

def test_load(self):
corpus = self.corpus_class(self.fname, article_min_tokens=0)

docs = list(corpus)
# the deerwester corpus always has nine documents
self.assertEqual(len(docs), 9)

def test_first_element(self):
"""
First two articles in this sample are
1) anarchism
2) autism
"""
corpus = self.corpus_class(self.enwiki, processes=1)

texts = corpus.get_texts()
self.assertTrue(u'anarchism' in next(texts))
self.assertTrue(u'autism' in next(texts))

def test_unicode_element(self):
"""
First unicode article in this sample is
1) папа
"""
bgwiki = datapath('bgwiki-latest-pages-articles-shortened.xml.bz2')
corpus = self.corpus_class(bgwiki)
texts = corpus.get_texts()
self.assertTrue(u'папа' in next(texts))

def test_custom_tokenizer(self):
"""
define a custom tokenizer function and use it
"""
wc = self.corpus_class(self.enwiki, processes=1, lemmatize=False, tokenizer_func=custom_tokenizer,
token_max_len=16, token_min_len=1, lower=False)
row = wc.get_texts()
list_tokens = next(row)
self.assertTrue(u'Anarchism' in list_tokens)
self.assertTrue(u'collectivization' in list_tokens)
self.assertTrue(u'a' in list_tokens)
self.assertTrue(u'i.e.' in list_tokens)

def test_lower_case_set_true(self):
"""
Set the parameter lower to True and check that upper case 'Anarchism' token doesnt exist
"""
corpus = self.corpus_class(self.enwiki, processes=1, lower=True, lemmatize=False)
row = corpus.get_texts()
list_tokens = next(row)
self.assertTrue(u'Anarchism' not in list_tokens)
self.assertTrue(u'anarchism' in list_tokens)

def test_lower_case_set_false(self):
"""
Set the parameter lower to False and check that upper case Anarchism' token exists
"""
corpus = self.corpus_class(self.enwiki, processes=1, lower=False, lemmatize=False)
row = corpus.get_texts()
list_tokens = next(row)
self.assertTrue(u'Anarchism' in list_tokens)
self.assertTrue(u'anarchism' in list_tokens)

def test_min_token_len_not_set(self):
"""
Don't set the parameter token_min_len and check that 'a' as a token doesn't exist
Default token_min_len=2
"""
corpus = self.corpus_class(self.enwiki, processes=1, lemmatize=False)
self.assertTrue(u'a' not in next(corpus.get_texts()))

def test_min_token_len_set(self):
"""
Set the parameter token_min_len to 1 and check that 'a' as a token exists
"""
corpus = self.corpus_class(self.enwiki, processes=1, token_min_len=1, lemmatize=False)
self.assertTrue(u'a' in next(corpus.get_texts()))

def test_max_token_len_not_set(self):
"""
Don't set the parameter token_max_len and check that 'collectivisation' as a token doesn't exist
Default token_max_len=15
"""
corpus = self.corpus_class(self.enwiki, processes=1, lemmatize=False)
self.assertTrue(u'collectivization' not in next(corpus.get_texts()))

def test_max_token_len_set(self):
"""
Set the parameter token_max_len to 16 and check that 'collectivisation' as a token exists
"""
corpus = self.corpus_class(self.enwiki, processes=1, token_max_len=16, lemmatize=False)
self.assertTrue(u'collectivization' in next(corpus.get_texts()))

# #TODO: sporadic failure to be investigated
# def test_get_texts_returns_generator_of_lists(self):
# corpus = self.corpus_class(self.enwiki)
# l = corpus.get_texts()
# self.assertEqual(type(l), types.GeneratorType)
# first = next(l)
# self.assertEqual(type(first), list)
# self.assertTrue(isinstance(first[0], bytes) or isinstance(first[0], str))

def test_sample_text(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably better to skip this test (not silently pass)., what do you think @steremma?

Copy link
Contributor Author

@steremma steremma Jan 11, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is that this test overrides the one defined in TestTextCorpus. If we just skip it the parent definition will be called and it will fail because we plain text is not legit XML (nor is it compressed). In that sense passing it silently practically ignores it. The same idea is followed in the tests:

    def test_save(self):
        pass

    def test_serialize(self):
        pass

    def test_serialize_compressed(self):
        pass

    def test_indexing(self):
        pass

of TestTextCorpus for these 4 tests defined in the parent class CorpusTestCase. So I think its better to keep it as it is.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Best variant - change class hierarchy and interfaces, we should'nt give "useless" methods from parent class (change from TestWikiCorpus -> TestTextCorpus to
TestTextCorpus -> BaseTestTextCorpus <- TestWikiCorpus), but right now this isn't really needed, stay current variant with "pass".

# Cannot instantiate WikiCorpus from lines
pass

def test_sample_text_length(self):
# Cannot instantiate WikiCorpus from lines
pass

def test_sample_text_seed(self):
# Cannot instantiate WikiCorpus from lines
pass

def test_empty_input(self):
# An empty file is not legit XML
pass


class TestTextDirectoryCorpus(unittest.TestCase):

def write_one_level(self, *args):
Expand Down
Binary file added gensim/test/test_data/testcorpus.xml.bz2
Binary file not shown.
135 changes: 0 additions & 135 deletions gensim/test/test_wikicorpus.py

This file was deleted.