Skip to content

Commit

Permalink
Merge pull request #249 from sir-kokabi/resolve-conflict-in-pr-#120
Browse files Browse the repository at this point in the history
Resolve conflict in pr #120
  • Loading branch information
imani authored Mar 1, 2023
2 parents 3bab84a + 4aaf9af commit 69481c9
Showing 1 changed file with 19 additions and 0 deletions.
19 changes: 19 additions & 0 deletions hazm/PersianPlainTextReader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus.reader import StreamBackedCorpusView, read_blankline_block

from hazm import word_tokenize, sent_tokenize


class PersianPlainTextReader(PlaintextCorpusReader):
"""
Reader for corpora that consist of plaintext documents. Paragraphs
are assumed to be split using blank lines. Sentences and words can
be tokenized using the default tokenizers, or by custom tokenizers
specificed as parameters to the constructor.
"""
CorpusView = StreamBackedCorpusView

def __init__(self, root, fileids, word_tokenizer=word_tokenize, sent_tokenizer=sent_tokenize,
para_block_reader=read_blankline_block, encoding='utf8'):
super().__init__(root, fileids, word_tokenizer, sent_tokenizer, para_block_reader, encoding)

0 comments on commit 69481c9

Please sign in to comment.