Merge pull request #249 from sir-kokabi/resolve-conflict-in-pr-#120

Resolve conflict in pr #120
roshan-research · Mar 1, 2023 · 69481c9 · 69481c9
2 parents 3bab84a + 4aaf9af
commit 69481c9
Showing 1 changed file with 19 additions and 0 deletions.
diff --git a/hazm/PersianPlainTextReader.py b/hazm/PersianPlainTextReader.py
@@ -0,0 +1,19 @@
+from nltk.corpus import PlaintextCorpusReader
+from nltk.corpus.reader import StreamBackedCorpusView, read_blankline_block
+
+from hazm import word_tokenize, sent_tokenize
+
+
+class PersianPlainTextReader(PlaintextCorpusReader):
+    """
+    Reader for corpora that consist of plaintext documents.  Paragraphs
+    are assumed to be split using blank lines.  Sentences and words can
+    be tokenized using the default tokenizers, or by custom tokenizers
+    specificed as parameters to the constructor.
+
+    """
+    CorpusView = StreamBackedCorpusView
+
+    def __init__(self, root, fileids, word_tokenizer=word_tokenize, sent_tokenizer=sent_tokenize,
+                 para_block_reader=read_blankline_block, encoding='utf8'):
+        super().__init__(root, fileids, word_tokenizer, sent_tokenizer, para_block_reader, encoding)