roshan-research · hadifar · Aug 25, 2018 · Aug 25, 2018 · Aug 25, 2018 · Aug 25, 2018
diff --git a/data.py b/data.py
diff --git a/hazm/BijankhanReader.py b/hazm/BijankhanReader.py
@@ -1,47 +1,75 @@
 # coding: utf-8
 
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 from __future__ import unicode_literals
-import re, codecs
-from .Normalizer import *
+
+import codecs
+import re
+
+from .Normalizer import Normalizer
 from .PeykareReader import join_verb_parts
 
-default_pos_map = {'ADJ': 'ADJ', 'ADJ_CMPR': 'ADJ', 'ADJ_INO': 'ADJ', 'ADJ_ORD': 'ADJ', 'ADJ_SIM': 'ADJ', 'ADJ_SUP': 'ADJ', 'ADV': 'ADV', 'ADV_EXM': 'ADV', 'ADV_I': 'ADV', 'ADV_NEGG': 'ADV', 'ADV_NI': 'ADV', 'ADV_TIME': 'ADV', 'AR': 'AR', 'CON': 'CONJ', 'DEFAULT': 'DEFAULT', 'DELM': 'PUNC', 'DET': 'PREP', 'IF': 'IF', 'INT': 'INT', 'MORP': 'MORP', 'MQUA': 'MQUA', 'MS': 'MS', 'N_PL': 'N', 'N_SING': 'N', 'NN': 'NN', 'NP': 'NP', 'OH': 'OH', 'OHH': 'OHH', 'P': 'PREP', 'PP': 'PP', 'PRO': 'PR', 'PS': 'PS', 'QUA': 'QUA', 'SPEC': 'SPEC', 'V_AUX': 'V', 'V_IMP': 'V', 'V_PA': 'V', 'V_PRE': 'V', 'V_PRS': 'V', 'V_SUB': 'V'}
-
-
-class BijankhanReader():
-	"""
-	interfaces [Bijankhan Corpus](http://ece.ut.ac.ir/dbrg/bijankhan/Corpus/BijanKhan_Corpus_Processed.zip) that you must download and extract it.
-
-	>>> bijankhan = BijankhanReader(bijankhan_file='corpora/bijankhan.txt')
-	>>> next(bijankhan.sents())
-	[('اولین', 'ADJ'), ('سیاره', 'N'), ('خارج', 'ADJ'), ('از', 'PREP'), ('منظومه', 'N'), ('شمسی', 'ADJ'), ('دیده_شد', 'V'), ('.', 'PUNC')]
-	"""
-
-	def __init__(self, bijankhan_file, joined_verb_parts=True, pos_map=default_pos_map):
-		self._bijankhan_file = bijankhan_file
-		self._joined_verb_parts = joined_verb_parts
-		self._pos_map = pos_map
-		self._normalizer = Normalizer(punctuation_spacing=False)
-
-	def _sentences(self):
-		sentence = []
-		for line in codecs.open(self._bijankhan_file, encoding='utf-8'):
-			parts = re.split('  +', line.strip())
-			if len(parts) == 2:
-				word, tag = parts
-				if word not in ('#', '*'):
-					word = self._normalizer.normalize(word)
-					sentence.append((word if word else '_', tag))
-				if tag == 'DELM' and word in ('#', '*', '.', '؟', '!') :
-					if len(sentence):
-						yield sentence
-						sentence = []
-
-	def sents(self):
-		map_poses = lambda item: (item[0], self._pos_map.get(item[1], item[1]))
-
-		for sentence in self._sentences():
-			if self._joined_verb_parts:
-				sentence = join_verb_parts(sentence)
-
-			yield list(map(map_poses, sentence))
+default_pos_map = {'ADJ': 'ADJ', 'ADJ_CMPR': 'ADJ',
+                   'ADJ_INO': 'ADJ', 'ADJ_ORD': 'ADJ',
+                   'ADJ_SIM': 'ADJ', 'ADJ_SUP': 'ADJ',
+                   'ADV': 'ADV', 'ADV_EXM': 'ADV',
+                   'ADV_I': 'ADV', 'ADV_NEGG': 'ADV',
+                   'ADV_NI': 'ADV', 'ADV_TIME': 'ADV',
+                   'AR': 'AR', 'CON': 'CONJ',
+                   'DEFAULT': 'DEFAULT', 'DELM': 'PUNC',
+                   'DET': 'PREP', 'IF': 'IF', 'INT': 'INT',
+                   'MORP': 'MORP', 'MQUA': 'MQUA',
+                   'MS': 'MS', 'N_PL': 'N', 'N_SING': 'N',
+                   'NN': 'NN', 'NP': 'NP', 'OH': 'OH',
+                   'OHH': 'OHH', 'P': 'PREP', 'PP': 'PP',
+                   'PRO': 'PR', 'PS': 'PS', 'QUA': 'QUA',
+                   'SPEC': 'SPEC', 'V_AUX': 'V',
+                   'V_IMP': 'V', 'V_PA': 'V',
+                   'V_PRE': 'V', 'V_PRS': 'V', 'V_SUB': 'V'}
+
+
+class BijankhanReader:
+    """
+    interfaces [Bijankhan Corpus](
+    http://ece.ut.ac.ir/dbrg/bijankhan/Corpus/BijanKhan_Corpus_Processed.zip
+    ) that you must download and extract it.
+
+    >>> bijankhan = BijankhanReader(bijankhan_file='corpora/bijankhan.txt')
+    >>> next(bijankhan.sents()) [('اولین', 'ADJ'), ('سیاره', 'N'), ('خارج',
+    'ADJ'), ('از', 'PREP'), ('منظومه', 'N'), ('شمسی', 'ADJ'), ('دیده_شد',
+    'V'), ('.', 'PUNC')]
+
+    """
+
+    def __init__(self, bijankhan_file, joined_verb_parts=True,
+                 pos_map=default_pos_map):
+
+        self._bijankhan_file = bijankhan_file
+        self._joined_verb_parts = joined_verb_parts
+        self._pos_map = pos_map
+        self._normalizer = Normalizer(punctuation_spacing=False)
+
+    def _sentences(self):
+        sentence = []
+        for line in codecs.open(self._bijankhan_file, encoding='utf-8'):
+            parts = re.split('  +', line.strip())
+            if len(parts) == 2:
+                word, tag = parts
+                if word not in ('#', '*'):
+                    word = self._normalizer.normalize(word)
+                    sentence.append((word if word else '_', tag))
+                if tag == 'DELM' and word in ('#', '*', '.', '؟', '!'):
+                    if len(sentence):
+                        yield sentence
+                        sentence = []
+
+    def sents(self):
+        map_poses = lambda item: (item[0], self._pos_map.get(item[1], item[1]))
+
+        for sentence in self._sentences():
+            if self._joined_verb_parts:
+                sentence = join_verb_parts(sentence)
+
+            yield list(map(map_poses, sentence))
diff --git a/hazm/Chunker.py b/hazm/Chunker.py
@@ -1,58 +1,69 @@
 # coding: utf-8
 
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 from __future__ import unicode_literals
-from nltk.chunk import ChunkParserI, RegexpParser, tree2conlltags, conlltags2tree
+
+from nltk.chunk import ChunkParserI
+from nltk.chunk import RegexpParser
+from nltk.chunk import conlltags2tree
+from nltk.chunk import tree2conlltags
+
 from .SequenceTagger import IOBTagger
 
 
 def tree2brackets(tree):
-	str, tag = '', ''
-	for item in tree2conlltags(tree):
-		if item[2][0] in {'B', 'O'} and tag:
-			str += tag +'] '
-			tag = ''
+    str, tag = '', ''
+    for item in tree2conlltags(tree):
+        if item[2][0] in {'B', 'O'} and tag:
+            str += tag + '] '
+            tag = ''
 
-		if item[2][0] == 'B':
-			tag = item[2].split('-')[1]
-			str += '['
-		str += item[0] +' '
+        if item[2][0] == 'B':
+            tag = item[2].split('-')[1]
+            str += '['
+        str += item[0] + ' '
 
-	if tag:
-		str += tag +'] '
+    if tag:
+        str += tag + '] '
 
-	return str.strip()
+    return str.strip()
 
 
 class Chunker(IOBTagger, ChunkParserI):
-	"""
-	>>> chunker = Chunker(model='resources/chunker.model')
-	>>> tree2brackets(chunker.parse([('نامه', 'Ne'), ('ایشان', 'PRO'), ('را', 'POSTP'), ('دریافت', 'N'), ('داشتم', 'V'), ('.', 'PUNC')]))
-	'[نامه ایشان NP] [را POSTP] [دریافت داشتم VP] .'
-	"""
+    """
+    >>> chunker = Chunker(model='resources/chunker.model')
+    >>> tree2brackets(chunker.parse([('نامه', 'Ne'), ('ایشان', 'PRO'),
+    ('را', 'POSTP'), ('دریافت', 'N'), ('داشتم', 'V'), ('.', 'PUNC')]))
+    '[نامه ایشان NP] [را POSTP] [دریافت داشتم VP] .'
+    """
 
-	def train(self, trees):
-		super(Chunker, self).train(map(tree2conlltags, trees))
+    def train(self, trees):
+        super(Chunker, self).train(map(tree2conlltags, trees))
 
-	def parse(self, sentence):
-		return next(self.parse_sents([sentence]))
+    def parse(self, sentence):
+        return next(self.parse_sents([sentence]))
 
-	def parse_sents(self, sentences):
-		for conlltagged in super(Chunker, self).tag_sents(sentences):
-			yield conlltags2tree(conlltagged)
+    def parse_sents(self, sentences):
+        for conlltagged in super(Chunker, self).tag_sents(sentences):
+            yield conlltags2tree(conlltagged)
 
-	def evaluate(self, gold):
-		return ChunkParserI.evaluate(self, gold)
+    def evaluate(self, gold):
+        return ChunkParserI.evaluate(self, gold)
 
 
 class RuleBasedChunker(RegexpParser):
-	"""
-	>>> chunker = RuleBasedChunker()
-	>>> tree2brackets(chunker.parse([('نامه', 'Ne'), ('۱۰', 'NUMe'), ('فوریه', 'Ne'), ('شما', 'PRO'), ('را', 'POSTP'), ('دریافت', 'N'), ('داشتم', 'V'), ('.', 'PUNC')]))
-	'[نامه ۱۰ فوریه شما NP] [را POSTP] [دریافت داشتم VP] .'
-	"""
+    """
+    >>> chunker = RuleBasedChunker()
+    >>> tree2brackets(chunker.parse([('نامه', 'Ne'), ('۱۰', 'NUMe'),
+     ('فوریه', 'Ne'), ('شما', 'PRO'), ('را', 'POSTP'), ('دریافت', 'N'),
+      ('داشتم', 'V'), ('.', 'PUNC')]))
+    '[نامه ۱۰ فوریه شما NP] [را POSTP] [دریافت داشتم VP] .'
+    """
 
-	def __init__(self):
-		grammar = r"""
+    def __init__(self):
+        grammar = r"""
 
 			NP:
 				<P>{<N>}<V>
@@ -82,4 +93,4 @@ def __init__(self):
 
 		"""
 
-		super(RuleBasedChunker, self).__init__(grammar=grammar)
+        super(RuleBasedChunker, self).__init__(grammar=grammar)