From a879678db01c651871991001c81d4d7422098a87 Mon Sep 17 00:00:00 2001 From: Amir Hadifar Date: Sat, 25 Aug 2018 14:04:19 +0430 Subject: [PATCH 1/4] white spaces issue fixed --- hazm/BijankhanReader.py | 84 +++--- hazm/Chunker.py | 70 ++--- hazm/DadeganReader.py | 475 +++++++++++++++++----------------- hazm/DependencyParser.py | 185 +++++++------- hazm/HamshahriReader.py | 118 +++++---- hazm/InformalNormalizer.py | 398 ++++++++++++++--------------- hazm/Lemmatizer.py | 197 +++++++------- hazm/Normalizer.py | 347 ++++++++++++------------- hazm/POSTagger.py | 45 ++-- hazm/PersicaReader.py | 69 ++--- hazm/PeykareReader.py | 147 +++++------ hazm/QuranCorpusReader.py | 102 ++++---- hazm/SentenceTokenizer.py | 22 +- hazm/SentiPersReader.py | 115 +++++---- hazm/SequenceTagger.py | 84 +++--- hazm/Stemmer.py | 57 +++-- hazm/TNewsReader.py | 99 ++++---- hazm/TokenSplitter.py | 58 +++-- hazm/TreebankReader.py | 508 +++++++++++++++++++------------------ hazm/VerbValencyReader.py | 41 +-- hazm/WikiExtractor.py | 212 ++++++++-------- hazm/WikipediaReader.py | 67 ++--- hazm/WordTokenizer.py | 131 +++++----- hazm/utils.py | 13 +- 24 files changed, 1878 insertions(+), 1766 deletions(-) diff --git a/hazm/BijankhanReader.py b/hazm/BijankhanReader.py index 8b55216c..e5512219 100644 --- a/hazm/BijankhanReader.py +++ b/hazm/BijankhanReader.py @@ -1,47 +1,55 @@ # coding: utf-8 from __future__ import unicode_literals -import re, codecs + +import codecs + from .Normalizer import * from .PeykareReader import join_verb_parts -default_pos_map = {'ADJ': 'ADJ', 'ADJ_CMPR': 'ADJ', 'ADJ_INO': 'ADJ', 'ADJ_ORD': 'ADJ', 'ADJ_SIM': 'ADJ', 'ADJ_SUP': 'ADJ', 'ADV': 'ADV', 'ADV_EXM': 'ADV', 'ADV_I': 'ADV', 'ADV_NEGG': 'ADV', 'ADV_NI': 'ADV', 'ADV_TIME': 'ADV', 'AR': 'AR', 'CON': 'CONJ', 'DEFAULT': 'DEFAULT', 'DELM': 'PUNC', 'DET': 'PREP', 'IF': 'IF', 'INT': 'INT', 'MORP': 'MORP', 'MQUA': 'MQUA', 'MS': 'MS', 'N_PL': 'N', 'N_SING': 'N', 'NN': 'NN', 'NP': 'NP', 'OH': 'OH', 'OHH': 'OHH', 'P': 'PREP', 'PP': 'PP', 'PRO': 'PR', 'PS': 'PS', 'QUA': 'QUA', 'SPEC': 'SPEC', 'V_AUX': 'V', 'V_IMP': 'V', 'V_PA': 'V', 'V_PRE': 'V', 'V_PRS': 'V', 'V_SUB': 'V'} +default_pos_map = {'ADJ': 'ADJ', 'ADJ_CMPR': 'ADJ', 'ADJ_INO': 'ADJ', 'ADJ_ORD': 'ADJ', 'ADJ_SIM': 'ADJ', + 'ADJ_SUP': 'ADJ', 'ADV': 'ADV', 'ADV_EXM': 'ADV', 'ADV_I': 'ADV', 'ADV_NEGG': 'ADV', 'ADV_NI': 'ADV', + 'ADV_TIME': 'ADV', 'AR': 'AR', 'CON': 'CONJ', 'DEFAULT': 'DEFAULT', 'DELM': 'PUNC', 'DET': 'PREP', + 'IF': 'IF', 'INT': 'INT', 'MORP': 'MORP', 'MQUA': 'MQUA', 'MS': 'MS', 'N_PL': 'N', 'N_SING': 'N', + 'NN': 'NN', 'NP': 'NP', 'OH': 'OH', 'OHH': 'OHH', 'P': 'PREP', 'PP': 'PP', 'PRO': 'PR', 'PS': 'PS', + 'QUA': 'QUA', 'SPEC': 'SPEC', 'V_AUX': 'V', 'V_IMP': 'V', 'V_PA': 'V', 'V_PRE': 'V', 'V_PRS': 'V', + 'V_SUB': 'V'} class BijankhanReader(): - """ - interfaces [Bijankhan Corpus](http://ece.ut.ac.ir/dbrg/bijankhan/Corpus/BijanKhan_Corpus_Processed.zip) that you must download and extract it. - - >>> bijankhan = BijankhanReader(bijankhan_file='corpora/bijankhan.txt') - >>> next(bijankhan.sents()) - [('اولین', 'ADJ'), ('سیاره', 'N'), ('خارج', 'ADJ'), ('از', 'PREP'), ('منظومه', 'N'), ('شمسی', 'ADJ'), ('دیده_شد', 'V'), ('.', 'PUNC')] - """ - - def __init__(self, bijankhan_file, joined_verb_parts=True, pos_map=default_pos_map): - self._bijankhan_file = bijankhan_file - self._joined_verb_parts = joined_verb_parts - self._pos_map = pos_map - self._normalizer = Normalizer(punctuation_spacing=False) - - def _sentences(self): - sentence = [] - for line in codecs.open(self._bijankhan_file, encoding='utf-8'): - parts = re.split(' +', line.strip()) - if len(parts) == 2: - word, tag = parts - if word not in ('#', '*'): - word = self._normalizer.normalize(word) - sentence.append((word if word else '_', tag)) - if tag == 'DELM' and word in ('#', '*', '.', '؟', '!') : - if len(sentence): - yield sentence - sentence = [] - - def sents(self): - map_poses = lambda item: (item[0], self._pos_map.get(item[1], item[1])) - - for sentence in self._sentences(): - if self._joined_verb_parts: - sentence = join_verb_parts(sentence) - - yield list(map(map_poses, sentence)) + """ + interfaces [Bijankhan Corpus](http://ece.ut.ac.ir/dbrg/bijankhan/Corpus/BijanKhan_Corpus_Processed.zip) that you must download and extract it. + + >>> bijankhan = BijankhanReader(bijankhan_file='corpora/bijankhan.txt') + >>> next(bijankhan.sents()) + [('اولین', 'ADJ'), ('سیاره', 'N'), ('خارج', 'ADJ'), ('از', 'PREP'), ('منظومه', 'N'), ('شمسی', 'ADJ'), ('دیده_شد', 'V'), ('.', 'PUNC')] + """ + + def __init__(self, bijankhan_file, joined_verb_parts=True, pos_map=default_pos_map): + self._bijankhan_file = bijankhan_file + self._joined_verb_parts = joined_verb_parts + self._pos_map = pos_map + self._normalizer = Normalizer(punctuation_spacing=False) + + def _sentences(self): + sentence = [] + for line in codecs.open(self._bijankhan_file, encoding='utf-8'): + parts = re.split(' +', line.strip()) + if len(parts) == 2: + word, tag = parts + if word not in ('#', '*'): + word = self._normalizer.normalize(word) + sentence.append((word if word else '_', tag)) + if tag == 'DELM' and word in ('#', '*', '.', '؟', '!'): + if len(sentence): + yield sentence + sentence = [] + + def sents(self): + map_poses = lambda item: (item[0], self._pos_map.get(item[1], item[1])) + + for sentence in self._sentences(): + if self._joined_verb_parts: + sentence = join_verb_parts(sentence) + + yield list(map(map_poses, sentence)) diff --git a/hazm/Chunker.py b/hazm/Chunker.py index 66c96c13..0b01ca61 100755 --- a/hazm/Chunker.py +++ b/hazm/Chunker.py @@ -1,58 +1,60 @@ # coding: utf-8 from __future__ import unicode_literals + from nltk.chunk import ChunkParserI, RegexpParser, tree2conlltags, conlltags2tree + from .SequenceTagger import IOBTagger def tree2brackets(tree): - str, tag = '', '' - for item in tree2conlltags(tree): - if item[2][0] in {'B', 'O'} and tag: - str += tag +'] ' - tag = '' + str, tag = '', '' + for item in tree2conlltags(tree): + if item[2][0] in {'B', 'O'} and tag: + str += tag + '] ' + tag = '' - if item[2][0] == 'B': - tag = item[2].split('-')[1] - str += '[' - str += item[0] +' ' + if item[2][0] == 'B': + tag = item[2].split('-')[1] + str += '[' + str += item[0] + ' ' - if tag: - str += tag +'] ' + if tag: + str += tag + '] ' - return str.strip() + return str.strip() class Chunker(IOBTagger, ChunkParserI): - """ - >>> chunker = Chunker(model='resources/chunker.model') - >>> tree2brackets(chunker.parse([('نامه', 'Ne'), ('ایشان', 'PRO'), ('را', 'POSTP'), ('دریافت', 'N'), ('داشتم', 'V'), ('.', 'PUNC')])) - '[نامه ایشان NP] [را POSTP] [دریافت داشتم VP] .' - """ + """ + >>> chunker = Chunker(model='resources/chunker.model') + >>> tree2brackets(chunker.parse([('نامه', 'Ne'), ('ایشان', 'PRO'), ('را', 'POSTP'), ('دریافت', 'N'), ('داشتم', 'V'), ('.', 'PUNC')])) + '[نامه ایشان NP] [را POSTP] [دریافت داشتم VP] .' + """ - def train(self, trees): - super(Chunker, self).train(map(tree2conlltags, trees)) + def train(self, trees): + super(Chunker, self).train(map(tree2conlltags, trees)) - def parse(self, sentence): - return next(self.parse_sents([sentence])) + def parse(self, sentence): + return next(self.parse_sents([sentence])) - def parse_sents(self, sentences): - for conlltagged in super(Chunker, self).tag_sents(sentences): - yield conlltags2tree(conlltagged) + def parse_sents(self, sentences): + for conlltagged in super(Chunker, self).tag_sents(sentences): + yield conlltags2tree(conlltagged) - def evaluate(self, gold): - return ChunkParserI.evaluate(self, gold) + def evaluate(self, gold): + return ChunkParserI.evaluate(self, gold) class RuleBasedChunker(RegexpParser): - """ - >>> chunker = RuleBasedChunker() - >>> tree2brackets(chunker.parse([('نامه', 'Ne'), ('۱۰', 'NUMe'), ('فوریه', 'Ne'), ('شما', 'PRO'), ('را', 'POSTP'), ('دریافت', 'N'), ('داشتم', 'V'), ('.', 'PUNC')])) - '[نامه ۱۰ فوریه شما NP] [را POSTP] [دریافت داشتم VP] .' - """ + """ + >>> chunker = RuleBasedChunker() + >>> tree2brackets(chunker.parse([('نامه', 'Ne'), ('۱۰', 'NUMe'), ('فوریه', 'Ne'), ('شما', 'PRO'), ('را', 'POSTP'), ('دریافت', 'N'), ('داشتم', 'V'), ('.', 'PUNC')])) + '[نامه ۱۰ فوریه شما NP] [را POSTP] [دریافت داشتم VP] .' + """ - def __init__(self): - grammar = r""" + def __init__(self): + grammar = r""" NP:

{} @@ -82,4 +84,4 @@ def __init__(self): """ - super(RuleBasedChunker, self).__init__(grammar=grammar) + super(RuleBasedChunker, self).__init__(grammar=grammar) diff --git a/hazm/DadeganReader.py b/hazm/DadeganReader.py index 32715bf3..9f24696d 100755 --- a/hazm/DadeganReader.py +++ b/hazm/DadeganReader.py @@ -1,22 +1,25 @@ # coding: utf-8 from __future__ import unicode_literals + import codecs + from nltk.parse import DependencyGraph from nltk.tree import Tree def coarse_pos_e(tags): - """ - Coarse POS tags of Dadegan corpus: - N: Noun, V: Verb, ADJ: Adjective, ADV: Adverb, PR: Pronoun, PREP: Preposition, POSTP: Postposition, CONJ: Conjunction, PUNC: Punctuation, ADR: Address Term, IDEN: Title, PART: Particle, POSNUM: Post-noun Modifier, PREM: Pre-modifier, PRENUM: Pre-noun Numeral, PSUS: Pseudo-sentence, SUBR: Subordinating Clause + """ + Coarse POS tags of Dadegan corpus: + N: Noun, V: Verb, ADJ: Adjective, ADV: Adverb, PR: Pronoun, PREP: Preposition, POSTP: Postposition, CONJ: Conjunction, PUNC: Punctuation, ADR: Address Term, IDEN: Title, PART: Particle, POSNUM: Post-noun Modifier, PREM: Pre-modifier, PRENUM: Pre-noun Numeral, PSUS: Pseudo-sentence, SUBR: Subordinating Clause - >>> coarse_pos_e(['N', 'IANM']) - 'N' - """ + >>> coarse_pos_e(['N', 'IANM']) + 'N' + """ - map = {'N': 'N', 'V': 'V', 'ADJ': 'AJ', 'ADV': 'ADV', 'PR': 'PRO', 'PREM': 'DET', 'PREP': 'P', 'POSTP': 'POSTP', 'PRENUM': 'NUM', 'CONJ': 'CONJ', 'PUNC': 'PUNC', 'SUBR': 'CONJ'} - return map.get(tags[0], 'X') + ('e' if 'EZ' in tags else '') + map = {'N': 'N', 'V': 'V', 'ADJ': 'AJ', 'ADV': 'ADV', 'PR': 'PRO', 'PREM': 'DET', 'PREP': 'P', 'POSTP': 'POSTP', + 'PRENUM': 'NUM', 'CONJ': 'CONJ', 'PUNC': 'PUNC', 'SUBR': 'CONJ'} + return map.get(tags[0], 'X') + ('e' if 'EZ' in tags else '') word_nodes = lambda tree: sorted(tree.nodes.values(), key=lambda node: node['address'])[1:] @@ -24,229 +27,233 @@ def coarse_pos_e(tags): class DadeganReader(): - """ - interfaces [Persian Dependency Treebank](http://dadegan.ir/perdt/download) - - >>> dadegan = DadeganReader(conll_file='corpora/dadegan.conll') - >>> next(dadegan.sents()) - [('این', 'DET'), ('میهمانی', 'N'), ('به', 'P'), ('منظور', 'Ne'), ('آشنایی', 'Ne'), ('هم‌تیمی‌های', 'Ne'), ('او', 'PRO'), ('با', 'P'), ('غذاهای', 'Ne'), ('ایرانی', 'AJ'), ('ترتیب', 'N'), ('داده_شد', 'V'), ('.', 'PUNC')] - - >>> from hazm.Chunker import tree2brackets - >>> tree2brackets(next(dadegan.chunked_trees())) - '[این میهمانی NP] [به PP] [منظور آشنایی هم‌تیمی‌های او NP] [با PP] [غذاهای ایرانی NP] [ترتیب داده_شد VP] .' - """ - - def __init__(self, conll_file, pos_map=coarse_pos_e): - self._conll_file = conll_file - self._pos_map = pos_map if pos_map else lambda tags: ','.join(tags) - - def _sentences(self): - with codecs.open(self._conll_file, encoding='utf8') as conll_file: - text = conll_file.read() - - # refine text - text = text.replace('‌‌', '‌').replace('\t‌', '\t').replace('‌\t', '\t').replace('\t ', '\t').replace(' \t', '\t').replace( - '\r', '').replace('\u2029', '‌') - - for item in text.replace(' ', '_').split('\n\n'): - if item.strip(): - yield item - - def trees(self): - for sentence in self._sentences(): - tree = DependencyGraph(sentence) - - for node in word_nodes(tree): - node['mtag'] = [node['ctag'], node['tag']] - - if 'ezafe' in node['feats']: - node['mtag'].append('EZ') - - node['mtag'] = self._pos_map(node['mtag']) - - yield tree - - def sents(self): - for tree in self.trees(): - yield [(node['word'], node['mtag']) for node in word_nodes(tree)] - - def chunked_trees(self): - for tree in self.trees(): - chunks = [] - for node in word_nodes(tree): - n = node['address'] - item = (node['word'], node['mtag']) - appended = False - if node['ctag'] in {'PREP', 'POSTP'}: - for d in node_deps(node): - label = 'PP' - if node['ctag'] == 'POSTP': - label = 'POSTP' - if d == n - 1 and type(chunks[-1]) == Tree and chunks[-1].label() == label: - chunks[-1].append(item) - appended = True - if node['head'] == n - 1 and len(chunks) > 0 and type(chunks[-1]) == Tree and chunks[ - -1].label() == label: - chunks[-1].append(item) - appended = True - if not appended: - chunks.append(Tree(label, [item])) - elif node['ctag'] in {'PUNC', 'CONJ', 'SUBR', 'PART'}: - if item[0] in {"'", '"', '(', ')', '{', '}', '[', ']', '-', '#', '«', '»'} and len(chunks) > 0 and type(chunks[-1]) == Tree: - for l in chunks[-1].leaves(): - if l[1] == item[1]: - chunks[-1].append(item) - appended = True - break - if appended is not True: - chunks.append(item) - elif node['ctag'] in {'N', 'PREM', 'ADJ', 'PR', 'ADR', 'PRENUM', 'IDEN', 'POSNUM', 'SADV'}: - if node['rel'] in {'MOZ', 'NPOSTMOD'}: - if len(chunks) > 0: - if type(chunks[-1]) == Tree: - j = n - len(chunks[-1].leaves()) - chunks[-1].append(item) - else: - j = n - 1 - treeNode = Tree('NP', [chunks.pop(), item]) - chunks.append(treeNode) - while j > node['head']: - leaves = chunks.pop().leaves() - if len(chunks) < 1: - chunks.append(Tree('NP', leaves)) - j -= 1 - elif type(chunks[-1]) == Tree: - j -= len(chunks[-1]) - for l in leaves: - chunks[-1].append(l) - else: - leaves.insert(0, chunks.pop()) - chunks.append(Tree('NP', leaves)) - j -= 1 - continue - elif node['rel'] == 'POSDEP' and tree.nodes[node['head']]['rel'] in {'NCONJ', 'AJCONJ'}: - conj = tree.nodes[node['head']] - if tree.nodes[conj['head']]['rel'] in {'MOZ', 'NPOSTMOD', 'AJCONJ', 'POSDEP'}: - label = 'NP' - leaves = [item] - j = n - 1 - while j >= conj['head']: - if type(chunks[-1]) is Tree: - j -= len(chunks[-1].leaves()) - label = chunks[-1].label() - leaves = chunks.pop().leaves() + leaves - else: - leaves.insert(0, chunks.pop()) - j -= 1 - chunks.append(Tree(label, leaves)) - appended = True - elif node['head'] == n - 1 and len(chunks) > 0 and type(chunks[-1]) == Tree and not chunks[ - -1].label() == 'PP': - chunks[-1].append(item) - appended = True - elif node['rel'] == 'AJCONJ' and tree.nodes[node['head']]['rel'] in {'NPOSTMOD', 'AJCONJ'}: - np_nodes = [item] - label = 'ADJP' - i = n - node['head'] - while i > 0: - if type(chunks[-1]) == Tree: - label = chunks[-1].label() - leaves = chunks.pop().leaves() - i -= len(leaves) - np_nodes = leaves + np_nodes - else: - i -= 1 - np_nodes.insert(0, chunks.pop()) - chunks.append(Tree(label, np_nodes)) - appended = True - elif node['ctag'] == 'ADJ' and node['rel'] == 'POSDEP' and tree.nodes[node['head']]['ctag'] != 'CONJ': - np_nodes = [item] - i = n - node['head'] - while i > 0: - label = 'ADJP' - if type(chunks[-1]) == Tree: - label = chunks[-1].label() - leaves = chunks.pop().leaves() - i -= len(leaves) - np_nodes = leaves + np_nodes - else: - i -= 1 - np_nodes.insert(0, chunks.pop()) - chunks.append(Tree(label, np_nodes)) - appended = True - for d in node_deps(node): - if d == n - 1 and type(chunks[-1]) == Tree and chunks[ - -1].label() != 'PP' and appended is not True: - label = chunks[-1].label() - if node['rel'] == 'ADV': - label = 'ADVP' - elif label in {'ADJP', 'ADVP'}: - if node['ctag'] == 'N': - label = 'NP' - elif node['ctag'] == 'ADJ': - label = 'ADJP' - leaves = chunks.pop().leaves() - leaves.append(item) - chunks.append(Tree(label, leaves)) - appended = True - elif tree.nodes[d]['rel'] == 'NPREMOD' and appended is not True: - np_nodes = [item] - i = n - d - while i > 0: - if type(chunks[-1]) == Tree: - leaves = chunks.pop().leaves() - i -= len(leaves) - np_nodes = leaves + np_nodes - else: - i -= 1 - np_nodes.insert(0, chunks.pop()) - chunks.append(Tree('NP', np_nodes)) - appended = True - if not appended: - label = 'NP' - if node['ctag'] == 'ADJ': - label = 'ADJP' - elif node['rel'] == 'ADV': - label = 'ADVP' - chunks.append(Tree(label, [item])) - elif node['ctag'] in {'V'}: - appended = False - for d in node_deps(node): - if d == n - 1 and type(chunks[-1]) == Tree and tree.nodes[d]['rel'] in {'NVE', 'ENC'} and appended is not True: - leaves = chunks.pop().leaves() - leaves.append(item) - chunks.append(Tree('VP', leaves)) - appended = True - elif tree.nodes[d]['rel'] in {'VPRT', 'NVE'}: - vp_nodes = [item] - i = n - d - while i > 0: - if type(chunks[-1]) == Tree: - leaves = chunks.pop().leaves() - i -= len(leaves) - vp_nodes = leaves + vp_nodes - else: - i -= 1 - vp_nodes.insert(0, chunks.pop()) - chunks.append(Tree('VP', vp_nodes)) - appended = True - break - if not appended: - chunks.append(Tree('VP', [item])) - elif node['ctag'] in {'PSUS'}: - if node['rel'] == 'ADV': - chunks.append(Tree('ADVP', [item])) - else: - chunks.append(Tree('VP', [item])) - elif node['ctag'] in {'ADV', 'SADV'}: - appended = False - for d in node_deps(node): - if d == n - 1 and type(chunks[-1]) == Tree: - leaves = chunks.pop().leaves() - leaves.append(item) - chunks.append(Tree('ADVP', leaves)) - appended = True - if not appended: - chunks.append(Tree('ADVP', [item])) - - yield Tree('S', chunks) + """ + interfaces [Persian Dependency Treebank](http://dadegan.ir/perdt/download) + + >>> dadegan = DadeganReader(conll_file='corpora/dadegan.conll') + >>> next(dadegan.sents()) + [('این', 'DET'), ('میهمانی', 'N'), ('به', 'P'), ('منظور', 'Ne'), ('آشنایی', 'Ne'), ('هم‌تیمی‌های', 'Ne'), ('او', 'PRO'), ('با', 'P'), ('غذاهای', 'Ne'), ('ایرانی', 'AJ'), ('ترتیب', 'N'), ('داده_شد', 'V'), ('.', 'PUNC')] + + >>> from hazm.Chunker import tree2brackets + >>> tree2brackets(next(dadegan.chunked_trees())) + '[این میهمانی NP] [به PP] [منظور آشنایی هم‌تیمی‌های او NP] [با PP] [غذاهای ایرانی NP] [ترتیب داده_شد VP] .' + """ + + def __init__(self, conll_file, pos_map=coarse_pos_e): + self._conll_file = conll_file + self._pos_map = pos_map if pos_map else lambda tags: ','.join(tags) + + def _sentences(self): + with codecs.open(self._conll_file, encoding='utf8') as conll_file: + text = conll_file.read() + + # refine text + text = text.replace('‌‌', '‌').replace('\t‌', '\t').replace('‌\t', '\t').replace('\t ', '\t').replace(' \t', + '\t').replace( + '\r', '').replace('\u2029', '‌') + + for item in text.replace(' ', '_').split('\n\n'): + if item.strip(): + yield item + + def trees(self): + for sentence in self._sentences(): + tree = DependencyGraph(sentence) + + for node in word_nodes(tree): + node['mtag'] = [node['ctag'], node['tag']] + + if 'ezafe' in node['feats']: + node['mtag'].append('EZ') + + node['mtag'] = self._pos_map(node['mtag']) + + yield tree + + def sents(self): + for tree in self.trees(): + yield [(node['word'], node['mtag']) for node in word_nodes(tree)] + + def chunked_trees(self): + for tree in self.trees(): + chunks = [] + for node in word_nodes(tree): + n = node['address'] + item = (node['word'], node['mtag']) + appended = False + if node['ctag'] in {'PREP', 'POSTP'}: + for d in node_deps(node): + label = 'PP' + if node['ctag'] == 'POSTP': + label = 'POSTP' + if d == n - 1 and type(chunks[-1]) == Tree and chunks[-1].label() == label: + chunks[-1].append(item) + appended = True + if node['head'] == n - 1 and len(chunks) > 0 and type(chunks[-1]) == Tree and chunks[ + -1].label() == label: + chunks[-1].append(item) + appended = True + if not appended: + chunks.append(Tree(label, [item])) + elif node['ctag'] in {'PUNC', 'CONJ', 'SUBR', 'PART'}: + if item[0] in {"'", '"', '(', ')', '{', '}', '[', ']', '-', '#', '«', '»'} and len( + chunks) > 0 and type(chunks[-1]) == Tree: + for l in chunks[-1].leaves(): + if l[1] == item[1]: + chunks[-1].append(item) + appended = True + break + if appended is not True: + chunks.append(item) + elif node['ctag'] in {'N', 'PREM', 'ADJ', 'PR', 'ADR', 'PRENUM', 'IDEN', 'POSNUM', 'SADV'}: + if node['rel'] in {'MOZ', 'NPOSTMOD'}: + if len(chunks) > 0: + if type(chunks[-1]) == Tree: + j = n - len(chunks[-1].leaves()) + chunks[-1].append(item) + else: + j = n - 1 + treeNode = Tree('NP', [chunks.pop(), item]) + chunks.append(treeNode) + while j > node['head']: + leaves = chunks.pop().leaves() + if len(chunks) < 1: + chunks.append(Tree('NP', leaves)) + j -= 1 + elif type(chunks[-1]) == Tree: + j -= len(chunks[-1]) + for l in leaves: + chunks[-1].append(l) + else: + leaves.insert(0, chunks.pop()) + chunks.append(Tree('NP', leaves)) + j -= 1 + continue + elif node['rel'] == 'POSDEP' and tree.nodes[node['head']]['rel'] in {'NCONJ', 'AJCONJ'}: + conj = tree.nodes[node['head']] + if tree.nodes[conj['head']]['rel'] in {'MOZ', 'NPOSTMOD', 'AJCONJ', 'POSDEP'}: + label = 'NP' + leaves = [item] + j = n - 1 + while j >= conj['head']: + if type(chunks[-1]) is Tree: + j -= len(chunks[-1].leaves()) + label = chunks[-1].label() + leaves = chunks.pop().leaves() + leaves + else: + leaves.insert(0, chunks.pop()) + j -= 1 + chunks.append(Tree(label, leaves)) + appended = True + elif node['head'] == n - 1 and len(chunks) > 0 and type(chunks[-1]) == Tree and not chunks[ + -1].label() == 'PP': + chunks[-1].append(item) + appended = True + elif node['rel'] == 'AJCONJ' and tree.nodes[node['head']]['rel'] in {'NPOSTMOD', 'AJCONJ'}: + np_nodes = [item] + label = 'ADJP' + i = n - node['head'] + while i > 0: + if type(chunks[-1]) == Tree: + label = chunks[-1].label() + leaves = chunks.pop().leaves() + i -= len(leaves) + np_nodes = leaves + np_nodes + else: + i -= 1 + np_nodes.insert(0, chunks.pop()) + chunks.append(Tree(label, np_nodes)) + appended = True + elif node['ctag'] == 'ADJ' and node['rel'] == 'POSDEP' and tree.nodes[node['head']][ + 'ctag'] != 'CONJ': + np_nodes = [item] + i = n - node['head'] + while i > 0: + label = 'ADJP' + if type(chunks[-1]) == Tree: + label = chunks[-1].label() + leaves = chunks.pop().leaves() + i -= len(leaves) + np_nodes = leaves + np_nodes + else: + i -= 1 + np_nodes.insert(0, chunks.pop()) + chunks.append(Tree(label, np_nodes)) + appended = True + for d in node_deps(node): + if d == n - 1 and type(chunks[-1]) == Tree and chunks[ + -1].label() != 'PP' and appended is not True: + label = chunks[-1].label() + if node['rel'] == 'ADV': + label = 'ADVP' + elif label in {'ADJP', 'ADVP'}: + if node['ctag'] == 'N': + label = 'NP' + elif node['ctag'] == 'ADJ': + label = 'ADJP' + leaves = chunks.pop().leaves() + leaves.append(item) + chunks.append(Tree(label, leaves)) + appended = True + elif tree.nodes[d]['rel'] == 'NPREMOD' and appended is not True: + np_nodes = [item] + i = n - d + while i > 0: + if type(chunks[-1]) == Tree: + leaves = chunks.pop().leaves() + i -= len(leaves) + np_nodes = leaves + np_nodes + else: + i -= 1 + np_nodes.insert(0, chunks.pop()) + chunks.append(Tree('NP', np_nodes)) + appended = True + if not appended: + label = 'NP' + if node['ctag'] == 'ADJ': + label = 'ADJP' + elif node['rel'] == 'ADV': + label = 'ADVP' + chunks.append(Tree(label, [item])) + elif node['ctag'] in {'V'}: + appended = False + for d in node_deps(node): + if d == n - 1 and type(chunks[-1]) == Tree and tree.nodes[d]['rel'] in {'NVE', + 'ENC'} and appended is not True: + leaves = chunks.pop().leaves() + leaves.append(item) + chunks.append(Tree('VP', leaves)) + appended = True + elif tree.nodes[d]['rel'] in {'VPRT', 'NVE'}: + vp_nodes = [item] + i = n - d + while i > 0: + if type(chunks[-1]) == Tree: + leaves = chunks.pop().leaves() + i -= len(leaves) + vp_nodes = leaves + vp_nodes + else: + i -= 1 + vp_nodes.insert(0, chunks.pop()) + chunks.append(Tree('VP', vp_nodes)) + appended = True + break + if not appended: + chunks.append(Tree('VP', [item])) + elif node['ctag'] in {'PSUS'}: + if node['rel'] == 'ADV': + chunks.append(Tree('ADVP', [item])) + else: + chunks.append(Tree('VP', [item])) + elif node['ctag'] in {'ADV', 'SADV'}: + appended = False + for d in node_deps(node): + if d == n - 1 and type(chunks[-1]) == Tree: + leaves = chunks.pop().leaves() + leaves.append(item) + chunks.append(Tree('ADVP', leaves)) + appended = True + if not appended: + chunks.append(Tree('ADVP', [item])) + + yield Tree('S', chunks) diff --git a/hazm/DependencyParser.py b/hazm/DependencyParser.py index a60d413f..7b1513b2 100644 --- a/hazm/DependencyParser.py +++ b/hazm/DependencyParser.py @@ -1,102 +1,113 @@ # coding: utf-8 from __future__ import print_function, unicode_literals -import os, codecs, tempfile + +import codecs +import os +import tempfile + from nltk.parse import DependencyGraph from nltk.parse.api import ParserI from nltk.parse.malt import MaltParser class MaltParser(MaltParser): - """ - interfaces [MaltParser](http://www.maltparser.org/) - """ - - def __init__(self, tagger, lemmatizer, working_dir='resources', model_file='langModel.mco'): - self.tagger = tagger - self.working_dir = working_dir - self.mco = model_file - self._malt_bin = os.path.join(working_dir, 'malt.jar') - self.lemmatize = lemmatizer.lemmatize if lemmatizer else lambda w, t: '_' - - def parse_sents(self, sentences, verbose=False): - tagged_sentences = self.tagger.tag_sents(sentences) - return self.parse_tagged_sents(tagged_sentences, verbose) - - def parse_tagged_sents(self, sentences, verbose=False): - input_file = tempfile.NamedTemporaryFile(prefix='malt_input.conll', dir=self.working_dir, delete=False) - output_file = tempfile.NamedTemporaryFile(prefix='malt_output.conll', dir=self.working_dir, delete=False) - - try: - for sentence in sentences: - for i, (word, tag) in enumerate(sentence, start=1): - word = word.strip() - if not word: - word = '_' - input_file.write(('\t'.join([str(i), word.replace(' ', '_'), self.lemmatize(word, tag).replace(' ', '_'), tag, tag, '_', '0', 'ROOT', '_', '_', '\n'])).encode('utf8')) - input_file.write('\n\n'.encode('utf8')) - input_file.close() - - cmd = ['java', '-jar', self._malt_bin, '-w', self.working_dir, '-c', self.mco, '-i', input_file.name, '-o', output_file.name, '-m', 'parse'] - if self._execute(cmd, verbose) != 0: - raise Exception("MaltParser parsing failed: %s" % (' '.join(cmd))) - - return (DependencyGraph(item) for item in codecs.open(output_file.name, encoding='utf8').read().split('\n\n') if item.strip()) - - finally: - input_file.close() - os.remove(input_file.name) - output_file.close() - os.remove(output_file.name) + """ + interfaces [MaltParser](http://www.maltparser.org/) + """ + + def __init__(self, tagger, lemmatizer, working_dir='resources', model_file='langModel.mco'): + self.tagger = tagger + self.working_dir = working_dir + self.mco = model_file + self._malt_bin = os.path.join(working_dir, 'malt.jar') + self.lemmatize = lemmatizer.lemmatize if lemmatizer else lambda w, t: '_' + + def parse_sents(self, sentences, verbose=False): + tagged_sentences = self.tagger.tag_sents(sentences) + return self.parse_tagged_sents(tagged_sentences, verbose) + + def parse_tagged_sents(self, sentences, verbose=False): + input_file = tempfile.NamedTemporaryFile(prefix='malt_input.conll', dir=self.working_dir, delete=False) + output_file = tempfile.NamedTemporaryFile(prefix='malt_output.conll', dir=self.working_dir, delete=False) + + try: + for sentence in sentences: + for i, (word, tag) in enumerate(sentence, start=1): + word = word.strip() + if not word: + word = '_' + input_file.write(('\t'.join( + [str(i), word.replace(' ', '_'), self.lemmatize(word, tag).replace(' ', '_'), tag, tag, '_', + '0', 'ROOT', '_', '_', '\n'])).encode('utf8')) + input_file.write('\n\n'.encode('utf8')) + input_file.close() + + cmd = ['java', '-jar', self._malt_bin, '-w', self.working_dir, '-c', self.mco, '-i', input_file.name, '-o', + output_file.name, '-m', 'parse'] + if self._execute(cmd, verbose) != 0: + raise Exception("MaltParser parsing failed: %s" % (' '.join(cmd))) + + return (DependencyGraph(item) for item in + codecs.open(output_file.name, encoding='utf8').read().split('\n\n') if item.strip()) + + finally: + input_file.close() + os.remove(input_file.name) + output_file.close() + os.remove(output_file.name) class TurboParser(ParserI): - """ - interfaces [TurboParser](http://www.ark.cs.cmu.edu/TurboParser/) which you must manually install - """ - - def __init__(self, tagger, lemmatizer, model_file): - self.tagger = tagger - self.lemmatize = lemmatizer.lemmatize if lemmatizer else lambda w, t: '_' - - import turboparser - self._pturboparser = turboparser.PTurboParser() - self.interface = self._pturboparser.create_parser() - self.interface.load_parser_model(model_file) - - def parse_sents(self, sentences): - tagged_sentences = self.tagger.tag_sents(sentences) - return self.tagged_parse_sents(tagged_sentences) - - def tagged_parse_sents(self, sentences): - input_file = tempfile.NamedTemporaryFile(prefix='turbo_input.conll', dir='resources', delete=False) - output_file = tempfile.NamedTemporaryFile(prefix='turbo_output.conll', dir='resources', delete=False) - - try: - for sentence in sentences: - for i, (word, tag) in enumerate(sentence, start=1): - word = word.strip() - if not word: - word = '_' - input_file.write(('\t'.join([str(i), word.replace(' ', '_'), self.lemmatize(word, tag).replace(' ', '_'), tag, tag, '_', '0', 'ROOT', '_', '_', '\n'])).encode('utf8')) - input_file.write('\n'.encode('utf8')) - input_file.close() - - self.interface.parse(input_file.name, output_file.name) - - return (DependencyGraph(item, cell_extractor=lambda cells: cells[1:8]) for item in codecs.open(output_file.name, encoding='utf8').read().split('\n\n') if item.strip()) - - finally: - input_file.close() - os.remove(input_file.name) - output_file.close() - os.remove(output_file.name) + """ + interfaces [TurboParser](http://www.ark.cs.cmu.edu/TurboParser/) which you must manually install + """ + + def __init__(self, tagger, lemmatizer, model_file): + self.tagger = tagger + self.lemmatize = lemmatizer.lemmatize if lemmatizer else lambda w, t: '_' + + import turboparser + self._pturboparser = turboparser.PTurboParser() + self.interface = self._pturboparser.create_parser() + self.interface.load_parser_model(model_file) + + def parse_sents(self, sentences): + tagged_sentences = self.tagger.tag_sents(sentences) + return self.tagged_parse_sents(tagged_sentences) + + def tagged_parse_sents(self, sentences): + input_file = tempfile.NamedTemporaryFile(prefix='turbo_input.conll', dir='resources', delete=False) + output_file = tempfile.NamedTemporaryFile(prefix='turbo_output.conll', dir='resources', delete=False) + + try: + for sentence in sentences: + for i, (word, tag) in enumerate(sentence, start=1): + word = word.strip() + if not word: + word = '_' + input_file.write(('\t'.join( + [str(i), word.replace(' ', '_'), self.lemmatize(word, tag).replace(' ', '_'), tag, tag, '_', + '0', 'ROOT', '_', '_', '\n'])).encode('utf8')) + input_file.write('\n'.encode('utf8')) + input_file.close() + + self.interface.parse(input_file.name, output_file.name) + + return (DependencyGraph(item, cell_extractor=lambda cells: cells[1:8]) for item in + codecs.open(output_file.name, encoding='utf8').read().split('\n\n') if item.strip()) + + finally: + input_file.close() + os.remove(input_file.name) + output_file.close() + os.remove(output_file.name) class DependencyParser(MaltParser): - """ - >>> from hazm import POSTagger, Lemmatizer - >>> parser = DependencyParser(tagger=POSTagger(model='resources/postagger.model'), lemmatizer=Lemmatizer()) - >>> parser.parse(['من', 'به', 'مدرسه', 'رفته بودم', '.']).tree().pprint() - (رفته_بودم من (به مدرسه) .) - """ + """ + >>> from hazm import POSTagger, Lemmatizer + >>> parser = DependencyParser(tagger=POSTagger(model='resources/postagger.model'), lemmatizer=Lemmatizer()) + >>> parser.parse(['من', 'به', 'مدرسه', 'رفته بودم', '.']).tree().pprint() + (رفته_بودم من (به مدرسه) .) + """ diff --git a/hazm/HamshahriReader.py b/hazm/HamshahriReader.py index 4f79d27e..d640ac1d 100644 --- a/hazm/HamshahriReader.py +++ b/hazm/HamshahriReader.py @@ -1,60 +1,72 @@ # coding: utf-8 from __future__ import print_function -import os, sys, re + +import os +import re +import sys from xml.dom import minidom class HamshahriReader(): - """ - interfaces [Hamshahri Corpus](http://dbrg.ut.ac.ir/Hamshahri/download.html#version2) that you must download and extract it. - - >>> hamshahri = HamshahriReader(root='corpora/hamshahri') - >>> next(hamshahri.docs())['id'] - 'HAM2-750403-001' - """ - - def __init__(self, root): - self._root = root - self._invalids = set(['hamshahri.dtd', 'HAM2-960622.xml', 'HAM2-960630.xml', 'HAM2-960701.xml', 'HAM2-960709.xml', 'HAM2-960710.xml', 'HAM2-960711.xml', 'HAM2-960817.xml', 'HAM2-960818.xml', 'HAM2-960819.xml', 'HAM2-960820.xml', 'HAM2-961019.xml', 'HAM2-961112.xml', 'HAM2-961113.xml', 'HAM2-961114.xml', 'HAM2-970414.xml', 'HAM2-970415.xml', 'HAM2-970612.xml', 'HAM2-970614.xml', 'HAM2-970710.xml', 'HAM2-970712.xml', 'HAM2-970713.xml', 'HAM2-970717.xml', 'HAM2-970719.xml', 'HAM2-980317.xml', 'HAM2-040820.xml', 'HAM2-040824.xml', 'HAM2-040825.xml', 'HAM2-040901.xml', 'HAM2-040917.xml', 'HAM2-040918.xml', 'HAM2-040920.xml', 'HAM2-041025.xml', 'HAM2-041026.xml', 'HAM2-041027.xml', 'HAM2-041230.xml', 'HAM2-041231.xml', 'HAM2-050101.xml', 'HAM2-050102.xml', 'HAM2-050223.xml', 'HAM2-050224.xml', 'HAM2-050406.xml', 'HAM2-050407.xml', 'HAM2-050416.xml']) - self._paragraph_pattern = re.compile(r'(\n.{0,50})(?=\n)') - - def docs(self): - for root, dirs, files in os.walk(self._root): - for name in sorted(files): - if name in self._invalids: - continue - - try: - elements = minidom.parse(os.path.join(root, name)) - for element in elements.getElementsByTagName('DOC'): - doc = {} - doc['id'] = element.getElementsByTagName('DOCID')[0].childNodes[0].data - doc['issue'] = element.getElementsByTagName('ISSUE')[0].childNodes[0].data - - for cat in element.getElementsByTagName('CAT'): - doc['categories_'+ cat.attributes['xml:lang'].value] = cat.childNodes[0].data.split('.') - - for date in element.getElementsByTagName('DATE'): - if date.attributes['calender'].value == 'Persian': - doc['date'] = date.childNodes[0].data - - elm = element.getElementsByTagName('TITLE')[0] - doc['title'] = elm.childNodes[1].data if len(elm.childNodes) > 1 else '' - - doc['text'] = '' - for item in element.getElementsByTagName('TEXT')[0].childNodes: - if item.nodeType == 4: # CDATA - doc['text'] += item.data - - # refine text - doc['text'] = self._paragraph_pattern.sub(r'\1\n', doc['text']).replace('\no ', '\n') - - yield doc - - except Exception as e: - print('error in reading', name, e, file=sys.stderr) - - def texts(self): - for doc in self.docs(): - yield doc['text'] + """ + interfaces [Hamshahri Corpus](http://dbrg.ut.ac.ir/Hamshahri/download.html#version2) that you must download and extract it. + + >>> hamshahri = HamshahriReader(root='corpora/hamshahri') + >>> next(hamshahri.docs())['id'] + 'HAM2-750403-001' + """ + + def __init__(self, root): + self._root = root + self._invalids = set( + ['hamshahri.dtd', 'HAM2-960622.xml', 'HAM2-960630.xml', 'HAM2-960701.xml', 'HAM2-960709.xml', + 'HAM2-960710.xml', 'HAM2-960711.xml', 'HAM2-960817.xml', 'HAM2-960818.xml', 'HAM2-960819.xml', + 'HAM2-960820.xml', 'HAM2-961019.xml', 'HAM2-961112.xml', 'HAM2-961113.xml', 'HAM2-961114.xml', + 'HAM2-970414.xml', 'HAM2-970415.xml', 'HAM2-970612.xml', 'HAM2-970614.xml', 'HAM2-970710.xml', + 'HAM2-970712.xml', 'HAM2-970713.xml', 'HAM2-970717.xml', 'HAM2-970719.xml', 'HAM2-980317.xml', + 'HAM2-040820.xml', 'HAM2-040824.xml', 'HAM2-040825.xml', 'HAM2-040901.xml', 'HAM2-040917.xml', + 'HAM2-040918.xml', 'HAM2-040920.xml', 'HAM2-041025.xml', 'HAM2-041026.xml', 'HAM2-041027.xml', + 'HAM2-041230.xml', 'HAM2-041231.xml', 'HAM2-050101.xml', 'HAM2-050102.xml', 'HAM2-050223.xml', + 'HAM2-050224.xml', 'HAM2-050406.xml', 'HAM2-050407.xml', 'HAM2-050416.xml']) + self._paragraph_pattern = re.compile(r'(\n.{0,50})(?=\n)') + + def docs(self): + for root, dirs, files in os.walk(self._root): + for name in sorted(files): + if name in self._invalids: + continue + + try: + elements = minidom.parse(os.path.join(root, name)) + for element in elements.getElementsByTagName('DOC'): + doc = {} + doc['id'] = element.getElementsByTagName('DOCID')[0].childNodes[0].data + doc['issue'] = element.getElementsByTagName('ISSUE')[0].childNodes[0].data + + for cat in element.getElementsByTagName('CAT'): + doc['categories_' + cat.attributes['xml:lang'].value] = cat.childNodes[0].data.split('.') + + for date in element.getElementsByTagName('DATE'): + if date.attributes['calender'].value == 'Persian': + doc['date'] = date.childNodes[0].data + + elm = element.getElementsByTagName('TITLE')[0] + doc['title'] = elm.childNodes[1].data if len(elm.childNodes) > 1 else '' + + doc['text'] = '' + for item in element.getElementsByTagName('TEXT')[0].childNodes: + if item.nodeType == 4: # CDATA + doc['text'] += item.data + + # refine text + doc['text'] = self._paragraph_pattern.sub(r'\1\n', doc['text']).replace('\no ', '\n') + + yield doc + + except Exception as e: + print('error in reading', name, e, file=sys.stderr) + + def texts(self): + for doc in self.docs(): + yield doc['text'] diff --git a/hazm/InformalNormalizer.py b/hazm/InformalNormalizer.py index 1dc0159e..3d7f7b51 100644 --- a/hazm/InformalNormalizer.py +++ b/hazm/InformalNormalizer.py @@ -1,213 +1,213 @@ # coding: utf-8 from __future__ import unicode_literals -import codecs -from .utils import informal_verbs, informal_words, NUMBERS -from .Normalizer import Normalizer + from .Lemmatizer import Lemmatizer +from .Normalizer import Normalizer +from .SentenceTokenizer import * from .Stemmer import Stemmer from .WordTokenizer import * -from .SentenceTokenizer import * +from .utils import informal_verbs, informal_words, NUMBERS class InformalNormalizer(Normalizer): - def __init__(self, verb_file=informal_verbs, word_file=informal_words, seperation_flag=False, **kargs): - self.seperation_flag = seperation_flag - self.lemmatizer = Lemmatizer() - self.ilemmatizer = InformalLemmatizer() - self.stemmer = Stemmer() - super(InformalNormalizer, self).__init__(**kargs) - - def informal_to_formal_conjucation(i, f, flag): - iv = self.informal_conjugations(i) - fv = self.lemmatizer.conjugations(f) - res = {} - if flag: - for i, j in zip(iv, fv[48:]): - res[i] = j - if '‌' in i: - res[i.replace('‌', '')] = j - res[i.replace('‌', ' ')] = j - if i.endswith('ین'): - res[i[:-1] + 'د'] = j - else: - for i, j in zip(iv[8:], fv[56:]): - res[i] = j - if '‌' in i: - res[i.replace('‌', '')] = j - res[i.replace('‌', ' ')] = j - if i.endswith('ین'): - res[i[:-1] + 'د'] = j - - return res - - with codecs.open(verb_file, encoding='utf8') as vf: - self.iverb_map = {} - for f, i, flag in map(lambda x: x.strip().split(' ', 2), vf): - self.iverb_map.update( - informal_to_formal_conjucation(i, f, flag) - ) - - with codecs.open(word_file, encoding='utf8') as wf: - self.iword_map = dict( - map(lambda x: x.strip().split(' ', 1), wf) - ) - - self.words = set() - if self.seperation_flag: - self.words.update(self.iword_map.keys()) - self.words.update(self.iword_map.values()) - self.words.update(self.iverb_map.keys()) - self.words.update(self.iverb_map.values()) - self.words.update(self.lemmatizer.words) - self.words.update(self.lemmatizer.verbs.keys()) - self.words.update(self.lemmatizer.verbs.values()) - - def split_token_words(self, token): - - def shekan(token): - res = [''] - for i in token: - res[-1] += i - if i in set(['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و'] + list(NUMBERS)): - res.append('') - while '' in res: - res.remove('') - return res - - def perm(lst): - if len(lst) > 1: - up = perm(lst[1:]) - else: - return [lst] - res = [] - for i in up: - res.append([lst[0]] + i) - res.append([lst[0] + i[0]] + i[1:]) - res.sort(key=len) - return res - - token = re.sub(r'(.)\1{2,}', r'\1', token) - ps = perm(shekan(token)) - for c in ps: - if set(map(lambda x: self.ilemmatizer.lemmatize(x), c)).issubset(self.words): - return ' '.join(c) - return token - - - def normalized_word(self, word): - """ - >>> normalizer = InformalNormalizer() - >>> normalizer.normalized_word('می‌رم') - ['می‌روم', 'می‌رم'] - >>> normalizer = InformalNormalizer(seperation_flag=True) - >>> normalizer.normalized_word('صداوسیماجمهوری') - ['صداوسیما جمهوری', 'صداوسیماجمهوری'] - """ - - options = [] - if word in self.lemmatizer.words or word in self.lemmatizer.verbs: - pass - - elif word in self.iverb_map: - options.append(self.iverb_map[word]) - - elif word in self.iword_map: - options.append(self.iword_map[word]) - - elif word[:-2] in self.ilemmatizer.verbs and word.endswith('ین'): - options.append(word[:-1] + 'د') - - elif word.endswith("ن") and word[:-1] in self.ilemmatizer.verbs: - options.append(word + 'د') - - elif word[:-1] in self.ilemmatizer.verbs and word.endswith('ه') and word[:-1] not in self.lemmatizer.words: - options.append(self.iword_map.get(word[:-1], word[:-1]) + 'د') - - elif word not in self.ilemmatizer.verbs and word.endswith('ه') and word[:-1] in self.ilemmatizer.words: - options.append(self.iword_map.get(word[:-1], word[:-1]) + ' است') - - elif word not in self.ilemmatizer.verbs and word.endswith('ون') and self.lemmatizer.lemmatize(word[:-2] + 'ان') in self.ilemmatizer.words: - options.append(word[:-2] + 'ان') - - elif self.seperation_flag: - options.append(self.split_token_words(word)) - - options.append(word) - return options - - def normalize(self, text): - - sent_tokenizer = SentenceTokenizer() - word_tokenizer = WordTokenizer() - text = super(InformalNormalizer, self).normalize(text) - sents = [word_tokenizer.tokenize(sentence) for sentence in sent_tokenizer.tokenize(text)] - - return [[self.normalized_word(word) for word in sent] for sent in sents] - - def informal_conjugations(self, verb): - ends = ['م', 'ی', '', 'یم', 'ین', 'ن'] - present_simples = [verb + end for end in ends] - if verb.endswith('ا'): - present_simples[2] = verb + 'د' - else: - present_simples[2] = verb + 'ه' - present_not_simples = ['ن' + item for item in present_simples] - present_imperfects = ['می‌' + item for item in present_simples] - present_not_imperfects = ['ن' + item for item in present_imperfects] - present_subjunctives = [ - item if item.startswith('ب') else 'ب' + item for item in present_simples] - present_not_subjunctives = ['ن' + item for item in present_simples] - return present_simples + present_not_simples + \ - present_imperfects + present_not_imperfects + \ - present_subjunctives + present_not_subjunctives + def __init__(self, verb_file=informal_verbs, word_file=informal_words, seperation_flag=False, **kargs): + self.seperation_flag = seperation_flag + self.lemmatizer = Lemmatizer() + self.ilemmatizer = InformalLemmatizer() + self.stemmer = Stemmer() + super(InformalNormalizer, self).__init__(**kargs) + + def informal_to_formal_conjucation(i, f, flag): + iv = self.informal_conjugations(i) + fv = self.lemmatizer.conjugations(f) + res = {} + if flag: + for i, j in zip(iv, fv[48:]): + res[i] = j + if '‌' in i: + res[i.replace('‌', '')] = j + res[i.replace('‌', ' ')] = j + if i.endswith('ین'): + res[i[:-1] + 'د'] = j + else: + for i, j in zip(iv[8:], fv[56:]): + res[i] = j + if '‌' in i: + res[i.replace('‌', '')] = j + res[i.replace('‌', ' ')] = j + if i.endswith('ین'): + res[i[:-1] + 'د'] = j + + return res + + with codecs.open(verb_file, encoding='utf8') as vf: + self.iverb_map = {} + for f, i, flag in map(lambda x: x.strip().split(' ', 2), vf): + self.iverb_map.update( + informal_to_formal_conjucation(i, f, flag) + ) + + with codecs.open(word_file, encoding='utf8') as wf: + self.iword_map = dict( + map(lambda x: x.strip().split(' ', 1), wf) + ) + + self.words = set() + if self.seperation_flag: + self.words.update(self.iword_map.keys()) + self.words.update(self.iword_map.values()) + self.words.update(self.iverb_map.keys()) + self.words.update(self.iverb_map.values()) + self.words.update(self.lemmatizer.words) + self.words.update(self.lemmatizer.verbs.keys()) + self.words.update(self.lemmatizer.verbs.values()) + + def split_token_words(self, token): + + def shekan(token): + res = [''] + for i in token: + res[-1] += i + if i in set(['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و'] + list(NUMBERS)): + res.append('') + while '' in res: + res.remove('') + return res + + def perm(lst): + if len(lst) > 1: + up = perm(lst[1:]) + else: + return [lst] + res = [] + for i in up: + res.append([lst[0]] + i) + res.append([lst[0] + i[0]] + i[1:]) + res.sort(key=len) + return res + + token = re.sub(r'(.)\1{2,}', r'\1', token) + ps = perm(shekan(token)) + for c in ps: + if set(map(lambda x: self.ilemmatizer.lemmatize(x), c)).issubset(self.words): + return ' '.join(c) + return token + + def normalized_word(self, word): + """ + >>> normalizer = InformalNormalizer() + >>> normalizer.normalized_word('می‌رم') + ['می‌روم', 'می‌رم'] + >>> normalizer = InformalNormalizer(seperation_flag=True) + >>> normalizer.normalized_word('صداوسیماجمهوری') + ['صداوسیما جمهوری', 'صداوسیماجمهوری'] + """ + + options = [] + if word in self.lemmatizer.words or word in self.lemmatizer.verbs: + pass + + elif word in self.iverb_map: + options.append(self.iverb_map[word]) + + elif word in self.iword_map: + options.append(self.iword_map[word]) + + elif word[:-2] in self.ilemmatizer.verbs and word.endswith('ین'): + options.append(word[:-1] + 'د') + + elif word.endswith("ن") and word[:-1] in self.ilemmatizer.verbs: + options.append(word + 'د') + + elif word[:-1] in self.ilemmatizer.verbs and word.endswith('ه') and word[:-1] not in self.lemmatizer.words: + options.append(self.iword_map.get(word[:-1], word[:-1]) + 'د') + + elif word not in self.ilemmatizer.verbs and word.endswith('ه') and word[:-1] in self.ilemmatizer.words: + options.append(self.iword_map.get(word[:-1], word[:-1]) + ' است') + + elif word not in self.ilemmatizer.verbs and word.endswith('ون') and self.lemmatizer.lemmatize( + word[:-2] + 'ان') in self.ilemmatizer.words: + options.append(word[:-2] + 'ان') + + elif self.seperation_flag: + options.append(self.split_token_words(word)) + + options.append(word) + return options + + def normalize(self, text): + + sent_tokenizer = SentenceTokenizer() + word_tokenizer = WordTokenizer() + text = super(InformalNormalizer, self).normalize(text) + sents = [word_tokenizer.tokenize(sentence) for sentence in sent_tokenizer.tokenize(text)] + + return [[self.normalized_word(word) for word in sent] for sent in sents] + + def informal_conjugations(self, verb): + ends = ['م', 'ی', '', 'یم', 'ین', 'ن'] + present_simples = [verb + end for end in ends] + if verb.endswith('ا'): + present_simples[2] = verb + 'د' + else: + present_simples[2] = verb + 'ه' + present_not_simples = ['ن' + item for item in present_simples] + present_imperfects = ['می‌' + item for item in present_simples] + present_not_imperfects = ['ن' + item for item in present_imperfects] + present_subjunctives = [ + item if item.startswith('ب') else 'ب' + item for item in present_simples] + present_not_subjunctives = ['ن' + item for item in present_simples] + return present_simples + present_not_simples + \ + present_imperfects + present_not_imperfects + \ + present_subjunctives + present_not_subjunctives class InformalLemmatizer(Lemmatizer): - def __init__(self, **kargs): - super(InformalLemmatizer, self).__init__(**kargs) - - temp = [] - self.words = set(self.words.keys()) - for word in self.words: - if word.endswith("ً"): - temp.append(word[:-1]) - - self.words.update(temp) - - temp = {} - for verb in self.verbs: - if verb.endswith("د"): - temp[verb[:-1] + 'ن'] = self.verbs[verb] - - self.verbs.update(temp) - - with codecs.open(informal_verbs, encoding='utf8') as vf: - for f, i, flag in map(lambda x: x.strip().split(' ', 2), vf): - self.verbs.update(dict( - map(lambda x: (x, f), self.iconjugations(i)) - )) - - with codecs.open(informal_words, encoding='utf8') as wf: - self.words.update( - map(lambda x: x.strip().split(' ', 1)[0], wf) - ) - - def iconjugations(self, verb): - ends = ['م', 'ی', '', 'یم', 'ین', 'ن'] - present_simples = [verb + end for end in ends] - if verb.endswith('ا'): - present_simples[2] = verb + 'د' - else: - present_simples[2] = verb + 'ه' - present_not_simples = ['ن' + item for item in present_simples] - present_imperfects = ['می‌' + item for item in present_simples] - present_not_imperfects = ['ن' + item for item in present_imperfects] - present_subjunctives = [ - item if item.startswith('ب') else 'ب' + item for item in present_simples] - present_not_subjunctives = ['ن' + item for item in present_simples] - return present_simples + present_not_simples + \ - present_imperfects + present_not_imperfects + \ - present_subjunctives + present_not_subjunctives + def __init__(self, **kargs): + super(InformalLemmatizer, self).__init__(**kargs) + + temp = [] + self.words = set(self.words.keys()) + for word in self.words: + if word.endswith("ً"): + temp.append(word[:-1]) + + self.words.update(temp) + + temp = {} + for verb in self.verbs: + if verb.endswith("د"): + temp[verb[:-1] + 'ن'] = self.verbs[verb] + + self.verbs.update(temp) + + with codecs.open(informal_verbs, encoding='utf8') as vf: + for f, i, flag in map(lambda x: x.strip().split(' ', 2), vf): + self.verbs.update(dict( + map(lambda x: (x, f), self.iconjugations(i)) + )) + + with codecs.open(informal_words, encoding='utf8') as wf: + self.words.update( + map(lambda x: x.strip().split(' ', 1)[0], wf) + ) + + def iconjugations(self, verb): + ends = ['م', 'ی', '', 'یم', 'ین', 'ن'] + present_simples = [verb + end for end in ends] + if verb.endswith('ا'): + present_simples[2] = verb + 'د' + else: + present_simples[2] = verb + 'ه' + present_not_simples = ['ن' + item for item in present_simples] + present_imperfects = ['می‌' + item for item in present_simples] + present_not_imperfects = ['ن' + item for item in present_imperfects] + present_subjunctives = [ + item if item.startswith('ب') else 'ب' + item for item in present_simples] + present_not_subjunctives = ['ن' + item for item in present_simples] + return present_simples + present_not_simples + \ + present_imperfects + present_not_imperfects + \ + present_subjunctives + present_not_subjunctives diff --git a/hazm/Lemmatizer.py b/hazm/Lemmatizer.py index 6359cefc..499c2ab6 100644 --- a/hazm/Lemmatizer.py +++ b/hazm/Lemmatizer.py @@ -1,104 +1,109 @@ # coding: utf-8 from __future__ import unicode_literals -from .utils import default_words, default_verbs + from .Stemmer import Stemmer from .WordTokenizer import WordTokenizer +from .utils import default_words, default_verbs class Lemmatizer(object): - """ - >>> lemmatizer = Lemmatizer() - >>> lemmatizer.lemmatize('کتاب‌ها') - 'کتاب' - >>> lemmatizer.lemmatize('آتشفشان') - 'آتشفشان' - >>> lemmatizer.lemmatize('می‌روم') - 'رفت#رو' - >>> lemmatizer.lemmatize('گفته_شده_است') - 'گفت#گو' - >>> lemmatizer.lemmatize('نچشیده_است') - 'چشید#چش' - >>> lemmatizer.lemmatize('مردم', pos='N') - 'مردم' - >>> lemmatizer.lemmatize('اجتماعی', pos='AJ') - 'اجتماعی' - """ - - def __init__(self, words_file=default_words, verbs_file=default_verbs, joined_verb_parts=True): - self.verbs = {} - self.stemmer = Stemmer() - - tokenizer = WordTokenizer(words_file=default_words, verbs_file=verbs_file) - self.words = tokenizer.words - - if verbs_file: - self.verbs['است'] = '#است' - for verb in tokenizer.verbs: - for tense in self.conjugations(verb): - self.verbs[tense] = verb - if joined_verb_parts: - for verb in tokenizer.verbs: - bon = verb.split('#')[0] - for after_verb in tokenizer.after_verbs: - self.verbs[bon + 'ه_' + after_verb] = verb - self.verbs['ن' + bon + 'ه_' + after_verb] = verb - for before_verb in tokenizer.before_verbs: - self.verbs[before_verb + '_' + bon] = verb - - def lemmatize(self, word, pos=''): - if not pos and word in self.words: - return word - - if (not pos or pos == 'V') and word in self.verbs: - return self.verbs[word] - - if pos.startswith('AJ') and word[-1] == 'ی': - return word - - if pos == 'PRO': - return word - - if word in self.words: - return word - - stem = self.stemmer.stem(word) - if stem and stem in self.words: - return stem - - return word - - def conjugations(self, verb): - """ - >>> lemmatizer = Lemmatizer() - >>> lemmatizer.conjugations('خورد#خور') - ['خوردم', 'خوردی', 'خورد', 'خوردیم', 'خوردید', 'خوردند', 'نخوردم', 'نخوردی', 'نخورد', 'نخوردیم', 'نخوردید', 'نخوردند', 'خورم', 'خوری', 'خورد', 'خوریم', 'خورید', 'خورند', 'نخورم', 'نخوری', 'نخورد', 'نخوریم', 'نخورید', 'نخورند', 'می‌خوردم', 'می‌خوردی', 'می‌خورد', 'می‌خوردیم', 'می‌خوردید', 'می‌خوردند', 'نمی‌خوردم', 'نمی‌خوردی', 'نمی‌خورد', 'نمی‌خوردیم', 'نمی‌خوردید', 'نمی‌خوردند', 'خورده‌ام', 'خورده‌ای', 'خورده', 'خورده‌ایم', 'خورده‌اید', 'خورده‌اند', 'نخورده‌ام', 'نخورده‌ای', 'نخورده', 'نخورده‌ایم', 'نخورده‌اید', 'نخورده‌اند', 'خورم', 'خوری', 'خورد', 'خوریم', 'خورید', 'خورند', 'نخورم', 'نخوری', 'نخورد', 'نخوریم', 'نخورید', 'نخورند', 'می‌خورم', 'می‌خوری', 'می‌خورد', 'می‌خوریم', 'می‌خورید', 'می‌خورند', 'نمی‌خورم', 'نمی‌خوری', 'نمی‌خورد', 'نمی‌خوریم', 'نمی‌خورید', 'نمی‌خورند', 'بخورم', 'بخوری', 'بخورد', 'بخوریم', 'بخورید', 'بخورند', 'نخورم', 'نخوری', 'نخورد', 'نخوریم', 'نخورید', 'نخورند', 'بخور', 'نخور'] - >>> lemmatizer.conjugations('آورد#آور') - ['آوردم', 'آوردی', 'آورد', 'آوردیم', 'آوردید', 'آوردند', 'نیاوردم', 'نیاوردی', 'نیاورد', 'نیاوردیم', 'نیاوردید', 'نیاوردند', 'آورم', 'آوری', 'آورد', 'آوریم', 'آورید', 'آورند', 'نیاورم', 'نیاوری', 'نیاورد', 'نیاوریم', 'نیاورید', 'نیاورند', 'می‌آوردم', 'می‌آوردی', 'می‌آورد', 'می‌آوردیم', 'می‌آوردید', 'می‌آوردند', 'نمی‌آوردم', 'نمی‌آوردی', 'نمی‌آورد', 'نمی‌آوردیم', 'نمی‌آوردید', 'نمی‌آوردند', 'آورده‌ام', 'آورده‌ای', 'آورده', 'آورده‌ایم', 'آورده‌اید', 'آورده‌اند', 'نیاورده‌ام', 'نیاورده‌ای', 'نیاورده', 'نیاورده‌ایم', 'نیاورده‌اید', 'نیاورده‌اند', 'آورم', 'آوری', 'آورد', 'آوریم', 'آورید', 'آورند', 'نیاورم', 'نیاوری', 'نیاورد', 'نیاوریم', 'نیاورید', 'نیاورند', 'می‌آورم', 'می‌آوری', 'می‌آورد', 'می‌آوریم', 'می‌آورید', 'می‌آورند', 'نمی‌آورم', 'نمی‌آوری', 'نمی‌آورد', 'نمی‌آوریم', 'نمی‌آورید', 'نمی‌آورند', 'بیاورم', 'بیاوری', 'بیاورد', 'بیاوریم', 'بیاورید', 'بیاورند', 'نیاورم', 'نیاوری', 'نیاورد', 'نیاوریم', 'نیاورید', 'نیاورند', 'بیاور', 'نیاور'] - """ - - past, present = verb.split('#') - ends = ['م', 'ی', '', 'یم', 'ید', 'ند'] - - if verb == '#هست': - return ['هست' + end for end in ends] + ['نیست' + end for end in ends] - - past_simples = [past + end for end in ends] - past_imperfects = ['می‌' + item for item in past_simples] - ends = ['ه‌ام', 'ه‌ای', 'ه', 'ه‌ایم', 'ه‌اید', 'ه‌اند'] - past_narratives = [past + end for end in ends] - - imperatives = ['ب' + present, 'ن' + present] - - if present.endswith('ا') or present in ('آ', 'گو'): - present = present + 'ی' - - ends = ['م', 'ی', 'د', 'یم', 'ید', 'ند'] - present_simples = [present + end for end in ends] - present_imperfects = ['می‌' + item for item in present_simples] - present_subjunctives = [item if item.startswith('ب') else 'ب' + item for item in present_simples] - present_not_subjunctives = ['ن' + item for item in present_simples] - - with_nots = lambda items: items + list(map(lambda item: 'ن' + item, items)) - aa_refinement = lambda items: list(map(lambda item: item.replace('بآ', 'بیا').replace('نآ', 'نیا'), items)) if items[0].startswith('آ') else items - return aa_refinement(with_nots(past_simples) + with_nots(present_simples) + with_nots(past_imperfects) + with_nots(past_narratives) + with_nots(present_simples) + with_nots(present_imperfects) + present_subjunctives + present_not_subjunctives + imperatives) + """ + >>> lemmatizer = Lemmatizer() + >>> lemmatizer.lemmatize('کتاب‌ها') + 'کتاب' + >>> lemmatizer.lemmatize('آتشفشان') + 'آتشفشان' + >>> lemmatizer.lemmatize('می‌روم') + 'رفت#رو' + >>> lemmatizer.lemmatize('گفته_شده_است') + 'گفت#گو' + >>> lemmatizer.lemmatize('نچشیده_است') + 'چشید#چش' + >>> lemmatizer.lemmatize('مردم', pos='N') + 'مردم' + >>> lemmatizer.lemmatize('اجتماعی', pos='AJ') + 'اجتماعی' + """ + + def __init__(self, words_file=default_words, verbs_file=default_verbs, joined_verb_parts=True): + self.verbs = {} + self.stemmer = Stemmer() + + tokenizer = WordTokenizer(words_file=default_words, verbs_file=verbs_file) + self.words = tokenizer.words + + if verbs_file: + self.verbs['است'] = '#است' + for verb in tokenizer.verbs: + for tense in self.conjugations(verb): + self.verbs[tense] = verb + if joined_verb_parts: + for verb in tokenizer.verbs: + bon = verb.split('#')[0] + for after_verb in tokenizer.after_verbs: + self.verbs[bon + 'ه_' + after_verb] = verb + self.verbs['ن' + bon + 'ه_' + after_verb] = verb + for before_verb in tokenizer.before_verbs: + self.verbs[before_verb + '_' + bon] = verb + + def lemmatize(self, word, pos=''): + if not pos and word in self.words: + return word + + if (not pos or pos == 'V') and word in self.verbs: + return self.verbs[word] + + if pos.startswith('AJ') and word[-1] == 'ی': + return word + + if pos == 'PRO': + return word + + if word in self.words: + return word + + stem = self.stemmer.stem(word) + if stem and stem in self.words: + return stem + + return word + + def conjugations(self, verb): + """ + >>> lemmatizer = Lemmatizer() + >>> lemmatizer.conjugations('خورد#خور') + ['خوردم', 'خوردی', 'خورد', 'خوردیم', 'خوردید', 'خوردند', 'نخوردم', 'نخوردی', 'نخورد', 'نخوردیم', 'نخوردید', 'نخوردند', 'خورم', 'خوری', 'خورد', 'خوریم', 'خورید', 'خورند', 'نخورم', 'نخوری', 'نخورد', 'نخوریم', 'نخورید', 'نخورند', 'می‌خوردم', 'می‌خوردی', 'می‌خورد', 'می‌خوردیم', 'می‌خوردید', 'می‌خوردند', 'نمی‌خوردم', 'نمی‌خوردی', 'نمی‌خورد', 'نمی‌خوردیم', 'نمی‌خوردید', 'نمی‌خوردند', 'خورده‌ام', 'خورده‌ای', 'خورده', 'خورده‌ایم', 'خورده‌اید', 'خورده‌اند', 'نخورده‌ام', 'نخورده‌ای', 'نخورده', 'نخورده‌ایم', 'نخورده‌اید', 'نخورده‌اند', 'خورم', 'خوری', 'خورد', 'خوریم', 'خورید', 'خورند', 'نخورم', 'نخوری', 'نخورد', 'نخوریم', 'نخورید', 'نخورند', 'می‌خورم', 'می‌خوری', 'می‌خورد', 'می‌خوریم', 'می‌خورید', 'می‌خورند', 'نمی‌خورم', 'نمی‌خوری', 'نمی‌خورد', 'نمی‌خوریم', 'نمی‌خورید', 'نمی‌خورند', 'بخورم', 'بخوری', 'بخورد', 'بخوریم', 'بخورید', 'بخورند', 'نخورم', 'نخوری', 'نخورد', 'نخوریم', 'نخورید', 'نخورند', 'بخور', 'نخور'] + >>> lemmatizer.conjugations('آورد#آور') + ['آوردم', 'آوردی', 'آورد', 'آوردیم', 'آوردید', 'آوردند', 'نیاوردم', 'نیاوردی', 'نیاورد', 'نیاوردیم', 'نیاوردید', 'نیاوردند', 'آورم', 'آوری', 'آورد', 'آوریم', 'آورید', 'آورند', 'نیاورم', 'نیاوری', 'نیاورد', 'نیاوریم', 'نیاورید', 'نیاورند', 'می‌آوردم', 'می‌آوردی', 'می‌آورد', 'می‌آوردیم', 'می‌آوردید', 'می‌آوردند', 'نمی‌آوردم', 'نمی‌آوردی', 'نمی‌آورد', 'نمی‌آوردیم', 'نمی‌آوردید', 'نمی‌آوردند', 'آورده‌ام', 'آورده‌ای', 'آورده', 'آورده‌ایم', 'آورده‌اید', 'آورده‌اند', 'نیاورده‌ام', 'نیاورده‌ای', 'نیاورده', 'نیاورده‌ایم', 'نیاورده‌اید', 'نیاورده‌اند', 'آورم', 'آوری', 'آورد', 'آوریم', 'آورید', 'آورند', 'نیاورم', 'نیاوری', 'نیاورد', 'نیاوریم', 'نیاورید', 'نیاورند', 'می‌آورم', 'می‌آوری', 'می‌آورد', 'می‌آوریم', 'می‌آورید', 'می‌آورند', 'نمی‌آورم', 'نمی‌آوری', 'نمی‌آورد', 'نمی‌آوریم', 'نمی‌آورید', 'نمی‌آورند', 'بیاورم', 'بیاوری', 'بیاورد', 'بیاوریم', 'بیاورید', 'بیاورند', 'نیاورم', 'نیاوری', 'نیاورد', 'نیاوریم', 'نیاورید', 'نیاورند', 'بیاور', 'نیاور'] + """ + + past, present = verb.split('#') + ends = ['م', 'ی', '', 'یم', 'ید', 'ند'] + + if verb == '#هست': + return ['هست' + end for end in ends] + ['نیست' + end for end in ends] + + past_simples = [past + end for end in ends] + past_imperfects = ['می‌' + item for item in past_simples] + ends = ['ه‌ام', 'ه‌ای', 'ه', 'ه‌ایم', 'ه‌اید', 'ه‌اند'] + past_narratives = [past + end for end in ends] + + imperatives = ['ب' + present, 'ن' + present] + + if present.endswith('ا') or present in ('آ', 'گو'): + present = present + 'ی' + + ends = ['م', 'ی', 'د', 'یم', 'ید', 'ند'] + present_simples = [present + end for end in ends] + present_imperfects = ['می‌' + item for item in present_simples] + present_subjunctives = [item if item.startswith('ب') else 'ب' + item for item in present_simples] + present_not_subjunctives = ['ن' + item for item in present_simples] + + with_nots = lambda items: items + list(map(lambda item: 'ن' + item, items)) + aa_refinement = lambda items: list(map(lambda item: item.replace('بآ', 'بیا').replace('نآ', 'نیا'), items)) if \ + items[0].startswith('آ') else items + return aa_refinement( + with_nots(past_simples) + with_nots(present_simples) + with_nots(past_imperfects) + with_nots( + past_narratives) + with_nots(present_simples) + with_nots( + present_imperfects) + present_subjunctives + present_not_subjunctives + imperatives) diff --git a/hazm/Normalizer.py b/hazm/Normalizer.py index 804b4ff2..ee846e0f 100644 --- a/hazm/Normalizer.py +++ b/hazm/Normalizer.py @@ -1,7 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals + import re + from .Lemmatizer import Lemmatizer from .WordTokenizer import WordTokenizer from .utils import maketrans @@ -10,179 +12,184 @@ class Normalizer(object): - def __init__(self, remove_extra_spaces=True, persian_style=True, persian_numbers=True, remove_diacritics=True, affix_spacing=True, token_based=False, punctuation_spacing=True): - self._punctuation_spacing = punctuation_spacing - self._affix_spacing = affix_spacing - self._token_based = token_based - - translation_src, translation_dst = ' كي“”', ' کی""' - if persian_numbers: - translation_src += '0123456789%' - translation_dst += '۰۱۲۳۴۵۶۷۸۹٪' - self.translations = maketrans(translation_src, translation_dst) - - if self._token_based: - lemmatizer = Lemmatizer() - self.words = lemmatizer.words - self.verbs = lemmatizer.verbs - self.tokenizer = WordTokenizer(join_verb_parts=False) - self.suffixes = {'ی', 'ای', 'ها', 'های', 'تر', 'تری', 'ترین', 'گر', 'گری', 'ام', 'ات', 'اش'} - - self.character_refinement_patterns = [] - - if remove_extra_spaces: - self.character_refinement_patterns.extend([ - (r' +', ' '), # remove extra spaces - (r'\n\n+', '\n\n'), # remove extra newlines - (r'[ـ\r]', ''), # remove keshide, carriage returns - ]) - - if persian_style: - self.character_refinement_patterns.extend([ - ('"([^\n"]+)"', r'«\1»'), # replace quotation with gyoome - ('([\d+])\.([\d+])', r'\1٫\2'), # replace dot with momayez - (r' ?\.\.\.', ' …'), # replace 3 dots - ]) - - if remove_diacritics: - self.character_refinement_patterns.append( - ('[\u064B\u064C\u064D\u064E\u064F\u0650\u0651\u0652]', ''), # remove FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SHADDA, SUKUN - ) - - self.character_refinement_patterns = compile_patterns(self.character_refinement_patterns) - - punc_after, punc_before = r'\.:!،؛؟»\]\)\}', r'«\[\(\{' - if punctuation_spacing: - self.punctuation_spacing_patterns = compile_patterns([ - ('" ([^\n"]+) "', r'"\1"'), # remove space before and after quotation - (' (['+ punc_after +'])', r'\1'), # remove space before - ('(['+ punc_before +']) ', r'\1'), # remove space after - ('(['+ punc_after[:3] +'])([^ \d'+ punc_after +'])', r'\1 \2'), # put space after . and : - ('(['+ punc_after[3:] +'])([^ '+ punc_after +'])', r'\1 \2'), # put space after - ('([^ '+ punc_before +'])(['+ punc_before +'])', r'\1 \2'), # put space before - ]) - - if affix_spacing: - self.affix_spacing_patterns = compile_patterns([ - (r'([^ ]ه) ی ', r'\1‌ی '), # fix ی space - (r'(^| )(ن?می) ', r'\1\2‌'), # put zwnj after می, نمی - (r'(?<=[^\n\d '+ punc_after + punc_before +']{2}) (تر(ین?)?|گری?|های?)(?=[ \n'+ punc_after + punc_before +']|$)', r'‌\1'), # put zwnj before تر, تری, ترین, گر, گری, ها, های - (r'([^ ]ه) (ا(م|یم|ش|ند|ی|ید|ت))(?=[ \n'+ punc_after +']|$)', r'\1‌\2'), # join ام, ایم, اش, اند, ای, اید, ات - ]) - - def normalize(self, text): - text = self.character_refinement(text) - if self._affix_spacing: - text = self.affix_spacing(text) - - if self._token_based: - tokens = self.tokenizer.tokenize(text.translate(self.translations)) - text = ' '.join(self.token_spacing(tokens)) - - if self._punctuation_spacing: - text = self.punctuation_spacing(text) - - return text - - def character_refinement(self, text): - """ - >>> normalizer = Normalizer() - >>> normalizer.character_refinement('اصلاح كاف و ياي عربي') - 'اصلاح کاف و یای عربی' - - >>> normalizer.character_refinement('عراق سال 2012 قراردادی به ارزش "4.2 میلیارد دلار" برای خرید تجهیزات نظامی با روسیه امضا کرد.') - 'عراق سال ۲۰۱۲ قراردادی به ارزش «۴٫۲ میلیارد دلار» برای خرید تجهیزات نظامی با روسیه امضا کرد.' - - >>> normalizer.character_refinement('رمــــان') - 'رمان' - - >>> normalizer.character_refinement('بُشقابِ مَن را بِگیر') - 'بشقاب من را بگیر' - """ - - text = text.translate(self.translations) - for pattern, repl in self.character_refinement_patterns: - text = pattern.sub(repl, text) - return text - - def punctuation_spacing(self, text): - """ - >>> normalizer = Normalizer() - >>> normalizer.punctuation_spacing('اصلاح ( پرانتزها ) در متن .') - 'اصلاح (پرانتزها) در متن.' - - >>> normalizer.punctuation_spacing('نسخه 0.5 در ساعت 22:00 تهران،1396') - 'نسخه 0.5 در ساعت 22:00 تهران، 1396' - """ - - for pattern, repl in self.punctuation_spacing_patterns: - text = pattern.sub(repl, text) - return text - - def affix_spacing(self, text): - """ - >>> normalizer = Normalizer() - >>> normalizer.affix_spacing('خانه ی پدری') - 'خانه‌ی پدری' - - >>> normalizer.affix_spacing('فاصله میان پیشوند ها و پسوند ها را اصلاح می کند.') - 'فاصله میان پیشوند‌ها و پسوند‌ها را اصلاح می‌کند.' - - >>> normalizer.affix_spacing('می روم') - 'می‌روم' - - >>> normalizer.affix_spacing('حرفه ای') - 'حرفه‌ای' - - >>> normalizer.affix_spacing('محبوب ترین ها') - 'محبوب‌ترین‌ها' - """ - - for pattern, repl in self.affix_spacing_patterns: - text = pattern.sub(repl, text) - return text - - def token_spacing(self, tokens): - """ - >>> normalizer = Normalizer(token_based=True) - >>> normalizer.token_spacing(['کتاب', 'ها']) - ['کتاب‌ها'] - - >>> normalizer.token_spacing(['او', 'می', 'رود']) - ['او', 'می‌رود'] - - >>> normalizer.token_spacing(['ماه', 'می', 'سال', 'جدید']) - ['ماه', 'می', 'سال', 'جدید'] - - >>> normalizer.token_spacing(['اخلال', 'گر']) - ['اخلال‌گر'] - - >>> normalizer.token_spacing(['پرداخت', 'شده', 'است']) - ['پرداخت', 'شده', 'است'] - - >>> normalizer.token_spacing(['زمین', 'لرزه', 'ای']) - ['زمین‌لرزه‌ای'] - """ + def __init__(self, remove_extra_spaces=True, persian_style=True, persian_numbers=True, remove_diacritics=True, + affix_spacing=True, token_based=False, punctuation_spacing=True): + self._punctuation_spacing = punctuation_spacing + self._affix_spacing = affix_spacing + self._token_based = token_based + + translation_src, translation_dst = ' كي“”', ' کی""' + if persian_numbers: + translation_src += '0123456789%' + translation_dst += '۰۱۲۳۴۵۶۷۸۹٪' + self.translations = maketrans(translation_src, translation_dst) + + if self._token_based: + lemmatizer = Lemmatizer() + self.words = lemmatizer.words + self.verbs = lemmatizer.verbs + self.tokenizer = WordTokenizer(join_verb_parts=False) + self.suffixes = {'ی', 'ای', 'ها', 'های', 'تر', 'تری', 'ترین', 'گر', 'گری', 'ام', 'ات', 'اش'} + + self.character_refinement_patterns = [] + + if remove_extra_spaces: + self.character_refinement_patterns.extend([ + (r' +', ' '), # remove extra spaces + (r'\n\n+', '\n\n'), # remove extra newlines + (r'[ـ\r]', ''), # remove keshide, carriage returns + ]) + + if persian_style: + self.character_refinement_patterns.extend([ + ('"([^\n"]+)"', r'«\1»'), # replace quotation with gyoome + ('([\d+])\.([\d+])', r'\1٫\2'), # replace dot with momayez + (r' ?\.\.\.', ' …'), # replace 3 dots + ]) + + if remove_diacritics: + self.character_refinement_patterns.append( + ('[\u064B\u064C\u064D\u064E\u064F\u0650\u0651\u0652]', ''), + # remove FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SHADDA, SUKUN + ) + + self.character_refinement_patterns = compile_patterns(self.character_refinement_patterns) + + punc_after, punc_before = r'\.:!،؛؟»\]\)\}', r'«\[\(\{' + if punctuation_spacing: + self.punctuation_spacing_patterns = compile_patterns([ + ('" ([^\n"]+) "', r'"\1"'), # remove space before and after quotation + (' ([' + punc_after + '])', r'\1'), # remove space before + ('([' + punc_before + ']) ', r'\1'), # remove space after + ('([' + punc_after[:3] + '])([^ \d' + punc_after + '])', r'\1 \2'), # put space after . and : + ('([' + punc_after[3:] + '])([^ ' + punc_after + '])', r'\1 \2'), # put space after + ('([^ ' + punc_before + '])([' + punc_before + '])', r'\1 \2'), # put space before + ]) + + if affix_spacing: + self.affix_spacing_patterns = compile_patterns([ + (r'([^ ]ه) ی ', r'\1‌ی '), # fix ی space + (r'(^| )(ن?می) ', r'\1\2‌'), # put zwnj after می, نمی + ( + r'(?<=[^\n\d ' + punc_after + punc_before + ']{2}) (تر(ین?)?|گری?|های?)(?=[ \n' + punc_after + punc_before + ']|$)', + r'‌\1'), # put zwnj before تر, تری, ترین, گر, گری, ها, های + (r'([^ ]ه) (ا(م|یم|ش|ند|ی|ید|ت))(?=[ \n' + punc_after + ']|$)', r'\1‌\2'), + # join ام, ایم, اش, اند, ای, اید, ات + ]) + + def normalize(self, text): + text = self.character_refinement(text) + if self._affix_spacing: + text = self.affix_spacing(text) + + if self._token_based: + tokens = self.tokenizer.tokenize(text.translate(self.translations)) + text = ' '.join(self.token_spacing(tokens)) + + if self._punctuation_spacing: + text = self.punctuation_spacing(text) + + return text + + def character_refinement(self, text): + """ + >>> normalizer = Normalizer() + >>> normalizer.character_refinement('اصلاح كاف و ياي عربي') + 'اصلاح کاف و یای عربی' + + >>> normalizer.character_refinement('عراق سال 2012 قراردادی به ارزش "4.2 میلیارد دلار" برای خرید تجهیزات نظامی با روسیه امضا کرد.') + 'عراق سال ۲۰۱۲ قراردادی به ارزش «۴٫۲ میلیارد دلار» برای خرید تجهیزات نظامی با روسیه امضا کرد.' + + >>> normalizer.character_refinement('رمــــان') + 'رمان' + + >>> normalizer.character_refinement('بُشقابِ مَن را بِگیر') + 'بشقاب من را بگیر' + """ + + text = text.translate(self.translations) + for pattern, repl in self.character_refinement_patterns: + text = pattern.sub(repl, text) + return text + + def punctuation_spacing(self, text): + """ + >>> normalizer = Normalizer() + >>> normalizer.punctuation_spacing('اصلاح ( پرانتزها ) در متن .') + 'اصلاح (پرانتزها) در متن.' + + >>> normalizer.punctuation_spacing('نسخه 0.5 در ساعت 22:00 تهران،1396') + 'نسخه 0.5 در ساعت 22:00 تهران، 1396' + """ + + for pattern, repl in self.punctuation_spacing_patterns: + text = pattern.sub(repl, text) + return text + + def affix_spacing(self, text): + """ + >>> normalizer = Normalizer() + >>> normalizer.affix_spacing('خانه ی پدری') + 'خانه‌ی پدری' + + >>> normalizer.affix_spacing('فاصله میان پیشوند ها و پسوند ها را اصلاح می کند.') + 'فاصله میان پیشوند‌ها و پسوند‌ها را اصلاح می‌کند.' + + >>> normalizer.affix_spacing('می روم') + 'می‌روم' + + >>> normalizer.affix_spacing('حرفه ای') + 'حرفه‌ای' + + >>> normalizer.affix_spacing('محبوب ترین ها') + 'محبوب‌ترین‌ها' + """ + + for pattern, repl in self.affix_spacing_patterns: + text = pattern.sub(repl, text) + return text + + def token_spacing(self, tokens): + """ + >>> normalizer = Normalizer(token_based=True) + >>> normalizer.token_spacing(['کتاب', 'ها']) + ['کتاب‌ها'] + + >>> normalizer.token_spacing(['او', 'می', 'رود']) + ['او', 'می‌رود'] + + >>> normalizer.token_spacing(['ماه', 'می', 'سال', 'جدید']) + ['ماه', 'می', 'سال', 'جدید'] + + >>> normalizer.token_spacing(['اخلال', 'گر']) + ['اخلال‌گر'] + + >>> normalizer.token_spacing(['پرداخت', 'شده', 'است']) + ['پرداخت', 'شده', 'است'] + + >>> normalizer.token_spacing(['زمین', 'لرزه', 'ای']) + ['زمین‌لرزه‌ای'] + """ - result = [] - for t, token in enumerate(tokens): - joined = False + result = [] + for t, token in enumerate(tokens): + joined = False - if result: - token_pair = result[-1]+'‌'+token - if token_pair in self.verbs or token_pair in self.words and self.words[token_pair][0] > 0: - joined = True + if result: + token_pair = result[-1] + '‌' + token + if token_pair in self.verbs or token_pair in self.words and self.words[token_pair][0] > 0: + joined = True - if t < len(tokens)-1 and token+'_'+tokens[t+1] in self.verbs: - joined = False + if t < len(tokens) - 1 and token + '_' + tokens[t + 1] in self.verbs: + joined = False - elif token in self.suffixes and result[-1] in self.words: - joined = True + elif token in self.suffixes and result[-1] in self.words: + joined = True - if joined: - result.pop() - result.append(token_pair) - else: - result.append(token) + if joined: + result.pop() + result.append(token_pair) + else: + result.append(token) - return result + return result diff --git a/hazm/POSTagger.py b/hazm/POSTagger.py index a2c99be2..c56e9844 100755 --- a/hazm/POSTagger.py +++ b/hazm/POSTagger.py @@ -1,32 +1,35 @@ # coding: utf-8 from __future__ import unicode_literals + from nltk.tag import stanford + from .SequenceTagger import SequenceTagger class POSTagger(SequenceTagger): - """ - >>> tagger = POSTagger(model='resources/postagger.model') - >>> tagger.tag(['من', 'به', 'مدرسه', 'رفته_بودم', '.']) - [('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')] - """ + """ + >>> tagger = POSTagger(model='resources/postagger.model') + >>> tagger.tag(['من', 'به', 'مدرسه', 'رفته_بودم', '.']) + [('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')] + """ class StanfordPOSTagger(stanford.StanfordPOSTagger): - """ - >>> tagger = StanfordPOSTagger(model_filename='resources/persian.tagger', path_to_jar='resources/stanford-postagger.jar') - >>> tagger.tag(['من', 'به', 'مدرسه', 'رفته_بودم', '.']) - [('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')] - """ - - def __init__(self, model_filename, path_to_jar, *args, **kwargs): - self._SEPARATOR = '/' - super(stanford.StanfordPOSTagger, self).__init__(model_filename=model_filename, path_to_jar=path_to_jar, *args, **kwargs) - - def tag(self, tokens): - return self.tag_sents([tokens])[0] - - def tag_sents(self, sentences): - refined = map(lambda s: [w.replace(' ', '_') for w in s], sentences) - return super(stanford.StanfordPOSTagger, self).tag_sents(refined) + """ + >>> tagger = StanfordPOSTagger(model_filename='resources/persian.tagger', path_to_jar='resources/stanford-postagger.jar') + >>> tagger.tag(['من', 'به', 'مدرسه', 'رفته_بودم', '.']) + [('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')] + """ + + def __init__(self, model_filename, path_to_jar, *args, **kwargs): + self._SEPARATOR = '/' + super(stanford.StanfordPOSTagger, self).__init__(model_filename=model_filename, path_to_jar=path_to_jar, *args, + **kwargs) + + def tag(self, tokens): + return self.tag_sents([tokens])[0] + + def tag_sents(self, sentences): + refined = map(lambda s: [w.replace(' ', '_') for w in s], sentences) + return super(stanford.StanfordPOSTagger, self).tag_sents(refined) diff --git a/hazm/PersicaReader.py b/hazm/PersicaReader.py index 21c2b5cb..ed3941df 100644 --- a/hazm/PersicaReader.py +++ b/hazm/PersicaReader.py @@ -1,41 +1,42 @@ # coding: utf-8 from __future__ import print_function + import codecs class PersicaReader(): - """ - interfaces [Persica Corpus](https://sourceforge.net/projects/persica/) - - >>> persica = PersicaReader('corpora/persica.csv') - >>> next(persica.docs())['id'] - 843656 - """ - - def __init__(self, csv_file): - self._csv_file = csv_file - - def docs(self): - lines = [] - for line in codecs.open(self._csv_file, encoding='utf-8-sig'): - line = line.strip() - if line: - if line.endswith(','): - lines.append(line[:-1]) - else: - lines.append(line) - yield { - 'id': int(lines[0]), - 'title': lines[1], - 'text': lines[2], - 'date': lines[3], - 'time': lines[4], - 'category': lines[5], - 'category2': lines[6], - } - lines = [] - - def texts(self): - for doc in self.docs(): - yield doc['text'] + """ + interfaces [Persica Corpus](https://sourceforge.net/projects/persica/) + + >>> persica = PersicaReader('corpora/persica.csv') + >>> next(persica.docs())['id'] + 843656 + """ + + def __init__(self, csv_file): + self._csv_file = csv_file + + def docs(self): + lines = [] + for line in codecs.open(self._csv_file, encoding='utf-8-sig'): + line = line.strip() + if line: + if line.endswith(','): + lines.append(line[:-1]) + else: + lines.append(line) + yield { + 'id': int(lines[0]), + 'title': lines[1], + 'text': lines[2], + 'date': lines[3], + 'time': lines[4], + 'category': lines[5], + 'category2': lines[6], + } + lines = [] + + def texts(self): + for doc in self.docs(): + yield doc['text'] diff --git a/hazm/PeykareReader.py b/hazm/PeykareReader.py index 9c0aeb95..0220a698 100644 --- a/hazm/PeykareReader.py +++ b/hazm/PeykareReader.py @@ -1,104 +1,109 @@ # coding: utf-8 from __future__ import unicode_literals -import os, codecs + +import codecs +import os + from .Normalizer import Normalizer from .WordTokenizer import WordTokenizer def coarse_pos_e(tags): - """ - Coarse POS tags of Peykare corpus: - N: Noun, V: Verb, AJ: Adjective, ADV: Adverb, PRO: Pronoun, DET: Determiner, P: Preposition, POSTP: Postposition, NUM: Number, CONJ: Conjunction, PUNC: Punctuation, RES: Residual, CL: Classifier, INT: Interjection + """ + Coarse POS tags of Peykare corpus: + N: Noun, V: Verb, AJ: Adjective, ADV: Adverb, PRO: Pronoun, DET: Determiner, P: Preposition, POSTP: Postposition, NUM: Number, CONJ: Conjunction, PUNC: Punctuation, RES: Residual, CL: Classifier, INT: Interjection - >>> coarse_pos_e(['N','COM','SING']) - 'N' - """ + >>> coarse_pos_e(['N','COM','SING']) + 'N' + """ - try: - return list(set(tags) & {'N', 'V', 'AJ', 'ADV', 'PRO', 'DET', 'P', 'POSTP', 'NUM', 'CONJ', 'PUNC', 'CL', 'INT', 'RES'})[0] + ('e' if 'EZ' in tags else '') - except: - return 'N' + try: + return list( + set(tags) & {'N', 'V', 'AJ', 'ADV', 'PRO', 'DET', 'P', 'POSTP', 'NUM', 'CONJ', 'PUNC', 'CL', 'INT', 'RES'})[ + 0] + ('e' if 'EZ' in tags else '') + except: + return 'N' def join_verb_parts(sentence): - """ - Join verb parts like Dadedgan corpus. + """ + Join verb parts like Dadedgan corpus. - >>> join_verb_parts([('اولین', 'AJ'), ('سیاره', 'Ne'), ('خارج', 'AJ'), ('از', 'P'), ('منظومه', 'Ne'), ('شمسی', 'AJ'), ('دیده', 'AJ'), ('شد', 'V'), ('.', 'PUNC')]) - [('اولین', 'AJ'), ('سیاره', 'Ne'), ('خارج', 'AJ'), ('از', 'P'), ('منظومه', 'Ne'), ('شمسی', 'AJ'), ('دیده_شد', 'V'), ('.', 'PUNC')] - """ + >>> join_verb_parts([('اولین', 'AJ'), ('سیاره', 'Ne'), ('خارج', 'AJ'), ('از', 'P'), ('منظومه', 'Ne'), ('شمسی', 'AJ'), ('دیده', 'AJ'), ('شد', 'V'), ('.', 'PUNC')]) + [('اولین', 'AJ'), ('سیاره', 'Ne'), ('خارج', 'AJ'), ('از', 'P'), ('منظومه', 'Ne'), ('شمسی', 'AJ'), ('دیده_شد', 'V'), ('.', 'PUNC')] + """ - if not hasattr(join_verb_parts, 'tokenizer'): - join_verb_parts.tokenizer = WordTokenizer() - before_verbs, after_verbs, verbe = join_verb_parts.tokenizer.before_verbs, join_verb_parts.tokenizer.after_verbs, join_verb_parts.tokenizer.verbe + if not hasattr(join_verb_parts, 'tokenizer'): + join_verb_parts.tokenizer = WordTokenizer() + before_verbs, after_verbs, verbe = join_verb_parts.tokenizer.before_verbs, join_verb_parts.tokenizer.after_verbs, join_verb_parts.tokenizer.verbe - result = [('', '')] - for word in reversed(sentence): - if word[0] in before_verbs or (result[-1][0] in after_verbs and word[0] in verbe): - result[-1] = (word[0] + '_' + result[-1][0], result[-1][1]) - else: - result.append(word) - return list(reversed(result[1:])) + result = [('', '')] + for word in reversed(sentence): + if word[0] in before_verbs or (result[-1][0] in after_verbs and word[0] in verbe): + result[-1] = (word[0] + '_' + result[-1][0], result[-1][1]) + else: + result.append(word) + return list(reversed(result[1:])) class PeykareReader(): - """ - Interfaces [Peykare Corpus](http://www.rcisp.com/?q=%D9%BE%DB%8C%DA%A9%D8%B1%D9%87%20%D9%85%D8%AA%D9%86%DB%8C%20%D8%B2%D8%A8%D8%A7%D9%86%20%D9%81%D8%A7%D8%B1%D8%B3%DB%8C) - Bijankhan, M., Sheykhzadegan, J., Bahrani, M., & Ghayoomi, M. (2011). Lessons from building a Persian written corpus: Peykare. Language Resources and Evaluation, 45, 143–164. + """ + Interfaces [Peykare Corpus](http://www.rcisp.com/?q=%D9%BE%DB%8C%DA%A9%D8%B1%D9%87%20%D9%85%D8%AA%D9%86%DB%8C%20%D8%B2%D8%A8%D8%A7%D9%86%20%D9%81%D8%A7%D8%B1%D8%B3%DB%8C) + Bijankhan, M., Sheykhzadegan, J., Bahrani, M., & Ghayoomi, M. (2011). Lessons from building a Persian written corpus: Peykare. Language Resources and Evaluation, 45, 143–164. - >>> peykare = PeykareReader(root='corpora/peykare') - >>> next(peykare.sents()) - [('دیرزمانی', 'N'), ('از', 'P'), ('راه‌اندازی', 'Ne'), ('شبکه‌ی', 'Ne'), ('خبر', 'Ne'), ('الجزیره', 'N'), ('نمی‌گذرد', 'V'), ('،', 'PUNC'), ('اما', 'CONJ'), ('این', 'DET'), ('شبکه‌ی', 'Ne'), ('خبری', 'AJe'), ('عربی', 'N'), ('بسیار', 'ADV'), ('سریع', 'ADV'), ('توانسته', 'V'), ('در', 'P'), ('میان', 'Ne'), ('شبکه‌های', 'Ne'), ('عظیم', 'AJe'), ('خبری', 'AJ'), ('و', 'CONJ'), ('بنگاه‌های', 'Ne'), ('چندرسانه‌ای', 'AJe'), ('دنیا', 'N'), ('خودی', 'N'), ('نشان', 'N'), ('دهد', 'V'), ('.', 'PUNC')] + >>> peykare = PeykareReader(root='corpora/peykare') + >>> next(peykare.sents()) + [('دیرزمانی', 'N'), ('از', 'P'), ('راه‌اندازی', 'Ne'), ('شبکه‌ی', 'Ne'), ('خبر', 'Ne'), ('الجزیره', 'N'), ('نمی‌گذرد', 'V'), ('،', 'PUNC'), ('اما', 'CONJ'), ('این', 'DET'), ('شبکه‌ی', 'Ne'), ('خبری', 'AJe'), ('عربی', 'N'), ('بسیار', 'ADV'), ('سریع', 'ADV'), ('توانسته', 'V'), ('در', 'P'), ('میان', 'Ne'), ('شبکه‌های', 'Ne'), ('عظیم', 'AJe'), ('خبری', 'AJ'), ('و', 'CONJ'), ('بنگاه‌های', 'Ne'), ('چندرسانه‌ای', 'AJe'), ('دنیا', 'N'), ('خودی', 'N'), ('نشان', 'N'), ('دهد', 'V'), ('.', 'PUNC')] - Reading Peykare sentences without mapping pos tags: + Reading Peykare sentences without mapping pos tags: - >>> peykare = PeykareReader(root='corpora/peykare', joined_verb_parts=False, pos_map=None) - >>> next(peykare.sents()) - [('دیرزمانی', 'N,COM,SING,TIME,YA'), ('از', 'P'), ('راه‌اندازی', 'N,COM,SING,EZ'), ('شبکه‌ی', 'N,COM,SING,EZ'), ('خبر', 'N,COM,SING,EZ'), ('الجزیره', 'N,PR,SING'), ('نمی‌گذرد', 'V,PRES,NEG,3'), ('،', 'PUNC'), ('اما', 'CONJ'), ('این', 'DET,DEMO'), ('شبکه‌ی', 'N,COM,SING,EZ'), ('خبری', 'AJ,SIM,EZ'), ('عربی', 'N,PR,SING'), ('بسیار', 'ADV,INTSF,SIM'), ('سریع', 'ADV,GENR,SIM'), ('توانسته', 'V,PASTP'), ('در', 'P'), ('میان', 'N,COM,SING,EZ'), ('شبکه‌های', 'N,COM,PL,EZ'), ('عظیم', 'AJ,SIM,EZ'), ('خبری', 'AJ,SIM'), ('و', 'CONJ'), ('بنگاه‌های', 'N,COM,PL,EZ'), ('چندرسانه‌ای', 'AJ,SIM,EZ'), ('دنیا', 'N,COM,SING'), ('خودی', 'N,COM,SING,YA'), ('نشان', 'N,COM,SING'), ('دهد', 'V,SUB,POS,3'), ('.', 'PUNC')] - """ + >>> peykare = PeykareReader(root='corpora/peykare', joined_verb_parts=False, pos_map=None) + >>> next(peykare.sents()) + [('دیرزمانی', 'N,COM,SING,TIME,YA'), ('از', 'P'), ('راه‌اندازی', 'N,COM,SING,EZ'), ('شبکه‌ی', 'N,COM,SING,EZ'), ('خبر', 'N,COM,SING,EZ'), ('الجزیره', 'N,PR,SING'), ('نمی‌گذرد', 'V,PRES,NEG,3'), ('،', 'PUNC'), ('اما', 'CONJ'), ('این', 'DET,DEMO'), ('شبکه‌ی', 'N,COM,SING,EZ'), ('خبری', 'AJ,SIM,EZ'), ('عربی', 'N,PR,SING'), ('بسیار', 'ADV,INTSF,SIM'), ('سریع', 'ADV,GENR,SIM'), ('توانسته', 'V,PASTP'), ('در', 'P'), ('میان', 'N,COM,SING,EZ'), ('شبکه‌های', 'N,COM,PL,EZ'), ('عظیم', 'AJ,SIM,EZ'), ('خبری', 'AJ,SIM'), ('و', 'CONJ'), ('بنگاه‌های', 'N,COM,PL,EZ'), ('چندرسانه‌ای', 'AJ,SIM,EZ'), ('دنیا', 'N,COM,SING'), ('خودی', 'N,COM,SING,YA'), ('نشان', 'N,COM,SING'), ('دهد', 'V,SUB,POS,3'), ('.', 'PUNC')] + """ - def __init__(self, root, joined_verb_parts=True, pos_map=coarse_pos_e): - self._root = root - self._pos_map = pos_map if pos_map else lambda tags: ','.join(tags) - self._joined_verb_parts = joined_verb_parts - self._normalizer = Normalizer(punctuation_spacing=False, affix_spacing=False) + def __init__(self, root, joined_verb_parts=True, pos_map=coarse_pos_e): + self._root = root + self._pos_map = pos_map if pos_map else lambda tags: ','.join(tags) + self._joined_verb_parts = joined_verb_parts + self._normalizer = Normalizer(punctuation_spacing=False, affix_spacing=False) - def docs(self): - """ extracts raw text of peykare document """ + def docs(self): + """ extracts raw text of peykare document """ - for root, dirs, files in os.walk(self._root): - for name in sorted(files): - with codecs.open(os.path.join(root, name), encoding='windows-1256') as peykare_file: - text = peykare_file.read() - if text: - yield text + for root, dirs, files in os.walk(self._root): + for name in sorted(files): + with codecs.open(os.path.join(root, name), encoding='windows-1256') as peykare_file: + text = peykare_file.read() + if text: + yield text - def doc_to_sents(self, document): - """ converts extracted document text to a list of (word, tag) pairs """ + def doc_to_sents(self, document): + """ converts extracted document text to a list of (word, tag) pairs """ - sentence = [] - for line in document.split('\r\n'): - if not line: - continue + sentence = [] + for line in document.split('\r\n'): + if not line: + continue - parts = line.split(' ') - tags, word = parts[3], self._normalizer.normalize('‌'.join(parts[4:])) + parts = line.split(' ') + tags, word = parts[3], self._normalizer.normalize('‌'.join(parts[4:])) - if word and word != '#': - sentence.append((word, tags)) + if word and word != '#': + sentence.append((word, tags)) - if parts[2] == 'PUNC' and word in {'#', '.', '؟', '!'}: - if len(sentence) > 1: - yield sentence - sentence = [] + if parts[2] == 'PUNC' and word in {'#', '.', '؟', '!'}: + if len(sentence) > 1: + yield sentence + sentence = [] - def sents(self): - map_pos = lambda item: (item[0], self._pos_map(item[1].split(','))) + def sents(self): + map_pos = lambda item: (item[0], self._pos_map(item[1].split(','))) - for document in self.docs(): - for sentence in self.doc_to_sents(document): - if self._joined_verb_parts: - sentence = join_verb_parts(sentence) + for document in self.docs(): + for sentence in self.doc_to_sents(document): + if self._joined_verb_parts: + sentence = join_verb_parts(sentence) - yield list(map(map_pos, sentence)) + yield list(map(map_pos, sentence)) diff --git a/hazm/QuranCorpusReader.py b/hazm/QuranCorpusReader.py index d8e6df80..50a7c3fe 100644 --- a/hazm/QuranCorpusReader.py +++ b/hazm/QuranCorpusReader.py @@ -1,58 +1,62 @@ # coding: utf8 from __future__ import unicode_literals + import codecs + from .utils import maketrans -buckwalter_transliteration = maketrans("'>&<}AbptvjHxd*rzs$SDTZEg_fqklmnhwYyFNKaui~o^#`{:@\"[;,.!-+%]", "\u0621\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u064b\u064c\u064d\u064e\u064f\u0650\u0651\u0652\u0653\u0654\u0670\u0671\u06dc\u06df\u06e0\u06e2\u06e3\u06e5\u06e6\u06e8\u06ea\u06eb\u06ec\u06ed") +buckwalter_transliteration = maketrans("'>&<}AbptvjHxd*rzs$SDTZEg_fqklmnhwYyFNKaui~o^#`{:@\"[;,.!-+%]", + "\u0621\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u064b\u064c\u064d\u064e\u064f\u0650\u0651\u0652\u0653\u0654\u0670\u0671\u06dc\u06df\u06e0\u06e2\u06e3\u06e5\u06e6\u06e8\u06ea\u06eb\u06ec\u06ed") class QuranCorpusReader(): - """ - interfaces [Quran Corpus](http://corpus.quran.com/download/) that you must download and extract it. - - >>> quran = QuranCorpusReader(quran_file='corpora/quranic-corpus-morphology.txt') - >>> print(next(quran.words())[1]) - بِسْمِ - """ - - def __init__(self, quran_file): - self._quran_file = quran_file - - def parts(self): - for line in codecs.open(self._quran_file): - if not line.startswith('('): - continue - parts = line.strip().split('\t') - - part = {'loc': eval(parts[0].replace(':', ',')), 'text': parts[1].translate(buckwalter_transliteration), 'tag': parts[2]} - - features = parts[3].split('|') - for feature in features: - if feature.startswith('LEM:'): - part['lem'] = feature[4:].translate(buckwalter_transliteration) - elif feature.startswith('ROOT:'): - part['root'] = feature[5:].translate(buckwalter_transliteration) - yield part - - def words(self): - - def word_item(location, parts): - text = ''.join([part['text'] for part in parts]) - tag = '-'.join([part['tag'] for part in parts]) - lem = '-'.join([part['lem'] for part in parts if 'lem' in part]) - root = '-'.join([part['root'] for part in parts if 'root' in part]) - return '.'.join(map(str, location)), text, lem, root, tag, parts - - last_location = (0, 0, 0, 0) - items = [] - for part in self.parts(): - if last_location[:3] == part['loc'][:3]: - items.append(part) - else: - if items: - yield word_item(last_location[:3], items) - items = [part] - last_location = part['loc'] - del part['loc'] - yield word_item(last_location[:3], items) + """ + interfaces [Quran Corpus](http://corpus.quran.com/download/) that you must download and extract it. + + >>> quran = QuranCorpusReader(quran_file='corpora/quranic-corpus-morphology.txt') + >>> print(next(quran.words())[1]) + بِسْمِ + """ + + def __init__(self, quran_file): + self._quran_file = quran_file + + def parts(self): + for line in codecs.open(self._quran_file): + if not line.startswith('('): + continue + parts = line.strip().split('\t') + + part = {'loc': eval(parts[0].replace(':', ',')), 'text': parts[1].translate(buckwalter_transliteration), + 'tag': parts[2]} + + features = parts[3].split('|') + for feature in features: + if feature.startswith('LEM:'): + part['lem'] = feature[4:].translate(buckwalter_transliteration) + elif feature.startswith('ROOT:'): + part['root'] = feature[5:].translate(buckwalter_transliteration) + yield part + + def words(self): + + def word_item(location, parts): + text = ''.join([part['text'] for part in parts]) + tag = '-'.join([part['tag'] for part in parts]) + lem = '-'.join([part['lem'] for part in parts if 'lem' in part]) + root = '-'.join([part['root'] for part in parts if 'root' in part]) + return '.'.join(map(str, location)), text, lem, root, tag, parts + + last_location = (0, 0, 0, 0) + items = [] + for part in self.parts(): + if last_location[:3] == part['loc'][:3]: + items.append(part) + else: + if items: + yield word_item(last_location[:3], items) + items = [part] + last_location = part['loc'] + del part['loc'] + yield word_item(last_location[:3], items) diff --git a/hazm/SentenceTokenizer.py b/hazm/SentenceTokenizer.py index cafbcffe..d3a7fb4e 100644 --- a/hazm/SentenceTokenizer.py +++ b/hazm/SentenceTokenizer.py @@ -1,20 +1,22 @@ # coding: utf-8 from __future__ import unicode_literals + import re + from nltk.tokenize.api import TokenizerI class SentenceTokenizer(TokenizerI): - """ - >>> tokenizer = SentenceTokenizer() - >>> tokenizer.tokenize('جدا کردن ساده است. تقریبا البته!') - ['جدا کردن ساده است.', 'تقریبا البته!'] - """ + """ + >>> tokenizer = SentenceTokenizer() + >>> tokenizer.tokenize('جدا کردن ساده است. تقریبا البته!') + ['جدا کردن ساده است.', 'تقریبا البته!'] + """ - def __init__(self): - self.pattern = re.compile(r'([!\.\?⸮؟]+)[ \n]+') + def __init__(self): + self.pattern = re.compile(r'([!\.\?⸮؟]+)[ \n]+') - def tokenize(self, text): - text = self.pattern.sub(r'\1\n\n', text) - return [sentence.replace('\n', ' ').strip() for sentence in text.split('\n\n') if sentence.strip()] + def tokenize(self, text): + text = self.pattern.sub(r'\1\n\n', text) + return [sentence.replace('\n', ' ').strip() for sentence in text.split('\n\n') if sentence.strip()] diff --git a/hazm/SentiPersReader.py b/hazm/SentiPersReader.py index 72068945..e7249d51 100644 --- a/hazm/SentiPersReader.py +++ b/hazm/SentiPersReader.py @@ -1,61 +1,68 @@ # coding: utf-8 from __future__ import unicode_literals, print_function -import os, sys, itertools + +import itertools +import os +import sys from xml.dom import minidom class SentiPersReader(): - """ - interfaces [SentiPers Corpus](http://nlp.guilan.ac.ir/Dataset.aspx) - - >>> sentipers = SentiPersReader(root='corpora/sentipers') - >>> next(sentipers.comments())[0][1] - 'بيشتر مناسب است براي کساني که به دنبال تنوع هستند و در همه چيز نو گرايي دارند .' - """ - - def __init__(self, root): - self._root = root - - def docs(self): - - def element_sentences(element): - for sentence in element.getElementsByTagName('Sentence'): - yield {'text': sentence.childNodes[0].data, 'id': sentence.getAttribute('ID'), 'value': int(sentence.getAttribute('Value')) if comment.getAttribute('Value') else None} - - for root, dirs, files in os.walk(self._root): - for filename in sorted(files): - try: - elements = minidom.parse(os.path.join(root, filename)) - - product = elements.getElementsByTagName('Product')[0] - doc = { - 'Title': product.getAttribute('Title'), - 'Type': product.getAttribute('Type'), - 'comments': [], - } - - for child in product.childNodes: - if child.nodeName in {'Voters', 'Performance', 'Capability', 'Production_Quality', 'Ergonomics', 'Purchase_Value'}: - value = child.getAttribute('Value') - doc[child.nodeName] = float(value) if '.' in value else int(value) - - for comment in itertools.chain(elements.getElementsByTagName('Opinion'), elements.getElementsByTagName('Criticism')): - doc['comments'].append({ - 'id': comment.getAttribute('ID'), - 'type': comment.nodeName, - 'author': comment.getAttribute('Holder').strip(), - 'value': int(comment.getAttribute('Value')) if comment.getAttribute('Value') else None, - 'sentences': list(element_sentences(comment)) - }) - - # todo: Accessories, Features, Review, Advantages, Tags, Keywords, Index - - yield doc - - except Exception as e: - print('error in reading', filename, e, file=sys.stderr) - - def comments(self): - for doc in self.docs(): - yield [[sentence['text'] for sentence in text] for text in [comment['sentences'] for comment in doc['comments']]] + """ + interfaces [SentiPers Corpus](http://nlp.guilan.ac.ir/Dataset.aspx) + + >>> sentipers = SentiPersReader(root='corpora/sentipers') + >>> next(sentipers.comments())[0][1] + 'بيشتر مناسب است براي کساني که به دنبال تنوع هستند و در همه چيز نو گرايي دارند .' + """ + + def __init__(self, root): + self._root = root + + def docs(self): + + def element_sentences(element): + for sentence in element.getElementsByTagName('Sentence'): + yield {'text': sentence.childNodes[0].data, 'id': sentence.getAttribute('ID'), + 'value': int(sentence.getAttribute('Value')) if comment.getAttribute('Value') else None} + + for root, dirs, files in os.walk(self._root): + for filename in sorted(files): + try: + elements = minidom.parse(os.path.join(root, filename)) + + product = elements.getElementsByTagName('Product')[0] + doc = { + 'Title': product.getAttribute('Title'), + 'Type': product.getAttribute('Type'), + 'comments': [], + } + + for child in product.childNodes: + if child.nodeName in {'Voters', 'Performance', 'Capability', 'Production_Quality', 'Ergonomics', + 'Purchase_Value'}: + value = child.getAttribute('Value') + doc[child.nodeName] = float(value) if '.' in value else int(value) + + for comment in itertools.chain(elements.getElementsByTagName('Opinion'), + elements.getElementsByTagName('Criticism')): + doc['comments'].append({ + 'id': comment.getAttribute('ID'), + 'type': comment.nodeName, + 'author': comment.getAttribute('Holder').strip(), + 'value': int(comment.getAttribute('Value')) if comment.getAttribute('Value') else None, + 'sentences': list(element_sentences(comment)) + }) + + # todo: Accessories, Features, Review, Advantages, Tags, Keywords, Index + + yield doc + + except Exception as e: + print('error in reading', filename, e, file=sys.stderr) + + def comments(self): + for doc in self.docs(): + yield [[sentence['text'] for sentence in text] for text in + [comment['sentences'] for comment in doc['comments']]] diff --git a/hazm/SequenceTagger.py b/hazm/SequenceTagger.py index 66e88268..c31b4016 100644 --- a/hazm/SequenceTagger.py +++ b/hazm/SequenceTagger.py @@ -1,57 +1,59 @@ # coding: utf-8 from __future__ import unicode_literals -from nltk.tag.api import TaggerI + from nltk.metrics import accuracy +from nltk.tag.api import TaggerI class SequenceTagger(TaggerI): - """ wrapper for [Wapiti](http://wapiti.limsi.fr) sequence tagger + """ wrapper for [Wapiti](http://wapiti.limsi.fr) sequence tagger - >>> tagger = SequenceTagger(patterns=['*', 'u:word-%x[0,0]']) - >>> tagger.train([[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]]) - >>> tagger.tag_sents([['من', 'به', 'مدرسه', 'رفته_بودم', '.']]) - [[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]] + >>> tagger = SequenceTagger(patterns=['*', 'u:word-%x[0,0]']) + >>> tagger.train([[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]]) + >>> tagger.tag_sents([['من', 'به', 'مدرسه', 'رفته_بودم', '.']]) + [[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]] - >>> tagger.save_model('resources/test.model') - >>> SequenceTagger(model='resources/test.model').tag_sents([['من', 'به', 'مدرسه', 'رفته_بودم', '.']]) - [[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]] - """ + >>> tagger.save_model('resources/test.model') + >>> SequenceTagger(model='resources/test.model').tag_sents([['من', 'به', 'مدرسه', 'رفته_بودم', '.']]) + [[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]] + """ - def __init__(self, patterns=[], **options): - from wapiti import Model - self.model = Model(patterns='\n'.join(patterns), **options) + def __init__(self, patterns=[], **options): + from wapiti import Model + self.model = Model(patterns='\n'.join(patterns), **options) - def train(self, sentences): - self.model.train(['\n'.join([' '.join(word) for word in sentence]) for sentence in sentences]) + def train(self, sentences): + self.model.train(['\n'.join([' '.join(word) for word in sentence]) for sentence in sentences]) - def save_model(self, filename): - self.model.save(filename) + def save_model(self, filename): + self.model.save(filename) - def tag_sents(self, sentences): - sentences = list(sentences) - lines = '\n\n'.join(['\n'.join(sentence) for sentence in sentences]).replace(' ', '_') - results = self.model.label_sequence(lines).decode('utf8') - tags = iter(results.strip().split('\n')) - return [[(word, next(tags)) for word in sentence] for sentence in sentences] + def tag_sents(self, sentences): + sentences = list(sentences) + lines = '\n\n'.join(['\n'.join(sentence) for sentence in sentences]).replace(' ', '_') + results = self.model.label_sequence(lines).decode('utf8') + tags = iter(results.strip().split('\n')) + return [[(word, next(tags)) for word in sentence] for sentence in sentences] class IOBTagger(SequenceTagger): - """ wrapper for [Wapiti](http://wapiti.limsi.fr) sequence tagger - - >>> tagger = IOBTagger(patterns=['*', 'U:word-%x[0,0]', 'U:word-%x[0,1]']) - >>> tagger.train([[('من', 'PRO', 'B-NP'), ('به', 'P', 'B-PP'), ('مدرسه', 'N', 'B-NP'), ('رفته_بودم', 'V', 'B-VP'), ('.', 'PUNC', 'O')]]) - >>> tagger.tag_sents([[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]]) - [[('من', 'PRO', 'B-NP'), ('به', 'P', 'B-PP'), ('مدرسه', 'N', 'B-NP'), ('رفته_بودم', 'V', 'B-VP'), ('.', 'PUNC', 'O')]] - """ - - def tag_sents(self, sentences): - sentences = list(sentences) - lines = '\n\n'.join(['\n'.join(['\t'.join(word) for word in sentence]) for sentence in sentences]).replace(' ', '_') - results = self.model.label_sequence(lines).decode('utf8') - tags = iter(results.strip().split('\n')) - return [[word + (next(tags),) for word in sentence] for sentence in sentences] - - def evaluate(self, gold): - tagged_sents = self.tag_sents(([word[:-1] for word in sentence] for sentence in gold)) - return accuracy(sum(gold, []), sum(tagged_sents, [])) + """ wrapper for [Wapiti](http://wapiti.limsi.fr) sequence tagger + + >>> tagger = IOBTagger(patterns=['*', 'U:word-%x[0,0]', 'U:word-%x[0,1]']) + >>> tagger.train([[('من', 'PRO', 'B-NP'), ('به', 'P', 'B-PP'), ('مدرسه', 'N', 'B-NP'), ('رفته_بودم', 'V', 'B-VP'), ('.', 'PUNC', 'O')]]) + >>> tagger.tag_sents([[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]]) + [[('من', 'PRO', 'B-NP'), ('به', 'P', 'B-PP'), ('مدرسه', 'N', 'B-NP'), ('رفته_بودم', 'V', 'B-VP'), ('.', 'PUNC', 'O')]] + """ + + def tag_sents(self, sentences): + sentences = list(sentences) + lines = '\n\n'.join(['\n'.join(['\t'.join(word) for word in sentence]) for sentence in sentences]).replace(' ', + '_') + results = self.model.label_sequence(lines).decode('utf8') + tags = iter(results.strip().split('\n')) + return [[word + (next(tags),) for word in sentence] for sentence in sentences] + + def evaluate(self, gold): + tagged_sents = self.tag_sents(([word[:-1] for word in sentence] for sentence in gold)) + return accuracy(sum(gold, []), sum(tagged_sents, [])) diff --git a/hazm/Stemmer.py b/hazm/Stemmer.py index 0b5304e4..f774422f 100644 --- a/hazm/Stemmer.py +++ b/hazm/Stemmer.py @@ -1,35 +1,36 @@ # coding: utf-8 from __future__ import unicode_literals + from nltk.stem.api import StemmerI class Stemmer(StemmerI): - """ - >>> stemmer = Stemmer() - >>> stemmer.stem('کتابی') - 'کتاب' - >>> stemmer.stem('کتاب‌ها') - 'کتاب' - >>> stemmer.stem('کتاب‌هایی') - 'کتاب' - >>> stemmer.stem('کتابهایشان') - 'کتاب' - >>> stemmer.stem('اندیشه‌اش') - 'اندیشه' - >>> stemmer.stem('خانۀ') - 'خانه' - """ - - def __init__(self): - self.ends = ['ات', 'ان', 'ترین', 'تر', 'م', 'ت', 'ش', 'یی', 'ی', 'ها', 'ٔ', '‌ا', '‌'] - - def stem(self, word): - for end in self.ends: - if word.endswith(end): - word = word[:-len(end)] - - if word.endswith('ۀ'): - word = word[:-1] + 'ه' - - return word + """ + >>> stemmer = Stemmer() + >>> stemmer.stem('کتابی') + 'کتاب' + >>> stemmer.stem('کتاب‌ها') + 'کتاب' + >>> stemmer.stem('کتاب‌هایی') + 'کتاب' + >>> stemmer.stem('کتابهایشان') + 'کتاب' + >>> stemmer.stem('اندیشه‌اش') + 'اندیشه' + >>> stemmer.stem('خانۀ') + 'خانه' + """ + + def __init__(self): + self.ends = ['ات', 'ان', 'ترین', 'تر', 'م', 'ت', 'ش', 'یی', 'ی', 'ها', 'ٔ', '‌ا', '‌'] + + def stem(self, word): + for end in self.ends: + if word.endswith(end): + word = word[:-len(end)] + + if word.endswith('ۀ'): + word = word[:-1] + 'ه' + + return word diff --git a/hazm/TNewsReader.py b/hazm/TNewsReader.py index e3b8ced9..70aca5ab 100644 --- a/hazm/TNewsReader.py +++ b/hazm/TNewsReader.py @@ -1,55 +1,58 @@ # coding: utf-8 from __future__ import print_function -import os, sys, re + +import os +import re +import sys from xml.dom import minidom class TNewsReader(): - """ - interfaces [TNews Corpus](http://datasets.tnews.ir/downloads/) that you must download and extract. - - >>> tnews = TNewsReader(root='corpora/tnews') - >>> next(tnews.docs())['id'] - '14092303482300013653' - """ - - def __init__(self, root): - self._root = root - self.cleaner = re.compile(r'<[^<>]+>') - - def docs(self): - def get_text(element): - raw_html = element.childNodes[0].data if element.childNodes else '' - cleaned_text = re.sub(self.cleaner, '', raw_html) - return cleaned_text - - for root, dirs, files in os.walk(self._root): - for name in sorted(files): - - try: - content = open(os.path.join(root, name)).read() - - # fix xml formating issue - content = re.sub(r'[  ]', '', content).replace('', '') + '' - - elements = minidom.parseString(content) - for element in elements.getElementsByTagName('NEWS'): - doc = {} - doc['id'] = get_text(element.getElementsByTagName('NEWSID')[0]) - doc['url'] = get_text(element.getElementsByTagName('URL')[0]) - doc['datetime'] = get_text(element.getElementsByTagName('UTCDATE')[0]) - doc['category'] = get_text(element.getElementsByTagName('CATEGORY')[0]) - doc['pre-title'] = get_text(element.getElementsByTagName('PRETITLE')[0]) - doc['title'] = get_text(element.getElementsByTagName('TITLE')[0]) - doc['post-title'] = get_text(element.getElementsByTagName('POSTTITLE')[0]) - doc['brief'] = get_text(element.getElementsByTagName('BRIEF')[0]) - doc['text'] = get_text(element.getElementsByTagName('DESCRIPTION')[0]) - yield doc - - except Exception as e: - print('error in reading', name, e, file=sys.stderr) - - def texts(self): - for doc in self.docs(): - yield doc['text'] + """ + interfaces [TNews Corpus](http://datasets.tnews.ir/downloads/) that you must download and extract. + + >>> tnews = TNewsReader(root='corpora/tnews') + >>> next(tnews.docs())['id'] + '14092303482300013653' + """ + + def __init__(self, root): + self._root = root + self.cleaner = re.compile(r'<[^<>]+>') + + def docs(self): + def get_text(element): + raw_html = element.childNodes[0].data if element.childNodes else '' + cleaned_text = re.sub(self.cleaner, '', raw_html) + return cleaned_text + + for root, dirs, files in os.walk(self._root): + for name in sorted(files): + + try: + content = open(os.path.join(root, name)).read() + + # fix xml formating issue + content = re.sub(r'[  ]', '', content).replace('', '') + '' + + elements = minidom.parseString(content) + for element in elements.getElementsByTagName('NEWS'): + doc = {} + doc['id'] = get_text(element.getElementsByTagName('NEWSID')[0]) + doc['url'] = get_text(element.getElementsByTagName('URL')[0]) + doc['datetime'] = get_text(element.getElementsByTagName('UTCDATE')[0]) + doc['category'] = get_text(element.getElementsByTagName('CATEGORY')[0]) + doc['pre-title'] = get_text(element.getElementsByTagName('PRETITLE')[0]) + doc['title'] = get_text(element.getElementsByTagName('TITLE')[0]) + doc['post-title'] = get_text(element.getElementsByTagName('POSTTITLE')[0]) + doc['brief'] = get_text(element.getElementsByTagName('BRIEF')[0]) + doc['text'] = get_text(element.getElementsByTagName('DESCRIPTION')[0]) + yield doc + + except Exception as e: + print('error in reading', name, e, file=sys.stderr) + + def texts(self): + for doc in self.docs(): + yield doc['text'] diff --git a/hazm/TokenSplitter.py b/hazm/TokenSplitter.py index c2c0f466..4263daf7 100644 --- a/hazm/TokenSplitter.py +++ b/hazm/TokenSplitter.py @@ -1,35 +1,37 @@ # coding: utf-8 from __future__ import unicode_literals + from .Lemmatizer import Lemmatizer class TokenSplitter(): - def __init__(self): - self.lemmatizer = Lemmatizer() - self.lemmatize = self.lemmatizer.lemmatize - self.words = self.lemmatizer.words - - def split_token_words(self, token): - """ - >>> splitter = TokenSplitter() - >>> splitter.split_token_words('صداوسیماجمهوری') - [('صداوسیما', 'جمهوری')] - >>> splitter.split_token_words('صداو') - [('صد', 'او'), ('صدا', 'و')] - >>> splitter.split_token_words('شهرموشها') - [('شهر', 'موشها')] - >>> splitter.split_token_words('داستان‌سرا') - [('داستان', 'سرا'), ('داستان‌سرا',)] - >>> splitter.split_token_words('دستان‌سرا') - [('دستان', 'سرا')] - """ - - candidates = [] - if '‌' in token: - candidates.append(tuple(token.split('‌'))) - - splits = [(token[:s], token[s:]) for s in range(1, len(token)) if token[s-1] != '‌' and token[s] != '‌'] + [(token, )] - candidates.extend(list(filter(lambda tokens: set(map(self.lemmatize, tokens)).issubset(self.words), splits))) - - return candidates + def __init__(self): + self.lemmatizer = Lemmatizer() + self.lemmatize = self.lemmatizer.lemmatize + self.words = self.lemmatizer.words + + def split_token_words(self, token): + """ + >>> splitter = TokenSplitter() + >>> splitter.split_token_words('صداوسیماجمهوری') + [('صداوسیما', 'جمهوری')] + >>> splitter.split_token_words('صداو') + [('صد', 'او'), ('صدا', 'و')] + >>> splitter.split_token_words('شهرموشها') + [('شهر', 'موشها')] + >>> splitter.split_token_words('داستان‌سرا') + [('داستان', 'سرا'), ('داستان‌سرا',)] + >>> splitter.split_token_words('دستان‌سرا') + [('دستان', 'سرا')] + """ + + candidates = [] + if '‌' in token: + candidates.append(tuple(token.split('‌'))) + + splits = [(token[:s], token[s:]) for s in range(1, len(token)) if token[s - 1] != '‌' and token[s] != '‌'] + [ + (token,)] + candidates.extend(list(filter(lambda tokens: set(map(self.lemmatize, tokens)).issubset(self.words), splits))) + + return candidates diff --git a/hazm/TreebankReader.py b/hazm/TreebankReader.py index 32421a5f..cee5b7c3 100644 --- a/hazm/TreebankReader.py +++ b/hazm/TreebankReader.py @@ -1,258 +1,274 @@ # coding: utf-8 from __future__ import unicode_literals, print_function -import os, sys, re, codecs + +import codecs +import os +import re +import sys from xml.dom import minidom + from nltk.tree import Tree + from .WordTokenizer import WordTokenizer def coarse_pos_e(tags): - """ - Coarse POS tags of Treebank corpus: - N: Noun, V: Verb, A: Adjective, D: Adverb, Z: Pronoun, T: Determiner, E: Preposition, P: Postposition, U: Number, J: Conjunction, O: Punctuation, R: Residual, L: Classifier, I: Interjection - - >>> coarse_pos_e(['Nasp---', 'pers', 'prop']) - 'N' - """ - - map = {'N': 'N', 'V': 'V', 'A': 'AJ', 'D': 'ADV', 'Z': 'PRO', 'T': 'DET', 'E': 'P', 'P': 'POSTP', 'U': 'NUM', 'J': 'CONJ', 'O': 'PUNC', 'R': 'RES', 'L': 'CL', 'I': 'INT'} - try: - if tags[0][0] == 'C': - if 'pronominal' in tags: - tags[0] = 'Z' - elif 'verb' in tags: - tags[0] = 'V' - elif 'prep' in tags: - tags[0] = 'E' - elif 'adv' in tags: - tags[0] = 'D' - elif 'det' in tags: - tags[0] = 'T' - return map[tags[0][0]] + ('e' if 'ezafe' in tags else '') - except Exception: - return '' + """ + Coarse POS tags of Treebank corpus: + N: Noun, V: Verb, A: Adjective, D: Adverb, Z: Pronoun, T: Determiner, E: Preposition, P: Postposition, U: Number, J: Conjunction, O: Punctuation, R: Residual, L: Classifier, I: Interjection + + >>> coarse_pos_e(['Nasp---', 'pers', 'prop']) + 'N' + """ + + map = {'N': 'N', 'V': 'V', 'A': 'AJ', 'D': 'ADV', 'Z': 'PRO', 'T': 'DET', 'E': 'P', 'P': 'POSTP', 'U': 'NUM', + 'J': 'CONJ', 'O': 'PUNC', 'R': 'RES', 'L': 'CL', 'I': 'INT'} + try: + if tags[0][0] == 'C': + if 'pronominal' in tags: + tags[0] = 'Z' + elif 'verb' in tags: + tags[0] = 'V' + elif 'prep' in tags: + tags[0] = 'E' + elif 'adv' in tags: + tags[0] = 'D' + elif 'det' in tags: + tags[0] = 'T' + return map[tags[0][0]] + ('e' if 'ezafe' in tags else '') + except Exception: + return '' class TreebankReader(): - """ - interfaces [Per­si­an Tree­bank](http://hpsg.fu-berlin.de/~ghayoomi/PTB.html) - - >>> treebank = TreebankReader(root='corpora/treebank') - >>> print(next(treebank.trees())) - (S - (VPS - (NPC (N دنیای/Ne) (MN (N آدولف/N) (N بورن/N))) - (VPC - (NPC (N دنیای/Ne) (NPA (N اتفاقات/Ne) (ADJ رویایی/AJ))) - (V است/V))) - (PUNC ./PUNC)) - - >>> next(treebank.sents()) - [('دنیای', 'Ne'), ('آدولف', 'N'), ('بورن', 'N'), ('دنیای', 'Ne'), ('اتفاقات', 'Ne'), ('رویایی', 'AJ'), ('است', 'V'), ('.', 'PUNC')] - - >>> from .Chunker import tree2brackets - >>> tree2brackets(next(treebank.chunked_trees())) - '[دنیای آدولف بورن NP] [دنیای اتفاقات رویایی NP] [است VP] .' - """ - - def __init__(self, root, pos_map=coarse_pos_e, join_clitics=False, join_verb_parts=False): - self._root = root - self._pos_map = pos_map if pos_map else lambda tags: ','.join(tags) - self._join_clitics = join_clitics - self._join_verb_parts = join_verb_parts - self._tokenizer = WordTokenizer() - - def docs(self): - for root, dirs, files in os.walk(self._root): - for name in sorted(files): - try: - with codecs.open(os.path.join(root, name), encoding='utf8') as treebank_file: - raw = re.sub(r'\n *', '', treebank_file.read()) - yield minidom.parseString(raw.encode('utf8')) - except Exception as e: - print('error in reading', name, e, file=sys.stderr) - - def trees(self): - - def traverse(node): - def extract_tags(W): - pos = [W.getAttribute('lc') if W.getAttribute('lc') else None] - if W.getAttribute('clitic') in {'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det'}: - pos.append(W.getAttribute('clitic')) - if W.getAttribute('ne_sort'): - pos.append(W.getAttribute('ne_sort')) - if W.getAttribute('n_type'): - pos.append(W.getAttribute('n_type')) - if W.getAttribute('ya_type'): - pos.append(W.getAttribute('ya_type')) - if W.getAttribute('ke_type'): - pos.append(W.getAttribute('ke_type')) - if W.getAttribute('type'): - pos.append(W.getAttribute('type')) - if W.getAttribute('kind'): - pos.append(W.getAttribute('kind')) - return pos - - def clitic_join(tree, clitic): - if type(tree[-1]) == Tree: - return clitic_join(tree[-1], clitic) - else: - if(clitic[0][0][0] == 'ا'): - clitic[0] = ('‌' + clitic[0][0], clitic[0][1]) - tree[-1]=(tree[-1][0] + clitic[0][0], clitic[0][1]) - tree.set_label('CLITICS') - return - - if not len(node.childNodes): - return - first = node.childNodes[0] - if first.tagName == 'w': - pos=extract_tags(first) - return Tree(node.tagName, [(first.childNodes[0].data.replace('می ', 'می‌'), self._pos_map(pos))]) - childs = node.childNodes[2:] if node.tagName == 'S' else node.childNodes - for child in childs: - if not len(child.childNodes): - childs.remove(child) - tree = Tree(node.tagName, map(traverse, childs)) - if self._join_clitics and len(tree) > 1 and type(tree[1]) == Tree and tree[1].label() == 'CLITIC' and tree[1][0][1] not in {'P', 'V'}: - clitic=tree[-1] - tree = Tree(tree.label(), [subtree for subtree in tree[0]]) - clitic_join(tree, clitic) - if self._join_verb_parts and len(tree) > 1 and type(tree[1]) == Tree and type(tree[0]) == Tree and tree[0].label() == 'AUX' and tree[0][0][0] in self._tokenizer.before_verbs: - tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0], tree[1][0][1]) - tree.remove(tree[0]) - if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe : - tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) - path = tree.leaf_treeposition(len(tree.leaves())-2) - removingtree = tree - while len(path) > 2 : - removingtree = removingtree[path[0]] - path = path[1:] - removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]])) - if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe : - tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) - path = tree.leaf_treeposition(len(tree.leaves())-2) - removingtree = tree - while len(path) > 2 : - removingtree = removingtree[path[0]] - path = path[1:] - removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]])) - return tree - - for doc in self.docs(): - for S in doc.getElementsByTagName('S'): - yield traverse(S) - - def sents(self): - for tree in self.trees(): - yield tree.leaves() - - - def chunked_trees(self): - collapse = lambda node, label: Tree(label, [Tree(pos[1], [pos[0]]) for pos in node.pos()]) - - def traverse(node, parent, chunks): - label = node.label() - - if label.count('-nid') > 0: - label = label.replace('-nid', '') - if label.count('-nid') > 0: - label = label.replace('-nid', '') - if label.count('-DiscA') > 0: - label = label.replace('-DiscA', '') - - if label in {'CLITIC', 'CLITICS'}: - if node[0][1] == 'V': - label = 'V' - elif node[0][1] == 'P': - label = 'PREP' - elif node[0][1] == 'DET': - label = 'DET' - elif node[0][1] == 'ADV': - label = 'ADV' - elif node[0][1] == 'PRO': - label = 'PRON' - - if label in {'CONJ', 'PUNC'} and len(node) == 1: - chunks.append(node) - return - - if label == 'PPC' and len(node) == 1: - chunks.append(Tree('PP', [node[0]])) - return - - if label == 'PREP': - chunks.append(Tree('PP', [node])) - return - - if label == 'PostP': - chunks.append(Tree('POSTP', [node])) - return - - for leaf in node.pos(): - if leaf[1] in {'PUNC', 'CONJ', 'PREP', 'PostP'}: - for i in range(len(node)): - traverse(node[i], node, chunks) - return - - if label == 'NPA' and parent.label() in {'CPC', 'PPC'}: - chunks.append(collapse(node, 'NP')) - return - - if label == 'NPA' and len(node)>=1: - if node[0].label() == 'ADV': - chunks.append(collapse(node, 'NP')) - return - - if label in {'NPC', 'N', 'INFV', 'DPA', 'CLASS', 'DPC', 'DEM', 'INTJ', 'MN', 'PRON', 'DET', 'NUM', 'RES'}: - chunks.append(collapse(node, 'NP')) - return - - if label == 'NPA' and len(node) >= 2: - if node[0].label() == 'ADJ' and node[1].label() == 'NPC' or node[0].label() in {'N', 'PRON'} and node[1].label() in {'ADJ', 'ADJPA', 'N'} or node[0].label() == 'NUM' and node[1].label() in {'N', 'NPC', 'MN', 'NUM'} or node[0].label() in {'N', 'NPC', 'MN'} and node[1].label() == 'NUM' or node[0].label() == 'NPC' and node[1].label() == 'ADJ' or node[0].label() == 'NPA' and node[1].label() != 'NPC' or node[1].label() == 'NPA' and node[0].label() != 'NPC': - chunks.append(collapse(node, 'NP')) - return - - if label == 'DPC' and len(node) >= 2: - chunkable = True - for leaf in node[1].pos(): - if leaf[1] in {'PUNC', 'CONJ', 'PREP', 'PostP'}: - chunkable = False - if node[1].label() in {'N', 'NPA', 'NPC'} and chunkable: - chunks.append(collapse(node, 'NP')) - return - - if label == 'DPA' and len(node)>=2: - if node[1].label() == 'ADV': - chunks.append(collapse(node, 'ADVP')) - return - - if label in {'MV', 'V', 'AUX', 'PPARV'}: - chunks.append(Tree('VP', [node])) - return - - if label in {'ADJ', 'ADJPC', 'MADJ', 'ADVPA'}: - chunks.append(Tree('ADJP', [node])) - return - - if label in {'ADV', 'MADV', 'ADVPC'}: - chunks.append(Tree('ADVP', [node])) - return - - if type(node[0]) != Tree: - chunks.append(node) - return - - for i in range(len(node)): - traverse(node[i], node, chunks) - - for tree in self.trees(): - chunks = [] - traverse(tree, None, chunks) - for i in range(len(chunks)): - if chunks[i].label() in {'PUNC', 'CONJ'}: - chunks[i] = chunks[i][0] - else: - chunks[i] = Tree(chunks[i].label(), chunks[i].leaves()) - yield Tree('S', chunks) + """ + interfaces [Per­si­an Tree­bank](http://hpsg.fu-berlin.de/~ghayoomi/PTB.html) + + >>> treebank = TreebankReader(root='corpora/treebank') + >>> print(next(treebank.trees())) + (S + (VPS + (NPC (N دنیای/Ne) (MN (N آدولف/N) (N بورن/N))) + (VPC + (NPC (N دنیای/Ne) (NPA (N اتفاقات/Ne) (ADJ رویایی/AJ))) + (V است/V))) + (PUNC ./PUNC)) + + >>> next(treebank.sents()) + [('دنیای', 'Ne'), ('آدولف', 'N'), ('بورن', 'N'), ('دنیای', 'Ne'), ('اتفاقات', 'Ne'), ('رویایی', 'AJ'), ('است', 'V'), ('.', 'PUNC')] + + >>> from .Chunker import tree2brackets + >>> tree2brackets(next(treebank.chunked_trees())) + '[دنیای آدولف بورن NP] [دنیای اتفاقات رویایی NP] [است VP] .' + """ + + def __init__(self, root, pos_map=coarse_pos_e, join_clitics=False, join_verb_parts=False): + self._root = root + self._pos_map = pos_map if pos_map else lambda tags: ','.join(tags) + self._join_clitics = join_clitics + self._join_verb_parts = join_verb_parts + self._tokenizer = WordTokenizer() + + def docs(self): + for root, dirs, files in os.walk(self._root): + for name in sorted(files): + try: + with codecs.open(os.path.join(root, name), encoding='utf8') as treebank_file: + raw = re.sub(r'\n *', '', treebank_file.read()) + yield minidom.parseString(raw.encode('utf8')) + except Exception as e: + print('error in reading', name, e, file=sys.stderr) + + def trees(self): + + def traverse(node): + def extract_tags(W): + pos = [W.getAttribute('lc') if W.getAttribute('lc') else None] + if W.getAttribute('clitic') in {'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det'}: + pos.append(W.getAttribute('clitic')) + if W.getAttribute('ne_sort'): + pos.append(W.getAttribute('ne_sort')) + if W.getAttribute('n_type'): + pos.append(W.getAttribute('n_type')) + if W.getAttribute('ya_type'): + pos.append(W.getAttribute('ya_type')) + if W.getAttribute('ke_type'): + pos.append(W.getAttribute('ke_type')) + if W.getAttribute('type'): + pos.append(W.getAttribute('type')) + if W.getAttribute('kind'): + pos.append(W.getAttribute('kind')) + return pos + + def clitic_join(tree, clitic): + if type(tree[-1]) == Tree: + return clitic_join(tree[-1], clitic) + else: + if (clitic[0][0][0] == 'ا'): + clitic[0] = ('‌' + clitic[0][0], clitic[0][1]) + tree[-1] = (tree[-1][0] + clitic[0][0], clitic[0][1]) + tree.set_label('CLITICS') + return + + if not len(node.childNodes): + return + first = node.childNodes[0] + if first.tagName == 'w': + pos = extract_tags(first) + return Tree(node.tagName, [(first.childNodes[0].data.replace('می ', 'می‌'), self._pos_map(pos))]) + childs = node.childNodes[2:] if node.tagName == 'S' else node.childNodes + for child in childs: + if not len(child.childNodes): + childs.remove(child) + tree = Tree(node.tagName, map(traverse, childs)) + if self._join_clitics and len(tree) > 1 and type(tree[1]) == Tree and tree[1].label() == 'CLITIC' and \ + tree[1][0][1] not in {'P', 'V'}: + clitic = tree[-1] + tree = Tree(tree.label(), [subtree for subtree in tree[0]]) + clitic_join(tree, clitic) + if self._join_verb_parts and len(tree) > 1 and type(tree[1]) == Tree and type(tree[0]) == Tree and tree[ + 0].label() == 'AUX' and tree[0][0][0] in self._tokenizer.before_verbs: + tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0], tree[1][0][1]) + tree.remove(tree[0]) + if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][ + 0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe: + tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) + path = tree.leaf_treeposition(len(tree.leaves()) - 2) + removingtree = tree + while len(path) > 2: + removingtree = removingtree[path[0]] + path = path[1:] + removingtree.remove(Tree(tree.pos()[-2][1], [tree.pos()[-2][0]])) + if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][ + 0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe: + tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) + path = tree.leaf_treeposition(len(tree.leaves()) - 2) + removingtree = tree + while len(path) > 2: + removingtree = removingtree[path[0]] + path = path[1:] + removingtree.remove(Tree(tree.pos()[-2][1], [tree.pos()[-2][0]])) + return tree + + for doc in self.docs(): + for S in doc.getElementsByTagName('S'): + yield traverse(S) + + def sents(self): + for tree in self.trees(): + yield tree.leaves() + + def chunked_trees(self): + collapse = lambda node, label: Tree(label, [Tree(pos[1], [pos[0]]) for pos in node.pos()]) + + def traverse(node, parent, chunks): + label = node.label() + + if label.count('-nid') > 0: + label = label.replace('-nid', '') + if label.count('-nid') > 0: + label = label.replace('-nid', '') + if label.count('-DiscA') > 0: + label = label.replace('-DiscA', '') + + if label in {'CLITIC', 'CLITICS'}: + if node[0][1] == 'V': + label = 'V' + elif node[0][1] == 'P': + label = 'PREP' + elif node[0][1] == 'DET': + label = 'DET' + elif node[0][1] == 'ADV': + label = 'ADV' + elif node[0][1] == 'PRO': + label = 'PRON' + + if label in {'CONJ', 'PUNC'} and len(node) == 1: + chunks.append(node) + return + + if label == 'PPC' and len(node) == 1: + chunks.append(Tree('PP', [node[0]])) + return + + if label == 'PREP': + chunks.append(Tree('PP', [node])) + return + + if label == 'PostP': + chunks.append(Tree('POSTP', [node])) + return + + for leaf in node.pos(): + if leaf[1] in {'PUNC', 'CONJ', 'PREP', 'PostP'}: + for i in range(len(node)): + traverse(node[i], node, chunks) + return + + if label == 'NPA' and parent.label() in {'CPC', 'PPC'}: + chunks.append(collapse(node, 'NP')) + return + + if label == 'NPA' and len(node) >= 1: + if node[0].label() == 'ADV': + chunks.append(collapse(node, 'NP')) + return + + if label in {'NPC', 'N', 'INFV', 'DPA', 'CLASS', 'DPC', 'DEM', 'INTJ', 'MN', 'PRON', 'DET', 'NUM', 'RES'}: + chunks.append(collapse(node, 'NP')) + return + + if label == 'NPA' and len(node) >= 2: + if node[0].label() == 'ADJ' and node[1].label() == 'NPC' or node[0].label() in {'N', 'PRON'} and node[ + 1].label() in {'ADJ', 'ADJPA', 'N'} or node[0].label() == 'NUM' and node[1].label() in {'N', 'NPC', + 'MN', + 'NUM'} or \ + node[0].label() in {'N', 'NPC', 'MN'} and node[1].label() == 'NUM' or node[ + 0].label() == 'NPC' and node[1].label() == 'ADJ' or node[0].label() == 'NPA' and node[ + 1].label() != 'NPC' or node[1].label() == 'NPA' and node[0].label() != 'NPC': + chunks.append(collapse(node, 'NP')) + return + + if label == 'DPC' and len(node) >= 2: + chunkable = True + for leaf in node[1].pos(): + if leaf[1] in {'PUNC', 'CONJ', 'PREP', 'PostP'}: + chunkable = False + if node[1].label() in {'N', 'NPA', 'NPC'} and chunkable: + chunks.append(collapse(node, 'NP')) + return + + if label == 'DPA' and len(node) >= 2: + if node[1].label() == 'ADV': + chunks.append(collapse(node, 'ADVP')) + return + + if label in {'MV', 'V', 'AUX', 'PPARV'}: + chunks.append(Tree('VP', [node])) + return + + if label in {'ADJ', 'ADJPC', 'MADJ', 'ADVPA'}: + chunks.append(Tree('ADJP', [node])) + return + + if label in {'ADV', 'MADV', 'ADVPC'}: + chunks.append(Tree('ADVP', [node])) + return + + if type(node[0]) != Tree: + chunks.append(node) + return + + for i in range(len(node)): + traverse(node[i], node, chunks) + + for tree in self.trees(): + chunks = [] + traverse(tree, None, chunks) + for i in range(len(chunks)): + if chunks[i].label() in {'PUNC', 'CONJ'}: + chunks[i] = chunks[i][0] + else: + chunks[i] = Tree(chunks[i].label(), chunks[i].leaves()) + yield Tree('S', chunks) diff --git a/hazm/VerbValencyReader.py b/hazm/VerbValencyReader.py index e0d26462..2aa7196d 100644 --- a/hazm/VerbValencyReader.py +++ b/hazm/VerbValencyReader.py @@ -1,29 +1,30 @@ # coding: utf-8 from __future__ import unicode_literals + import codecs from collections import namedtuple - -Verb = namedtuple('Verb', ('past_light_verb', 'present_light_verb', 'prefix', 'nonverbal_element', 'preposition', 'valency')) +Verb = namedtuple('Verb', + ('past_light_verb', 'present_light_verb', 'prefix', 'nonverbal_element', 'preposition', 'valency')) class VerbValencyReader(): - """ - interfaces [Verb Valency Corpus](http://dadegan.ir/catalog/pervallex) - Mohammad Sadegh Rasooli, Amirsaeid Moloodi, Manouchehr Kouhestani, & Behrouz Minaei Bidgoli. (2011). A Syntactic Valency Lexicon for Persian Verbs: The First Steps towards Persian Dependency Treebank. in 5th Language & Technology Conference (LTC): Human Language Technologies as a Challenge for Computer Science and Linguistics (pp. 227–231). Poznań, Poland. - """ - - def __init__(self, valency_file='corpora/valency.txt'): - self._valency_file = valency_file - - def verbs(self): - with codecs.open(self._valency_file, encoding='utf-8') as valency_file: - for line in valency_file: - if 'بن ماضی' in line: - continue - - line = line.strip().replace('-\t', '\t') - parts = line.split('\t') - if len(parts) == 6: - yield Verb(*parts) + """ + interfaces [Verb Valency Corpus](http://dadegan.ir/catalog/pervallex) + Mohammad Sadegh Rasooli, Amirsaeid Moloodi, Manouchehr Kouhestani, & Behrouz Minaei Bidgoli. (2011). A Syntactic Valency Lexicon for Persian Verbs: The First Steps towards Persian Dependency Treebank. in 5th Language & Technology Conference (LTC): Human Language Technologies as a Challenge for Computer Science and Linguistics (pp. 227–231). Poznań, Poland. + """ + + def __init__(self, valency_file='corpora/valency.txt'): + self._valency_file = valency_file + + def verbs(self): + with codecs.open(self._valency_file, encoding='utf-8') as valency_file: + for line in valency_file: + if 'بن ماضی' in line: + continue + + line = line.strip().replace('-\t', '\t') + parts = line.split('\t') + if len(parts) == 6: + yield Verb(*parts) diff --git a/hazm/WikiExtractor.py b/hazm/WikiExtractor.py index 56fe0dcb..9684c028 100755 --- a/hazm/WikiExtractor.py +++ b/hazm/WikiExtractor.py @@ -56,48 +56,51 @@ from __future__ import unicode_literals, division -import sys import argparse import bz2 -import codecs import cgi +import codecs import fileinput +import json import logging import os.path import re # TODO use regex when it will be standard +import sys import time -import json from io import StringIO from multiprocessing import Queue, Process, Value, cpu_count from timeit import default_timer - PY2 = sys.version_info[0] == 2 # Python 2.7 compatibiity if PY2: from urllib import quote from htmlentitydefs import name2codepoint from itertools import izip as zip, izip_longest as zip_longest + range = xrange # Use Python 3 equivalent - chr = unichr # Use Python 3 equivalent + chr = unichr # Use Python 3 equivalent text_type = unicode + class SimpleNamespace(object): - def __init__ (self, **kwargs): + def __init__(self, **kwargs): self.__dict__.update(kwargs) - def __repr__ (self): + + def __repr__(self): keys = sorted(self.__dict__) items = ("{}={!r}".format(k, self.__dict__[k]) for k in keys) return "{}({})".format(type(self).__name__, ", ".join(items)) - def __eq__ (self, other): + + def __eq__(self, other): return self.__dict__ == other.__dict__ else: from urllib.parse import quote from html.entities import name2codepoint from itertools import zip_longest from types import SimpleNamespace - text_type = str + text_type = str # =========================================================================== @@ -111,18 +114,18 @@ def __eq__ (self, other): ## # Defined in # We include as default Template, when loading external template file. - knownNamespaces = {'Template': 10}, + knownNamespaces={'Template': 10}, ## # The namespace used for template definitions # It is the name associated with namespace key=10 in the siteinfo header. - templateNamespace = '', - templatePrefix = '', + templateNamespace='', + templatePrefix='', ## # The namespace used for module definitions # It is the name associated with namespace key=828 in the siteinfo header. - moduleNamespace = '', + moduleNamespace='', ## # Recognize only these namespaces in links @@ -130,67 +133,67 @@ def __eq__ (self, other): # wiktionary: Wiki dictionary # wikt: shortcut for Wiktionary # - acceptedNamespaces = ['w', 'wiktionary', 'wikt'], + acceptedNamespaces=['w', 'wiktionary', 'wikt'], # This is obtained from - urlbase = '', + urlbase='', ## # Filter disambiguation pages - filter_disambig_pages = False, + filter_disambig_pages=False, ## # Drop tables from the article - keep_tables = False, + keep_tables=False, ## # Whether to preserve links in output - keepLinks = False, + keepLinks=False, ## # Whether to preserve section titles - keepSections = True, + keepSections=True, ## # Whether to preserve lists - keepLists = False, + keepLists=False, ## # Whether to output HTML instead of text - toHTML = False, + toHTML=False, ## # Whether to write json instead of the xml-like default output format - write_json = False, + write_json=False, ## # Whether to expand templates - expand_templates = True, + expand_templates=True, ## ## Whether to escape doc content - escape_doc = False, + escape_doc=False, ## # Print the wikipedia article revision - print_revision = False, + print_revision=False, ## # Minimum expanded text length required to print document - min_text_length = 0, + min_text_length=0, # Shared objects holding templates, redirects and cache - templates = {}, - redirects = {}, + templates={}, + redirects={}, # cache of parser templates # FIXME: sharing this with a Manager slows down. - templateCache = {}, + templateCache={}, # Elements to ignore/discard - ignored_tag_patterns = [], + ignored_tag_patterns=[], - discardElements = [ + discardElements=[ 'gallery', 'timeline', 'noinclude', 'pre', 'table', 'tr', 'td', 'th', 'caption', 'div', 'form', 'input', 'select', 'option', 'textarea', @@ -208,10 +211,11 @@ def __eq__ (self, other): # Regex for identifying disambig pages filter_disambig_page_pattern = re.compile("{{disambig(uation)?(\|[^}]*)?}}") + ## # page filtering logic -- remove templates, undesired xml namespaces, and disambiguation pages def keepPage(ns, page): - if ns != '0': # Aritcle + if ns != '0': # Aritcle return False # remove disambig pages if desired if options.filter_disambig_pages: @@ -333,7 +337,6 @@ def fixup(m): # The buggy template {{Template:T}} has a comment terminating with just "->" comment = re.compile(r'', re.DOTALL) - # Match ... nowiki = re.compile(r'.*?') @@ -343,16 +346,17 @@ def ignoreTag(tag): right = re.compile(r'' % tag, re.IGNORECASE) options.ignored_tag_patterns.append((left, right)) + # Match selfClosing HTML tags selfClosing_tag_patterns = [ re.compile(r'<\s*%s\b[^>]*/\s*>' % tag, re.DOTALL | re.IGNORECASE) for tag in selfClosingTags - ] +] # Match HTML placeholder tags placeholder_tag_patterns = [ (re.compile(r'<\s*%s(\s*| [^>]+?)>.*?<\s*/\s*%s\s*>' % (tag, tag), re.DOTALL | re.IGNORECASE), repl) for tag, repl in placeholder_tags.items() - ] +] # Match preformatted lines preformatted = re.compile(r'^ .*?$') @@ -399,7 +403,6 @@ def parse(cls, body): tpl.append(TemplateText(body[start:])) # leftover return tpl - def subst(self, params, extractor, depth=0): # We perform parameter substitutions recursively. # We also limit the maximum number of iterations to avoid too long or @@ -430,7 +433,6 @@ def __str__(self): class TemplateText(text_type): """Fixed text of template""" - def subst(self, params, extractor, depth): return self @@ -467,7 +469,6 @@ def __str__(self): else: return '{{{%s}}}' % self.name - def subst(self, params, extractor, depth): """ Substitute value for this argument from dict :param params: @@ -496,15 +497,12 @@ def __init__(self, title='', args=[], prev=None): self.prev = prev self.depth = prev.depth + 1 if prev else 0 - def push(self, title, args): return Frame(title, args, self) - def pop(self): return self.prev - def __str__(self): res = '' prev = self.prev @@ -514,14 +512,17 @@ def __str__(self): prev = prev.prev return '' + # ====================================================================== substWords = 'subst:|safesubst:' + class Extractor(object): """ An extraction task on a article. """ + def __init__(self, id, revid, title, lines): """ :param id: id of page. @@ -557,7 +558,7 @@ def write_output(self, out, text): # We don't use json.dump(data, out) because we want to be # able to encode the string if the output is sys.stdout out_str = json.dumps(json_data, ensure_ascii=False) - if out == sys.stdout: # option -a or -o - + if out == sys.stdout: # option -a or -o - out_str = out_str.encode('utf-8') out.write(out_str) out.write('\n') @@ -567,11 +568,11 @@ def write_output(self, out, text): else: header = '\n' % (self.id, url, self.title) footer = "\n\n" - if out == sys.stdout: # option -a or -o - + if out == sys.stdout: # option -a or -o - header = header.encode('utf-8') out.write(header) for line in text: - if out == sys.stdout: # option -a or -o - + if out == sys.stdout: # option -a or -o - line = line.encode('utf-8') out.write(line) out.write('\n') @@ -592,9 +593,9 @@ def extract(self, out): colon = self.title.find(':') if colon != -1: ns = self.title[:colon] - pagename = self.title[colon+1:] + pagename = self.title[colon + 1:] else: - ns = '' # Main + ns = '' # Main pagename = self.title self.magicWords['NAMESPACE'] = ns self.magicWords['NAMESPACENUMBER'] = options.knownNamespaces.get(ns, '0') @@ -603,7 +604,7 @@ def extract(self, out): slash = pagename.rfind('/') if slash != -1: self.magicWords['BASEPAGENAME'] = pagename[:slash] - self.magicWords['SUBPAGENAME'] = pagename[slash+1:] + self.magicWords['SUBPAGENAME'] = pagename[slash + 1:] else: self.magicWords['BASEPAGENAME'] = pagename self.magicWords['SUBPAGENAME'] = '' @@ -618,7 +619,7 @@ def extract(self, out): self.magicWords['CURRENTHOUR'] = time.strftime('%H') self.magicWords['CURRENTTIME'] = time.strftime('%H:%M:%S') text = self.text - self.text = '' # save memory + self.text = '' # save memory # # @see https://doc.wikimedia.org/mediawiki-core/master/php/classParser.html # This does the equivalent of internalParse(): @@ -644,7 +645,6 @@ def extract(self, out): logging.warn("Template errors in article '%s' (%s): title(%d) recursion(%d, %d, %d)", self.title, self.id, *errs) - def transform(self, wikitext): """ Transforms wiki markup. @@ -660,7 +660,6 @@ def transform(self, wikitext): res += self.transform1(wikitext[cur:]) return res - def transform1(self, text): """Transform text not containing """ if options.expand_templates: @@ -671,7 +670,6 @@ def transform1(self, text): # Drop transclusions (template, parser functions) return dropNested(text, r'{{', r'}}') - def wiki2text(self, text): # # final part of internalParse().) @@ -727,7 +725,6 @@ def wiki2text(self, text): text = res + unescape(text[cur:]) return text - def clean(self, text): """ Removes irrelevant parts from :param: text. @@ -793,7 +790,6 @@ def clean(self, text): text = cgi.escape(text) return text - # ---------------------------------------------------------------------- # Expand templates @@ -803,7 +799,6 @@ def clean(self, text): # check for template beginning reOpen = re.compile('(? %s', self.frame.depth, '', res) return res - def templateParams(self, parameters): """ Build a dictionary with positional or name key to expanded parameters. @@ -913,7 +907,6 @@ def templateParams(self, parameters): # logging.debug('%*stemplateParams> %s', self.frame.length, '', '|'.join(templateParams.values())) return templateParams - def expandTemplate(self, body): """Expands template invocation. :param body: the parts of a template. @@ -1248,7 +1241,7 @@ def findMatchingBraces(text, ldelim=0): break elif len(stack) == 1 and 0 < stack[0] < ldelim: # ambiguous {{{{{ }}} }} - #yield m1.start() + stack[0], end + # yield m1.start() + stack[0], end cur = end break elif brac == '[': # [[ @@ -1378,7 +1371,7 @@ def functionParams(args, vars): for var in vars: value = args.get(var) if value is None: - value = args.get(str(index)) # positional argument + value = args.get(str(index)) # positional argument if value is None: value = '' else: @@ -1390,9 +1383,9 @@ def functionParams(args, vars): def string_sub(args): params = functionParams(args, ('s', 'i', 'j')) s = params.get('s', '') - i = int(params.get('i', 1) or 1) # or handles case of '' value + i = int(params.get('i', 1) or 1) # or handles case of '' value j = int(params.get('j', -1) or -1) - if i > 0: i -= 1 # lua is 1-based + if i > 0: i -= 1 # lua is 1-based if j < 0: j += 1 if j == 0: j = len(s) return s[i:j] @@ -1401,9 +1394,9 @@ def string_sub(args): def string_sublength(args): params = functionParams(args, ('s', 'i', 'len')) s = params.get('s', '') - i = int(params.get('i', 1) or 1) - 1 # lua is 1-based + i = int(params.get('i', 1) or 1) - 1 # lua is 1-based len = int(params.get('len', 1) or 1) - return s[i:i+len] + return s[i:i + len] def string_len(args): @@ -1416,12 +1409,12 @@ def string_find(args): params = functionParams(args, ('source', 'target', 'start', 'plain')) source = params.get('source', '') pattern = params.get('target', '') - start = int('0'+params.get('start', 1)) - 1 # lua is 1-based - plain = int('0'+params.get('plain', 1)) + start = int('0' + params.get('start', 1)) - 1 # lua is 1-based + plain = int('0' + params.get('plain', 1)) if source == '' or pattern == '': return 0 if plain: - return source.find(pattern, start) + 1 # lua is 1-based + return source.find(pattern, start) + 1 # lua is 1-based else: return (re.compile(pattern).search(source, start) or -1) + 1 @@ -1431,7 +1424,7 @@ def string_pos(args): target = params.get('target', '') pos = int(params.get('pos', 1) or 1) if pos > 0: - pos -= 1 # The first character has an index value of 1 + pos -= 1 # The first character has an index value of 1 return target[pos] @@ -1490,6 +1483,7 @@ def toRoman(n, romanNumeralMap): ) return toRoman(num, smallRomans) + # ---------------------------------------------------------------------- modules = { @@ -1520,6 +1514,7 @@ def toRoman(n, romanNumeralMap): } } + # ---------------------------------------------------------------------- # variables @@ -1740,15 +1735,12 @@ def __call__(self, value1, value2): ROUND = Infix(lambda x, y: round(x, y)) -from math import floor, ceil, pi, e, trunc, exp, log as ln, sin, cos, tan, asin, acos, atan - - def sharp_expr(extr, expr): """Tries converting a lua expr into a Python expr.""" try: expr = extr.expand(expr) - expr = re.sub('(?])=', '==', expr) # negative lookbehind - expr = re.sub('mod', '%', expr) # no \b here + expr = re.sub('(?])=', '==', expr) # negative lookbehind + expr = re.sub('mod', '%', expr) # no \b here expr = re.sub('\bdiv\b', '/', expr) expr = re.sub('\bround\b', '|ROUND|', expr) return text_type(eval(expr)) @@ -1762,11 +1754,11 @@ def sharp_if(extr, testValue, valueIfTrue, valueIfFalse=None, *args): if testValue.strip(): # The {{#if:}} function is an if-then-else construct. # The applied condition is: "The condition string is non-empty". - valueIfTrue = extr.expand(valueIfTrue.strip()) # eval + valueIfTrue = extr.expand(valueIfTrue.strip()) # eval if valueIfTrue: return valueIfTrue elif valueIfFalse: - return extr.expand(valueIfFalse.strip()) # eval + return extr.expand(valueIfFalse.strip()) # eval return "" @@ -1863,19 +1855,19 @@ def sharp_invoke(module, function, args): '#ifexpr': lambda *args: '', # not supported - '#ifexist': lambda extr, title, ifex, ifnex: extr.expand(ifnex), # assuming title is not present + '#ifexist': lambda extr, title, ifex, ifnex: extr.expand(ifnex), # assuming title is not present '#rel2abs': lambda *args: '', # not supported '#switch': sharp_switch, - '#language': lambda *args: '', # not supported + '#language': lambda *args: '', # not supported - '#time': lambda *args: '', # not supported + '#time': lambda *args: '', # not supported - '#timel': lambda *args: '', # not supported + '#timel': lambda *args: '', # not supported - '#titleparts': lambda *args: '', # not supported + '#titleparts': lambda *args: '', # not supported # This function is used in some pages to construct links # http://meta.wikimedia.org/wiki/Help:URL @@ -1925,7 +1917,7 @@ def callParserFunction(functionName, args, extractor): break frame = frame.prev else: - params = [extractor.transform(p) for p in args[2:]] # evaluates them + params = [extractor.transform(p) for p in args[2:]] # evaluates them params = extractor.templateParams(params) ret = sharp_invoke(module, fun, params) logging.debug('%*s<#invoke %s %s %s', extractor.frame.depth, '', module, fun, ret) @@ -1956,6 +1948,7 @@ def callParserFunction(functionName, args, extractor): reNoinclude = re.compile(r'(?:.*?)', re.DOTALL) reIncludeonly = re.compile(r'|', re.DOTALL) + def define_template(title, page): """ Adds a template defined in the :param page:. @@ -2015,8 +2008,8 @@ def dropNested(text, openDelim, closeDelim): openRE = re.compile(openDelim, re.IGNORECASE) closeRE = re.compile(closeDelim, re.IGNORECASE) # partition text in separate blocks { } { } - spans = [] # pairs (s, e) for each partition - nest = 0 # nesting level + spans = [] # pairs (s, e) for each partition + nest = 0 # nesting level start = openRE.search(text, 0) if not start: return text @@ -2024,8 +2017,8 @@ def dropNested(text, openDelim, closeDelim): next = start while end: next = openRE.search(text, next.end()) - if not next: # termination - while nest: # close all pending + if not next: # termination + while nest: # close all pending nest -= 1 end0 = closeRE.search(text, end.end()) if end0: @@ -2041,7 +2034,7 @@ def dropNested(text, openDelim, closeDelim): # try closing more last = end.end() end = closeRE.search(text, end.end()) - if not end: # unbalanced + if not end: # unbalanced if spans: span = (spans[0][0], last) else: @@ -2053,7 +2046,7 @@ def dropNested(text, openDelim, closeDelim): # advance start, find next close start = next end = closeRE.search(text, next.end()) - break # { } + break # { } if next != start: # { { } nest += 1 @@ -2069,7 +2062,7 @@ def dropSpans(spans, text): res = '' offset = 0 for s, e in spans: - if offset <= s: # handle nesting + if offset <= s: # handle nesting if offset < s: res += text[offset:s] offset = e @@ -2512,13 +2505,13 @@ def compact(text): :param text: convert to HTML. """ - page = [] # list of paragraph - headers = {} # Headers for unfilled sections + page = [] # list of paragraph + headers = {} # Headers for unfilled sections emptySection = False # empty sections are discarded - listLevel = [] # nesting of lists - listCount = [] # count of each list (it should be always in the same length of listLevel) + listLevel = [] # nesting of lists + listCount = [] # count of each list (it should be always in the same length of listLevel) for line in text.split('\n'): - if not line: # collapse empty lines + if not line: # collapse empty lines # if there is an opening list, close it if we see an empty line if len(listLevel): page.append(line) @@ -2535,11 +2528,11 @@ def compact(text): m = section.match(line) if m: title = m.group(2) - lev = len(m.group(1)) # header level + lev = len(m.group(1)) # header level if options.toHTML: page.append("%s" % (lev, title, lev)) if title and title[-1] not in '!?': - title += '.' # terminate sentence. + title += '.' # terminate sentence. headers[lev] = title # drop previous headers for i in list(headers.keys()): @@ -2566,7 +2559,7 @@ def compact(text): # c: current level char # n: next level char for c, n in zip_longest(listLevel, line, fillvalue=''): - if not n or n not in '*#;:': # shorter or different + if not n or n not in '*#;:': # shorter or different if c: if options.toHTML: page.append(listClose[c]) @@ -2718,6 +2711,7 @@ def open(self, filename): # 1 2 3 4 keyRE = re.compile(r'key="(\d*)"') + def load_templates(file, output_file=None): """ Load templates from :param file:. @@ -2802,7 +2796,7 @@ def pages_from(input): elif tag == 'redirect': redirect = True elif tag == 'text': - if m.lastindex == 3 and line[m.start(3)-2] == '/': # self closing + if m.lastindex == 3 and line[m.start(3) - 2] == '/': # self closing # continue inText = True @@ -2944,15 +2938,15 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, delay = 0 if spool_length.value > max_spool_length: # reduce to 10% - while spool_length.value > max_spool_length/10: + while spool_length.value > max_spool_length / 10: time.sleep(10) delay += 10 if delay: logging.info('Delay %ds', delay) job = (id, revid, title, page, page_num) - jobs_queue.put(job) # goes to any available extract_process + jobs_queue.put(job) # goes to any available extract_process page_num += 1 - page = None # free memory + page = None # free memory input.close() @@ -2990,16 +2984,15 @@ def extract_process(opts, i, jobs_queue, output_queue): createLogger(options.quiet, options.debug) - out = StringIO() # memory buffer - + out = StringIO() # memory buffer while True: job = jobs_queue.get() # job is (id, title, page, page_num) if job: id, revid, title, page, page_num = job try: - e = Extractor(*job[:4]) # (id, revid, title, page) - page = None # free memory + e = Extractor(*job[:4]) # (id, revid, title, page) + page = None # free memory e.extract(out) text = out.getvalue() except: @@ -3015,7 +3008,9 @@ def extract_process(opts, i, jobs_queue, output_queue): out.close() -report_period = 10000 # progress report period +report_period = 10000 # progress report period + + def reduce_process(opts, output_queue, spool_length, out_file=None, file_size=0, file_compress=True): """Pull finished article text, write series of files (or stdout) @@ -3042,8 +3037,8 @@ def reduce_process(opts, output_queue, spool_length, interval_start = default_timer() # FIXME: use a heap - spool = {} # collected pages - next_page = 0 # sequence numbering of page + spool = {} # collected pages + next_page = 0 # sequence numbering of page while True: if next_page in spool: output.write(spool.pop(next_page).encode('utf-8')) @@ -3079,8 +3074,8 @@ def reduce_process(opts, output_queue, spool_length, # Minimum size of output files minFileSize = 200 * 1024 -def main(): +def main(): parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__) @@ -3097,7 +3092,6 @@ def main(): groupO.add_argument("--json", action="store_true", help="write output in json format instead of the default one") - groupP = parser.add_argument_group('Processing') groupP.add_argument("--html", action="store_true", help="produce HTML output, subsumes --links") @@ -3176,7 +3170,7 @@ def main(): 'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'em', 'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd', 'p', 'plaintext', 's', 'span', 'strike', 'strong', - 'tt', 'u', 'var', 'a', 'br', 'small', 'div', 'td', + 'tt', 'u', 'var', 'a', 'br', 'small', 'div', 'td', 'nowiki', 'ref', 'sup' ] @@ -3228,6 +3222,7 @@ def main(): process_dump(input_file, args.templates, output_path, file_size, args.compress, args.processes) + def createLogger(quiet, debug): logger = logging.getLogger() if not quiet: @@ -3235,5 +3230,6 @@ def createLogger(quiet, debug): if debug: logger.setLevel(logging.DEBUG) + if __name__ == '__main__': main() diff --git a/hazm/WikipediaReader.py b/hazm/WikipediaReader.py index 635a73bd..a9dc1370 100644 --- a/hazm/WikipediaReader.py +++ b/hazm/WikipediaReader.py @@ -1,37 +1,42 @@ # coding: utf-8 from __future__ import unicode_literals, print_function -import os, re, subprocess + +import os +import re +import subprocess class WikipediaReader(): - """ - interfaces [Persian Wikipedia dump](http://download.wikimedia.org/fawiki/latest/fawiki-latest-pages-articles.xml.bz2) - """ - - def __init__(self, fawiki_dump, n_jobs=2): - self.fawiki_dump = fawiki_dump - self.wiki_extractor = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'WikiExtractor.py') - self.n_jobs = n_jobs - - def docs(self): - proc = subprocess.Popen(['python', self.wiki_extractor, '--no-templates', '--processes', str(self.n_jobs), '--output', '-', self.fawiki_dump], stdout=subprocess.PIPE) - doc_pattern = re.compile(r'') - - doc = [] - for line in iter(proc.stdout.readline, ''): - line = line.strip().decode('utf8') - if line: - doc.append(line) - - if line == '': - del doc[1] - id, url, title = doc_pattern.match(doc[0]).groups() - html = '\n'.join(doc[1:-1]) - - yield {'id': id, 'url': url, 'title': title, 'html': html, 'text': html} - doc = [] - - def texts(self): - for doc in self.docs(): - yield doc['text'] + """ + interfaces [Persian Wikipedia dump](http://download.wikimedia.org/fawiki/latest/fawiki-latest-pages-articles.xml.bz2) + """ + + def __init__(self, fawiki_dump, n_jobs=2): + self.fawiki_dump = fawiki_dump + self.wiki_extractor = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'WikiExtractor.py') + self.n_jobs = n_jobs + + def docs(self): + proc = subprocess.Popen( + ['python', self.wiki_extractor, '--no-templates', '--processes', str(self.n_jobs), '--output', '-', + self.fawiki_dump], stdout=subprocess.PIPE) + doc_pattern = re.compile(r'') + + doc = [] + for line in iter(proc.stdout.readline, ''): + line = line.strip().decode('utf8') + if line: + doc.append(line) + + if line == '': + del doc[1] + id, url, title = doc_pattern.match(doc[0]).groups() + html = '\n'.join(doc[1:-1]) + + yield {'id': id, 'url': url, 'title': title, 'html': html, 'text': html} + doc = [] + + def texts(self): + for doc in self.docs(): + yield doc['text'] diff --git a/hazm/WordTokenizer.py b/hazm/WordTokenizer.py index 18e9a206..bd9d7ec4 100644 --- a/hazm/WordTokenizer.py +++ b/hazm/WordTokenizer.py @@ -1,76 +1,87 @@ # coding: utf-8 from __future__ import unicode_literals -import re, codecs -from .utils import words_list, default_words, default_verbs + +import codecs +import re + from nltk.tokenize.api import TokenizerI +from .utils import words_list, default_words, default_verbs + class WordTokenizer(TokenizerI): - """ - >>> tokenizer = WordTokenizer() - >>> tokenizer.tokenize('این جمله (خیلی) پیچیده نیست!!!') - ['این', 'جمله', '(', 'خیلی', ')', 'پیچیده', 'نیست', '!!!'] + """ + >>> tokenizer = WordTokenizer() + >>> tokenizer.tokenize('این جمله (خیلی) پیچیده نیست!!!') + ['این', 'جمله', '(', 'خیلی', ')', 'پیچیده', 'نیست', '!!!'] - >>> tokenizer.tokenize('نسخه 0.5 در ساعت 22:00 تهران،1396') - ['نسخه', '0.5', 'در', 'ساعت', '22:00', 'تهران', '،', '1396'] - """ + >>> tokenizer.tokenize('نسخه 0.5 در ساعت 22:00 تهران،1396') + ['نسخه', '0.5', 'در', 'ساعت', '22:00', 'تهران', '،', '1396'] + """ - def __init__(self, words_file=default_words, verbs_file=default_verbs, join_verb_parts=True): - self._join_verb_parts = join_verb_parts - self.pattern = re.compile(r'([؟!\?]+|[\d\.:]+|[:\.،؛»\]\)\}"«\[\(\{])') + def __init__(self, words_file=default_words, verbs_file=default_verbs, join_verb_parts=True): + self._join_verb_parts = join_verb_parts + self.pattern = re.compile(r'([؟!\?]+|[\d\.:]+|[:\.،؛»\]\)\}"«\[\(\{])') - self.words = {item[0]: (item[1], item[2]) for item in words_list(default_words)} + self.words = {item[0]: (item[1], item[2]) for item in words_list(default_words)} - if join_verb_parts: - self.after_verbs = set([ - 'ام', 'ای', 'است', 'ایم', 'اید', 'اند', 'بودم', 'بودی', 'بود', 'بودیم', 'بودید', 'بودند', 'باشم', 'باشی', 'باشد', 'باشیم', 'باشید', 'باشند', - 'شده_ام', 'شده_ای', 'شده_است', 'شده_ایم', 'شده_اید', 'شده_اند', 'شده_بودم', 'شده_بودی', 'شده_بود', 'شده_بودیم', 'شده_بودید', 'شده_بودند', 'شده_باشم', 'شده_باشی', 'شده_باشد', 'شده_باشیم', 'شده_باشید', 'شده_باشند', - 'نشده_ام', 'نشده_ای', 'نشده_است', 'نشده_ایم', 'نشده_اید', 'نشده_اند', 'نشده_بودم', 'نشده_بودی', 'نشده_بود', 'نشده_بودیم', 'نشده_بودید', 'نشده_بودند', 'نشده_باشم', 'نشده_باشی', 'نشده_باشد', 'نشده_باشیم', 'نشده_باشید', 'نشده_باشند', - 'شوم', 'شوی', 'شود', 'شویم', 'شوید', 'شوند', 'شدم', 'شدی', 'شد', 'شدیم', 'شدید', 'شدند', - 'نشوم', 'نشوی', 'نشود', 'نشویم', 'نشوید', 'نشوند', 'نشدم', 'نشدی', 'نشد', 'نشدیم', 'نشدید', 'نشدند', - 'می‌شوم', 'می‌شوی', 'می‌شود', 'می‌شویم', 'می‌شوید', 'می‌شوند', 'می‌شدم', 'می‌شدی', 'می‌شد', 'می‌شدیم', 'می‌شدید', 'می‌شدند', - 'نمی‌شوم', 'نمی‌شوی', 'نمی‌شود', 'نمی‌شویم', 'نمی‌شوید', 'نمی‌شوند', 'نمی‌شدم', 'نمی‌شدی', 'نمی‌شد', 'نمی‌شدیم', 'نمی‌شدید', 'نمی‌شدند', - 'خواهم_شد', 'خواهی_شد', 'خواهد_شد', 'خواهیم_شد', 'خواهید_شد', 'خواهند_شد', - 'نخواهم_شد', 'نخواهی_شد', 'نخواهد_شد', 'نخواهیم_شد', 'نخواهید_شد', 'نخواهند_شد', - ]) + if join_verb_parts: + self.after_verbs = set([ + 'ام', 'ای', 'است', 'ایم', 'اید', 'اند', 'بودم', 'بودی', 'بود', 'بودیم', 'بودید', 'بودند', 'باشم', + 'باشی', 'باشد', 'باشیم', 'باشید', 'باشند', + 'شده_ام', 'شده_ای', 'شده_است', 'شده_ایم', 'شده_اید', 'شده_اند', 'شده_بودم', 'شده_بودی', 'شده_بود', + 'شده_بودیم', 'شده_بودید', 'شده_بودند', 'شده_باشم', 'شده_باشی', 'شده_باشد', 'شده_باشیم', 'شده_باشید', + 'شده_باشند', + 'نشده_ام', 'نشده_ای', 'نشده_است', 'نشده_ایم', 'نشده_اید', 'نشده_اند', 'نشده_بودم', 'نشده_بودی', + 'نشده_بود', 'نشده_بودیم', 'نشده_بودید', 'نشده_بودند', 'نشده_باشم', 'نشده_باشی', 'نشده_باشد', + 'نشده_باشیم', 'نشده_باشید', 'نشده_باشند', + 'شوم', 'شوی', 'شود', 'شویم', 'شوید', 'شوند', 'شدم', 'شدی', 'شد', 'شدیم', 'شدید', 'شدند', + 'نشوم', 'نشوی', 'نشود', 'نشویم', 'نشوید', 'نشوند', 'نشدم', 'نشدی', 'نشد', 'نشدیم', 'نشدید', 'نشدند', + 'می‌شوم', 'می‌شوی', 'می‌شود', 'می‌شویم', 'می‌شوید', 'می‌شوند', 'می‌شدم', 'می‌شدی', 'می‌شد', 'می‌شدیم', + 'می‌شدید', 'می‌شدند', + 'نمی‌شوم', 'نمی‌شوی', 'نمی‌شود', 'نمی‌شویم', 'نمی‌شوید', 'نمی‌شوند', 'نمی‌شدم', 'نمی‌شدی', 'نمی‌شد', + 'نمی‌شدیم', 'نمی‌شدید', 'نمی‌شدند', + 'خواهم_شد', 'خواهی_شد', 'خواهد_شد', 'خواهیم_شد', 'خواهید_شد', 'خواهند_شد', + 'نخواهم_شد', 'نخواهی_شد', 'نخواهد_شد', 'نخواهیم_شد', 'نخواهید_شد', 'نخواهند_شد', + ]) - self.before_verbs = set([ - 'خواهم', 'خواهی', 'خواهد', 'خواهیم', 'خواهید', 'خواهند', - 'نخواهم', 'نخواهی', 'نخواهد', 'نخواهیم', 'نخواهید', 'نخواهند' - ]) + self.before_verbs = set([ + 'خواهم', 'خواهی', 'خواهد', 'خواهیم', 'خواهید', 'خواهند', + 'نخواهم', 'نخواهی', 'نخواهد', 'نخواهیم', 'نخواهید', 'نخواهند' + ]) - with codecs.open(verbs_file, encoding='utf8') as verbs_file: - self.verbs = list(reversed([verb.strip() for verb in verbs_file if verb])) - self.bons = set([verb.split('#')[0] for verb in self.verbs]) - self.verbe = set([bon +'ه' for bon in self.bons] + ['ن'+ bon +'ه' for bon in self.bons]) + with codecs.open(verbs_file, encoding='utf8') as verbs_file: + self.verbs = list(reversed([verb.strip() for verb in verbs_file if verb])) + self.bons = set([verb.split('#')[0] for verb in self.verbs]) + self.verbe = set([bon + 'ه' for bon in self.bons] + ['ن' + bon + 'ه' for bon in self.bons]) - def tokenize(self, text): - text = self.pattern.sub(r' \1 ', text.replace('\n', ' ').replace('\t', ' ')) - tokens = [word for word in text.split(' ') if word] - if self._join_verb_parts: - tokens = self.join_verb_parts(tokens) - return tokens + def tokenize(self, text): + text = self.pattern.sub(r' \1 ', text.replace('\n', ' ').replace('\t', ' ')) + tokens = [word for word in text.split(' ') if word] + if self._join_verb_parts: + tokens = self.join_verb_parts(tokens) + return tokens - def join_verb_parts(self, tokens): - """ - >>> tokenizer = WordTokenizer() - >>> tokenizer.join_verb_parts(['خواهد', 'رفت']) - ['خواهد_رفت'] - >>> tokenizer.join_verb_parts(['رفته', 'است']) - ['رفته_است'] - >>> tokenizer.join_verb_parts(['گفته', 'شده', 'است']) - ['گفته_شده_است'] - >>> tokenizer.join_verb_parts(['گفته', 'خواهد', 'شد']) - ['گفته_خواهد_شد'] - >>> tokenizer.join_verb_parts(['خسته', 'شدید']) - ['خسته', 'شدید'] - """ + def join_verb_parts(self, tokens): + """ + >>> tokenizer = WordTokenizer() + >>> tokenizer.join_verb_parts(['خواهد', 'رفت']) + ['خواهد_رفت'] + >>> tokenizer.join_verb_parts(['رفته', 'است']) + ['رفته_است'] + >>> tokenizer.join_verb_parts(['گفته', 'شده', 'است']) + ['گفته_شده_است'] + >>> tokenizer.join_verb_parts(['گفته', 'خواهد', 'شد']) + ['گفته_خواهد_شد'] + >>> tokenizer.join_verb_parts(['خسته', 'شدید']) + ['خسته', 'شدید'] + """ - result = [''] - for token in reversed(tokens): - if token in self.before_verbs or (result[-1] in self.after_verbs and token in self.verbe): - result[-1] = token +'_'+ result[-1] - else: - result.append(token) - return list(reversed(result[1:])) + result = [''] + for token in reversed(tokens): + if token in self.before_verbs or (result[-1] in self.after_verbs and token in self.verbe): + result[-1] = token + '_' + result[-1] + else: + result.append(token) + return list(reversed(result[1:])) diff --git a/hazm/utils.py b/hazm/utils.py index de1179d8..65e58c61 100644 --- a/hazm/utils.py +++ b/hazm/utils.py @@ -1,6 +1,7 @@ # coding: utf-8 -import sys, codecs +import codecs +import sys from os import path PY2 = sys.version_info[0] == 2 @@ -18,11 +19,11 @@ def words_list(words_file=default_words): - with codecs.open(words_file, encoding='utf-8') as words_file: - items = [line.strip().split('\t') for line in words_file] - return [(item[0], int(item[1]), tuple(item[2].split(','))) for item in items if len(item) == 3] + with codecs.open(words_file, encoding='utf-8') as words_file: + items = [line.strip().split('\t') for line in words_file] + return [(item[0], int(item[1]), tuple(item[2].split(','))) for item in items if len(item) == 3] def stopwords_list(stopwords_file=default_stopwords): - with codecs.open(stopwords_file, encoding='utf8') as stopwords_file: - return list(map(lambda w: w.strip(), stopwords_file)) + with codecs.open(stopwords_file, encoding='utf8') as stopwords_file: + return list(map(lambda w: w.strip(), stopwords_file)) From fe11efdeb77c70ccc83858b63de46cabc3140374 Mon Sep 17 00:00:00 2001 From: Amir Hadifar Date: Sat, 25 Aug 2018 14:21:20 +0430 Subject: [PATCH 2/4] __feature__ added to all files --- hazm/BijankhanReader.py | 3 +++ hazm/Chunker.py | 3 +++ hazm/DadeganReader.py | 4 ++++ hazm/DependencyParser.py | 5 ++++- hazm/HamshahriReader.py | 3 +++ hazm/InformalNormalizer.py | 3 +++ hazm/Lemmatizer.py | 3 +++ hazm/Normalizer.py | 3 +++ hazm/POSTagger.py | 3 +++ hazm/PersicaReader.py | 3 +++ hazm/PeykareReader.py | 3 +++ hazm/QuranCorpusReader.py | 3 +++ hazm/SentenceTokenizer.py | 3 +++ hazm/SentiPersReader.py | 7 +++++-- hazm/SequenceTagger.py | 3 +++ hazm/Stemmer.py | 3 +++ hazm/TNewsReader.py | 8 ++++++-- hazm/TokenSplitter.py | 3 +++ hazm/TreebankReader.py | 5 ++++- hazm/VerbValencyReader.py | 3 +++ hazm/WikiExtractor.py | 5 ++++- hazm/WikipediaReader.py | 35 +++++++++++++++++++++++++++-------- hazm/WordTokenizer.py | 3 +++ hazm/utils.py | 5 +++++ 24 files changed, 107 insertions(+), 15 deletions(-) diff --git a/hazm/BijankhanReader.py b/hazm/BijankhanReader.py index e5512219..13b440da 100644 --- a/hazm/BijankhanReader.py +++ b/hazm/BijankhanReader.py @@ -1,5 +1,8 @@ # coding: utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function from __future__ import unicode_literals import codecs diff --git a/hazm/Chunker.py b/hazm/Chunker.py index 0b01ca61..3fc2f0c6 100755 --- a/hazm/Chunker.py +++ b/hazm/Chunker.py @@ -1,5 +1,8 @@ # coding: utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function from __future__ import unicode_literals from nltk.chunk import ChunkParserI, RegexpParser, tree2conlltags, conlltags2tree diff --git a/hazm/DadeganReader.py b/hazm/DadeganReader.py index 9f24696d..8f6be81f 100755 --- a/hazm/DadeganReader.py +++ b/hazm/DadeganReader.py @@ -1,5 +1,9 @@ # coding: utf-8 +from __future__ import unicode_literals +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function from __future__ import unicode_literals import codecs diff --git a/hazm/DependencyParser.py b/hazm/DependencyParser.py index 7b1513b2..17ef1c34 100644 --- a/hazm/DependencyParser.py +++ b/hazm/DependencyParser.py @@ -1,6 +1,9 @@ # coding: utf-8 -from __future__ import print_function, unicode_literals +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals import codecs import os diff --git a/hazm/HamshahriReader.py b/hazm/HamshahriReader.py index d640ac1d..c474735f 100644 --- a/hazm/HamshahriReader.py +++ b/hazm/HamshahriReader.py @@ -1,6 +1,9 @@ # coding: utf-8 +from __future__ import absolute_import +from __future__ import division from __future__ import print_function +from __future__ import unicode_literals import os import re diff --git a/hazm/InformalNormalizer.py b/hazm/InformalNormalizer.py index 3d7f7b51..0b4e0121 100644 --- a/hazm/InformalNormalizer.py +++ b/hazm/InformalNormalizer.py @@ -1,5 +1,8 @@ # coding: utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function from __future__ import unicode_literals from .Lemmatizer import Lemmatizer diff --git a/hazm/Lemmatizer.py b/hazm/Lemmatizer.py index 499c2ab6..f6647873 100644 --- a/hazm/Lemmatizer.py +++ b/hazm/Lemmatizer.py @@ -1,5 +1,8 @@ # coding: utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function from __future__ import unicode_literals from .Stemmer import Stemmer diff --git a/hazm/Normalizer.py b/hazm/Normalizer.py index ee846e0f..3f83686d 100644 --- a/hazm/Normalizer.py +++ b/hazm/Normalizer.py @@ -1,5 +1,8 @@ # coding: utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function from __future__ import unicode_literals import re diff --git a/hazm/POSTagger.py b/hazm/POSTagger.py index c56e9844..33954362 100755 --- a/hazm/POSTagger.py +++ b/hazm/POSTagger.py @@ -1,5 +1,8 @@ # coding: utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function from __future__ import unicode_literals from nltk.tag import stanford diff --git a/hazm/PersicaReader.py b/hazm/PersicaReader.py index ed3941df..f5933073 100644 --- a/hazm/PersicaReader.py +++ b/hazm/PersicaReader.py @@ -1,6 +1,9 @@ # coding: utf-8 +from __future__ import absolute_import +from __future__ import division from __future__ import print_function +from __future__ import unicode_literals import codecs diff --git a/hazm/PeykareReader.py b/hazm/PeykareReader.py index 0220a698..16c61484 100644 --- a/hazm/PeykareReader.py +++ b/hazm/PeykareReader.py @@ -1,5 +1,8 @@ # coding: utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function from __future__ import unicode_literals import codecs diff --git a/hazm/QuranCorpusReader.py b/hazm/QuranCorpusReader.py index 50a7c3fe..3f07d03f 100644 --- a/hazm/QuranCorpusReader.py +++ b/hazm/QuranCorpusReader.py @@ -1,5 +1,8 @@ # coding: utf8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function from __future__ import unicode_literals import codecs diff --git a/hazm/SentenceTokenizer.py b/hazm/SentenceTokenizer.py index d3a7fb4e..b5d5b41b 100644 --- a/hazm/SentenceTokenizer.py +++ b/hazm/SentenceTokenizer.py @@ -1,5 +1,8 @@ # coding: utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function from __future__ import unicode_literals import re diff --git a/hazm/SentiPersReader.py b/hazm/SentiPersReader.py index e7249d51..321000cf 100644 --- a/hazm/SentiPersReader.py +++ b/hazm/SentiPersReader.py @@ -1,6 +1,9 @@ # coding: utf-8 -from __future__ import unicode_literals, print_function +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals import itertools import os @@ -8,7 +11,7 @@ from xml.dom import minidom -class SentiPersReader(): +class SentiPersReader: """ interfaces [SentiPers Corpus](http://nlp.guilan.ac.ir/Dataset.aspx) diff --git a/hazm/SequenceTagger.py b/hazm/SequenceTagger.py index c31b4016..001931d0 100644 --- a/hazm/SequenceTagger.py +++ b/hazm/SequenceTagger.py @@ -1,5 +1,8 @@ # coding: utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function from __future__ import unicode_literals from nltk.metrics import accuracy diff --git a/hazm/Stemmer.py b/hazm/Stemmer.py index f774422f..f2d8fdc1 100644 --- a/hazm/Stemmer.py +++ b/hazm/Stemmer.py @@ -1,5 +1,8 @@ # coding: utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function from __future__ import unicode_literals from nltk.stem.api import StemmerI diff --git a/hazm/TNewsReader.py b/hazm/TNewsReader.py index 70aca5ab..1460d2da 100644 --- a/hazm/TNewsReader.py +++ b/hazm/TNewsReader.py @@ -1,6 +1,9 @@ # coding: utf-8 +from __future__ import absolute_import +from __future__ import division from __future__ import print_function +from __future__ import unicode_literals import os import re @@ -8,9 +11,10 @@ from xml.dom import minidom -class TNewsReader(): +class TNewsReader: """ - interfaces [TNews Corpus](http://datasets.tnews.ir/downloads/) that you must download and extract. + interfaces [TNews Corpus](http://datasets.tnews.ir/downloads/) + that you must download and extract. >>> tnews = TNewsReader(root='corpora/tnews') >>> next(tnews.docs())['id'] diff --git a/hazm/TokenSplitter.py b/hazm/TokenSplitter.py index 4263daf7..9fa5757e 100644 --- a/hazm/TokenSplitter.py +++ b/hazm/TokenSplitter.py @@ -1,5 +1,8 @@ # coding: utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function from __future__ import unicode_literals from .Lemmatizer import Lemmatizer diff --git a/hazm/TreebankReader.py b/hazm/TreebankReader.py index cee5b7c3..836e7e08 100644 --- a/hazm/TreebankReader.py +++ b/hazm/TreebankReader.py @@ -1,6 +1,9 @@ # coding: utf-8 -from __future__ import unicode_literals, print_function +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals import codecs import os diff --git a/hazm/VerbValencyReader.py b/hazm/VerbValencyReader.py index 2aa7196d..010ecdb0 100644 --- a/hazm/VerbValencyReader.py +++ b/hazm/VerbValencyReader.py @@ -1,5 +1,8 @@ # coding: utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function from __future__ import unicode_literals import codecs diff --git a/hazm/WikiExtractor.py b/hazm/WikiExtractor.py index 9684c028..62eca670 100755 --- a/hazm/WikiExtractor.py +++ b/hazm/WikiExtractor.py @@ -54,7 +54,10 @@ """ -from __future__ import unicode_literals, division +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals import argparse import bz2 diff --git a/hazm/WikipediaReader.py b/hazm/WikipediaReader.py index a9dc1370..b1819c93 100644 --- a/hazm/WikipediaReader.py +++ b/hazm/WikipediaReader.py @@ -1,6 +1,9 @@ # coding: utf-8 -from __future__ import unicode_literals, print_function +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals import os import re @@ -9,19 +12,31 @@ class WikipediaReader(): """ - interfaces [Persian Wikipedia dump](http://download.wikimedia.org/fawiki/latest/fawiki-latest-pages-articles.xml.bz2) + interfaces [Persian Wikipedia dump] + (http://download.wikimedia.org/fawiki/latest/fawiki-latest-pages-articles.xml.bz2) """ def __init__(self, fawiki_dump, n_jobs=2): self.fawiki_dump = fawiki_dump - self.wiki_extractor = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'WikiExtractor.py') + abs_path = os.path.abspath(os.path.dirname(__file__)) + self.wiki_extractor = os.path.join(abs_path, 'WikiExtractor.py') self.n_jobs = n_jobs def docs(self): proc = subprocess.Popen( - ['python', self.wiki_extractor, '--no-templates', '--processes', str(self.n_jobs), '--output', '-', - self.fawiki_dump], stdout=subprocess.PIPE) - doc_pattern = re.compile(r'') + ['python', + self.wiki_extractor, + '--no-templates', + '--processes', + str(self.n_jobs), + '--output', '-', + self.fawiki_dump], + stdout=subprocess.PIPE + ) + + doc_pattern = re.compile(r'') doc = [] for line in iter(proc.stdout.readline, ''): @@ -31,10 +46,14 @@ def docs(self): if line == '': del doc[1] - id, url, title = doc_pattern.match(doc[0]).groups() + idx, url, title = doc_pattern.match(doc[0]).groups() html = '\n'.join(doc[1:-1]) - yield {'id': id, 'url': url, 'title': title, 'html': html, 'text': html} + yield {'id': idx, + 'url': url, + 'title': title, + 'html': html, + 'text': html} doc = [] def texts(self): diff --git a/hazm/WordTokenizer.py b/hazm/WordTokenizer.py index bd9d7ec4..4827429b 100644 --- a/hazm/WordTokenizer.py +++ b/hazm/WordTokenizer.py @@ -1,5 +1,8 @@ # coding: utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function from __future__ import unicode_literals import codecs diff --git a/hazm/utils.py b/hazm/utils.py index 65e58c61..f75fe73c 100644 --- a/hazm/utils.py +++ b/hazm/utils.py @@ -1,5 +1,10 @@ # coding: utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + import codecs import sys from os import path From 912430d5e661328fceb5e272b3e62f58428025b9 Mon Sep 17 00:00:00 2001 From: Amir Hadifar Date: Sat, 25 Aug 2018 14:59:06 +0430 Subject: [PATCH 3/4] pep8 compilance --- data.py | 527 +++++++++++++++++++++---------------- hazm/BijankhanReader.py | 40 ++- hazm/Chunker.py | 12 +- hazm/DadeganReader.py | 94 +++++-- hazm/DependencyParser.py | 48 ++-- hazm/HamshahriReader.py | 59 +++-- hazm/InformalNormalizer.py | 30 ++- hazm/Lemmatizer.py | 24 +- hazm/Normalizer.py | 48 ++-- hazm/POSTagger.py | 5 +- hazm/PeykareReader.py | 39 ++- hazm/QuranCorpusReader.py | 27 +- hazm/SentenceTokenizer.py | 3 +- hazm/SentiPersReader.py | 27 +- hazm/SequenceTagger.py | 25 +- hazm/Stemmer.py | 14 +- hazm/TNewsReader.py | 32 ++- hazm/TokenSplitter.py | 10 +- hazm/TreebankReader.py | 97 ++++--- hazm/VerbValencyReader.py | 10 +- hazm/WikiExtractor.py | 139 ++++++---- hazm/WikipediaReader.py | 2 +- hazm/WordTokenizer.py | 59 +++-- hazm/utils.py | 5 +- 24 files changed, 880 insertions(+), 496 deletions(-) diff --git a/data.py b/data.py index 472f6309..4cd8ed74 100755 --- a/data.py +++ b/data.py @@ -1,247 +1,320 @@ # coding: utf-8 from __future__ import print_function, unicode_literals -import codecs, subprocess, random + +import codecs +import random +import subprocess from collections import Counter from itertools import islice + from nltk.tag import untag from sklearn.model_selection import train_test_split + from hazm import * from hazm.Chunker import tree2brackets from hazm.PeykareReader import coarse_pos_e as peykare_coarse_pos_e -from hazm.DadeganReader import coarse_pos_e as dadegan_coarse_pos_e - - -def create_words_file(dic_file='resources/persian.dic', output='hazm/data/words.dat'): - """ prepares list of persian word words from [Virastyar](https://sourceforge.net/projects/virastyar/) dic file. """ - - dic_words = [line.strip().replace(', ', ',').split('\t') for line in codecs.open(dic_file, encoding='utf-8') if len(line.strip().split('\t')) == 3] - dic_words = filter(lambda item: not item[2].startswith('V') and 'NEG' not in item[2], dic_words) - dic_words = ['\t'.join(item) for item in sorted(dic_words, key=lambda item: item[0])] - print(*dic_words, sep='\n', file=codecs.open(output, 'w', 'utf-8')) - print(output, 'created') - - -def evaluate_lemmatizer(conll_file='resources/train.conll', peykare_root='corpora/peykare'): - lemmatizer = Lemmatizer() - - errors = [] - with codecs.open('resources/lemmatizer_errors.txt', 'w', 'utf8') as output: - dadegan = DadeganReader(conll_file) - for tree in dadegan.trees(): - for node in tree.nodelist[1:]: - word, lemma, pos = node['word'], node['lemma'], node['mtag'] - if lemmatizer.lemmatize(word, pos) != lemma: - errors.append((word, lemma, pos, lemmatizer.lemmatize(word, pos))) - print(len(errors), 'errors', file=output) - counter = Counter(errors) - for item, count in sorted(counter.items(), key=lambda t: t[1], reverse=True): - print(count, *item, file=output) - - missed = [] - with codecs.open('resources/lemmatizer_missed.txt', 'w', 'utf8') as output: - peykare = PeykareReader(peykare_root) - for sentence in peykare.sents(): - for word in sentence: - if word[1] == 'V': - if word[0] == lemmatizer.lemmatize(word[0]): - missed.append(word[0]) - print(len(missed), 'missed', file=output) - counter = Counter(missed) - for item, count in sorted(counter.items(), key=lambda t: t[1], reverse=True): - print(count, item, file=output) -def evaluate_normalizer(tnews_root='corpora/tnews'): - - tnews = TNewsReader(root=tnews_root) - normalizer = Normalizer(persian_style=False, persian_numbers=False, remove_diacritics=False, token_based=False, affix_spacing=True) - token_normalizer = Normalizer(persian_style=False, persian_numbers=False, remove_diacritics=False, token_based=True, affix_spacing=False) +def create_words_file(dic_file='resources/persian.dic', + output='hazm/data/words.dat'): + """ prepares list of persian word words from [Virastyar](https://sourceforge.net/projects/virastyar/) dic file. """ + + dic_words = [line.strip().replace(', ', ',').split('\t') for line in + codecs.open(dic_file, encoding='utf-8') if + len(line.strip().split('\t')) == 3] + dic_words = filter( + lambda item: not item[2].startswith('V') and 'NEG' not in item[2], + dic_words) + dic_words = ['\t'.join(item) for item in + sorted(dic_words, key=lambda item: item[0])] + print(*dic_words, sep='\n', file=codecs.open(output, 'w', 'utf-8')) + print(output, 'created') + + +def evaluate_lemmatizer(conll_file='resources/train.conll', + peykare_root='corpora/peykare'): + lemmatizer = Lemmatizer() + + errors = [] + with codecs.open('resources/lemmatizer_errors.txt', 'w', 'utf8') as output: + dadegan = DadeganReader(conll_file) + for tree in dadegan.trees(): + for node in tree.nodelist[1:]: + word, lemma, pos = node['word'], node['lemma'], node['mtag'] + if lemmatizer.lemmatize(word, pos) != lemma: + errors.append( + (word, lemma, pos, lemmatizer.lemmatize(word, pos))) + print(len(errors), 'errors', file=output) + counter = Counter(errors) + for item, count in sorted(counter.items(), key=lambda t: t[1], + reverse=True): + print(count, *item, file=output) + + missed = [] + with codecs.open('resources/lemmatizer_missed.txt', 'w', 'utf8') as output: + peykare = PeykareReader(peykare_root) + for sentence in peykare.sents(): + for word in sentence: + if word[1] == 'V': + if word[0] == lemmatizer.lemmatize(word[0]): + missed.append(word[0]) + print(len(missed), 'missed', file=output) + counter = Counter(missed) + for item, count in sorted(counter.items(), key=lambda t: t[1], + reverse=True): + print(count, item, file=output) - with codecs.open('resources/normalized.txt', 'w', 'utf8') as output1, codecs.open('resources/normalized_token_based.txt', 'w', 'utf8') as output2: - random.seed(0) - for text in tnews.texts(): - if random.randint(0, 100) != 0: - continue - for sentence in sent_tokenize(text): - print(normalizer.normalize(sentence), '\n', file=output1) - print(token_normalizer.normalize(sentence), '\n', file=output2) +def evaluate_normalizer(tnews_root='corpora/tnews'): + tnews = TNewsReader(root=tnews_root) + normalizer = Normalizer(persian_style=False, persian_numbers=False, + remove_diacritics=False, token_based=False, + affix_spacing=True) + token_normalizer = Normalizer(persian_style=False, persian_numbers=False, + remove_diacritics=False, token_based=True, + affix_spacing=False) + + with codecs.open('resources/normalized.txt', 'w', + 'utf8') as output1, codecs.open( + 'resources/normalized_token_based.txt', 'w', 'utf8') as output2: + random.seed(0) + for text in tnews.texts(): + if random.randint(0, 100) != 0: + continue + + for sentence in sent_tokenize(text): + print(normalizer.normalize(sentence), '\n', file=output1) + print(token_normalizer.normalize(sentence), '\n', file=output2) def evaluate_informal_normalizer(sentipars_root='corpora/sentipers'): - sentipers = SentiPersReader(root=sentipars_root) - normalizer = Normalizer() - informal_normalizer = InformalNormalizer() - - output = codecs.open('resources/normalized.txt', 'w', 'utf8') - for comments in sentipers.comments(): - for comment in comments: - for sentence in comment: - print(normalizer.normalize(sentence), file=output) - sents = informal_normalizer.normalize(sentence) - sents = [[word[0] for word in sent] for sent in sents] - sents = [' '.join(sent) for sent in sents] - text = '\n'.join(sents) - text = normalizer.normalize(text) - print(text, file=output) - print(file=output) + sentipers = SentiPersReader(root=sentipars_root) + normalizer = Normalizer() + informal_normalizer = InformalNormalizer() + + output = codecs.open('resources/normalized.txt', 'w', 'utf8') + for comments in sentipers.comments(): + for comment in comments: + for sentence in comment: + print(normalizer.normalize(sentence), file=output) + sents = informal_normalizer.normalize(sentence) + sents = [[word[0] for word in sent] for sent in sents] + sents = [' '.join(sent) for sent in sents] + text = '\n'.join(sents) + text = normalizer.normalize(text) + print(text, file=output) + print(file=output) def evaluate_chunker(treebank_root='corpora/treebank'): - treebank = TreebankReader(treebank_root, join_clitics=True, join_verb_parts=True) - chunker = Chunker() - chunked_trees = list(treebank.chunked_trees()) - - print(chunker.evaluate(chunked_trees)) - - output = codecs.open('resources/chunker_errors.txt', 'w', 'utf8') - for sentence, gold in zip(treebank.sents(), chunked_trees): - chunked = chunker.parse(sentence) - if chunked != gold: - print(tree2brackets(chunked), file=output) - print(tree2brackets(gold), file=output) - print(file=output) - - -def train_postagger(peykare_root='corpora/peykare', model_file='resources/postagger.model', test_size=.1, sents_limit=None, pos_map=peykare_coarse_pos_e): - - tagger = POSTagger(type='crf', algo='rprop', compact=True, patterns=[ - '*', - - 'u:wll=%x[-2,0]', - 'u:wl=%x[-1,0]', - 'u:w=%x[0,0]', - 'u:wr=%x[1,0]', - 'u:wrr=%x[2,0]', - - # 'u:w2l=%x[-1,0]/%x[0,0]', - # 'u:w2r=%x[0,0]/%x[1,0]', - - '*:p1=%m[0,0,"^.?"]', - '*:p2=%m[0,0,"^.?.?"]', - '*:p3=%m[0,0,"^.?.?.?"]', - - '*:s1=%m[0,0,".?$"]', - '*:s2=%m[0,0,".?.?$"]', - '*:s3=%m[0,0,".?.?.?$"]', - - '*:p?l=%t[-1,0,"\p"]', - '*:p?=%t[0,0,"\p"]', - '*:p?r=%t[1,0,"\p"]', - '*:p?a=%t[0,0,"^\p*$"]', - - '*:n?l=%t[-1,0,"\d"]', - '*:n?=%t[0,0,"\d"]', - '*:n?r=%t[1,0,"\d"]', - '*:n?a=%t[0,0,"^\d*$"]', - ]) - - peykare = PeykareReader(peykare_root, pos_map=pos_map) - train_sents, test_sents = train_test_split(list(islice(peykare.sents(), sents_limit)), test_size=test_size, random_state=0) - - tagger.train(train_sents) - tagger.save_model(model_file) - - print(tagger.evaluate(test_sents)) - - -def train_chunker(train_file='corpora/train.conll', dev_file='corpora/dev.conll', test_file='corpora/test.conll', model_file='resources/chunker.model'): - - tagger = POSTagger(model='resources/postagger.model') - chunker = Chunker(type='crf', algo='l-bfgs', compact=True, patterns=[ - '*', - - 'u:wll=%x[-2,0]', - 'u:wl=%x[-1,0]', - 'u:w=%x[0,0]', - 'u:wr=%x[1,0]', - 'u:wrr=%x[2,0]', - - '*:tll=%x[-2,1]', - '*:tl=%x[-1,1]', - '*:t=%x[0,1]', - '*:tr=%x[1,1]', - '*:trr=%x[2,1]', - ]) - - def retag_trees(trees, sents): - for tree, sentence in zip(trees, tagger.tag_sents(map(untag, sents))): - for (n, word) in zip(tree.treepositions('leaves'), sentence): - tree[n] = word - - train, test = DadeganReader(train_file), DadeganReader(test_file) - train_trees = list(train.chunked_trees()) - retag_trees(train_trees, train.sents()) - chunker.train(train_trees) - chunker.save_model(model_file) - - test_trees = list(test.chunked_trees()) - retag_trees(test_trees, test.sents()) - print(chunker.evaluate(test_trees)) - - -def train_maltparser(train_file='corpora/train.conll', dev_file='corpora/dev.conll', test_file='corpora/test.conll', model_file='langModel.mco', path_to_jar='resources/malt.jar', options_file='resources/malt-options.xml', features_file='resources/malt-features.xml', memory_min='-Xms7g', memory_max='-Xmx8g'): - - lemmatizer, tagger = Lemmatizer(), POSTagger(model='resources/postagger.model') - - train, test = DadeganReader(train_file), DadeganReader(test_file) - train_data = train_file +'.data' - with codecs.open(train_data, 'w', 'utf8') as output: - for tree, sentence in zip(train.trees(), tagger.tag_sents(map(untag, train.sents()))): - for i, (node, word) in enumerate(zip(list(tree.nodes.values())[1:], sentence), start=1): - node['mtag'] = word[1] - node['lemma'] = lemmatizer.lemmatize(node['word'], node['mtag']) - print(i, node['word'].replace(' ', '_'), node['lemma'].replace(' ', '_'), node['mtag'], node['mtag'], '_', node['head'], node['rel'], '_', '_', sep='\t', file=output) - print(file=output) - - subprocess.Popen(['java', memory_min, memory_max, '-jar', path_to_jar, '-w', 'resources', '-c', model_file, '-i', train_data, '-f', options_file, '-F', features_file, '-m', 'learn']).wait() - - # evaluation - parser = MaltParser(tagger=tagger, lemmatizer=lemmatizer, model_file=model_file) - parsed_trees = parser.parse_sents(map(untag, test.sents())) - - test_data, test_results = test_file +'.data', test_file +'.results' - print('\n'.join([tree.to_conll(10) for tree in test.trees()]).strip(), file=codecs.open(test_data, 'w', 'utf8')) - print('\n'.join([tree.to_conll(10) for tree in parsed_trees]).strip(), file=codecs.open(test_results, 'w', 'utf8')) - subprocess.Popen(['java', '-jar', 'resources/MaltEval.jar', '-g', test_data, '-s', test_results]).wait() - - -def train_turboparser(train_file='corpora/train.conll', dev_file='corpora/dev.conll', test_file='corpora/test.conll', model_file='resources/turboparser.model'): - - lemmatizer, tagger = Lemmatizer(), POSTagger(model='resources/postagger.model') - - train, test = DadeganReader(train_file), DadeganReader(test_file) - train_data = train_file +'.data' - with codecs.open(train_data, 'w', 'utf8') as output: - for tree, sentence in zip(train.trees(), tagger.tag_sents(map(untag, train.sents()))): - for i, (node, word) in enumerate(zip(list(tree.nodes.values())[1:], sentence), start=1): - node['mtag'] = word[1] - node['lemma'] = lemmatizer.lemmatize(node['word'], node['mtag']) - print(i, node['word'].replace(' ', '_'), node['lemma'].replace(' ', '_'), node['mtag'], node['mtag'], '_', node['head'], node['rel'], '_', '_', sep='\t', file=output) - print(file=output) - - subprocess.Popen(['./resources/TurboParser', '--train', '--file_train='+train_data, '--file_model='+model_file, '--logtostderr']).wait() - - # evaluation - parser = TurboParser(tagger=tagger, lemmatizer=lemmatizer, model_file=model_file) - parsed_trees = parser.parse_sents(map(untag, test.sents())) - - test_data, test_results = test_file +'.data', test_file +'.results' - print('\n'.join([tree.to_conll(10) for tree in test.trees()]).strip(), file=codecs.open(test_data, 'w', 'utf8')) - print('\n'.join([tree.to_conll(10) for tree in parsed_trees]).strip(), file=codecs.open(test_results, 'w', 'utf8')) - subprocess.Popen(['java', '-jar', 'resources/MaltEval.jar', '-g', test_data, '-s', test_results, '--pattern', '0.####', '--Metric', 'LAS;UAS']).wait() - - -def train_stanford_postagger(peykare_root='corpora/peykare', path_to_model='resources/persian.tagger', path_to_jar='resources/stanford-postagger.jar', properties_file='resources/stanford-postagger.props', memory_min='-Xms1g', memory_max='-Xmx6g', test_size=.1, pos_map=peykare_coarse_pos_e): - peykare = PeykareReader(peykare_root, pos_map=pos_map) - train_file = 'resources/tagger_train_data.txt' - train, test = train_test_split(list(peykare.sents()), test_size=test_size, random_state=0) - - output = codecs.open(train_file, 'w', 'utf8') - for sentence in train: - print(*(map(lambda w: '/'.join(w).replace(' ', '_'), sentence)), file=output) - subprocess.Popen(['java', memory_min, memory_max, '-classpath', path_to_jar, 'edu.stanford.nlp.tagger.maxent.MaxentTagger', '-prop', properties_file, '-model', path_to_model, '-trainFile', train_file, '-tagSeparator', '/', '-search', 'owlqn2']).wait() - - tagger = StanfordPOSTagger(path_to_jar=path_to_jar, path_to_model=path_to_model) - print(tagger.evaluate(test)) + treebank = TreebankReader(treebank_root, join_clitics=True, + join_verb_parts=True) + chunker = Chunker() + chunked_trees = list(treebank.chunked_trees()) + + print(chunker.evaluate(chunked_trees)) + + output = codecs.open('resources/chunker_errors.txt', 'w', 'utf8') + for sentence, gold in zip(treebank.sents(), chunked_trees): + chunked = chunker.parse(sentence) + if chunked != gold: + print(tree2brackets(chunked), file=output) + print(tree2brackets(gold), file=output) + print(file=output) + + +def train_postagger(peykare_root='corpora/peykare', + model_file='resources/postagger.model', test_size=.1, + sents_limit=None, pos_map=peykare_coarse_pos_e): + tagger = POSTagger(type='crf', algo='rprop', compact=True, patterns=[ + '*', + + 'u:wll=%x[-2,0]', + 'u:wl=%x[-1,0]', + 'u:w=%x[0,0]', + 'u:wr=%x[1,0]', + 'u:wrr=%x[2,0]', + + # 'u:w2l=%x[-1,0]/%x[0,0]', + # 'u:w2r=%x[0,0]/%x[1,0]', + + '*:p1=%m[0,0,"^.?"]', + '*:p2=%m[0,0,"^.?.?"]', + '*:p3=%m[0,0,"^.?.?.?"]', + + '*:s1=%m[0,0,".?$"]', + '*:s2=%m[0,0,".?.?$"]', + '*:s3=%m[0,0,".?.?.?$"]', + + '*:p?l=%t[-1,0,"\p"]', + '*:p?=%t[0,0,"\p"]', + '*:p?r=%t[1,0,"\p"]', + '*:p?a=%t[0,0,"^\p*$"]', + + '*:n?l=%t[-1,0,"\d"]', + '*:n?=%t[0,0,"\d"]', + '*:n?r=%t[1,0,"\d"]', + '*:n?a=%t[0,0,"^\d*$"]', + ]) + + peykare = PeykareReader(peykare_root, pos_map=pos_map) + train_sents, test_sents = train_test_split( + list(islice(peykare.sents(), sents_limit)), test_size=test_size, + random_state=0) + + tagger.train(train_sents) + tagger.save_model(model_file) + + print(tagger.evaluate(test_sents)) + + +def train_chunker(train_file='corpora/train.conll', + dev_file='corpora/dev.conll', test_file='corpora/test.conll', + model_file='resources/chunker.model'): + tagger = POSTagger(model='resources/postagger.model') + chunker = Chunker(type='crf', algo='l-bfgs', compact=True, patterns=[ + '*', + + 'u:wll=%x[-2,0]', + 'u:wl=%x[-1,0]', + 'u:w=%x[0,0]', + 'u:wr=%x[1,0]', + 'u:wrr=%x[2,0]', + + '*:tll=%x[-2,1]', + '*:tl=%x[-1,1]', + '*:t=%x[0,1]', + '*:tr=%x[1,1]', + '*:trr=%x[2,1]', + ]) + + def retag_trees(trees, sents): + for tree, sentence in zip(trees, tagger.tag_sents(map(untag, sents))): + for (n, word) in zip(tree.treepositions('leaves'), sentence): + tree[n] = word + + train, test = DadeganReader(train_file), DadeganReader(test_file) + train_trees = list(train.chunked_trees()) + retag_trees(train_trees, train.sents()) + chunker.train(train_trees) + chunker.save_model(model_file) + + test_trees = list(test.chunked_trees()) + retag_trees(test_trees, test.sents()) + print(chunker.evaluate(test_trees)) + + +def train_maltparser(train_file='corpora/train.conll', + dev_file='corpora/dev.conll', + test_file='corpora/test.conll', + model_file='langModel.mco', + path_to_jar='resources/malt.jar', + options_file='resources/malt-options.xml', + features_file='resources/malt-features.xml', + memory_min='-Xms7g', memory_max='-Xmx8g'): + lemmatizer, tagger = Lemmatizer(), POSTagger( + model='resources/postagger.model') + + train, test = DadeganReader(train_file), DadeganReader(test_file) + train_data = train_file + '.data' + with codecs.open(train_data, 'w', 'utf8') as output: + for tree, sentence in zip(train.trees(), + tagger.tag_sents(map(untag, train.sents()))): + for i, (node, word) in enumerate( + zip(list(tree.nodes.values())[1:], sentence), start=1): + node['mtag'] = word[1] + node['lemma'] = lemmatizer.lemmatize(node['word'], + node['mtag']) + print(i, node['word'].replace(' ', '_'), + node['lemma'].replace(' ', '_'), node['mtag'], + node['mtag'], '_', node['head'], node['rel'], '_', '_', + sep='\t', file=output) + print(file=output) + + subprocess.Popen( + ['java', memory_min, memory_max, '-jar', path_to_jar, '-w', + 'resources', '-c', model_file, '-i', train_data, '-f', options_file, + '-F', features_file, '-m', 'learn']).wait() + + # evaluation + parser = MaltParser(tagger=tagger, lemmatizer=lemmatizer, + model_file=model_file) + parsed_trees = parser.parse_sents(map(untag, test.sents())) + + test_data, test_results = test_file + '.data', test_file + '.results' + print('\n'.join([tree.to_conll(10) for tree in test.trees()]).strip(), + file=codecs.open(test_data, 'w', 'utf8')) + print('\n'.join([tree.to_conll(10) for tree in parsed_trees]).strip(), + file=codecs.open(test_results, 'w', 'utf8')) + subprocess.Popen( + ['java', '-jar', 'resources/MaltEval.jar', '-g', test_data, '-s', + test_results]).wait() + + +def train_turboparser(train_file='corpora/train.conll', + dev_file='corpora/dev.conll', + test_file='corpora/test.conll', + model_file='resources/turboparser.model'): + lemmatizer, tagger = Lemmatizer(), POSTagger( + model='resources/postagger.model') + + train, test = DadeganReader(train_file), DadeganReader(test_file) + train_data = train_file + '.data' + with codecs.open(train_data, 'w', 'utf8') as output: + for tree, sentence in zip(train.trees(), + tagger.tag_sents(map(untag, train.sents()))): + for i, (node, word) in enumerate( + zip(list(tree.nodes.values())[1:], sentence), start=1): + node['mtag'] = word[1] + node['lemma'] = lemmatizer.lemmatize(node['word'], + node['mtag']) + print(i, node['word'].replace(' ', '_'), + node['lemma'].replace(' ', '_'), node['mtag'], + node['mtag'], '_', node['head'], node['rel'], '_', '_', + sep='\t', file=output) + print(file=output) + + subprocess.Popen( + ['./resources/TurboParser', '--train', '--file_train=' + train_data, + '--file_model=' + model_file, '--logtostderr']).wait() + + # evaluation + parser = TurboParser(tagger=tagger, lemmatizer=lemmatizer, + model_file=model_file) + parsed_trees = parser.parse_sents(map(untag, test.sents())) + + test_data, test_results = test_file + '.data', test_file + '.results' + print('\n'.join([tree.to_conll(10) for tree in test.trees()]).strip(), + file=codecs.open(test_data, 'w', 'utf8')) + print('\n'.join([tree.to_conll(10) for tree in parsed_trees]).strip(), + file=codecs.open(test_results, 'w', 'utf8')) + subprocess.Popen( + ['java', '-jar', 'resources/MaltEval.jar', '-g', test_data, '-s', + test_results, '--pattern', '0.####', '--Metric', 'LAS;UAS']).wait() + + +def train_stanford_postagger(peykare_root='corpora/peykare', + path_to_model='resources/persian.tagger', + path_to_jar='resources/stanford-postagger.jar', + properties_file='resources/stanford-postagger.props', + memory_min='-Xms1g', memory_max='-Xmx6g', + test_size=.1, pos_map=peykare_coarse_pos_e): + peykare = PeykareReader(peykare_root, pos_map=pos_map) + train_file = 'resources/tagger_train_data.txt' + train, test = train_test_split(list(peykare.sents()), test_size=test_size, + random_state=0) + + output = codecs.open(train_file, 'w', 'utf8') + for sentence in train: + print(*(map(lambda w: '/'.join(w).replace(' ', '_'), sentence)), + file=output) + subprocess.Popen( + ['java', memory_min, memory_max, '-classpath', path_to_jar, + 'edu.stanford.nlp.tagger.maxent.MaxentTagger', '-prop', + properties_file, '-model', path_to_model, '-trainFile', train_file, + '-tagSeparator', '/', '-search', 'owlqn2']).wait() + + tagger = StanfordPOSTagger(path_to_jar=path_to_jar, + path_to_model=path_to_model) + print(tagger.evaluate(test)) diff --git a/hazm/BijankhanReader.py b/hazm/BijankhanReader.py index 13b440da..61a2285a 100644 --- a/hazm/BijankhanReader.py +++ b/hazm/BijankhanReader.py @@ -10,25 +10,41 @@ from .Normalizer import * from .PeykareReader import join_verb_parts -default_pos_map = {'ADJ': 'ADJ', 'ADJ_CMPR': 'ADJ', 'ADJ_INO': 'ADJ', 'ADJ_ORD': 'ADJ', 'ADJ_SIM': 'ADJ', - 'ADJ_SUP': 'ADJ', 'ADV': 'ADV', 'ADV_EXM': 'ADV', 'ADV_I': 'ADV', 'ADV_NEGG': 'ADV', 'ADV_NI': 'ADV', - 'ADV_TIME': 'ADV', 'AR': 'AR', 'CON': 'CONJ', 'DEFAULT': 'DEFAULT', 'DELM': 'PUNC', 'DET': 'PREP', - 'IF': 'IF', 'INT': 'INT', 'MORP': 'MORP', 'MQUA': 'MQUA', 'MS': 'MS', 'N_PL': 'N', 'N_SING': 'N', - 'NN': 'NN', 'NP': 'NP', 'OH': 'OH', 'OHH': 'OHH', 'P': 'PREP', 'PP': 'PP', 'PRO': 'PR', 'PS': 'PS', - 'QUA': 'QUA', 'SPEC': 'SPEC', 'V_AUX': 'V', 'V_IMP': 'V', 'V_PA': 'V', 'V_PRE': 'V', 'V_PRS': 'V', - 'V_SUB': 'V'} +default_pos_map = {'ADJ': 'ADJ', 'ADJ_CMPR': 'ADJ', + 'ADJ_INO': 'ADJ', 'ADJ_ORD': 'ADJ', + 'ADJ_SIM': 'ADJ', 'ADJ_SUP': 'ADJ', + 'ADV': 'ADV', 'ADV_EXM': 'ADV', + 'ADV_I': 'ADV', 'ADV_NEGG': 'ADV', + 'ADV_NI': 'ADV', 'ADV_TIME': 'ADV', + 'AR': 'AR', 'CON': 'CONJ', + 'DEFAULT': 'DEFAULT', 'DELM': 'PUNC', + 'DET': 'PREP', 'IF': 'IF', 'INT': 'INT', + 'MORP': 'MORP', 'MQUA': 'MQUA', + 'MS': 'MS', 'N_PL': 'N', 'N_SING': 'N', + 'NN': 'NN', 'NP': 'NP', 'OH': 'OH', + 'OHH': 'OHH', 'P': 'PREP', 'PP': 'PP', + 'PRO': 'PR', 'PS': 'PS', 'QUA': 'QUA', + 'SPEC': 'SPEC', 'V_AUX': 'V', + 'V_IMP': 'V', 'V_PA': 'V', + 'V_PRE': 'V', 'V_PRS': 'V', 'V_SUB': 'V'} -class BijankhanReader(): +class BijankhanReader: """ - interfaces [Bijankhan Corpus](http://ece.ut.ac.ir/dbrg/bijankhan/Corpus/BijanKhan_Corpus_Processed.zip) that you must download and extract it. + interfaces [Bijankhan Corpus]( + http://ece.ut.ac.ir/dbrg/bijankhan/Corpus/BijanKhan_Corpus_Processed.zip + ) that you must download and extract it. >>> bijankhan = BijankhanReader(bijankhan_file='corpora/bijankhan.txt') - >>> next(bijankhan.sents()) - [('اولین', 'ADJ'), ('سیاره', 'N'), ('خارج', 'ADJ'), ('از', 'PREP'), ('منظومه', 'N'), ('شمسی', 'ADJ'), ('دیده_شد', 'V'), ('.', 'PUNC')] + >>> next(bijankhan.sents()) [('اولین', 'ADJ'), ('سیاره', 'N'), ('خارج', + 'ADJ'), ('از', 'PREP'), ('منظومه', 'N'), ('شمسی', 'ADJ'), ('دیده_شد', + 'V'), ('.', 'PUNC')] + """ - def __init__(self, bijankhan_file, joined_verb_parts=True, pos_map=default_pos_map): + def __init__(self, bijankhan_file, joined_verb_parts=True, + pos_map=default_pos_map): + self._bijankhan_file = bijankhan_file self._joined_verb_parts = joined_verb_parts self._pos_map = pos_map diff --git a/hazm/Chunker.py b/hazm/Chunker.py index 3fc2f0c6..e52d3a9c 100755 --- a/hazm/Chunker.py +++ b/hazm/Chunker.py @@ -5,7 +5,10 @@ from __future__ import print_function from __future__ import unicode_literals -from nltk.chunk import ChunkParserI, RegexpParser, tree2conlltags, conlltags2tree +from nltk.chunk import ChunkParserI +from nltk.chunk import RegexpParser +from nltk.chunk import conlltags2tree +from nltk.chunk import tree2conlltags from .SequenceTagger import IOBTagger @@ -31,7 +34,8 @@ def tree2brackets(tree): class Chunker(IOBTagger, ChunkParserI): """ >>> chunker = Chunker(model='resources/chunker.model') - >>> tree2brackets(chunker.parse([('نامه', 'Ne'), ('ایشان', 'PRO'), ('را', 'POSTP'), ('دریافت', 'N'), ('داشتم', 'V'), ('.', 'PUNC')])) + >>> tree2brackets(chunker.parse([('نامه', 'Ne'), ('ایشان', 'PRO'), + ('را', 'POSTP'), ('دریافت', 'N'), ('داشتم', 'V'), ('.', 'PUNC')])) '[نامه ایشان NP] [را POSTP] [دریافت داشتم VP] .' """ @@ -52,7 +56,9 @@ def evaluate(self, gold): class RuleBasedChunker(RegexpParser): """ >>> chunker = RuleBasedChunker() - >>> tree2brackets(chunker.parse([('نامه', 'Ne'), ('۱۰', 'NUMe'), ('فوریه', 'Ne'), ('شما', 'PRO'), ('را', 'POSTP'), ('دریافت', 'N'), ('داشتم', 'V'), ('.', 'PUNC')])) + >>> tree2brackets(chunker.parse([('نامه', 'Ne'), ('۱۰', 'NUMe'), + ('فوریه', 'Ne'), ('شما', 'PRO'), ('را', 'POSTP'), ('دریافت', 'N'), + ('داشتم', 'V'), ('.', 'PUNC')])) '[نامه ۱۰ فوریه شما NP] [را POSTP] [دریافت داشتم VP] .' """ diff --git a/hazm/DadeganReader.py b/hazm/DadeganReader.py index 8f6be81f..03633a01 100755 --- a/hazm/DadeganReader.py +++ b/hazm/DadeganReader.py @@ -1,6 +1,5 @@ # coding: utf-8 -from __future__ import unicode_literals from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -14,33 +13,44 @@ def coarse_pos_e(tags): """ - Coarse POS tags of Dadegan corpus: - N: Noun, V: Verb, ADJ: Adjective, ADV: Adverb, PR: Pronoun, PREP: Preposition, POSTP: Postposition, CONJ: Conjunction, PUNC: Punctuation, ADR: Address Term, IDEN: Title, PART: Particle, POSNUM: Post-noun Modifier, PREM: Pre-modifier, PRENUM: Pre-noun Numeral, PSUS: Pseudo-sentence, SUBR: Subordinating Clause + Coarse POS tags of Dadegan corpus: N: Noun, V: Verb, ADJ: Adjective, + ADV: Adverb, PR: Pronoun, PREP: Preposition, POSTP: Postposition, + CONJ: Conjunction, PUNC: Punctuation, ADR: Address Term, IDEN: Title, + PART: Particle, POSNUM: Post-noun Modifier, PREM: Pre-modifier, PRENUM: + Pre-noun Numeral, PSUS: Pseudo-sentence, SUBR: Subordinating Clause >>> coarse_pos_e(['N', 'IANM']) 'N' """ - map = {'N': 'N', 'V': 'V', 'ADJ': 'AJ', 'ADV': 'ADV', 'PR': 'PRO', 'PREM': 'DET', 'PREP': 'P', 'POSTP': 'POSTP', - 'PRENUM': 'NUM', 'CONJ': 'CONJ', 'PUNC': 'PUNC', 'SUBR': 'CONJ'} + map = {'N': 'N', 'V': 'V', 'ADJ': 'AJ', + 'ADV': 'ADV', 'PR': 'PRO', + 'PREM': 'DET', 'PREP': 'P', + 'POSTP': 'POSTP', 'PRENUM': 'NUM', + 'CONJ': 'CONJ', 'PUNC': 'PUNC', 'SUBR': 'CONJ'} + return map.get(tags[0], 'X') + ('e' if 'EZ' in tags else '') -word_nodes = lambda tree: sorted(tree.nodes.values(), key=lambda node: node['address'])[1:] +word_nodes = lambda tree: sorted(tree.nodes.values(), + key=lambda node: node['address'])[1:] node_deps = lambda node: sum(node['deps'].values(), []) -class DadeganReader(): +class DadeganReader: """ interfaces [Persian Dependency Treebank](http://dadegan.ir/perdt/download) >>> dadegan = DadeganReader(conll_file='corpora/dadegan.conll') >>> next(dadegan.sents()) - [('این', 'DET'), ('میهمانی', 'N'), ('به', 'P'), ('منظور', 'Ne'), ('آشنایی', 'Ne'), ('هم‌تیمی‌های', 'Ne'), ('او', 'PRO'), ('با', 'P'), ('غذاهای', 'Ne'), ('ایرانی', 'AJ'), ('ترتیب', 'N'), ('داده_شد', 'V'), ('.', 'PUNC')] + [('این', 'DET'), ('میهمانی', 'N'), ('به', 'P'), ('منظور', 'Ne'), ('آشنایی', 'Ne') + , ('هم‌تیمی‌های', 'Ne'), ('او', 'PRO'), ('با', 'P'), ('غذاهای', 'Ne') + , ('ایرانی', 'AJ'), ('ترتیب', 'N'), ('داده_شد', 'V'), ('.', 'PUNC')] >>> from hazm.Chunker import tree2brackets >>> tree2brackets(next(dadegan.chunked_trees())) - '[این میهمانی NP] [به PP] [منظور آشنایی هم‌تیمی‌های او NP] [با PP] [غذاهای ایرانی NP] [ترتیب داده_شد VP] .' + '[این میهمانی NP] [به PP] [منظور آشنایی هم‌تیمی‌های او NP] + [با PP] [غذاهای ایرانی NP] [ترتیب داده_شد VP] .' """ def __init__(self, conll_file, pos_map=coarse_pos_e): @@ -52,9 +62,13 @@ def _sentences(self): text = conll_file.read() # refine text - text = text.replace('‌‌', '‌').replace('\t‌', '\t').replace('‌\t', '\t').replace('\t ', '\t').replace(' \t', - '\t').replace( - '\r', '').replace('\u2029', '‌') + text = text.replace('‌‌', '‌') + text = text.replace('\t‌', '\t') + text = text.replace('‌\t', '\t') + text = text.replace('\t ', '\t') + text = text.replace(' \t', '\t') + text = text.replace('\r', '') + text = text.replace('\u2029', '‌') for item in text.replace(' ', '_').split('\n\n'): if item.strip(): @@ -90,18 +104,23 @@ def chunked_trees(self): label = 'PP' if node['ctag'] == 'POSTP': label = 'POSTP' - if d == n - 1 and type(chunks[-1]) == Tree and chunks[-1].label() == label: + if d == n - 1 and type(chunks[-1]) == Tree and chunks[ + -1].label() == label: chunks[-1].append(item) appended = True - if node['head'] == n - 1 and len(chunks) > 0 and type(chunks[-1]) == Tree and chunks[ + if node['head'] == n - 1 and len(chunks) > 0 and type( + chunks[-1]) == Tree and chunks[ -1].label() == label: chunks[-1].append(item) appended = True if not appended: chunks.append(Tree(label, [item])) elif node['ctag'] in {'PUNC', 'CONJ', 'SUBR', 'PART'}: - if item[0] in {"'", '"', '(', ')', '{', '}', '[', ']', '-', '#', '«', '»'} and len( - chunks) > 0 and type(chunks[-1]) == Tree: + if item[0] in {"'", '"', '(', ')', '{', '}', '[', ']', '-', + '#', '«', '»'} and \ + len(chunks) > 0 and \ + type(chunks[-1]) == Tree: + for l in chunks[-1].leaves(): if l[1] == item[1]: chunks[-1].append(item) @@ -109,7 +128,8 @@ def chunked_trees(self): break if appended is not True: chunks.append(item) - elif node['ctag'] in {'N', 'PREM', 'ADJ', 'PR', 'ADR', 'PRENUM', 'IDEN', 'POSNUM', 'SADV'}: + elif node['ctag'] in {'N', 'PREM', 'ADJ', 'PR', 'ADR', + 'PRENUM', 'IDEN', 'POSNUM', 'SADV'}: if node['rel'] in {'MOZ', 'NPOSTMOD'}: if len(chunks) > 0: if type(chunks[-1]) == Tree: @@ -133,9 +153,13 @@ def chunked_trees(self): chunks.append(Tree('NP', leaves)) j -= 1 continue - elif node['rel'] == 'POSDEP' and tree.nodes[node['head']]['rel'] in {'NCONJ', 'AJCONJ'}: + elif node['rel'] == 'POSDEP' and tree.nodes[node['head']][ + 'rel'] in {'NCONJ', 'AJCONJ'}: conj = tree.nodes[node['head']] - if tree.nodes[conj['head']]['rel'] in {'MOZ', 'NPOSTMOD', 'AJCONJ', 'POSDEP'}: + if tree.nodes[conj['head']]['rel'] in {'MOZ', + 'NPOSTMOD', + 'AJCONJ', + 'POSDEP'}: label = 'NP' leaves = [item] j = n - 1 @@ -149,11 +173,15 @@ def chunked_trees(self): j -= 1 chunks.append(Tree(label, leaves)) appended = True - elif node['head'] == n - 1 and len(chunks) > 0 and type(chunks[-1]) == Tree and not chunks[ - -1].label() == 'PP': + elif node['head'] == n - 1 and \ + len(chunks) > 0 and \ + type(chunks[-1]) == Tree and not \ + chunks[-1].label() == 'PP': + chunks[-1].append(item) appended = True - elif node['rel'] == 'AJCONJ' and tree.nodes[node['head']]['rel'] in {'NPOSTMOD', 'AJCONJ'}: + elif node['rel'] == 'AJCONJ' and tree.nodes[node['head']][ + 'rel'] in {'NPOSTMOD', 'AJCONJ'}: np_nodes = [item] label = 'ADJP' i = n - node['head'] @@ -168,8 +196,10 @@ def chunked_trees(self): np_nodes.insert(0, chunks.pop()) chunks.append(Tree(label, np_nodes)) appended = True - elif node['ctag'] == 'ADJ' and node['rel'] == 'POSDEP' and tree.nodes[node['head']][ - 'ctag'] != 'CONJ': + elif node['ctag'] == 'ADJ' and \ + node['rel'] == 'POSDEP' and \ + tree.nodes[node['head']][ + 'ctag'] != 'CONJ': np_nodes = [item] i = n - node['head'] while i > 0: @@ -185,8 +215,10 @@ def chunked_trees(self): chunks.append(Tree(label, np_nodes)) appended = True for d in node_deps(node): - if d == n - 1 and type(chunks[-1]) == Tree and chunks[ - -1].label() != 'PP' and appended is not True: + if d == n - 1 and \ + type(chunks[-1]) == Tree and \ + chunks[-1].label() != 'PP' and \ + appended is not True: label = chunks[-1].label() if node['rel'] == 'ADV': label = 'ADVP' @@ -199,7 +231,8 @@ def chunked_trees(self): leaves.append(item) chunks.append(Tree(label, leaves)) appended = True - elif tree.nodes[d]['rel'] == 'NPREMOD' and appended is not True: + elif tree.nodes[d][ + 'rel'] == 'NPREMOD' and appended is not True: np_nodes = [item] i = n - d while i > 0: @@ -222,8 +255,11 @@ def chunked_trees(self): elif node['ctag'] in {'V'}: appended = False for d in node_deps(node): - if d == n - 1 and type(chunks[-1]) == Tree and tree.nodes[d]['rel'] in {'NVE', - 'ENC'} and appended is not True: + if d == n - 1 and \ + type(chunks[-1]) == Tree and \ + tree.nodes[d]['rel'] in {'NVE', 'ENC'} and \ + appended is not True: + leaves = chunks.pop().leaves() leaves.append(item) chunks.append(Tree('VP', leaves)) diff --git a/hazm/DependencyParser.py b/hazm/DependencyParser.py index 17ef1c34..86684bf8 100644 --- a/hazm/DependencyParser.py +++ b/hazm/DependencyParser.py @@ -19,20 +19,23 @@ class MaltParser(MaltParser): interfaces [MaltParser](http://www.maltparser.org/) """ - def __init__(self, tagger, lemmatizer, working_dir='resources', model_file='langModel.mco'): + def __init__(self, tagger, lemmatizer, working_dir='resources', + model_file='langModel.mco'): self.tagger = tagger self.working_dir = working_dir self.mco = model_file self._malt_bin = os.path.join(working_dir, 'malt.jar') - self.lemmatize = lemmatizer.lemmatize if lemmatizer else lambda w, t: '_' + self.lemmatize = lemmatizer.lemmatize if lemmatizer else lambda w,t: '_' def parse_sents(self, sentences, verbose=False): tagged_sentences = self.tagger.tag_sents(sentences) return self.parse_tagged_sents(tagged_sentences, verbose) def parse_tagged_sents(self, sentences, verbose=False): - input_file = tempfile.NamedTemporaryFile(prefix='malt_input.conll', dir=self.working_dir, delete=False) - output_file = tempfile.NamedTemporaryFile(prefix='malt_output.conll', dir=self.working_dir, delete=False) + input_file = tempfile.NamedTemporaryFile( + prefix='malt_input.conll', dir=self.working_dir, delete=False) + output_file = tempfile.NamedTemporaryFile( + prefix='malt_output.conll', dir=self.working_dir, delete=False) try: for sentence in sentences: @@ -41,18 +44,24 @@ def parse_tagged_sents(self, sentences, verbose=False): if not word: word = '_' input_file.write(('\t'.join( - [str(i), word.replace(' ', '_'), self.lemmatize(word, tag).replace(' ', '_'), tag, tag, '_', + [str(i), word.replace(' ', '_'), + self.lemmatize(word, tag).replace(' ', '_'), tag, tag, + '_', '0', 'ROOT', '_', '_', '\n'])).encode('utf8')) input_file.write('\n\n'.encode('utf8')) input_file.close() - cmd = ['java', '-jar', self._malt_bin, '-w', self.working_dir, '-c', self.mco, '-i', input_file.name, '-o', + cmd = ['java', '-jar', self._malt_bin, '-w', self.working_dir, + '-c', self.mco, '-i', input_file.name, '-o', output_file.name, '-m', 'parse'] if self._execute(cmd, verbose) != 0: - raise Exception("MaltParser parsing failed: %s" % (' '.join(cmd))) + raise Exception( + "MaltParser parsing failed: %s" % (' '.join(cmd))) return (DependencyGraph(item) for item in - codecs.open(output_file.name, encoding='utf8').read().split('\n\n') if item.strip()) + codecs.open(output_file.name, + encoding='utf8').read().split('\n\n') if + item.strip()) finally: input_file.close() @@ -63,12 +72,13 @@ def parse_tagged_sents(self, sentences, verbose=False): class TurboParser(ParserI): """ - interfaces [TurboParser](http://www.ark.cs.cmu.edu/TurboParser/) which you must manually install + interfaces [TurboParser](http://www.ark.cs.cmu.edu/TurboParser/) + which you must manually install """ def __init__(self, tagger, lemmatizer, model_file): self.tagger = tagger - self.lemmatize = lemmatizer.lemmatize if lemmatizer else lambda w, t: '_' + self.lemmatize = lemmatizer.lemmatize if lemmatizer else lambda w,t: '_' import turboparser self._pturboparser = turboparser.PTurboParser() @@ -80,8 +90,11 @@ def parse_sents(self, sentences): return self.tagged_parse_sents(tagged_sentences) def tagged_parse_sents(self, sentences): - input_file = tempfile.NamedTemporaryFile(prefix='turbo_input.conll', dir='resources', delete=False) - output_file = tempfile.NamedTemporaryFile(prefix='turbo_output.conll', dir='resources', delete=False) + input_file = tempfile.NamedTemporaryFile(prefix='turbo_input.conll', + dir='resources', delete=False) + output_file = tempfile.NamedTemporaryFile(prefix='turbo_output.conll', + dir='resources', + delete=False) try: for sentence in sentences: @@ -90,15 +103,20 @@ def tagged_parse_sents(self, sentences): if not word: word = '_' input_file.write(('\t'.join( - [str(i), word.replace(' ', '_'), self.lemmatize(word, tag).replace(' ', '_'), tag, tag, '_', + [str(i), word.replace(' ', '_'), + self.lemmatize(word, tag).replace(' ', '_'), tag, tag, + '_', '0', 'ROOT', '_', '_', '\n'])).encode('utf8')) input_file.write('\n'.encode('utf8')) input_file.close() self.interface.parse(input_file.name, output_file.name) - return (DependencyGraph(item, cell_extractor=lambda cells: cells[1:8]) for item in - codecs.open(output_file.name, encoding='utf8').read().split('\n\n') if item.strip()) + return ( + DependencyGraph(item, cell_extractor=lambda cells: cells[1:8]) for + item in + codecs.open(output_file.name, encoding='utf8').read().split('\n\n') + if item.strip()) finally: input_file.close() diff --git a/hazm/HamshahriReader.py b/hazm/HamshahriReader.py index c474735f..d9b39f15 100644 --- a/hazm/HamshahriReader.py +++ b/hazm/HamshahriReader.py @@ -13,7 +13,9 @@ class HamshahriReader(): """ - interfaces [Hamshahri Corpus](http://dbrg.ut.ac.ir/Hamshahri/download.html#version2) that you must download and extract it. + interfaces [Hamshahri Corpus] + (http://dbrg.ut.ac.ir/Hamshahri/download.html#version2) + that you must download and extract it. >>> hamshahri = HamshahriReader(root='corpora/hamshahri') >>> next(hamshahri.docs())['id'] @@ -22,16 +24,28 @@ class HamshahriReader(): def __init__(self, root): self._root = root - self._invalids = set( - ['hamshahri.dtd', 'HAM2-960622.xml', 'HAM2-960630.xml', 'HAM2-960701.xml', 'HAM2-960709.xml', - 'HAM2-960710.xml', 'HAM2-960711.xml', 'HAM2-960817.xml', 'HAM2-960818.xml', 'HAM2-960819.xml', - 'HAM2-960820.xml', 'HAM2-961019.xml', 'HAM2-961112.xml', 'HAM2-961113.xml', 'HAM2-961114.xml', - 'HAM2-970414.xml', 'HAM2-970415.xml', 'HAM2-970612.xml', 'HAM2-970614.xml', 'HAM2-970710.xml', - 'HAM2-970712.xml', 'HAM2-970713.xml', 'HAM2-970717.xml', 'HAM2-970719.xml', 'HAM2-980317.xml', - 'HAM2-040820.xml', 'HAM2-040824.xml', 'HAM2-040825.xml', 'HAM2-040901.xml', 'HAM2-040917.xml', - 'HAM2-040918.xml', 'HAM2-040920.xml', 'HAM2-041025.xml', 'HAM2-041026.xml', 'HAM2-041027.xml', - 'HAM2-041230.xml', 'HAM2-041231.xml', 'HAM2-050101.xml', 'HAM2-050102.xml', 'HAM2-050223.xml', - 'HAM2-050224.xml', 'HAM2-050406.xml', 'HAM2-050407.xml', 'HAM2-050416.xml']) + self._invalids = {'hamshahri.dtd', 'HAM2-960622.xml', + 'HAM2-960630.xml', 'HAM2-960701.xml', + 'HAM2-960709.xml', 'HAM2-960710.xml', + 'HAM2-960711.xml', 'HAM2-960817.xml', + 'HAM2-960818.xml', 'HAM2-960819.xml', + 'HAM2-960820.xml', 'HAM2-961019.xml', + 'HAM2-961112.xml', 'HAM2-961113.xml', + 'HAM2-961114.xml', 'HAM2-970414.xml', + 'HAM2-970415.xml', 'HAM2-970612.xml', + 'HAM2-970614.xml', 'HAM2-970710.xml', + 'HAM2-970712.xml', 'HAM2-970713.xml', + 'HAM2-970717.xml', 'HAM2-970719.xml', + 'HAM2-980317.xml', 'HAM2-040820.xml', + 'HAM2-040824.xml', 'HAM2-040825.xml', + 'HAM2-040901.xml', 'HAM2-040917.xml', + 'HAM2-040918.xml', 'HAM2-040920.xml', + 'HAM2-041025.xml', 'HAM2-041026.xml', + 'HAM2-041027.xml', 'HAM2-041230.xml', + 'HAM2-041231.xml', 'HAM2-050101.xml', + 'HAM2-050102.xml', 'HAM2-050223.xml', + 'HAM2-050224.xml', 'HAM2-050406.xml', + 'HAM2-050407.xml', 'HAM2-050416.xml'} self._paragraph_pattern = re.compile(r'(\n.{0,50})(?=\n)') def docs(self): @@ -44,26 +58,37 @@ def docs(self): elements = minidom.parse(os.path.join(root, name)) for element in elements.getElementsByTagName('DOC'): doc = {} - doc['id'] = element.getElementsByTagName('DOCID')[0].childNodes[0].data - doc['issue'] = element.getElementsByTagName('ISSUE')[0].childNodes[0].data + doc['id'] = \ + element.getElementsByTagName('DOCID')[ + 0].childNodes[ + 0].data + doc['issue'] = \ + element.getElementsByTagName('ISSUE')[ + 0].childNodes[ + 0].data for cat in element.getElementsByTagName('CAT'): - doc['categories_' + cat.attributes['xml:lang'].value] = cat.childNodes[0].data.split('.') + doc['categories_' + cat.attributes[ + 'xml:lang'].value] = cat.childNodes[ + 0].data.split('.') for date in element.getElementsByTagName('DATE'): if date.attributes['calender'].value == 'Persian': doc['date'] = date.childNodes[0].data elm = element.getElementsByTagName('TITLE')[0] - doc['title'] = elm.childNodes[1].data if len(elm.childNodes) > 1 else '' + doc['title'] = elm.childNodes[1].data if len( + elm.childNodes) > 1 else '' doc['text'] = '' - for item in element.getElementsByTagName('TEXT')[0].childNodes: + for item in element.getElementsByTagName('TEXT')[ + 0].childNodes: if item.nodeType == 4: # CDATA doc['text'] += item.data # refine text - doc['text'] = self._paragraph_pattern.sub(r'\1\n', doc['text']).replace('\no ', '\n') + doc['text'] = self._paragraph_pattern.sub(r'\1\n', doc[ + 'text']).replace('\no ', '\n') yield doc diff --git a/hazm/InformalNormalizer.py b/hazm/InformalNormalizer.py index 0b4e0121..b84ca977 100644 --- a/hazm/InformalNormalizer.py +++ b/hazm/InformalNormalizer.py @@ -15,7 +15,8 @@ class InformalNormalizer(Normalizer): - def __init__(self, verb_file=informal_verbs, word_file=informal_words, seperation_flag=False, **kargs): + def __init__(self, verb_file=informal_verbs, word_file=informal_words, + seperation_flag=False, **kargs): self.seperation_flag = seperation_flag self.lemmatizer = Lemmatizer() self.ilemmatizer = InformalLemmatizer() @@ -73,7 +74,8 @@ def shekan(token): res = [''] for i in token: res[-1] += i - if i in set(['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و'] + list(NUMBERS)): + if i in set( + ['ا', 'د', 'ذ', 'ر', 'ز', 'ژ', 'و'] + list(NUMBERS)): res.append('') while '' in res: res.remove('') @@ -94,7 +96,8 @@ def perm(lst): token = re.sub(r'(.)\1{2,}', r'\1', token) ps = perm(shekan(token)) for c in ps: - if set(map(lambda x: self.ilemmatizer.lemmatize(x), c)).issubset(self.words): + if set(map(lambda x: self.ilemmatizer.lemmatize(x), c)).issubset( + self.words): return ' '.join(c) return token @@ -124,13 +127,16 @@ def normalized_word(self, word): elif word.endswith("ن") and word[:-1] in self.ilemmatizer.verbs: options.append(word + 'د') - elif word[:-1] in self.ilemmatizer.verbs and word.endswith('ه') and word[:-1] not in self.lemmatizer.words: + elif word[:-1] in self.ilemmatizer.verbs and word.endswith( + 'ه') and word[:-1] not in self.lemmatizer.words: options.append(self.iword_map.get(word[:-1], word[:-1]) + 'د') - elif word not in self.ilemmatizer.verbs and word.endswith('ه') and word[:-1] in self.ilemmatizer.words: + elif word not in self.ilemmatizer.verbs and word.endswith( + 'ه') and word[:-1] in self.ilemmatizer.words: options.append(self.iword_map.get(word[:-1], word[:-1]) + ' است') - elif word not in self.ilemmatizer.verbs and word.endswith('ون') and self.lemmatizer.lemmatize( + elif word not in self.ilemmatizer.verbs and word.endswith( + 'ون') and self.lemmatizer.lemmatize( word[:-2] + 'ان') in self.ilemmatizer.words: options.append(word[:-2] + 'ان') @@ -145,9 +151,11 @@ def normalize(self, text): sent_tokenizer = SentenceTokenizer() word_tokenizer = WordTokenizer() text = super(InformalNormalizer, self).normalize(text) - sents = [word_tokenizer.tokenize(sentence) for sentence in sent_tokenizer.tokenize(text)] + sents = [word_tokenizer.tokenize(sentence) for sentence in + sent_tokenizer.tokenize(text)] - return [[self.normalized_word(word) for word in sent] for sent in sents] + return [[self.normalized_word(word) for word in sent] for sent in + sents] def informal_conjugations(self, verb): ends = ['م', 'ی', '', 'یم', 'ین', 'ن'] @@ -160,7 +168,8 @@ def informal_conjugations(self, verb): present_imperfects = ['می‌' + item for item in present_simples] present_not_imperfects = ['ن' + item for item in present_imperfects] present_subjunctives = [ - item if item.startswith('ب') else 'ب' + item for item in present_simples] + item if item.startswith('ب') else 'ب' + item for item in + present_simples] present_not_subjunctives = ['ن' + item for item in present_simples] return present_simples + present_not_simples + \ present_imperfects + present_not_imperfects + \ @@ -209,7 +218,8 @@ def iconjugations(self, verb): present_imperfects = ['می‌' + item for item in present_simples] present_not_imperfects = ['ن' + item for item in present_imperfects] present_subjunctives = [ - item if item.startswith('ب') else 'ب' + item for item in present_simples] + item if item.startswith('ب') else 'ب' + item for item in + present_simples] present_not_subjunctives = ['ن' + item for item in present_simples] return present_simples + present_not_simples + \ present_imperfects + present_not_imperfects + \ diff --git a/hazm/Lemmatizer.py b/hazm/Lemmatizer.py index f6647873..a074dfb2 100644 --- a/hazm/Lemmatizer.py +++ b/hazm/Lemmatizer.py @@ -29,11 +29,13 @@ class Lemmatizer(object): 'اجتماعی' """ - def __init__(self, words_file=default_words, verbs_file=default_verbs, joined_verb_parts=True): + def __init__(self, words_file=default_words, verbs_file=default_verbs, + joined_verb_parts=True): self.verbs = {} self.stemmer = Stemmer() - tokenizer = WordTokenizer(words_file=default_words, verbs_file=verbs_file) + tokenizer = WordTokenizer(words_file=default_words, + verbs_file=verbs_file) self.words = tokenizer.words if verbs_file: @@ -85,7 +87,8 @@ def conjugations(self, verb): ends = ['م', 'ی', '', 'یم', 'ید', 'ند'] if verb == '#هست': - return ['هست' + end for end in ends] + ['نیست' + end for end in ends] + return ['هست' + end for end in ends] + ['نیست' + end for end in + ends] past_simples = [past + end for end in ends] past_imperfects = ['می‌' + item for item in past_simples] @@ -100,13 +103,18 @@ def conjugations(self, verb): ends = ['م', 'ی', 'د', 'یم', 'ید', 'ند'] present_simples = [present + end for end in ends] present_imperfects = ['می‌' + item for item in present_simples] - present_subjunctives = [item if item.startswith('ب') else 'ب' + item for item in present_simples] + present_subjunctives = [item if item.startswith('ب') else 'ب' + item + for item in present_simples] present_not_subjunctives = ['ن' + item for item in present_simples] - with_nots = lambda items: items + list(map(lambda item: 'ن' + item, items)) - aa_refinement = lambda items: list(map(lambda item: item.replace('بآ', 'بیا').replace('نآ', 'نیا'), items)) if \ - items[0].startswith('آ') else items + with_nots = lambda items: items + list( + map(lambda item: 'ن' + item, items)) + aa_refinement = lambda items: list( + map(lambda item: item.replace('بآ', 'بیا').replace('نآ', 'نیا'), + items)) if \ + items[0].startswith('آ') else items return aa_refinement( - with_nots(past_simples) + with_nots(present_simples) + with_nots(past_imperfects) + with_nots( + with_nots(past_simples) + with_nots(present_simples) + with_nots( + past_imperfects) + with_nots( past_narratives) + with_nots(present_simples) + with_nots( present_imperfects) + present_subjunctives + present_not_subjunctives + imperatives) diff --git a/hazm/Normalizer.py b/hazm/Normalizer.py index 3f83686d..fc08db0d 100644 --- a/hazm/Normalizer.py +++ b/hazm/Normalizer.py @@ -11,12 +11,16 @@ from .WordTokenizer import WordTokenizer from .utils import maketrans -compile_patterns = lambda patterns: [(re.compile(pattern), repl) for pattern, repl in patterns] +compile_patterns = lambda patterns: [(re.compile(pattern), repl) for + pattern, repl in patterns] class Normalizer(object): - def __init__(self, remove_extra_spaces=True, persian_style=True, persian_numbers=True, remove_diacritics=True, - affix_spacing=True, token_based=False, punctuation_spacing=True): + def __init__(self, remove_extra_spaces=True, persian_style=True, + persian_numbers=True, remove_diacritics=True, + affix_spacing=True, token_based=False, + punctuation_spacing=True): + self._punctuation_spacing = punctuation_spacing self._affix_spacing = affix_spacing self._token_based = token_based @@ -32,7 +36,8 @@ def __init__(self, remove_extra_spaces=True, persian_style=True, persian_numbers self.words = lemmatizer.words self.verbs = lemmatizer.verbs self.tokenizer = WordTokenizer(join_verb_parts=False) - self.suffixes = {'ی', 'ای', 'ها', 'های', 'تر', 'تری', 'ترین', 'گر', 'گری', 'ام', 'ات', 'اش'} + self.suffixes = {'ی', 'ای', 'ها', 'های', 'تر', 'تری', 'ترین', 'گر', + 'گری', 'ام', 'ات', 'اش'} self.character_refinement_patterns = [] @@ -53,20 +58,26 @@ def __init__(self, remove_extra_spaces=True, persian_style=True, persian_numbers if remove_diacritics: self.character_refinement_patterns.append( ('[\u064B\u064C\u064D\u064E\u064F\u0650\u0651\u0652]', ''), - # remove FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SHADDA, SUKUN + # remove FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, + # SHADDA, SUKUN ) - self.character_refinement_patterns = compile_patterns(self.character_refinement_patterns) + self.character_refinement_patterns = compile_patterns( + self.character_refinement_patterns) punc_after, punc_before = r'\.:!،؛؟»\]\)\}', r'«\[\(\{' if punctuation_spacing: self.punctuation_spacing_patterns = compile_patterns([ - ('" ([^\n"]+) "', r'"\1"'), # remove space before and after quotation + ('" ([^\n"]+) "', r'"\1"'), + # remove space before and after quotation (' ([' + punc_after + '])', r'\1'), # remove space before ('([' + punc_before + ']) ', r'\1'), # remove space after - ('([' + punc_after[:3] + '])([^ \d' + punc_after + '])', r'\1 \2'), # put space after . and : - ('([' + punc_after[3:] + '])([^ ' + punc_after + '])', r'\1 \2'), # put space after - ('([^ ' + punc_before + '])([' + punc_before + '])', r'\1 \2'), # put space before + ('([' + punc_after[:3] + '])([^ \d' + punc_after + '])', + r'\1 \2'), # put space after . and : + ('([' + punc_after[3:] + '])([^ ' + punc_after + '])', + r'\1 \2'), # put space after + ('([^ ' + punc_before + '])([' + punc_before + '])', r'\1 \2'), + # put space before ]) if affix_spacing: @@ -74,9 +85,14 @@ def __init__(self, remove_extra_spaces=True, persian_style=True, persian_numbers (r'([^ ]ه) ی ', r'\1‌ی '), # fix ی space (r'(^| )(ن?می) ', r'\1\2‌'), # put zwnj after می, نمی ( - r'(?<=[^\n\d ' + punc_after + punc_before + ']{2}) (تر(ین?)?|گری?|های?)(?=[ \n' + punc_after + punc_before + ']|$)', - r'‌\1'), # put zwnj before تر, تری, ترین, گر, گری, ها, های - (r'([^ ]ه) (ا(م|یم|ش|ند|ی|ید|ت))(?=[ \n' + punc_after + ']|$)', r'\1‌\2'), + r'(?<=[^\n\d ' + + punc_after + + punc_before + + ']{2}) (تر(ین?)?|گری?|های?)(?=[ \n' + + punc_after + punc_before + ']|$)', + r'‌\1'), # put zwnj before تر, تری, ترین, گر, گری, ها, های + (r'([^ ]ه) (ا(م|یم|ش|ند|ی|ید|ت))(?=[ \n' + punc_after + ']|$)', + r'\1‌\2'), # join ام, ایم, اش, اند, ای, اید, ات ]) @@ -180,10 +196,12 @@ def token_spacing(self, tokens): if result: token_pair = result[-1] + '‌' + token - if token_pair in self.verbs or token_pair in self.words and self.words[token_pair][0] > 0: + if token_pair in self.verbs or token_pair in self.words and \ + self.words[token_pair][0] > 0: joined = True - if t < len(tokens) - 1 and token + '_' + tokens[t + 1] in self.verbs: + if t < len(tokens) - 1 and token + '_' + tokens[ + t + 1] in self.verbs: joined = False elif token in self.suffixes and result[-1] in self.words: diff --git a/hazm/POSTagger.py b/hazm/POSTagger.py index 33954362..a884f156 100755 --- a/hazm/POSTagger.py +++ b/hazm/POSTagger.py @@ -27,8 +27,9 @@ class StanfordPOSTagger(stanford.StanfordPOSTagger): def __init__(self, model_filename, path_to_jar, *args, **kwargs): self._SEPARATOR = '/' - super(stanford.StanfordPOSTagger, self).__init__(model_filename=model_filename, path_to_jar=path_to_jar, *args, - **kwargs) + super(stanford.StanfordPOSTagger, self).__init__( + model_filename=model_filename, path_to_jar=path_to_jar, *args, + **kwargs) def tag(self, tokens): return self.tag_sents([tokens])[0] diff --git a/hazm/PeykareReader.py b/hazm/PeykareReader.py index 16c61484..c6e1c40f 100644 --- a/hazm/PeykareReader.py +++ b/hazm/PeykareReader.py @@ -14,8 +14,10 @@ def coarse_pos_e(tags): """ - Coarse POS tags of Peykare corpus: - N: Noun, V: Verb, AJ: Adjective, ADV: Adverb, PRO: Pronoun, DET: Determiner, P: Preposition, POSTP: Postposition, NUM: Number, CONJ: Conjunction, PUNC: Punctuation, RES: Residual, CL: Classifier, INT: Interjection + Coarse POS tags of Peykare corpus: N: Noun, V: Verb, AJ: Adjective, + ADV: Adverb, PRO: Pronoun, DET: Determiner, P: Preposition, POSTP: + Postposition, NUM: Number, CONJ: Conjunction, PUNC: Punctuation, + RES: Residual, CL: Classifier, INT: Interjection >>> coarse_pos_e(['N','COM','SING']) 'N' @@ -23,8 +25,13 @@ def coarse_pos_e(tags): try: return list( - set(tags) & {'N', 'V', 'AJ', 'ADV', 'PRO', 'DET', 'P', 'POSTP', 'NUM', 'CONJ', 'PUNC', 'CL', 'INT', 'RES'})[ - 0] + ('e' if 'EZ' in tags else '') + set(tags) & {'N', 'V', + 'AJ', 'ADV', + 'PRO', 'DET', + 'P', 'POSTP', + 'NUM', 'CONJ', + 'PUNC', 'CL', + 'INT', 'RES'})[0] + ('e' if 'EZ' in tags else '') except: return 'N' @@ -39,11 +46,14 @@ def join_verb_parts(sentence): if not hasattr(join_verb_parts, 'tokenizer'): join_verb_parts.tokenizer = WordTokenizer() - before_verbs, after_verbs, verbe = join_verb_parts.tokenizer.before_verbs, join_verb_parts.tokenizer.after_verbs, join_verb_parts.tokenizer.verbe + before_verbs = join_verb_parts.tokenizer.before_verbs + after_verbs = join_verb_parts.tokenizer.after_verbs + verbe = join_verb_parts.tokenizer.verbe result = [('', '')] for word in reversed(sentence): - if word[0] in before_verbs or (result[-1][0] in after_verbs and word[0] in verbe): + if word[0] in before_verbs or ( + result[-1][0] in after_verbs and word[0] in verbe): result[-1] = (word[0] + '_' + result[-1][0], result[-1][1]) else: result.append(word) @@ -52,8 +62,12 @@ def join_verb_parts(sentence): class PeykareReader(): """ - Interfaces [Peykare Corpus](http://www.rcisp.com/?q=%D9%BE%DB%8C%DA%A9%D8%B1%D9%87%20%D9%85%D8%AA%D9%86%DB%8C%20%D8%B2%D8%A8%D8%A7%D9%86%20%D9%81%D8%A7%D8%B1%D8%B3%DB%8C) - Bijankhan, M., Sheykhzadegan, J., Bahrani, M., & Ghayoomi, M. (2011). Lessons from building a Persian written corpus: Peykare. Language Resources and Evaluation, 45, 143–164. + Interfaces [Peykare Corpus]( + http://www.rcisp.com/?q=%D9%BE%DB%8C%DA%A9%D8%B1%D9%87%20%D9%85%D8%AA%D9 + %86%DB%8C%20%D8%B2%D8%A8%D8%A7%D9%86%20%D9%81%D8%A7%D8%B1%D8%B3%DB%8C) + Bijankhan, M., Sheykhzadegan, J., Bahrani, M., & Ghayoomi, M. (2011). + Lessons from building a Persian written corpus: Peykare. Language + Resources and Evaluation, 45, 143–164. >>> peykare = PeykareReader(root='corpora/peykare') >>> next(peykare.sents()) @@ -70,14 +84,16 @@ def __init__(self, root, joined_verb_parts=True, pos_map=coarse_pos_e): self._root = root self._pos_map = pos_map if pos_map else lambda tags: ','.join(tags) self._joined_verb_parts = joined_verb_parts - self._normalizer = Normalizer(punctuation_spacing=False, affix_spacing=False) + self._normalizer = Normalizer(punctuation_spacing=False, + affix_spacing=False) def docs(self): """ extracts raw text of peykare document """ for root, dirs, files in os.walk(self._root): for name in sorted(files): - with codecs.open(os.path.join(root, name), encoding='windows-1256') as peykare_file: + with codecs.open(os.path.join(root, name), + encoding='windows-1256') as peykare_file: text = peykare_file.read() if text: yield text @@ -91,7 +107,8 @@ def doc_to_sents(self, document): continue parts = line.split(' ') - tags, word = parts[3], self._normalizer.normalize('‌'.join(parts[4:])) + tags, word = parts[3], self._normalizer.normalize( + '‌'.join(parts[4:])) if word and word != '#': sentence.append((word, tags)) diff --git a/hazm/QuranCorpusReader.py b/hazm/QuranCorpusReader.py index 3f07d03f..b6f5050d 100644 --- a/hazm/QuranCorpusReader.py +++ b/hazm/QuranCorpusReader.py @@ -9,13 +9,25 @@ from .utils import maketrans -buckwalter_transliteration = maketrans("'>&<}AbptvjHxd*rzs$SDTZEg_fqklmnhwYyFNKaui~o^#`{:@\"[;,.!-+%]", - "\u0621\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064a\u064b\u064c\u064d\u064e\u064f\u0650\u0651\u0652\u0653\u0654\u0670\u0671\u06dc\u06df\u06e0\u06e2\u06e3\u06e5\u06e6\u06e8\u06ea\u06eb\u06ec\u06ed") +buckwalter_transliteration = maketrans("'>&<}AbptvjHxd" + "*rzs$SDTZEg_fqklmnhwYyFNKaui~o^#`{" + ":@\"[;,.!-+%]", + "\u0621\u0623\u0624\u0625\u0626\u0627" + "\u0628\u0629\u062a\u062b\u062c\u062d" + "\u062e\u062f\u0630\u0631\u0632\u0633" + "\u0634\u0635\u0636\u0637\u0638\u0639" + "\u063a\u0640\u0641\u0642\u0643\u0644" + "\u0645\u0646\u0647\u0648\u0649\u064a" + "\u064b\u064c\u064d\u064e\u064f\u0650" + "\u0651\u0652\u0653\u0654\u0670\u0671" + "\u06dc\u06df\u06e0\u06e2\u06e3\u06e5" + "\u06e6\u06e8\u06ea\u06eb\u06ec\u06ed") class QuranCorpusReader(): """ - interfaces [Quran Corpus](http://corpus.quran.com/download/) that you must download and extract it. + interfaces [Quran Corpus](http://corpus.quran.com/download/) + that you must download and extract it. >>> quran = QuranCorpusReader(quran_file='corpora/quranic-corpus-morphology.txt') >>> print(next(quran.words())[1]) @@ -31,15 +43,18 @@ def parts(self): continue parts = line.strip().split('\t') - part = {'loc': eval(parts[0].replace(':', ',')), 'text': parts[1].translate(buckwalter_transliteration), + part = {'loc': eval(parts[0].replace(':', ',')), + 'text': parts[1].translate(buckwalter_transliteration), 'tag': parts[2]} features = parts[3].split('|') for feature in features: if feature.startswith('LEM:'): - part['lem'] = feature[4:].translate(buckwalter_transliteration) + part['lem'] = feature[4:].translate( + buckwalter_transliteration) elif feature.startswith('ROOT:'): - part['root'] = feature[5:].translate(buckwalter_transliteration) + part['root'] = feature[5:].translate( + buckwalter_transliteration) yield part def words(self): diff --git a/hazm/SentenceTokenizer.py b/hazm/SentenceTokenizer.py index b5d5b41b..ea540e9a 100644 --- a/hazm/SentenceTokenizer.py +++ b/hazm/SentenceTokenizer.py @@ -22,4 +22,5 @@ def __init__(self): def tokenize(self, text): text = self.pattern.sub(r'\1\n\n', text) - return [sentence.replace('\n', ' ').strip() for sentence in text.split('\n\n') if sentence.strip()] + return [sentence.replace('\n', ' ').strip() + for sentence in text.split('\n\n') if sentence.strip()] diff --git a/hazm/SentiPersReader.py b/hazm/SentiPersReader.py index 321000cf..49c208d9 100644 --- a/hazm/SentiPersReader.py +++ b/hazm/SentiPersReader.py @@ -27,8 +27,10 @@ def docs(self): def element_sentences(element): for sentence in element.getElementsByTagName('Sentence'): - yield {'text': sentence.childNodes[0].data, 'id': sentence.getAttribute('ID'), - 'value': int(sentence.getAttribute('Value')) if comment.getAttribute('Value') else None} + yield {'text': sentence.childNodes[0].data, + 'id': sentence.getAttribute('ID'), + 'value': int(sentence.getAttribute('Value')) + if comment.getAttribute('Value') else None} for root, dirs, files in os.walk(self._root): for filename in sorted(files): @@ -43,22 +45,29 @@ def element_sentences(element): } for child in product.childNodes: - if child.nodeName in {'Voters', 'Performance', 'Capability', 'Production_Quality', 'Ergonomics', + if child.nodeName in {'Voters', 'Performance', + 'Capability', + 'Production_Quality', + 'Ergonomics', 'Purchase_Value'}: value = child.getAttribute('Value') - doc[child.nodeName] = float(value) if '.' in value else int(value) + doc[child.nodeName] = float( + value) if '.' in value else int(value) - for comment in itertools.chain(elements.getElementsByTagName('Opinion'), - elements.getElementsByTagName('Criticism')): + for comment in itertools.chain( + elements.getElementsByTagName('Opinion'), + elements.getElementsByTagName('Criticism')): doc['comments'].append({ 'id': comment.getAttribute('ID'), 'type': comment.nodeName, 'author': comment.getAttribute('Holder').strip(), - 'value': int(comment.getAttribute('Value')) if comment.getAttribute('Value') else None, + 'value': int(comment.getAttribute( + 'Value')) if comment.getAttribute( + 'Value') else None, 'sentences': list(element_sentences(comment)) }) - - # todo: Accessories, Features, Review, Advantages, Tags, Keywords, Index + # todo: Accessories, Features, Review, + # todo: Advantages, Tags, Keywords, Index yield doc diff --git a/hazm/SequenceTagger.py b/hazm/SequenceTagger.py index 001931d0..90415730 100644 --- a/hazm/SequenceTagger.py +++ b/hazm/SequenceTagger.py @@ -27,17 +27,21 @@ def __init__(self, patterns=[], **options): self.model = Model(patterns='\n'.join(patterns), **options) def train(self, sentences): - self.model.train(['\n'.join([' '.join(word) for word in sentence]) for sentence in sentences]) + sequences = ['\n'.join([' '.join(word) for word in sentence]) + for sentence in sentences] + self.model.train(sequences) def save_model(self, filename): self.model.save(filename) def tag_sents(self, sentences): sentences = list(sentences) - lines = '\n\n'.join(['\n'.join(sentence) for sentence in sentences]).replace(' ', '_') + lines = '\n\n'.join( + ['\n'.join(sentence) for sentence in sentences]).replace(' ', '_') results = self.model.label_sequence(lines).decode('utf8') tags = iter(results.strip().split('\n')) - return [[(word, next(tags)) for word in sentence] for sentence in sentences] + return [[(word, next(tags)) for word in sentence] for sentence in + sentences] class IOBTagger(SequenceTagger): @@ -46,17 +50,22 @@ class IOBTagger(SequenceTagger): >>> tagger = IOBTagger(patterns=['*', 'U:word-%x[0,0]', 'U:word-%x[0,1]']) >>> tagger.train([[('من', 'PRO', 'B-NP'), ('به', 'P', 'B-PP'), ('مدرسه', 'N', 'B-NP'), ('رفته_بودم', 'V', 'B-VP'), ('.', 'PUNC', 'O')]]) >>> tagger.tag_sents([[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]]) - [[('من', 'PRO', 'B-NP'), ('به', 'P', 'B-PP'), ('مدرسه', 'N', 'B-NP'), ('رفته_بودم', 'V', 'B-VP'), ('.', 'PUNC', 'O')]] + [[('من', 'PRO', 'B-NP'), ('به', 'P', 'B-PP'), ('مدرسه', 'N', 'B-NP') + , ('رفته_بودم', 'V', 'B-VP'), ('.', 'PUNC', 'O')]] """ def tag_sents(self, sentences): sentences = list(sentences) - lines = '\n\n'.join(['\n'.join(['\t'.join(word) for word in sentence]) for sentence in sentences]).replace(' ', - '_') + lines = '\n\n'.join( + ['\n'.join(['\t'.join(word) for word in sentence]) for sentence in + sentences]).replace(' ', + '_') results = self.model.label_sequence(lines).decode('utf8') tags = iter(results.strip().split('\n')) - return [[word + (next(tags),) for word in sentence] for sentence in sentences] + return [[word + (next(tags),) for word in sentence] for sentence in + sentences] def evaluate(self, gold): - tagged_sents = self.tag_sents(([word[:-1] for word in sentence] for sentence in gold)) + tagged_sents = self.tag_sents( + ([word[:-1] for word in sentence] for sentence in gold)) return accuracy(sum(gold, []), sum(tagged_sents, [])) diff --git a/hazm/Stemmer.py b/hazm/Stemmer.py index f2d8fdc1..9de77d55 100644 --- a/hazm/Stemmer.py +++ b/hazm/Stemmer.py @@ -26,7 +26,19 @@ class Stemmer(StemmerI): """ def __init__(self): - self.ends = ['ات', 'ان', 'ترین', 'تر', 'م', 'ت', 'ش', 'یی', 'ی', 'ها', 'ٔ', '‌ا', '‌'] + self.ends = ['ات', + 'ان', + 'ترین', + 'تر', + 'م', + 'ت', + 'ش', + 'یی', + 'ی', + 'ها', + 'ٔ', + '‌ا', + '‌'] def stem(self, word): for end in self.ends: diff --git a/hazm/TNewsReader.py b/hazm/TNewsReader.py index 1460d2da..e47cfb1f 100644 --- a/hazm/TNewsReader.py +++ b/hazm/TNewsReader.py @@ -37,21 +37,31 @@ def get_text(element): try: content = open(os.path.join(root, name)).read() - # fix xml formating issue - content = re.sub(r'[  ]', '', content).replace('', '') + '' + # fix xml formatting issue + content = re.sub(r'[  ]', '', content) + content = content.replace('', '') + '' elements = minidom.parseString(content) for element in elements.getElementsByTagName('NEWS'): doc = {} - doc['id'] = get_text(element.getElementsByTagName('NEWSID')[0]) - doc['url'] = get_text(element.getElementsByTagName('URL')[0]) - doc['datetime'] = get_text(element.getElementsByTagName('UTCDATE')[0]) - doc['category'] = get_text(element.getElementsByTagName('CATEGORY')[0]) - doc['pre-title'] = get_text(element.getElementsByTagName('PRETITLE')[0]) - doc['title'] = get_text(element.getElementsByTagName('TITLE')[0]) - doc['post-title'] = get_text(element.getElementsByTagName('POSTTITLE')[0]) - doc['brief'] = get_text(element.getElementsByTagName('BRIEF')[0]) - doc['text'] = get_text(element.getElementsByTagName('DESCRIPTION')[0]) + doc['id'] = get_text( + element.getElementsByTagName('NEWSID')[0]) + doc['url'] = get_text( + element.getElementsByTagName('URL')[0]) + doc['datetime'] = get_text( + element.getElementsByTagName('UTCDATE')[0]) + doc['category'] = get_text( + element.getElementsByTagName('CATEGORY')[0]) + doc['pre-title'] = get_text( + element.getElementsByTagName('PRETITLE')[0]) + doc['title'] = get_text( + element.getElementsByTagName('TITLE')[0]) + doc['post-title'] = get_text( + element.getElementsByTagName('POSTTITLE')[0]) + doc['brief'] = get_text( + element.getElementsByTagName('BRIEF')[0]) + doc['text'] = get_text( + element.getElementsByTagName('DESCRIPTION')[0]) yield doc except Exception as e: diff --git a/hazm/TokenSplitter.py b/hazm/TokenSplitter.py index 9fa5757e..df3a9c35 100644 --- a/hazm/TokenSplitter.py +++ b/hazm/TokenSplitter.py @@ -8,7 +8,7 @@ from .Lemmatizer import Lemmatizer -class TokenSplitter(): +class TokenSplitter: def __init__(self): self.lemmatizer = Lemmatizer() self.lemmatize = self.lemmatizer.lemmatize @@ -33,8 +33,10 @@ def split_token_words(self, token): if '‌' in token: candidates.append(tuple(token.split('‌'))) - splits = [(token[:s], token[s:]) for s in range(1, len(token)) if token[s - 1] != '‌' and token[s] != '‌'] + [ - (token,)] - candidates.extend(list(filter(lambda tokens: set(map(self.lemmatize, tokens)).issubset(self.words), splits))) + splits = [(token[:s], token[s:]) for s in range(1, len(token)) if + token[s - 1] != '‌' and token[s] != '‌'] + [(token,)] + candidates.extend(list(filter( + lambda tokens: set(map(self.lemmatize, tokens)).issubset( + self.words), splits))) return candidates diff --git a/hazm/TreebankReader.py b/hazm/TreebankReader.py index 836e7e08..b2057dcd 100644 --- a/hazm/TreebankReader.py +++ b/hazm/TreebankReader.py @@ -18,15 +18,20 @@ def coarse_pos_e(tags): """ - Coarse POS tags of Treebank corpus: - N: Noun, V: Verb, A: Adjective, D: Adverb, Z: Pronoun, T: Determiner, E: Preposition, P: Postposition, U: Number, J: Conjunction, O: Punctuation, R: Residual, L: Classifier, I: Interjection + Coarse POS tags of Treebank corpus: N: Noun, V: Verb, A: Adjective, + D: Adverb, Z: Pronoun, T: Determiner, E: Preposition, P: Postposition, + U: Number, J: Conjunction, O: Punctuation, R: Residual, L: Classifier, + I: Interjection >>> coarse_pos_e(['Nasp---', 'pers', 'prop']) 'N' """ - map = {'N': 'N', 'V': 'V', 'A': 'AJ', 'D': 'ADV', 'Z': 'PRO', 'T': 'DET', 'E': 'P', 'P': 'POSTP', 'U': 'NUM', - 'J': 'CONJ', 'O': 'PUNC', 'R': 'RES', 'L': 'CL', 'I': 'INT'} + pos_map = {'N': 'N', 'V': 'V', 'A': 'AJ', + 'D': 'ADV', 'Z': 'PRO', 'T': 'DET', + 'E': 'P', 'P': 'POSTP', 'U': 'NUM', + 'J': 'CONJ', 'O': 'PUNC', 'R': 'RES', + 'L': 'CL', 'I': 'INT'} try: if tags[0][0] == 'C': if 'pronominal' in tags: @@ -39,12 +44,12 @@ def coarse_pos_e(tags): tags[0] = 'D' elif 'det' in tags: tags[0] = 'T' - return map[tags[0][0]] + ('e' if 'ezafe' in tags else '') + return pos_map[tags[0][0]] + ('e' if 'ezafe' in tags else '') except Exception: return '' -class TreebankReader(): +class TreebankReader: """ interfaces [Per­si­an Tree­bank](http://hpsg.fu-berlin.de/~ghayoomi/PTB.html) @@ -59,14 +64,16 @@ class TreebankReader(): (PUNC ./PUNC)) >>> next(treebank.sents()) - [('دنیای', 'Ne'), ('آدولف', 'N'), ('بورن', 'N'), ('دنیای', 'Ne'), ('اتفاقات', 'Ne'), ('رویایی', 'AJ'), ('است', 'V'), ('.', 'PUNC')] + [('دنیای', 'Ne'), ('آدولف', 'N'), ('بورن', 'N'), ('دنیای', 'Ne'), + ('اتفاقات', 'Ne'), ('رویایی', 'AJ'), ('است', 'V'), ('.', 'PUNC')] >>> from .Chunker import tree2brackets >>> tree2brackets(next(treebank.chunked_trees())) '[دنیای آدولف بورن NP] [دنیای اتفاقات رویایی NP] [است VP] .' """ - def __init__(self, root, pos_map=coarse_pos_e, join_clitics=False, join_verb_parts=False): + def __init__(self, root, pos_map=coarse_pos_e, join_clitics=False, + join_verb_parts=False): self._root = root self._pos_map = pos_map if pos_map else lambda tags: ','.join(tags) self._join_clitics = join_clitics @@ -77,7 +84,8 @@ def docs(self): for root, dirs, files in os.walk(self._root): for name in sorted(files): try: - with codecs.open(os.path.join(root, name), encoding='utf8') as treebank_file: + with codecs.open(os.path.join(root, name), + encoding='utf8') as treebank_file: raw = re.sub(r'\n *', '', treebank_file.read()) yield minidom.parseString(raw.encode('utf8')) except Exception as e: @@ -88,7 +96,8 @@ def trees(self): def traverse(node): def extract_tags(W): pos = [W.getAttribute('lc') if W.getAttribute('lc') else None] - if W.getAttribute('clitic') in {'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det'}: + if W.getAttribute('clitic') in {'ezafe', 'pronominal', 'verb', + 'prep', 'adv', 'det'}: pos.append(W.getAttribute('clitic')) if W.getAttribute('ne_sort'): pos.append(W.getAttribute('ne_sort')) @@ -119,39 +128,53 @@ def clitic_join(tree, clitic): first = node.childNodes[0] if first.tagName == 'w': pos = extract_tags(first) - return Tree(node.tagName, [(first.childNodes[0].data.replace('می ', 'می‌'), self._pos_map(pos))]) - childs = node.childNodes[2:] if node.tagName == 'S' else node.childNodes + return Tree(node.tagName, [(first.childNodes[0].data.replace( + 'می ', 'می‌'), self._pos_map(pos))]) + childs = node.childNodes[ + 2:] if node.tagName == 'S' else node.childNodes for child in childs: if not len(child.childNodes): childs.remove(child) tree = Tree(node.tagName, map(traverse, childs)) - if self._join_clitics and len(tree) > 1 and type(tree[1]) == Tree and tree[1].label() == 'CLITIC' and \ + if self._join_clitics and len(tree) > 1 and type( + tree[1]) == Tree and tree[1].label() == 'CLITIC' and \ tree[1][0][1] not in {'P', 'V'}: clitic = tree[-1] tree = Tree(tree.label(), [subtree for subtree in tree[0]]) clitic_join(tree, clitic) - if self._join_verb_parts and len(tree) > 1 and type(tree[1]) == Tree and type(tree[0]) == Tree and tree[ - 0].label() == 'AUX' and tree[0][0][0] in self._tokenizer.before_verbs: - tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0], tree[1][0][1]) + if self._join_verb_parts and len(tree) > 1 and type( + tree[1]) == Tree and type(tree[0]) == Tree and tree[ + 0].label() == 'AUX' and tree[0][0][ + 0] in self._tokenizer.before_verbs: + tree[1][0] = ( + tree[0][0][0] + ' ' + tree[1][0][0], tree[1][0][1]) tree.remove(tree[0]) - if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][ - 0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe: - tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) + if self._join_verb_parts and len(tree.leaves()) > 1 and \ + tree.leaves()[-1][ + 0] in self._tokenizer.after_verbs and \ + tree.leaves()[-2][0] in self._tokenizer.verbe: + tree[1][0] = ( + tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves()) - 2) removingtree = tree while len(path) > 2: removingtree = removingtree[path[0]] path = path[1:] - removingtree.remove(Tree(tree.pos()[-2][1], [tree.pos()[-2][0]])) - if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][ - 0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe: - tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) + removingtree.remove( + Tree(tree.pos()[-2][1], [tree.pos()[-2][0]])) + if self._join_verb_parts and len(tree.leaves()) > 1 and \ + tree.leaves()[-1][ + 0] in self._tokenizer.after_verbs and \ + tree.leaves()[-2][0] in self._tokenizer.verbe: + tree[1][0] = ( + tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves()) - 2) removingtree = tree while len(path) > 2: removingtree = removingtree[path[0]] path = path[1:] - removingtree.remove(Tree(tree.pos()[-2][1], [tree.pos()[-2][0]])) + removingtree.remove( + Tree(tree.pos()[-2][1], [tree.pos()[-2][0]])) return tree for doc in self.docs(): @@ -163,7 +186,9 @@ def sents(self): yield tree.leaves() def chunked_trees(self): - collapse = lambda node, label: Tree(label, [Tree(pos[1], [pos[0]]) for pos in node.pos()]) + collapse = lambda node, label: Tree(label, + [Tree(pos[1], [pos[0]]) for pos in + node.pos()]) def traverse(node, parent, chunks): label = node.label() @@ -218,18 +243,24 @@ def traverse(node, parent, chunks): chunks.append(collapse(node, 'NP')) return - if label in {'NPC', 'N', 'INFV', 'DPA', 'CLASS', 'DPC', 'DEM', 'INTJ', 'MN', 'PRON', 'DET', 'NUM', 'RES'}: + if label in {'NPC', 'N', 'INFV', 'DPA', 'CLASS', 'DPC', 'DEM', + 'INTJ', 'MN', 'PRON', 'DET', 'NUM', 'RES'}: chunks.append(collapse(node, 'NP')) return if label == 'NPA' and len(node) >= 2: - if node[0].label() == 'ADJ' and node[1].label() == 'NPC' or node[0].label() in {'N', 'PRON'} and node[ - 1].label() in {'ADJ', 'ADJPA', 'N'} or node[0].label() == 'NUM' and node[1].label() in {'N', 'NPC', - 'MN', - 'NUM'} or \ - node[0].label() in {'N', 'NPC', 'MN'} and node[1].label() == 'NUM' or node[ - 0].label() == 'NPC' and node[1].label() == 'ADJ' or node[0].label() == 'NPA' and node[ - 1].label() != 'NPC' or node[1].label() == 'NPA' and node[0].label() != 'NPC': + if node[0].label() == 'ADJ' and node[1].label() == 'NPC' or \ + node[0].label() in {'N', 'PRON'} and node[ + 1].label() in {'ADJ', 'ADJPA', 'N'} or node[ + 0].label() == 'NUM' and node[1].label() in {'N', 'NPC', + 'MN', + 'NUM'} or \ + node[0].label() in {'N', 'NPC', 'MN'} and node[ + 1].label() == 'NUM' or node[ + 0].label() == 'NPC' and node[1].label() == 'ADJ' or node[ + 0].label() == 'NPA' and node[ + 1].label() != 'NPC' or node[1].label() == 'NPA' and node[ + 0].label() != 'NPC': chunks.append(collapse(node, 'NP')) return diff --git a/hazm/VerbValencyReader.py b/hazm/VerbValencyReader.py index 010ecdb0..77c55110 100644 --- a/hazm/VerbValencyReader.py +++ b/hazm/VerbValencyReader.py @@ -9,13 +9,19 @@ from collections import namedtuple Verb = namedtuple('Verb', - ('past_light_verb', 'present_light_verb', 'prefix', 'nonverbal_element', 'preposition', 'valency')) + ('past_light_verb', 'present_light_verb', 'prefix', + 'nonverbal_element', 'preposition', 'valency')) class VerbValencyReader(): """ interfaces [Verb Valency Corpus](http://dadegan.ir/catalog/pervallex) - Mohammad Sadegh Rasooli, Amirsaeid Moloodi, Manouchehr Kouhestani, & Behrouz Minaei Bidgoli. (2011). A Syntactic Valency Lexicon for Persian Verbs: The First Steps towards Persian Dependency Treebank. in 5th Language & Technology Conference (LTC): Human Language Technologies as a Challenge for Computer Science and Linguistics (pp. 227–231). Poznań, Poland. + Mohammad Sadegh Rasooli, Amirsaeid Moloodi, Manouchehr Kouhestani, & + Behrouz Minaei Bidgoli. (2011). A Syntactic Valency Lexicon for + Persian Verbs: The First Steps towards Persian Dependency Treebank. in 5th + Language & Technology Conference (LTC): Human Language Technologies as + a Challenge for Computer Science and Linguistics (pp. 227–231). + Poznań, Poland. """ def __init__(self, valency_file='corpora/valency.txt'): diff --git a/hazm/WikiExtractor.py b/hazm/WikiExtractor.py index 62eca670..0552460d 100755 --- a/hazm/WikiExtractor.py +++ b/hazm/WikiExtractor.py @@ -345,19 +345,22 @@ def fixup(m): def ignoreTag(tag): - left = re.compile(r'<%s\b.*?>' % tag, re.IGNORECASE | re.DOTALL) # both and + left = re.compile(r'<%s\b.*?>' % tag, + re.IGNORECASE | re.DOTALL) # both and right = re.compile(r'' % tag, re.IGNORECASE) options.ignored_tag_patterns.append((left, right)) # Match selfClosing HTML tags selfClosing_tag_patterns = [ - re.compile(r'<\s*%s\b[^>]*/\s*>' % tag, re.DOTALL | re.IGNORECASE) for tag in selfClosingTags + re.compile(r'<\s*%s\b[^>]*/\s*>' % tag, re.DOTALL | re.IGNORECASE) + for tag in selfClosingTags ] # Match HTML placeholder tags placeholder_tag_patterns = [ - (re.compile(r'<\s*%s(\s*| [^>]+?)>.*?<\s*/\s*%s\s*>' % (tag, tag), re.DOTALL | re.IGNORECASE), + (re.compile(r'<\s*%s(\s*| [^>]+?)>.*?<\s*/\s*%s\s*>' % (tag, tag), + re.DOTALL | re.IGNORECASE), repl) for tag, repl in placeholder_tags.items() ] @@ -484,7 +487,8 @@ def subst(self, params, extractor, depth): paramName = extractor.transform(paramName) res = '' if paramName in params: - res = params[paramName] # use parameter value specified in template invocation + res = params[ + paramName] # use parameter value specified in template invocation elif self.default: # use the default value defaultValue = self.default.subst(params, extractor, depth + 1) res = extractor.transform(defaultValue) @@ -567,9 +571,11 @@ def write_output(self, out, text): out.write('\n') else: if options.print_revision: - header = '\n' % (self.id, self.revid, url, self.title) + header = '\n' % ( + self.id, self.revid, url, self.title) else: - header = '\n' % (self.id, url, self.title) + header = '\n' % ( + self.id, url, self.title) footer = "\n\n" if out == sys.stdout: # option -a or -o - header = header.encode('utf-8') @@ -601,7 +607,8 @@ def extract(self, out): ns = '' # Main pagename = self.title self.magicWords['NAMESPACE'] = ns - self.magicWords['NAMESPACENUMBER'] = options.knownNamespaces.get(ns, '0') + self.magicWords['NAMESPACENUMBER'] = options.knownNamespaces.get(ns, + '0') self.magicWords['PAGENAME'] = pagename self.magicWords['FULLPAGENAME'] = self.title slash = pagename.rfind('/') @@ -645,8 +652,9 @@ def extract(self, out): self.recursion_exceeded_2_errs, self.recursion_exceeded_3_errs) if any(errs): - logging.warn("Template errors in article '%s' (%s): title(%d) recursion(%d, %d, %d)", - self.title, self.id, *errs) + logging.warn( + "Template errors in article '%s' (%s): title(%d) recursion(%d, %d, %d)", + self.title, self.id, *errs) def transform(self, wikitext): """ @@ -657,7 +665,8 @@ def transform(self, wikitext): res = '' cur = 0 for m in nowiki.finditer(wikitext, cur): - res += self.transform1(wikitext[cur:m.start()]) + wikitext[m.start():m.end()] + res += self.transform1(wikitext[cur:m.start()]) + wikitext[ + m.start():m.end()] cur = m.end() # leftover res += self.transform1(wikitext[cur:]) @@ -756,7 +765,8 @@ def clean(self, text): # Drop discarded elements for tag in options.discardElements: - text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag) + text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, + r'<\s*/\s*%s>' % tag) if not options.toHTML: # Turn into text what is left (&nbsp;) and @@ -766,7 +776,8 @@ def clean(self, text): for pattern, placeholder in placeholder_tag_patterns: index = 1 for match in pattern.finditer(text): - text = text.replace(match.group(), '%s_%d' % (placeholder, index)) + text = text.replace(match.group(), + '%s_%d' % (placeholder, index)) index += 1 text = text.replace('<<', '«').replace('>>', '»') @@ -779,14 +790,17 @@ def clean(self, text): text = dots.sub('...', text) text = re.sub(' (,:\.\)\]»)', r'\1', text) text = re.sub('(\[\(«) ', r'\1', text) - text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U) # lines with only punctuations + text = re.sub(r'\n\W+?\n', '\n', text, + flags=re.U) # lines with only punctuations text = text.replace(',,', ',').replace(',.', '.') if options.keep_tables: # the following regular expressions are used to remove the wikiml chartacters around table strucutures # yet keep the content. The order here is imporant so we remove certain markup like {| and then # then the future html attributes such as 'style'. Finally we drop the remaining '|-' that delimits cells. text = re.sub(r'!(?:\s)?style=\"[a-z]+:(?:\d+)%;\"', r'', text) - text = re.sub(r'!(?:\s)?style="[a-z]+:(?:\d+)%;[a-z]+:(?:#)?(?:[0-9a-z]+)?"', r'', text) + text = re.sub( + r'!(?:\s)?style="[a-z]+:(?:\d+)%;[a-z]+:(?:#)?(?:[0-9a-z]+)?"', + r'', text) text = text.replace('|-', '') text = text.replace('|', '') if options.toHTML: @@ -997,7 +1011,8 @@ def expandTemplate(self, body): colon = title.find(':') if colon > 1: funct = title[:colon] - parts[0] = title[colon + 1:].strip() # side-effect (parts[0] not used later) + parts[0] = title[ + colon + 1:].strip() # side-effect (parts[0] not used later) # arguments after first are not evaluated ret = callParserFunction(funct, parts, self) logging.debug('%*s]*\s+)*?class="(?:[^"\s>]*\s+)*?error(?:\s[^">]*)?"', test): + if re.match( + '<(?:strong|span|p|div)\s(?:[^\s>]*\s+)*?class="(?:[^"\s>]*\s+)*?error(?:\s[^">]*)?"', + test): return extr.expand(then.strip()) elif Else is None: return test.strip() @@ -1858,7 +1878,8 @@ def sharp_invoke(module, function, args): '#ifexpr': lambda *args: '', # not supported - '#ifexist': lambda extr, title, ifex, ifnex: extr.expand(ifnex), # assuming title is not present + '#ifexist': lambda extr, title, ifex, ifnex: extr.expand(ifnex), +# assuming title is not present '#rel2abs': lambda *args: '', # not supported @@ -1904,7 +1925,8 @@ def callParserFunction(functionName, args, extractor): functionName = functionName.lower() if functionName == '#invoke': module, fun = args[0].strip(), args[1].strip() - logging.debug('%*s#invoke %s %s %s', extractor.frame.depth, '', module, fun, args[2:]) + logging.debug('%*s#invoke %s %s %s', extractor.frame.depth, '', + module, fun, args[2:]) # special handling of frame if len(args) == 2: # find parameters in frame whose title is the one of the original @@ -1920,10 +1942,12 @@ def callParserFunction(functionName, args, extractor): break frame = frame.prev else: - params = [extractor.transform(p) for p in args[2:]] # evaluates them + params = [extractor.transform(p) for p in + args[2:]] # evaluates them params = extractor.templateParams(params) ret = sharp_invoke(module, fun, params) - logging.debug('%*s<#invoke %s %s %s', extractor.frame.depth, '', module, fun, ret) + logging.debug('%*s<#invoke %s %s %s', extractor.frame.depth, '', + module, fun, ret) return ret if functionName in parserFunctions: # branching functions use the extractor to selectively evaluate args @@ -2394,7 +2418,8 @@ def makeInternalLink(title, label): if colon == 0: # drop also :File: colon2 = title.find(':', colon + 1) - if colon2 > 1 and title[colon + 1:colon2] not in options.acceptedNamespaces: + if colon2 > 1 and title[ + colon + 1:colon2] not in options.acceptedNamespaces: return '' if options.keepLinks: return '%s' % (quote(title.encode('utf-8')), label) @@ -2492,7 +2517,8 @@ def makeExternalImage(url, alt=''): # match tail after wikilink tailRE = re.compile('\w+') -syntaxhighlight = re.compile('<syntaxhighlight .*?>(.*?)</syntaxhighlight>', re.DOTALL) +syntaxhighlight = re.compile( + '<syntaxhighlight .*?>(.*?)</syntaxhighlight>', re.DOTALL) # skip level 1, it is page name level section = re.compile(r'(==+)\s*(.*?)\s*\1') @@ -2597,7 +2623,8 @@ def compact(text): # use item count for #-lines listCount[i - 1] += 1 bullet = '%d. ' % listCount[i - 1] if n == '#' else '- ' - page.append('{0:{1}s}'.format(bullet, len(listLevel)) + line) + page.append( + '{0:{1}s}'.format(bullet, len(listLevel)) + line) elif options.toHTML: page.append(listItem[n] % line) elif len(listLevel): @@ -2665,7 +2692,8 @@ def __next__(self): def _dirname(self): char1 = self.dir_index % 26 char2 = self.dir_index // 26 % 26 - return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1)) + return os.path.join(self.path_name, + '%c%c' % (ord('A') + char2, ord('A') + char1)) def _filepath(self): return '%s/wiki_%02d' % (self._dirname(), self.file_index) @@ -2757,7 +2785,8 @@ def load_templates(file, output_file=None): logging.info("Preprocessed %d pages", page_count) if output_file: output.close() - logging.info("Saved %d templates to '%s'", len(options.templates), output_file) + logging.info("Saved %d templates to '%s'", len(options.templates), + output_file) def pages_from(input): @@ -2799,7 +2828,8 @@ def pages_from(input): elif tag == 'redirect': redirect = True elif tag == 'text': - if m.lastindex == 3 and line[m.start(3) - 2] == '/': # self closing + if m.lastindex == 3 and line[ + m.start(3) - 2] == '/': # self closing # continue inText = True @@ -2838,7 +2868,8 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, if input_file == '-': input = sys.stdin else: - input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed) + input = fileinput.FileInput(input_file, + openhook=fileinput.hook_compressed) # collect siteinfo for line in input: @@ -2874,7 +2905,8 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, template_load_start = default_timer() if template_file: if os.path.exists(template_file): - logging.info("Loading template definitions from: %s", template_file) + logging.info("Loading template definitions from: %s", + template_file) # can't use with here: file = fileinput.FileInput(template_file, openhook=fileinput.hook_compressed) @@ -2883,13 +2915,18 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, else: if input_file == '-': # can't scan then reset stdin; must error w/ suggestion to specify template_file - raise ValueError("to use templates with stdin dump, must supply explicit template-file") - logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", input_file) + raise ValueError( + "to use templates with stdin dump, must supply explicit template-file") + logging.info( + "Preprocessing '%s' to collect template definitions: this may take some time.", + input_file) load_templates(input, template_file) input.close() - input = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed) + input = fileinput.FileInput(input_file, + openhook=fileinput.hook_compressed) template_load_elapsed = default_timer() - template_load_start - logging.info("Loaded %d templates in %.1fs", len(options.templates), template_load_elapsed) + logging.info("Loaded %d templates in %.1fs", len(options.templates), + template_load_elapsed) # process pages logging.info("Starting page extraction from %s.", input_file) @@ -2967,8 +3004,9 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, extract_duration = default_timer() - extract_start extract_rate = page_num / extract_duration - logging.info("Finished %d-process extraction of %d articles in %.1fs (%.1f art/s)", - process_count, page_num, extract_duration, extract_rate) + logging.info( + "Finished %d-process extraction of %d articles in %.1fs (%.1f art/s)", + process_count, page_num, extract_duration, extract_rate) # ---------------------------------------------------------------------- @@ -3036,7 +3074,8 @@ def reduce_process(opts, output_queue, spool_length, else: output = sys.stdout if PY2 else sys.stdout.buffer if file_compress: - logging.warn("writing to stdout, so no output compression (use an external tool)") + logging.warn( + "writing to stdout, so no output compression (use an external tool)") interval_start = default_timer() # FIXME: use a heap @@ -3050,7 +3089,8 @@ def reduce_process(opts, output_queue, spool_length, spool_length.value = len(spool) # progress report if next_page % report_period == 0: - interval_rate = report_period / (default_timer() - interval_start) + interval_rate = report_period / ( + default_timer() - interval_start) logging.info("Extracted %d articles (%.1f art/s)", next_page, interval_rate) interval_start = default_timer() @@ -3110,17 +3150,23 @@ def main(): help="use or create file containing templates") groupP.add_argument("--no-templates", action="store_false", help="Do not expand templates") - groupP.add_argument("-r", "--revision", action="store_true", default=options.print_revision, + groupP.add_argument("-r", "--revision", action="store_true", + default=options.print_revision, help="Include the document revision id (default=%(default)s)") - groupP.add_argument("--min_text_length", type=int, default=options.min_text_length, + groupP.add_argument("--min_text_length", type=int, + default=options.min_text_length, help="Minimum expanded text length required to write document (default=%(default)s)") - groupP.add_argument("--filter_disambig_pages", action="store_true", default=options.filter_disambig_pages, + groupP.add_argument("--filter_disambig_pages", action="store_true", + default=options.filter_disambig_pages, help="Remove pages from output that contain disabmiguation markup (default=%(default)s)") - groupP.add_argument("-it", "--ignored_tags", default="", metavar="abbr,b,big", + groupP.add_argument("-it", "--ignored_tags", default="", + metavar="abbr,b,big", help="comma separated list of tags that will be dropped, keeping their content") - groupP.add_argument("-de", "--discard_elements", default="", metavar="gallery,timeline,noinclude", + groupP.add_argument("-de", "--discard_elements", default="", + metavar="gallery,timeline,noinclude", help="comma separated list of elements that will be removed from the article text") - groupP.add_argument("--keep_tables", action="store_true", default=options.keep_tables, + groupP.add_argument("--keep_tables", action="store_true", + default=options.keep_tables, help="Preserve tables in the output article text (default=%(default)s)") default_process_count = max(1, cpu_count() - 1) parser.add_argument("--processes", type=int, default=default_process_count, @@ -3207,7 +3253,8 @@ def main(): with open(args.templates) as file: load_templates(file) - file = fileinput.FileInput(input_file, openhook=fileinput.hook_compressed) + file = fileinput.FileInput(input_file, + openhook=fileinput.hook_compressed) for page_data in pages_from(file): id, revid, title, ns, page = page_data Extractor(id, revid, title, page).extract(sys.stdout) diff --git a/hazm/WikipediaReader.py b/hazm/WikipediaReader.py index b1819c93..98e88d3e 100644 --- a/hazm/WikipediaReader.py +++ b/hazm/WikipediaReader.py @@ -10,7 +10,7 @@ import subprocess -class WikipediaReader(): +class WikipediaReader: """ interfaces [Persian Wikipedia dump] (http://download.wikimedia.org/fawiki/latest/fawiki-latest-pages-articles.xml.bz2) diff --git a/hazm/WordTokenizer.py b/hazm/WordTokenizer.py index 4827429b..12a3d3e5 100644 --- a/hazm/WordTokenizer.py +++ b/hazm/WordTokenizer.py @@ -23,36 +23,47 @@ class WordTokenizer(TokenizerI): ['نسخه', '0.5', 'در', 'ساعت', '22:00', 'تهران', '،', '1396'] """ - def __init__(self, words_file=default_words, verbs_file=default_verbs, join_verb_parts=True): + def __init__(self, words_file=default_words, verbs_file=default_verbs, + join_verb_parts=True): + self._join_verb_parts = join_verb_parts self.pattern = re.compile(r'([؟!\?]+|[\d\.:]+|[:\.،؛»\]\)\}"«\[\(\{])') self.words = {item[0]: (item[1], item[2]) for item in words_list(default_words)} if join_verb_parts: - self.after_verbs = set([ - 'ام', 'ای', 'است', 'ایم', 'اید', 'اند', 'بودم', 'بودی', 'بود', 'بودیم', 'بودید', 'بودند', 'باشم', - 'باشی', 'باشد', 'باشیم', 'باشید', 'باشند', - 'شده_ام', 'شده_ای', 'شده_است', 'شده_ایم', 'شده_اید', 'شده_اند', 'شده_بودم', 'شده_بودی', 'شده_بود', - 'شده_بودیم', 'شده_بودید', 'شده_بودند', 'شده_باشم', 'شده_باشی', 'شده_باشد', 'شده_باشیم', 'شده_باشید', - 'شده_باشند', - 'نشده_ام', 'نشده_ای', 'نشده_است', 'نشده_ایم', 'نشده_اید', 'نشده_اند', 'نشده_بودم', 'نشده_بودی', - 'نشده_بود', 'نشده_بودیم', 'نشده_بودید', 'نشده_بودند', 'نشده_باشم', 'نشده_باشی', 'نشده_باشد', - 'نشده_باشیم', 'نشده_باشید', 'نشده_باشند', - 'شوم', 'شوی', 'شود', 'شویم', 'شوید', 'شوند', 'شدم', 'شدی', 'شد', 'شدیم', 'شدید', 'شدند', - 'نشوم', 'نشوی', 'نشود', 'نشویم', 'نشوید', 'نشوند', 'نشدم', 'نشدی', 'نشد', 'نشدیم', 'نشدید', 'نشدند', - 'می‌شوم', 'می‌شوی', 'می‌شود', 'می‌شویم', 'می‌شوید', 'می‌شوند', 'می‌شدم', 'می‌شدی', 'می‌شد', 'می‌شدیم', - 'می‌شدید', 'می‌شدند', - 'نمی‌شوم', 'نمی‌شوی', 'نمی‌شود', 'نمی‌شویم', 'نمی‌شوید', 'نمی‌شوند', 'نمی‌شدم', 'نمی‌شدی', 'نمی‌شد', - 'نمی‌شدیم', 'نمی‌شدید', 'نمی‌شدند', - 'خواهم_شد', 'خواهی_شد', 'خواهد_شد', 'خواهیم_شد', 'خواهید_شد', 'خواهند_شد', - 'نخواهم_شد', 'نخواهی_شد', 'نخواهد_شد', 'نخواهیم_شد', 'نخواهید_شد', 'نخواهند_شد', - ]) - - self.before_verbs = set([ - 'خواهم', 'خواهی', 'خواهد', 'خواهیم', 'خواهید', 'خواهند', - 'نخواهم', 'نخواهی', 'نخواهد', 'نخواهیم', 'نخواهید', 'نخواهند' - ]) + self.after_verbs = {'ام', 'ای', 'است', 'ایم', 'اید', 'اند', 'بودم', + 'بودی', 'بود', 'بودیم', 'بودید', 'بودند', + 'باشم', 'باشی', 'باشد', 'باشیم', 'باشید', + 'باشند', 'شده_ام', 'شده_ای', 'شده_است', + 'شده_ایم', 'شده_اید', 'شده_اند', 'شده_بودم', + 'شده_بودی', 'شده_بود', 'شده_بودیم', + 'شده_بودید', 'شده_بودند', 'شده_باشم', + 'شده_باشی', 'شده_باشد', 'شده_باشیم', + 'شده_باشید', 'شده_باشند', 'نشده_ام', 'نشده_ای', + 'نشده_است', 'نشده_ایم', 'نشده_اید', 'نشده_اند', + 'نشده_بودم', 'نشده_بودی', 'نشده_بود', + 'نشده_بودیم', 'نشده_بودید', 'نشده_بودند', + 'نشده_باشم', 'نشده_باشی', 'نشده_باشد', + 'نشده_باشیم', 'نشده_باشید', 'نشده_باشند', + 'شوم', 'شوی', 'شود', 'شویم', 'شوید', 'شوند', + 'شدم', 'شدی', 'شد', 'شدیم', 'شدید', 'شدند', + 'نشوم', 'نشوی', 'نشود', 'نشویم', 'نشوید', + 'نشوند', 'نشدم', 'نشدی', 'نشد', 'نشدیم', + 'نشدید', 'نشدند', 'می‌شوم', 'می‌شوی', 'می‌شود', + 'می‌شویم', 'می‌شوید', 'می‌شوند', 'می‌شدم', + 'می‌شدی', 'می‌شد', 'می‌شدیم', 'می‌شدید', + 'می‌شدند', 'نمی‌شوم', 'نمی‌شوی', 'نمی‌شود', + 'نمی‌شویم', 'نمی‌شوید', 'نمی‌شوند', 'نمی‌شدم', + 'نمی‌شدی', 'نمی‌شد', 'نمی‌شدیم', 'نمی‌شدید', + 'نمی‌شدند', 'خواهم_شد', 'خواهی_شد', 'خواهد_شد', + 'خواهیم_شد', 'خواهید_شد', 'خواهند_شد', + 'نخواهم_شد', 'نخواهی_شد', 'نخواهد_شد', + 'نخواهیم_شد', 'نخواهید_شد', 'نخواهند_شد'} + + self.before_verbs = {'خواهم', 'خواهی', 'خواهد', 'خواهیم', 'خواهید', + 'خواهند', 'نخواهم', 'نخواهی', 'نخواهد', + 'نخواهیم', 'نخواهید', 'نخواهند'} with codecs.open(verbs_file, encoding='utf8') as verbs_file: self.verbs = list(reversed([verb.strip() for verb in verbs_file if verb])) diff --git a/hazm/utils.py b/hazm/utils.py index f75fe73c..bdfde90c 100644 --- a/hazm/utils.py +++ b/hazm/utils.py @@ -26,7 +26,10 @@ def words_list(words_file=default_words): with codecs.open(words_file, encoding='utf-8') as words_file: items = [line.strip().split('\t') for line in words_file] - return [(item[0], int(item[1]), tuple(item[2].split(','))) for item in items if len(item) == 3] + return [ + (item[0], int(item[1]), tuple(item[2].split(','))) + for item in items if len(item) == 3 + ] def stopwords_list(stopwords_file=default_stopwords): From e50b7f3f2deb1d29962be722a5b3f7c89edd088c Mon Sep 17 00:00:00 2001 From: Amir Hadifar Date: Sat, 25 Aug 2018 15:30:06 +0430 Subject: [PATCH 4/4] more on pep8 compilance --- hazm/BijankhanReader.py | 3 +- hazm/HamshahriReader.py | 3 +- hazm/InformalNormalizer.py | 57 +++++++++--------- hazm/Lemmatizer.py | 4 +- hazm/Normalizer.py | 17 +++--- hazm/TNewsReader.py | 3 +- hazm/TreebankReader.py | 30 +++++----- hazm/WikiExtractor.py | 48 +++++++++------ hazm/WordTokenizer.py | 16 +++-- hazm/__init__.py | 49 ++++++++-------- setup.py | 40 ++++++------- tests.py | 116 +++++++++++++++++++------------------ 12 files changed, 207 insertions(+), 179 deletions(-) diff --git a/hazm/BijankhanReader.py b/hazm/BijankhanReader.py index 61a2285a..08ccf268 100644 --- a/hazm/BijankhanReader.py +++ b/hazm/BijankhanReader.py @@ -6,8 +6,9 @@ from __future__ import unicode_literals import codecs +import re -from .Normalizer import * +from .Normalizer import Normalizer from .PeykareReader import join_verb_parts default_pos_map = {'ADJ': 'ADJ', 'ADJ_CMPR': 'ADJ', diff --git a/hazm/HamshahriReader.py b/hazm/HamshahriReader.py index d9b39f15..74fcacf1 100644 --- a/hazm/HamshahriReader.py +++ b/hazm/HamshahriReader.py @@ -81,8 +81,7 @@ def docs(self): elm.childNodes) > 1 else '' doc['text'] = '' - for item in element.getElementsByTagName('TEXT')[ - 0].childNodes: + for item in element.getElementsByTagName('TEXT')[0].childNodes: if item.nodeType == 4: # CDATA doc['text'] += item.data diff --git a/hazm/InformalNormalizer.py b/hazm/InformalNormalizer.py index b84ca977..eae9d13a 100644 --- a/hazm/InformalNormalizer.py +++ b/hazm/InformalNormalizer.py @@ -5,11 +5,14 @@ from __future__ import print_function from __future__ import unicode_literals +import codecs +import re + from .Lemmatizer import Lemmatizer from .Normalizer import Normalizer -from .SentenceTokenizer import * +from .SentenceTokenizer import SentenceTokenizer from .Stemmer import Stemmer -from .WordTokenizer import * +from .WordTokenizer import WordTokenizer from .utils import informal_verbs, informal_words, NUMBERS @@ -23,34 +26,11 @@ def __init__(self, verb_file=informal_verbs, word_file=informal_words, self.stemmer = Stemmer() super(InformalNormalizer, self).__init__(**kargs) - def informal_to_formal_conjucation(i, f, flag): - iv = self.informal_conjugations(i) - fv = self.lemmatizer.conjugations(f) - res = {} - if flag: - for i, j in zip(iv, fv[48:]): - res[i] = j - if '‌' in i: - res[i.replace('‌', '')] = j - res[i.replace('‌', ' ')] = j - if i.endswith('ین'): - res[i[:-1] + 'د'] = j - else: - for i, j in zip(iv[8:], fv[56:]): - res[i] = j - if '‌' in i: - res[i.replace('‌', '')] = j - res[i.replace('‌', ' ')] = j - if i.endswith('ین'): - res[i[:-1] + 'د'] = j - - return res - with codecs.open(verb_file, encoding='utf8') as vf: self.iverb_map = {} for f, i, flag in map(lambda x: x.strip().split(' ', 2), vf): self.iverb_map.update( - informal_to_formal_conjucation(i, f, flag) + self.informal_to_formal_conjucation(i, f, flag) ) with codecs.open(word_file, encoding='utf8') as wf: @@ -68,6 +48,29 @@ def informal_to_formal_conjucation(i, f, flag): self.words.update(self.lemmatizer.verbs.keys()) self.words.update(self.lemmatizer.verbs.values()) + def informal_to_formal_conjucation(self, i, f, flag): + iv = self.informal_conjugations(i) + fv = self.lemmatizer.conjugations(f) + res = {} + if flag: + for i, j in zip(iv, fv[48:]): + res[i] = j + if '‌' in i: + res[i.replace('‌', '')] = j + res[i.replace('‌', ' ')] = j + if i.endswith('ین'): + res[i[:-1] + 'د'] = j + else: + for i, j in zip(iv[8:], fv[56:]): + res[i] = j + if '‌' in i: + res[i.replace('‌', '')] = j + res[i.replace('‌', ' ')] = j + if i.endswith('ین'): + res[i[:-1] + 'د'] = j + + return res + def split_token_words(self, token): def shekan(token): @@ -137,7 +140,7 @@ def normalized_word(self, word): elif word not in self.ilemmatizer.verbs and word.endswith( 'ون') and self.lemmatizer.lemmatize( - word[:-2] + 'ان') in self.ilemmatizer.words: + word[:-2] + 'ان') in self.ilemmatizer.words: options.append(word[:-2] + 'ان') elif self.seperation_flag: diff --git a/hazm/Lemmatizer.py b/hazm/Lemmatizer.py index a074dfb2..44ac3e9d 100644 --- a/hazm/Lemmatizer.py +++ b/hazm/Lemmatizer.py @@ -87,8 +87,8 @@ def conjugations(self, verb): ends = ['م', 'ی', '', 'یم', 'ید', 'ند'] if verb == '#هست': - return ['هست' + end for end in ends] + ['نیست' + end for end in - ends] + return ['هست' + end for end in ends] + \ + ['نیست' + end for end in ends] past_simples = [past + end for end in ends] past_imperfects = ['می‌' + item for item in past_simples] diff --git a/hazm/Normalizer.py b/hazm/Normalizer.py index fc08db0d..0530d30d 100644 --- a/hazm/Normalizer.py +++ b/hazm/Normalizer.py @@ -11,9 +11,6 @@ from .WordTokenizer import WordTokenizer from .utils import maketrans -compile_patterns = lambda patterns: [(re.compile(pattern), repl) for - pattern, repl in patterns] - class Normalizer(object): def __init__(self, remove_extra_spaces=True, persian_style=True, @@ -62,12 +59,12 @@ def __init__(self, remove_extra_spaces=True, persian_style=True, # SHADDA, SUKUN ) - self.character_refinement_patterns = compile_patterns( + self.character_refinement_patterns = self.compile_patterns( self.character_refinement_patterns) punc_after, punc_before = r'\.:!،؛؟»\]\)\}', r'«\[\(\{' if punctuation_spacing: - self.punctuation_spacing_patterns = compile_patterns([ + self.punctuation_spacing_patterns = self.compile_patterns([ ('" ([^\n"]+) "', r'"\1"'), # remove space before and after quotation (' ([' + punc_after + '])', r'\1'), # remove space before @@ -81,7 +78,7 @@ def __init__(self, remove_extra_spaces=True, persian_style=True, ]) if affix_spacing: - self.affix_spacing_patterns = compile_patterns([ + self.affix_spacing_patterns = self.compile_patterns([ (r'([^ ]ه) ی ', r'\1‌ی '), # fix ی space (r'(^| )(ن?می) ', r'\1\2‌'), # put zwnj after می, نمی ( @@ -200,8 +197,8 @@ def token_spacing(self, tokens): self.words[token_pair][0] > 0: joined = True - if t < len(tokens) - 1 and token + '_' + tokens[ - t + 1] in self.verbs: + if t < len(tokens) - 1 and token + '_' + tokens[t + 1] in \ + self.verbs: joined = False elif token in self.suffixes and result[-1] in self.words: @@ -214,3 +211,7 @@ def token_spacing(self, tokens): result.append(token) return result + + def compile_patterns(self, patterns): + return [(re.compile(pattern), repl) for + pattern, repl in patterns] diff --git a/hazm/TNewsReader.py b/hazm/TNewsReader.py index e47cfb1f..eca2e014 100644 --- a/hazm/TNewsReader.py +++ b/hazm/TNewsReader.py @@ -38,7 +38,8 @@ def get_text(element): content = open(os.path.join(root, name)).read() # fix xml formatting issue - content = re.sub(r'[  ]', '', content) + content = re.sub(r'[  ]', '', + content) content = content.replace('', '') + '' elements = minidom.parseString(content) diff --git a/hazm/TreebankReader.py b/hazm/TreebankReader.py index b2057dcd..9616d36a 100644 --- a/hazm/TreebankReader.py +++ b/hazm/TreebankReader.py @@ -117,7 +117,7 @@ def clitic_join(tree, clitic): if type(tree[-1]) == Tree: return clitic_join(tree[-1], clitic) else: - if (clitic[0][0][0] == 'ا'): + if clitic[0][0][0] == 'ا': clitic[0] = ('‌' + clitic[0][0], clitic[0][1]) tree[-1] = (tree[-1][0] + clitic[0][0], clitic[0][1]) tree.set_label('CLITICS') @@ -147,14 +147,15 @@ def clitic_join(tree, clitic): 0].label() == 'AUX' and tree[0][0][ 0] in self._tokenizer.before_verbs: tree[1][0] = ( - tree[0][0][0] + ' ' + tree[1][0][0], tree[1][0][1]) + tree[0][0][0] + ' ' + tree[1][0][0], tree[1][0][1]) tree.remove(tree[0]) if self._join_verb_parts and len(tree.leaves()) > 1 and \ tree.leaves()[-1][ 0] in self._tokenizer.after_verbs and \ tree.leaves()[-2][0] in self._tokenizer.verbe: tree[1][0] = ( - tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) + tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], + tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves()) - 2) removingtree = tree while len(path) > 2: @@ -167,7 +168,8 @@ def clitic_join(tree, clitic): 0] in self._tokenizer.after_verbs and \ tree.leaves()[-2][0] in self._tokenizer.verbe: tree[1][0] = ( - tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1]) + tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], + tree[1][0][1]) path = tree.leaf_treeposition(len(tree.leaves()) - 2) removingtree = tree while len(path) > 2: @@ -185,10 +187,12 @@ def sents(self): for tree in self.trees(): yield tree.leaves() + def collapse(self, node, label): + return Tree(label, + [Tree(pos[1], [pos[0]]) for pos in + node.pos()]) + def chunked_trees(self): - collapse = lambda node, label: Tree(label, - [Tree(pos[1], [pos[0]]) for pos in - node.pos()]) def traverse(node, parent, chunks): label = node.label() @@ -235,17 +239,17 @@ def traverse(node, parent, chunks): return if label == 'NPA' and parent.label() in {'CPC', 'PPC'}: - chunks.append(collapse(node, 'NP')) + chunks.append(self.collapse(node, 'NP')) return if label == 'NPA' and len(node) >= 1: if node[0].label() == 'ADV': - chunks.append(collapse(node, 'NP')) + chunks.append(self.collapse(node, 'NP')) return if label in {'NPC', 'N', 'INFV', 'DPA', 'CLASS', 'DPC', 'DEM', 'INTJ', 'MN', 'PRON', 'DET', 'NUM', 'RES'}: - chunks.append(collapse(node, 'NP')) + chunks.append(self.collapse(node, 'NP')) return if label == 'NPA' and len(node) >= 2: @@ -261,7 +265,7 @@ def traverse(node, parent, chunks): 0].label() == 'NPA' and node[ 1].label() != 'NPC' or node[1].label() == 'NPA' and node[ 0].label() != 'NPC': - chunks.append(collapse(node, 'NP')) + chunks.append(self.collapse(node, 'NP')) return if label == 'DPC' and len(node) >= 2: @@ -270,12 +274,12 @@ def traverse(node, parent, chunks): if leaf[1] in {'PUNC', 'CONJ', 'PREP', 'PostP'}: chunkable = False if node[1].label() in {'N', 'NPA', 'NPC'} and chunkable: - chunks.append(collapse(node, 'NP')) + chunks.append(self.collapse(node, 'NP')) return if label == 'DPA' and len(node) >= 2: if node[1].label() == 'ADV': - chunks.append(collapse(node, 'ADVP')) + chunks.append(self.collapse(node, 'ADVP')) return if label in {'MV', 'V', 'AUX', 'PPARV'}: diff --git a/hazm/WikiExtractor.py b/hazm/WikiExtractor.py index 0552460d..a7a0326e 100755 --- a/hazm/WikiExtractor.py +++ b/hazm/WikiExtractor.py @@ -27,10 +27,10 @@ # under the terms of the GNU General Public License, version 3, # as published by the Free Software Foundation. # -# Tanl is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License at for more details. +# Tanl is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License at +# for more details. # # ============================================================================= @@ -110,7 +110,8 @@ def __eq__(self, other): # Program version version = '2.75' -## PARAMS #################################################################### +# PARAMS +###################################################################### options = SimpleNamespace( @@ -174,7 +175,7 @@ def __eq__(self, other): expand_templates=True, ## - ## Whether to escape doc content + # Whether to escape doc content escape_doc=False, ## @@ -514,7 +515,8 @@ def __str__(self): res = '' prev = self.prev while prev: - if res: res += ', ' + if res: + res += ', ' res += '(%s, %s)' % (prev.title, prev.args) prev = prev.prev return '' @@ -572,10 +574,10 @@ def write_output(self, out, text): else: if options.print_revision: header = '\n' % ( - self.id, self.revid, url, self.title) + self.id, self.revid, url, self.title) else: header = '\n' % ( - self.id, url, self.title) + self.id, url, self.title) footer = "\n\n" if out == sys.stdout: # option -a or -o - header = header.encode('utf-8') @@ -1315,7 +1317,7 @@ def findBalanced(text, openDelim=['[['], closeDelim=[']]']): stack.append(delim) nextPat = afterPat[delim] else: - opening = stack.pop() + # opening = stack.pop() # assert opening == openDelim[closeDelim.index(next.group(0))] if stack: nextPat = afterPat[stack[-1]] @@ -1405,9 +1407,13 @@ def string_sub(args): s = params.get('s', '') i = int(params.get('i', 1) or 1) # or handles case of '' value j = int(params.get('j', -1) or -1) - if i > 0: i -= 1 # lua is 1-based - if j < 0: j += 1 - if j == 0: j = len(s) + if i > 0: + i -= 1 # lua is 1-based + if j < 0: + j += 1 + if j == 0: + j = len(s) + return s[i:j] @@ -1879,7 +1885,7 @@ def sharp_invoke(module, function, args): '#ifexpr': lambda *args: '', # not supported '#ifexist': lambda extr, title, ifex, ifnex: extr.expand(ifnex), -# assuming title is not present + # assuming title is not present '#rel2abs': lambda *args: '', # not supported @@ -1984,7 +1990,8 @@ def define_template(title, page): # title = normalizeTitle(title) # sanity check (empty template, e.g. Template:Crude Oil Prices)) - if not page: return + if not page: + return # check for redirects m = re.match('#REDIRECT.*?\[\[([^\]]*)]]', page[0], re.IGNORECASE) @@ -2658,7 +2665,8 @@ def compact(text): def handle_unicode(entity): numeric_code = int(entity[2:-1]) - if numeric_code >= 0x10000: return '' + if numeric_code >= 0x10000: + return '' return chr(numeric_code) @@ -2805,7 +2813,8 @@ def pages_from(input): redirect = False title = None for line in input: - if not isinstance(line, text_type): line = line.decode('utf-8') + if not isinstance(line, text_type): + line = line.decode('utf-8') if '<' not in line: # faster than doing re.search() if inText: page.append(line) @@ -2874,7 +2883,8 @@ def process_dump(input_file, template_file, out_file, file_size, file_compress, # collect siteinfo for line in input: # When an input file is .bz2 or .gz, line can be a bytes even in Python 3. - if not isinstance(line, text_type): line = line.decode('utf-8') + if not isinstance(line, text_type): + line = line.decode('utf-8') m = tagRE.search(line) if not m: continue @@ -3090,7 +3100,7 @@ def reduce_process(opts, output_queue, spool_length, # progress report if next_page % report_period == 0: interval_rate = report_period / ( - default_timer() - interval_start) + default_timer() - interval_start) logging.info("Extracted %d articles (%.1f art/s)", next_page, interval_rate) interval_start = default_timer() diff --git a/hazm/WordTokenizer.py b/hazm/WordTokenizer.py index 12a3d3e5..f18a41e0 100644 --- a/hazm/WordTokenizer.py +++ b/hazm/WordTokenizer.py @@ -29,7 +29,8 @@ def __init__(self, words_file=default_words, verbs_file=default_verbs, self._join_verb_parts = join_verb_parts self.pattern = re.compile(r'([؟!\?]+|[\d\.:]+|[:\.،؛»\]\)\}"«\[\(\{])') - self.words = {item[0]: (item[1], item[2]) for item in words_list(default_words)} + self.words = {item[0]: (item[1], item[2]) + for item in words_list(default_words)} if join_verb_parts: self.after_verbs = {'ام', 'ای', 'است', 'ایم', 'اید', 'اند', 'بودم', @@ -66,12 +67,16 @@ def __init__(self, words_file=default_words, verbs_file=default_verbs, 'نخواهیم', 'نخواهید', 'نخواهند'} with codecs.open(verbs_file, encoding='utf8') as verbs_file: - self.verbs = list(reversed([verb.strip() for verb in verbs_file if verb])) + self.verbs = list( + reversed([verb.strip() for verb in verbs_file if verb])) self.bons = set([verb.split('#')[0] for verb in self.verbs]) - self.verbe = set([bon + 'ه' for bon in self.bons] + ['ن' + bon + 'ه' for bon in self.bons]) + self.verbe = set( + [bon + 'ه' for bon in self.bons] + + ['ن' + bon + 'ه' for bon in self.bons]) def tokenize(self, text): - text = self.pattern.sub(r' \1 ', text.replace('\n', ' ').replace('\t', ' ')) + text = self.pattern.sub(r' \1 ', + text.replace('\n', ' ').replace('\t', ' ')) tokens = [word for word in text.split(' ') if word] if self._join_verb_parts: tokens = self.join_verb_parts(tokens) @@ -94,7 +99,8 @@ def join_verb_parts(self, tokens): result = [''] for token in reversed(tokens): - if token in self.before_verbs or (result[-1] in self.after_verbs and token in self.verbe): + if token in self.before_verbs or ( + result[-1] in self.after_verbs and token in self.verbe): result[-1] = token + '_' + result[-1] else: result.append(token) diff --git a/hazm/__init__.py b/hazm/__init__.py index 6090f240..ea189291 100644 --- a/hazm/__init__.py +++ b/hazm/__init__.py @@ -1,38 +1,35 @@ - -from .WordTokenizer import WordTokenizer -from .SentenceTokenizer import SentenceTokenizer -from .TokenSplitter import TokenSplitter -from .HamshahriReader import HamshahriReader -from .PersicaReader import PersicaReader from .BijankhanReader import BijankhanReader -from .PeykareReader import PeykareReader -from .VerbValencyReader import VerbValencyReader +from .Chunker import Chunker, RuleBasedChunker, tree2brackets from .DadeganReader import DadeganReader -from .TreebankReader import TreebankReader -from .WikipediaReader import WikipediaReader -from .SentiPersReader import SentiPersReader -from .QuranCorpusReader import QuranCorpusReader -from .TNewsReader import TNewsReader -from .Normalizer import Normalizer +from .DependencyParser import DependencyParser, MaltParser, TurboParser +from .HamshahriReader import HamshahriReader from .InformalNormalizer import InformalNormalizer, InformalLemmatizer -from .Stemmer import Stemmer from .Lemmatizer import Lemmatizer -from .SequenceTagger import SequenceTagger, IOBTagger +from .Normalizer import Normalizer from .POSTagger import POSTagger, StanfordPOSTagger -from .Chunker import Chunker, RuleBasedChunker, tree2brackets -from .DependencyParser import DependencyParser, MaltParser, TurboParser - - +from .PersicaReader import PersicaReader +from .PeykareReader import PeykareReader +from .QuranCorpusReader import QuranCorpusReader +from .SentenceTokenizer import SentenceTokenizer +from .SentiPersReader import SentiPersReader +from .SequenceTagger import SequenceTagger, IOBTagger +from .Stemmer import Stemmer +from .TNewsReader import TNewsReader +from .TokenSplitter import TokenSplitter +from .TreebankReader import TreebankReader +from .VerbValencyReader import VerbValencyReader +from .WikipediaReader import WikipediaReader +from .WordTokenizer import WordTokenizer from .utils import words_list, stopwords_list def sent_tokenize(text): - if not hasattr(sent_tokenize, 'tokenizer'): - sent_tokenize.tokenizer = SentenceTokenizer() - return sent_tokenize.tokenizer.tokenize(text) + if not hasattr(sent_tokenize, 'tokenizer'): + sent_tokenize.tokenizer = SentenceTokenizer() + return sent_tokenize.tokenizer.tokenize(text) def word_tokenize(sentence): - if not hasattr(word_tokenize, 'tokenizer'): - word_tokenize.tokenizer = WordTokenizer() - return word_tokenize.tokenizer.tokenize(sentence) + if not hasattr(word_tokenize, 'tokenizer'): + word_tokenize.tokenizer = WordTokenizer() + return word_tokenize.tokenizer.tokenize(sentence) diff --git a/setup.py b/setup.py index 154732af..3d4c14ff 100755 --- a/setup.py +++ b/setup.py @@ -1,24 +1,24 @@ - from setuptools import setup setup( - name='hazm', - version='0.5.3', - description='Python library for digesting Persian text.', - author='Alireza Nourian', - author_email='az.nourian@gmail.com', - url='http://www.sobhe.ir/hazm/', - packages=['hazm'], - package_data={'hazm': ['data/*.dat']}, - classifiers=[ - 'Topic :: Text Processing', - 'Natural Language :: Persian', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'License :: OSI Approved :: MIT License', - ], - install_requires=['nltk==3.2.2', 'libwapiti>=0.2.1;platform_system!="Windows"'], - extras_require={'wapiti': ['libwapiti>=0.2.1']}, + name='hazm', + version='0.5.3', + description='Python library for digesting Persian text.', + author='Alireza Nourian', + author_email='az.nourian@gmail.com', + url='http://www.sobhe.ir/hazm/', + packages=['hazm'], + package_data={'hazm': ['data/*.dat']}, + classifiers=[ + 'Topic :: Text Processing', + 'Natural Language :: Persian', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'License :: OSI Approved :: MIT License', + ], + install_requires=['nltk==3.2.2', + 'libwapiti>=0.2.1;platform_system!="Windows"'], + extras_require={'wapiti': ['libwapiti>=0.2.1']}, ) diff --git a/tests.py b/tests.py index bae3d9bf..b3b91c78 100644 --- a/tests.py +++ b/tests.py @@ -1,72 +1,78 @@ # coding: utf-8 from __future__ import unicode_literals -import sys, inspect, doctest, unittest + +import doctest +import inspect +import sys +import unittest + from hazm import * modules = { - 'persica': PersicaReader, - 'hamshahri': HamshahriReader, - 'bijankhan': BijankhanReader, - 'peykare': PeykareReader, - 'dadegan': DadeganReader, - 'valency': VerbValencyReader, - 'treebank': TreebankReader, - 'sentipers': SentiPersReader, - 'tnews': TNewsReader, - 'quran': QuranCorpusReader, - 'sentence_tokenizer': SentenceTokenizer, - 'word_tokenizer': WordTokenizer, - 'splitter': TokenSplitter, - 'normalizer': Normalizer, - 'stemmer': Stemmer, - 'lemmatizer': Lemmatizer, - 'tagger': SequenceTagger, - 'postagger': POSTagger, - 'chunker': Chunker, - 'parser': DependencyParser, - 'informal_normalizer': InformalNormalizer + 'persica': PersicaReader, + 'hamshahri': HamshahriReader, + 'bijankhan': BijankhanReader, + 'peykare': PeykareReader, + 'dadegan': DadeganReader, + 'valency': VerbValencyReader, + 'treebank': TreebankReader, + 'sentipers': SentiPersReader, + 'tnews': TNewsReader, + 'quran': QuranCorpusReader, + 'sentence_tokenizer': SentenceTokenizer, + 'word_tokenizer': WordTokenizer, + 'splitter': TokenSplitter, + 'normalizer': Normalizer, + 'stemmer': Stemmer, + 'lemmatizer': Lemmatizer, + 'tagger': SequenceTagger, + 'postagger': POSTagger, + 'chunker': Chunker, + 'parser': DependencyParser, + 'informal_normalizer': InformalNormalizer } class UnicodeOutputChecker(doctest.OutputChecker): - def check_output(self, want, got, optionflags): - try: - want, got = eval(want), eval(got) - except: - pass + def check_output(self, want, got, optionflags): + try: + want, got = eval(want), eval(got) + except: + pass - try: - got = got.decode('unicode-escape') - want = want.replace('آ', 'ا') # decode issue - except: - pass + try: + got = got.decode('unicode-escape') + want = want.replace('آ', 'ا') # decode issue + except: + pass - if type(want) == unicode: - want = want.replace('٫', '.') # eval issue + if type(want) == unicode: + want = want.replace('٫', '.') # eval issue - return want == got + return want == got if __name__ == '__main__': - # test all modules if no one specified - all_modules = len(sys.argv) < 2 - - suites = [] - checker = UnicodeOutputChecker() if utils.PY2 else None - for name, object in modules.items(): - if all_modules or name in sys.argv: - suites.append(doctest.DocTestSuite(inspect.getmodule(object), checker=checker)) - - if not utils.PY2 and all_modules: - suites.append(doctest.DocFileSuite('README.md')) - - failure = False - runner = unittest.TextTestRunner(verbosity=2) - for suite in suites: - if not runner.run(suite).wasSuccessful(): - failure = True - - if failure: - exit(1) + # test all modules if no one specified + all_modules = len(sys.argv) < 2 + + suites = [] + checker = UnicodeOutputChecker() if utils.PY2 else None + for name, object in modules.items(): + if all_modules or name in sys.argv: + suites.append(doctest.DocTestSuite(inspect.getmodule(object), + checker=checker)) + + if not utils.PY2 and all_modules: + suites.append(doctest.DocFileSuite('README.md')) + + failure = False + runner = unittest.TextTestRunner(verbosity=2) + for suite in suites: + if not runner.run(suite).wasSuccessful(): + failure = True + + if failure: + exit(1)