Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Some pep8 issues resolved. #135

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
527 changes: 300 additions & 227 deletions data.py

Large diffs are not rendered by default.

112 changes: 70 additions & 42 deletions hazm/BijankhanReader.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,75 @@
# coding: utf-8

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import re, codecs
from .Normalizer import *

import codecs
import re

from .Normalizer import Normalizer
from .PeykareReader import join_verb_parts

default_pos_map = {'ADJ': 'ADJ', 'ADJ_CMPR': 'ADJ', 'ADJ_INO': 'ADJ', 'ADJ_ORD': 'ADJ', 'ADJ_SIM': 'ADJ', 'ADJ_SUP': 'ADJ', 'ADV': 'ADV', 'ADV_EXM': 'ADV', 'ADV_I': 'ADV', 'ADV_NEGG': 'ADV', 'ADV_NI': 'ADV', 'ADV_TIME': 'ADV', 'AR': 'AR', 'CON': 'CONJ', 'DEFAULT': 'DEFAULT', 'DELM': 'PUNC', 'DET': 'PREP', 'IF': 'IF', 'INT': 'INT', 'MORP': 'MORP', 'MQUA': 'MQUA', 'MS': 'MS', 'N_PL': 'N', 'N_SING': 'N', 'NN': 'NN', 'NP': 'NP', 'OH': 'OH', 'OHH': 'OHH', 'P': 'PREP', 'PP': 'PP', 'PRO': 'PR', 'PS': 'PS', 'QUA': 'QUA', 'SPEC': 'SPEC', 'V_AUX': 'V', 'V_IMP': 'V', 'V_PA': 'V', 'V_PRE': 'V', 'V_PRS': 'V', 'V_SUB': 'V'}


class BijankhanReader():
"""
interfaces [Bijankhan Corpus](http://ece.ut.ac.ir/dbrg/bijankhan/Corpus/BijanKhan_Corpus_Processed.zip) that you must download and extract it.

>>> bijankhan = BijankhanReader(bijankhan_file='corpora/bijankhan.txt')
>>> next(bijankhan.sents())
[('اولین', 'ADJ'), ('سیاره', 'N'), ('خارج', 'ADJ'), ('از', 'PREP'), ('منظومه', 'N'), ('شمسی', 'ADJ'), ('دیده_شد', 'V'), ('.', 'PUNC')]
"""

def __init__(self, bijankhan_file, joined_verb_parts=True, pos_map=default_pos_map):
self._bijankhan_file = bijankhan_file
self._joined_verb_parts = joined_verb_parts
self._pos_map = pos_map
self._normalizer = Normalizer(punctuation_spacing=False)

def _sentences(self):
sentence = []
for line in codecs.open(self._bijankhan_file, encoding='utf-8'):
parts = re.split(' +', line.strip())
if len(parts) == 2:
word, tag = parts
if word not in ('#', '*'):
word = self._normalizer.normalize(word)
sentence.append((word if word else '_', tag))
if tag == 'DELM' and word in ('#', '*', '.', '؟', '!') :
if len(sentence):
yield sentence
sentence = []

def sents(self):
map_poses = lambda item: (item[0], self._pos_map.get(item[1], item[1]))

for sentence in self._sentences():
if self._joined_verb_parts:
sentence = join_verb_parts(sentence)

yield list(map(map_poses, sentence))
default_pos_map = {'ADJ': 'ADJ', 'ADJ_CMPR': 'ADJ',
'ADJ_INO': 'ADJ', 'ADJ_ORD': 'ADJ',
'ADJ_SIM': 'ADJ', 'ADJ_SUP': 'ADJ',
'ADV': 'ADV', 'ADV_EXM': 'ADV',
'ADV_I': 'ADV', 'ADV_NEGG': 'ADV',
'ADV_NI': 'ADV', 'ADV_TIME': 'ADV',
'AR': 'AR', 'CON': 'CONJ',
'DEFAULT': 'DEFAULT', 'DELM': 'PUNC',
'DET': 'PREP', 'IF': 'IF', 'INT': 'INT',
'MORP': 'MORP', 'MQUA': 'MQUA',
'MS': 'MS', 'N_PL': 'N', 'N_SING': 'N',
'NN': 'NN', 'NP': 'NP', 'OH': 'OH',
'OHH': 'OHH', 'P': 'PREP', 'PP': 'PP',
'PRO': 'PR', 'PS': 'PS', 'QUA': 'QUA',
'SPEC': 'SPEC', 'V_AUX': 'V',
'V_IMP': 'V', 'V_PA': 'V',
'V_PRE': 'V', 'V_PRS': 'V', 'V_SUB': 'V'}


class BijankhanReader:
"""
interfaces [Bijankhan Corpus](
http://ece.ut.ac.ir/dbrg/bijankhan/Corpus/BijanKhan_Corpus_Processed.zip
) that you must download and extract it.

>>> bijankhan = BijankhanReader(bijankhan_file='corpora/bijankhan.txt')
>>> next(bijankhan.sents()) [('اولین', 'ADJ'), ('سیاره', 'N'), ('خارج',
'ADJ'), ('از', 'PREP'), ('منظومه', 'N'), ('شمسی', 'ADJ'), ('دیده_شد',
'V'), ('.', 'PUNC')]

"""

def __init__(self, bijankhan_file, joined_verb_parts=True,
pos_map=default_pos_map):

self._bijankhan_file = bijankhan_file
self._joined_verb_parts = joined_verb_parts
self._pos_map = pos_map
self._normalizer = Normalizer(punctuation_spacing=False)

def _sentences(self):
sentence = []
for line in codecs.open(self._bijankhan_file, encoding='utf-8'):
parts = re.split(' +', line.strip())
if len(parts) == 2:
word, tag = parts
if word not in ('#', '*'):
word = self._normalizer.normalize(word)
sentence.append((word if word else '_', tag))
if tag == 'DELM' and word in ('#', '*', '.', '؟', '!'):
if len(sentence):
yield sentence
sentence = []

def sents(self):
map_poses = lambda item: (item[0], self._pos_map.get(item[1], item[1]))

for sentence in self._sentences():
if self._joined_verb_parts:
sentence = join_verb_parts(sentence)

yield list(map(map_poses, sentence))
81 changes: 46 additions & 35 deletions hazm/Chunker.py
Original file line number Diff line number Diff line change
@@ -1,58 +1,69 @@
# coding: utf-8

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from nltk.chunk import ChunkParserI, RegexpParser, tree2conlltags, conlltags2tree

from nltk.chunk import ChunkParserI
from nltk.chunk import RegexpParser
from nltk.chunk import conlltags2tree
from nltk.chunk import tree2conlltags

from .SequenceTagger import IOBTagger


def tree2brackets(tree):
str, tag = '', ''
for item in tree2conlltags(tree):
if item[2][0] in {'B', 'O'} and tag:
str += tag +'] '
tag = ''
str, tag = '', ''
for item in tree2conlltags(tree):
if item[2][0] in {'B', 'O'} and tag:
str += tag + '] '
tag = ''

if item[2][0] == 'B':
tag = item[2].split('-')[1]
str += '['
str += item[0] +' '
if item[2][0] == 'B':
tag = item[2].split('-')[1]
str += '['
str += item[0] + ' '

if tag:
str += tag +'] '
if tag:
str += tag + '] '

return str.strip()
return str.strip()


class Chunker(IOBTagger, ChunkParserI):
"""
>>> chunker = Chunker(model='resources/chunker.model')
>>> tree2brackets(chunker.parse([('نامه', 'Ne'), ('ایشان', 'PRO'), ('را', 'POSTP'), ('دریافت', 'N'), ('داشتم', 'V'), ('.', 'PUNC')]))
'[نامه ایشان NP] [را POSTP] [دریافت داشتم VP] .'
"""
"""
>>> chunker = Chunker(model='resources/chunker.model')
>>> tree2brackets(chunker.parse([('نامه', 'Ne'), ('ایشان', 'PRO'),
('را', 'POSTP'), ('دریافت', 'N'), ('داشتم', 'V'), ('.', 'PUNC')]))
'[نامه ایشان NP] [را POSTP] [دریافت داشتم VP] .'
"""

def train(self, trees):
super(Chunker, self).train(map(tree2conlltags, trees))
def train(self, trees):
super(Chunker, self).train(map(tree2conlltags, trees))

def parse(self, sentence):
return next(self.parse_sents([sentence]))
def parse(self, sentence):
return next(self.parse_sents([sentence]))

def parse_sents(self, sentences):
for conlltagged in super(Chunker, self).tag_sents(sentences):
yield conlltags2tree(conlltagged)
def parse_sents(self, sentences):
for conlltagged in super(Chunker, self).tag_sents(sentences):
yield conlltags2tree(conlltagged)

def evaluate(self, gold):
return ChunkParserI.evaluate(self, gold)
def evaluate(self, gold):
return ChunkParserI.evaluate(self, gold)


class RuleBasedChunker(RegexpParser):
"""
>>> chunker = RuleBasedChunker()
>>> tree2brackets(chunker.parse([('نامه', 'Ne'), ('۱۰', 'NUMe'), ('فوریه', 'Ne'), ('شما', 'PRO'), ('را', 'POSTP'), ('دریافت', 'N'), ('داشتم', 'V'), ('.', 'PUNC')]))
'[نامه ۱۰ فوریه شما NP] [را POSTP] [دریافت داشتم VP] .'
"""
"""
>>> chunker = RuleBasedChunker()
>>> tree2brackets(chunker.parse([('نامه', 'Ne'), ('۱۰', 'NUMe'),
('فوریه', 'Ne'), ('شما', 'PRO'), ('را', 'POSTP'), ('دریافت', 'N'),
('داشتم', 'V'), ('.', 'PUNC')]))
'[نامه ۱۰ فوریه شما NP] [را POSTP] [دریافت داشتم VP] .'
"""

def __init__(self):
grammar = r"""
def __init__(self):
grammar = r"""

NP:
<P>{<N>}<V>
Expand Down Expand Up @@ -82,4 +93,4 @@ def __init__(self):

"""

super(RuleBasedChunker, self).__init__(grammar=grammar)
super(RuleBasedChunker, self).__init__(grammar=grammar)
Loading