Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Degarbayan interface #176

Merged
merged 4 commits into from
Feb 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions corpora/degarbayan/CorpusPair.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
<?xml version="1.0"?>
<PairCorpus>
<Pair>
<PairId>
10001
</PairId>
<NewsSource1>
asrirannews
</NewsSource1>
<NewsSource2>
aftabnewsnews
</NewsSource2>
<NewsId1>
458034
</NewsId1>
<NewsId2>
356246
</NewsId2>
<Sentence1>
24 نفر نهایی تیم ملی بدون تغییری خاص معرفی شد
</Sentence1>
<Sentence2>
کی روش 24 بازیکن را به تیم ملی فوتبال دعوت کرد
</Sentence2>
<MethodType>
title
</MethodType>
<judge>
1
</judge>
</Pair>
<Pair>
<PairId>
10003
</PairId>
<NewsSource1>
asrirannews
</NewsSource1>
<NewsSource2>
aftabnewsnews
</NewsSource2>
<NewsId1>
458038
</NewsId1>
<NewsId2>
356247
</NewsId2>
<Sentence1>
سقوط هواپیمای مسافربری در جنوب روسیه / مرگ همه 61 سرنشین (+فیلم)
</Sentence1>
<Sentence2>
سقوط هواپیمای مسافربری در روسیه با 61 کشته
</Sentence2>
<MethodType>
title
</MethodType>
<judge>
0
</judge>
</Pair>
<Pair>
<PairId>
10018
</PairId>
<NewsSource1>
asrirannews
</NewsSource1>
<NewsSource2>
iribnews
</NewsSource2>
<NewsId1>
458069
</NewsId1>
<NewsId2>
1071139
</NewsId2>
<Sentence1>
پیروزی والیبال نشسته بانوان ایران مقابل کانادا
</Sentence1>
<Sentence2>
نخستین پیروزی بانوان ایران رقم خورد
</Sentence2>
<MethodType>
title
</MethodType>
<judge>
-1
</judge>
</Pair>
</PairCorpus>
70 changes: 70 additions & 0 deletions hazm/DegarbayanReader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# coding: utf-8

from __future__ import unicode_literals, print_function
import os
from xml.dom import minidom

class DegarbayanReader():
"""
interfaces [Degarbayan Corpus](https://www.peykaregan.ir/dataset/%D9%BE%DB%8C%DA%A9%D8%B1%D9%87-%D8%AF%DA%AF%D8%B1%D8%A8%DB%8C%D8%A7%D9%86)
معانی‌جو، ر.، و میرروشندل، س.ا. (۱۳۹۶). دگربیان: توسعه پیکره متنی فارسی جملات و عبارات معادل به کمک روش جمع‌سپاری. علوم رایانش و فناوری اطلاعات، ۱۵ (۱)، ۲۲-۳۰.

>>> degarbayan = DegarbayanReader(root='corpora/degarbayan')
>>> next(degarbayan.pairs())
('24 نفر نهایی تیم ملی بدون تغییری خاص معرفی شد', 'کی روش 24 بازیکن را به تیم ملی فوتبال دعوت کرد', 'Paraphrase')
"""

def __init__(self, root, corpus_file='CorpusPair.xml', judge_type='three_class', version=1.0):
"""
:param root: Path to corpus folder.
:param corpus_file: Name of corpus pair file. Defaults to 'CorpusPair.xml'
:param judge_type: How to return judge value. can be eighter 'two_class' or 'three_class'
for two or three class value returns. Defaults to 'three_class'
:param version: Corpus version. Defaults to version 1.0
:type root: str
:type corpuse_file: str
:type judge_type: str
:type version: float
"""
self._root = root
self._corpus_file = corpus_file
self._judge_type = judge_type
if judge_type != 'three_class' and judge_type != 'two_class':
self._judge_type = 'three_class'

def docs(self):

def judge_number_to_text(judge):
if judge == '1' or (self._judge_type == 'two_class' and judge == '0'):
return 'Paraphrase'
elif judge == '0':
return 'SemiParaphrase'
else:
return 'NotParaphrase'

filename = os.path.join(self._root, self._corpus_file)
if os.path.exists(filename):
try:
elements = minidom.parse(filename)
for element in elements.getElementsByTagName('Pair'):
pair = {}
pair['id'] = element.getElementsByTagName('PairId')[0].childNodes[0].data.strip()
pair['news_source1'] = element.getElementsByTagName('NewsSource1')[0].childNodes[0].data.strip()
pair['news_source2'] = element.getElementsByTagName('NewsSource2')[0].childNodes[0].data.strip()
pair['news_id1'] = element.getElementsByTagName('NewsId1')[0].childNodes[0].data.strip()
pair['news_id2'] = element.getElementsByTagName('NewsId2')[0].childNodes[0].data.strip()
pair['sentence1'] = element.getElementsByTagName('Sentence1')[0].childNodes[0].data.strip()
pair['sentence2'] = element.getElementsByTagName('Sentence2')[0].childNodes[0].data.strip()
pair['method_type'] = element.getElementsByTagName('MethodType')[0].childNodes[0].data.strip()
pair['judge'] = judge_number_to_text(element.getElementsByTagName('judge')[0].childNodes[0].data.strip())
yield pair

except Exception as e:
print('error in reading', filename, e, file=sys.stderr)
else:
print('error in reading file', filename, e, file=sys.stderr)
raise FileNotFoundError('error in reading file', filename)

def pairs(self):
for pair in self.docs():
yield pair['sentence1'], pair['sentence2'], pair['judge']
1 change: 1 addition & 0 deletions hazm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .TreebankReader import TreebankReader
from .WikipediaReader import WikipediaReader
from .SentiPersReader import SentiPersReader
from .DegarbayanReader import DegarbayanReader
from .QuranCorpusReader import QuranCorpusReader
from .TNewsReader import TNewsReader
from .Normalizer import Normalizer
Expand Down
1 change: 1 addition & 0 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
'valency': VerbValencyReader,
'treebank': TreebankReader,
'sentipers': SentiPersReader,
'degarbayan': DegarbayanReader,
'tnews': TNewsReader,
'quran': QuranCorpusReader,
'sentence_tokenizer': SentenceTokenizer,
Expand Down