Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

kowikitext korpus 추가 #101

Merged
merged 5 commits into from
Oct 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Korpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from .korpus_korean_petitions import KoreanPetitionsKorpus
from .korpus_kornli import KorNLIKorpus
from .korpus_korsts import KorSTSKorpus
from .korpus_kowiki import KowikiTextKorpus
from .korpus_namuwiki import NamuwikiTextKorpus
from .korpus_naverchangwon_ner import NaverChangwonNERKorpus
from .korpus_nsmc import NSMCKorpus
Expand Down
91 changes: 91 additions & 0 deletions Korpora/korpus_kowiki.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import os
from .korpora import Korpus, SentencePairKorpusData
from .utils import fetch, default_korpora_path, load_wikitext


KOWIKI_FETCH_INFORMATION = [
{
'url': 'https://github.com/lovit/kowikitext/releases/download/kowikitext.20200920.v2/kowikitext_20200920.train.zip',
'destination': 'kowiki/kowikitext_20200920.train.zip',
'method': 'download & unzip'
},
{
'url': 'https://github.com/lovit/kowikitext/releases/download/kowikitext.20200920.v2/kowikitext_20200920.test.zip',
'destination': 'kowiki/kowikitext_20200920.test.zip',
'method': 'download & unzip'
},
{
'url': 'https://github.com/lovit/kowikitext/releases/download/kowikitext.20200920.v2/kowikitext_20200920.dev.zip',
'destination': 'kowiki/kowikitext_20200920.dev.zip',
'method': 'download & unzip'
}
]

description = """ Author : Hyunjoong Kim lovit@github
Repository : https://github.com/lovit/kowikitext
References :

한국어 위키피디아의 덤프 데이터를 바탕을 제작한 wikitext 형식의 텍스트 파일입니다.
학습 및 평가를 위하여 위키페이지 별로 train (99%), dev (0.5%), test (0.5%) 로 나뉘어져있습니다.
"""

license = " CC-BY-SA 3.0 which kowiki dump dataset is licensed"


class KowikiTextKorpus(Korpus):
def __init__(self, root_dir=None, force_download=False):
super().__init__(description, license)

if root_dir is None:
root_dir = default_korpora_path
fetch_kowikitext(root_dir, force_download)

for information in KOWIKI_FETCH_INFORMATION:
destination = information['destination']
local_path = os.path.join(os.path.abspath(root_dir), destination[:-4])

if 'train' in destination:
response = input(
'kowikiText.train text file is large (1.6G).\n'
'If you want to load text in your memory, please insert `yes`\n'
'If the `INPUT` is integer, it loads only first `INPUT` sentences\n').lower()
if (len(response) == 1 and response == 'y') or (response == 'yes'):
texts, titles = self.load(local_path)
self.train = SentencePairKorpusData('kowikiText.train', texts, titles)
elif response.isdigit():
texts, titles = self.load(local_path, num_lines=int(response))
self.train = SentencePairKorpusData('kowikiText.train', texts, titles)
else:
dirname = os.path.abspath(f'{root_dir}/kowiki')
self.train = f'kowikitext corpus is downloaded. Open local directory {dirname}'
print('Continue to load `dev` and `test`')
continue

texts, titles = self.load(local_path)
if 'dev' in destination:
self.dev = SentencePairKorpusData('kowikiText.dev', texts, titles)
elif 'test' in destination:
self.test = SentencePairKorpusData('kowikiText.test', texts, titles)
else:
raise ValueError(f'Check local files')

def load(self, path, num_lines=-1):
def split_title_text(wikitext):
lines = wikitext.split('\n')
title = lines[0]
text = '\n'.join([line.strip() for line in lines[2:] if line.strip()])
return title, text

wikitexts = load_wikitext(path, num_lines)
wikitexts = [split_title_text(wikitext) for wikitext in wikitexts]
titles, texts = zip(*wikitexts)
# swap position
return texts, titles


def fetch_kowikitext(root_dir, force_download):
for information in KOWIKI_FETCH_INFORMATION:
url = information['url']
destination = information['destination']
local_path = os.path.join(os.path.abspath(root_dir), destination)
fetch(url, local_path, 'kowikitext', force_download, information['method'])
4 changes: 4 additions & 0 deletions Korpora/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .korpus_korean_petitions import KoreanPetitionsKorpus, fetch_korean_petitions
from .korpus_kornli import KorNLIKorpus, fetch_kornli
from .korpus_korsts import KorSTSKorpus, fetch_korsts
from .korpus_kowiki import KowikiTextKorpus, fetch_kowikitext
from .korpus_namuwiki import NamuwikiTextKorpus, fetch_namuwikitext
from .korpus_naverchangwon_ner import NaverChangwonNERKorpus, fetch_naverchangwon_ner
from .korpus_nsmc import NSMCKorpus, fetch_nsmc
Expand Down Expand Up @@ -62,6 +63,7 @@ def corpus_list(cls):
'korean_petitions': KoreanPetitionsKorpus,
'kornli': KorNLIKorpus,
'korsts': KorSTSKorpus,
'kowikitext': KowikiTextKorpus,
'namuwikitext': NamuwikiTextKorpus,
'naver_changwon_ner': NaverChangwonNERKorpus,
'nsmc': NSMCKorpus,
Expand All @@ -75,6 +77,7 @@ def corpus_list(cls):
'korean_petitions': "lovit@github 님이 만드신 2017.08 ~ 2019.03 청와대 청원데이터",
'kornli': "KakaoBrain 에서 제공하는 Natural Language Inference (NLI) 데이터",
'korsts': "KakaoBrain 에서 제공하는 Semantic Textual Similarity (STS) 데이터",
'kowikitext': "lovit@github 님이 만드신 wikitext 형식의 한국어 위키피디아 데이터",
'namuwikitext': "lovit@github 님이 만드신 wikitext 형식의 나무위키 데이터",
'naver_changwon_ner': "네이버 + 창원대 NER shared task data",
'nsmc': "e9t@github 님이 만드신 Naver sentiment movie corpus v1.0",
Expand All @@ -88,6 +91,7 @@ def corpus_list(cls):
'korean_petitions': fetch_korean_petitions,
'kornli': fetch_kornli,
'korsts': fetch_korsts,
'kowikitext': fetch_kowikitext,
'namuwikitext': fetch_namuwikitext,
'naver_changwon_ner': fetch_naverchangwon_ner,
'nsmc': fetch_nsmc,
Expand Down
35 changes: 35 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ pip install Korpora
|korean_petitions|청와대 국민 청원|https://github.com/lovit/petitions_archive|
|kornli|Korean NLI|https://github.com/kakaobrain/KorNLUDatasets|
|korsts|Korean STS|https://github.com/kakaobrain/KorNLUDatasets|
|kowikitext|한국어 위키피디아 텍스트|https://github.com/lovit/kowikitext|
|namuwikitext|나무위키 텍스트|https://github.com/lovit/namuwikitext|
|naver_changwon_ner|네이버 x 창원대 개체명 인식 데이터셋|https://github.com/naver/nlp-challenge/tree/master/missions/ner|
|nsmc|NAVER Sentiment Movie Corpus|https://github.com/e9t/nsmc|
Expand All @@ -56,6 +57,7 @@ Korpora.corpus_list()
'korean_petitions': 'lovit@github 님이 만드신 2017.08 ~ 2019.03 청와대 청원데이터',
'kornli': 'KakaoBrain 에서 제공하는 Natural Language Inference (NLI) 데이터',
'korsts': 'KakaoBrain 에서 제공하는 Semantic Textual Similarity (STS) 데이터',
'kowikitext': 'lovit@github 님이 만드신 wikitext 형식의 한국어 위키피디아 데이터',
'namuwikitext': 'lovit@github 님이 만드신 wikitext 형식의 나무위키 데이터',
'naver_changwon_ner': '네이버 + 창원대 NER shared task data',
'nsmc': 'e9t@github 님이 만드신 Naver sentiment movie corpus v1.0',
Expand Down Expand Up @@ -288,6 +290,39 @@ korsts.test[0]
|기타|데이터 관련 추가 정보|


### kowikitext
- author: lovit@github
- repository: https://github.com/lovit/kowikitext
- size:
- train : 26794425 lines (877754 articles, 1.7G)
- dev : 130419 lines (4433 articles, 7.7M)
- test : 134340 lines (4434 articles, 8.4M)
- example
```python
from Korpora import Korpora, KowikiTextKorpus

kowiki = KowikiTextKorpus() # or
kowiki = Korpora.load('kowikitext')

kowiki.train[0]
# SentencePair(text='외교부장\n외교부장', pair=' = 분류:중화인민공화국의 외교부장 =')
kowiki.train[0].text
# '외교부장\n외교부장'
kowiki.train[0].pair
# = 분류:중화인민공화국의 외교부장 =
kowiki.dev[0]
# SentencePair(text='스폴리아텔레(, )는 이탈리아의 후식으로서 ...', pair=' = 스폴리아텔레 =')
kowiki.test[0]
# SentencePair(text='기타', pair=' = 분류:러시아의 기타 연주자 =')
```
- data structure

|속성명|내용|
|---|---|
|text|섹션 본문|
|pair|섹션 타이틀|


### 나무위키텍스트
- author: lovit@github
- repository: https://github.com/lovit/namuwikitext
Expand Down