From 47c80b9d995157cc0c5b14d71c9c4424ccebe895 Mon Sep 17 00:00:00 2001 From: lovit Date: Sun, 4 Oct 2020 04:37:11 +0900 Subject: [PATCH 1/5] Implement kowikitext korpus and loader (#96) --- Korpora/__init__.py | 1 + Korpora/korpus_kowiki.py | 91 ++++++++++++++++++++++++++++++++++++++++ Korpora/loader.py | 4 ++ 3 files changed, 96 insertions(+) create mode 100644 Korpora/korpus_kowiki.py diff --git a/Korpora/__init__.py b/Korpora/__init__.py index 7e1298b..d57fd48 100644 --- a/Korpora/__init__.py +++ b/Korpora/__init__.py @@ -12,6 +12,7 @@ from .korpus_korean_petitions import KoreanPetitionsKorpus from .korpus_kornli import KorNLIKorpus from .korpus_korsts import KorSTSKorpus +from .korpus_kowiki import KowikiTextKorpus from .korpus_namuwiki import NamuwikiTextKorpus from .korpus_naverchangwon_ner import NaverChangwonNERKorpus from .korpus_nsmc import NSMCKorpus diff --git a/Korpora/korpus_kowiki.py b/Korpora/korpus_kowiki.py new file mode 100644 index 0000000..f1634f6 --- /dev/null +++ b/Korpora/korpus_kowiki.py @@ -0,0 +1,91 @@ +import os +from .korpora import Korpus, SentencePairKorpusData +from .utils import fetch, default_korpora_path, load_wikitext + + +KOWIKI_FETCH_INFORMATION = [ + { + 'url': 'https://github.com/lovit/kowikitext/releases/download/20200920.v1/kowikitext_20200920.train.zip', + 'destination': 'kowiki/kowikitext_20200920.train.zip', + 'method': 'download & unzip' + }, + { + 'url': 'https://github.com/lovit/kowikitext/releases/download/20200920.v1/kowikitext_20200920.test.zip', + 'destination': 'kowiki/kowikitext_20200920.test.zip', + 'method': 'download & unzip' + }, + { + 'url': 'https://github.com/lovit/kowikitext/releases/download/20200920.v1/kowikitext_20200920.dev.zip', + 'destination': 'kowiki/kowikitext_20200920.dev.zip', + 'method': 'download & unzip' + } +] + +description = """ Author : Hyunjoong Kim lovit@github + Repository : https://github.com/lovit/kowikitext + References : + + 한국어 위키피디아의 덤프 데이터를 바탕을 제작한 wikitext 형식의 텍스트 파일입니다. + 학습 및 평가를 위하여 위키페이지 별로 train (99%), dev (0.5%), test (0.5%) 로 나뉘어져있습니다. +""" + +license = " CC-BY-SA 3.0 which kowiki dump dataset is licensed" + + +class KowikiTextKorpus(Korpus): + def __init__(self, root_dir=None, force_download=False): + super().__init__(description, license) + + if root_dir is None: + root_dir = default_korpora_path + fetch_kowikitext(root_dir, force_download) + + for information in KOWIKI_FETCH_INFORMATION: + destination = information['destination'] + local_path = os.path.join(os.path.abspath(root_dir), destination[:-4]) + + if 'train' in destination: + response = input( + 'kowikiText.train text file is large (1.6G).\n' + 'If you want to load text in your memory, please insert `yes`\n' + 'If the `INPUT` is integer, it loads only first `INPUT` sentences\n').lower() + if (len(response) == 1 and response == 'y') or (response == 'yes'): + texts, titles = self.load(local_path) + self.train = SentencePairKorpusData('kowikiText.train', texts, titles) + elif response.isdigit(): + texts, titles = self.load(local_path, num_lines=int(response)) + self.train = SentencePairKorpusData('kowikiText.train', texts, titles) + else: + dirname = os.path.abspath(f'{root_dir}/kowiki') + self.train = f'kowikitext corpus is downloaded. Open local directory {dirname}' + print('Continue to load `dev` and `test`') + continue + + texts, titles = self.load(local_path) + if 'dev' in destination: + self.dev = SentencePairKorpusData('kowikiText.dev', texts, titles) + elif 'test' in destination: + self.test = SentencePairKorpusData('kowikiText.test', texts, titles) + else: + raise ValueError(f'Check local files') + + def load(self, path, num_lines=-1): + def split_title_text(wikitext): + lines = wikitext.split('\n') + title = lines[0] + text = '\n'.join([line.strip() for line in lines[2:] if line.strip()]) + return title, text + + wikitexts = load_wikitext(path, num_lines) + wikitexts = [split_title_text(wikitext) for wikitext in wikitexts] + titles, texts = zip(*wikitexts) + # swap position + return texts, titles + + +def fetch_kowikitext(root_dir, force_download): + for information in KOWIKI_FETCH_INFORMATION: + url = information['url'] + destination = information['destination'] + local_path = os.path.join(os.path.abspath(root_dir), destination) + fetch(url, local_path, 'kowikitext', force_download, information['method']) diff --git a/Korpora/loader.py b/Korpora/loader.py index 93f796d..f514a3c 100644 --- a/Korpora/loader.py +++ b/Korpora/loader.py @@ -4,6 +4,7 @@ from .korpus_korean_petitions import KoreanPetitionsKorpus, fetch_korean_petitions from .korpus_kornli import KorNLIKorpus, fetch_kornli from .korpus_korsts import KorSTSKorpus, fetch_korsts +from .korpus_kowiki import KowikiTextKorpus, fetch_kowikitext from .korpus_namuwiki import NamuwikiTextKorpus, fetch_namuwikitext from .korpus_naverchangwon_ner import NaverChangwonNERKorpus, fetch_naverchangwon_ner from .korpus_nsmc import NSMCKorpus, fetch_nsmc @@ -62,6 +63,7 @@ def corpus_list(cls): 'korean_petitions': KoreanPetitionsKorpus, 'kornli': KorNLIKorpus, 'korsts': KorSTSKorpus, + 'kowikitext': KowikiTextKorpus, 'namuwikitext': NamuwikiTextKorpus, 'naver_changwon_ner': NaverChangwonNERKorpus, 'nsmc': NSMCKorpus, @@ -75,6 +77,7 @@ def corpus_list(cls): 'korean_petitions': "lovit@github 님이 만드신 2017.08 ~ 2019.03 청와대 청원데이터", 'kornli': "KakaoBrain 에서 제공하는 Natural Language Inference (NLI) 데이터", 'korsts': "KakaoBrain 에서 제공하는 Semantic Textual Similarity (STS) 데이터", + 'kowikitext': "lovit@github 님이 만드신 wikitext 형식의 한국어 위키피디아 데이터", 'namuwikitext': "lovit@github 님이 만드신 wikitext 형식의 나무위키 데이터", 'naver_changwon_ner': "네이버 + 창원대 NER shared task data", 'nsmc': "e9t@github 님이 만드신 Naver sentiment movie corpus v1.0", @@ -88,6 +91,7 @@ def corpus_list(cls): 'korean_petitions': fetch_korean_petitions, 'kornli': fetch_kornli, 'korsts': fetch_korsts, + 'kowikitext': fetch_kowikitext, 'namuwikitext': fetch_namuwikitext, 'naver_changwon_ner': fetch_naverchangwon_ner, 'nsmc': fetch_nsmc, From 25c50150db15c274fcb3feb75491ce1de21eb7ce Mon Sep 17 00:00:00 2001 From: lovit Date: Sun, 4 Oct 2020 05:07:06 +0900 Subject: [PATCH 2/5] Update kowiki usage (#96) --- README.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/README.md b/README.md index bd2b297..91284f2 100644 --- a/README.md +++ b/README.md @@ -288,6 +288,39 @@ korsts.test[0] |기타|데이터 관련 추가 정보| +### kowikitext +- author: lovit@github +- repository: https://github.com/lovit/kowikitext +- size: + - train : 26827837 lines (877754 articles, 1.7G) + - dev : 130568 lines (4433 articles, 7.7M) + - test : 134478 lines (4433 articles, 8.4M) +- example +```python +from Korpora import Korpora, KowikiTextKorpus + +kowiki = KowikiTextKorpus() # or +kowiki = Korpora.load('kowikitext') + +kowiki.train[0] +# SentencePair(text='외교부장\n외교부장', pair=' = 분류:중화인민공화국의 외교부장 =') +kowiki.train[0].text +# '외교부장\n외교부장' +kowiki.train[0].pair +# = 분류:중화인민공화국의 외교부장 = +kowiki.dev[0] +# SentencePair(text='thumb|right|180px|스포이아텔레\n스폴리아텔레(, )는 이탈리아의 ...', pair=' = 스폴리아텔레 =') +kowiki.test[0] +# SentencePair(text='기타', pair=' = 분류:러시아의 기타 연주자 =') +``` +- data structure + +|속성명|내용| +|---|---| +|text|섹션 본문| +|pair|섹션 타이틀| + + ### 나무위키텍스트 - author: lovit@github - repository: https://github.com/lovit/namuwikitext From 2403da7812dd9b8a4d09556d09e4ebf67951f4f7 Mon Sep 17 00:00:00 2001 From: lovit Date: Sun, 4 Oct 2020 06:55:19 +0900 Subject: [PATCH 3/5] Update kowikitext.20200920.v2 (#96) --- Korpora/korpus_kowiki.py | 6 +++--- README.md | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Korpora/korpus_kowiki.py b/Korpora/korpus_kowiki.py index f1634f6..d47b212 100644 --- a/Korpora/korpus_kowiki.py +++ b/Korpora/korpus_kowiki.py @@ -5,17 +5,17 @@ KOWIKI_FETCH_INFORMATION = [ { - 'url': 'https://github.com/lovit/kowikitext/releases/download/20200920.v1/kowikitext_20200920.train.zip', + 'url': 'https://github.com/lovit/kowikitext/releases/download/kowikitext.20200920.v2/kowikitext_20200920.train.zip', 'destination': 'kowiki/kowikitext_20200920.train.zip', 'method': 'download & unzip' }, { - 'url': 'https://github.com/lovit/kowikitext/releases/download/20200920.v1/kowikitext_20200920.test.zip', + 'url': 'https://github.com/lovit/kowikitext/releases/download/kowikitext.20200920.v2/kowikitext_20200920.test.zip', 'destination': 'kowiki/kowikitext_20200920.test.zip', 'method': 'download & unzip' }, { - 'url': 'https://github.com/lovit/kowikitext/releases/download/20200920.v1/kowikitext_20200920.dev.zip', + 'url': 'https://github.com/lovit/kowikitext/releases/download/kowikitext.20200920.v2/kowikitext_20200920.dev.zip', 'destination': 'kowiki/kowikitext_20200920.dev.zip', 'method': 'download & unzip' } diff --git a/README.md b/README.md index 91284f2..f144bea 100644 --- a/README.md +++ b/README.md @@ -292,9 +292,9 @@ korsts.test[0] - author: lovit@github - repository: https://github.com/lovit/kowikitext - size: - - train : 26827837 lines (877754 articles, 1.7G) - - dev : 130568 lines (4433 articles, 7.7M) - - test : 134478 lines (4433 articles, 8.4M) + - train : 26794425 lines (877754 articles, 1.7G) + - dev : 130419 lines (4433 articles, 7.7M) + - test : 134340 lines (4434 articles, 8.4M) - example ```python from Korpora import Korpora, KowikiTextKorpus From bcfbd66a917c65c2c381bacdb29b3e6243be7e0b Mon Sep 17 00:00:00 2001 From: lovit Date: Sun, 4 Oct 2020 07:00:42 +0900 Subject: [PATCH 4/5] Update kowiki example (#96) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f144bea..eb257b1 100644 --- a/README.md +++ b/README.md @@ -309,7 +309,7 @@ kowiki.train[0].text kowiki.train[0].pair # = 분류:중화인민공화국의 외교부장 = kowiki.dev[0] -# SentencePair(text='thumb|right|180px|스포이아텔레\n스폴리아텔레(, )는 이탈리아의 ...', pair=' = 스폴리아텔레 =') +# SentencePair(text='스폴리아텔레(, )는 이탈리아의 후식으로서 ...', pair=' = 스폴리아텔레 =') kowiki.test[0] # SentencePair(text='기타', pair=' = 분류:러시아의 기타 연주자 =') ``` From b1e883e2c02ab1335fbdedee0236299e07f4a4be Mon Sep 17 00:00:00 2001 From: lovit Date: Sun, 4 Oct 2020 07:03:05 +0900 Subject: [PATCH 5/5] Update kowikitext corpus description (#96) --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index eb257b1..b98f755 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ pip install Korpora |korean_petitions|청와대 국민 청원|https://github.com/lovit/petitions_archive| |kornli|Korean NLI|https://github.com/kakaobrain/KorNLUDatasets| |korsts|Korean STS|https://github.com/kakaobrain/KorNLUDatasets| +|kowikitext|한국어 위키피디아 텍스트|https://github.com/lovit/kowikitext| |namuwikitext|나무위키 텍스트|https://github.com/lovit/namuwikitext| |naver_changwon_ner|네이버 x 창원대 개체명 인식 데이터셋|https://github.com/naver/nlp-challenge/tree/master/missions/ner| |nsmc|NAVER Sentiment Movie Corpus|https://github.com/e9t/nsmc| @@ -56,6 +57,7 @@ Korpora.corpus_list() 'korean_petitions': 'lovit@github 님이 만드신 2017.08 ~ 2019.03 청와대 청원데이터', 'kornli': 'KakaoBrain 에서 제공하는 Natural Language Inference (NLI) 데이터', 'korsts': 'KakaoBrain 에서 제공하는 Semantic Textual Similarity (STS) 데이터', + 'kowikitext': 'lovit@github 님이 만드신 wikitext 형식의 한국어 위키피디아 데이터', 'namuwikitext': 'lovit@github 님이 만드신 wikitext 형식의 나무위키 데이터', 'naver_changwon_ner': '네이버 + 창원대 NER shared task data', 'nsmc': 'e9t@github 님이 만드신 Naver sentiment movie corpus v1.0',