diff --git a/Korpora/__init__.py b/Korpora/__init__.py index 7e1298b..d57fd48 100644 --- a/Korpora/__init__.py +++ b/Korpora/__init__.py @@ -12,6 +12,7 @@ from .korpus_korean_petitions import KoreanPetitionsKorpus from .korpus_kornli import KorNLIKorpus from .korpus_korsts import KorSTSKorpus +from .korpus_kowiki import KowikiTextKorpus from .korpus_namuwiki import NamuwikiTextKorpus from .korpus_naverchangwon_ner import NaverChangwonNERKorpus from .korpus_nsmc import NSMCKorpus diff --git a/Korpora/korpus_kowiki.py b/Korpora/korpus_kowiki.py new file mode 100644 index 0000000..d47b212 --- /dev/null +++ b/Korpora/korpus_kowiki.py @@ -0,0 +1,91 @@ +import os +from .korpora import Korpus, SentencePairKorpusData +from .utils import fetch, default_korpora_path, load_wikitext + + +KOWIKI_FETCH_INFORMATION = [ + { + 'url': 'https://github.com/lovit/kowikitext/releases/download/kowikitext.20200920.v2/kowikitext_20200920.train.zip', + 'destination': 'kowiki/kowikitext_20200920.train.zip', + 'method': 'download & unzip' + }, + { + 'url': 'https://github.com/lovit/kowikitext/releases/download/kowikitext.20200920.v2/kowikitext_20200920.test.zip', + 'destination': 'kowiki/kowikitext_20200920.test.zip', + 'method': 'download & unzip' + }, + { + 'url': 'https://github.com/lovit/kowikitext/releases/download/kowikitext.20200920.v2/kowikitext_20200920.dev.zip', + 'destination': 'kowiki/kowikitext_20200920.dev.zip', + 'method': 'download & unzip' + } +] + +description = """ Author : Hyunjoong Kim lovit@github + Repository : https://github.com/lovit/kowikitext + References : + + 한국어 위키피디아의 덤프 데이터를 바탕을 제작한 wikitext 형식의 텍스트 파일입니다. + 학습 및 평가를 위하여 위키페이지 별로 train (99%), dev (0.5%), test (0.5%) 로 나뉘어져있습니다. +""" + +license = " CC-BY-SA 3.0 which kowiki dump dataset is licensed" + + +class KowikiTextKorpus(Korpus): + def __init__(self, root_dir=None, force_download=False): + super().__init__(description, license) + + if root_dir is None: + root_dir = default_korpora_path + fetch_kowikitext(root_dir, force_download) + + for information in KOWIKI_FETCH_INFORMATION: + destination = information['destination'] + local_path = os.path.join(os.path.abspath(root_dir), destination[:-4]) + + if 'train' in destination: + response = input( + 'kowikiText.train text file is large (1.6G).\n' + 'If you want to load text in your memory, please insert `yes`\n' + 'If the `INPUT` is integer, it loads only first `INPUT` sentences\n').lower() + if (len(response) == 1 and response == 'y') or (response == 'yes'): + texts, titles = self.load(local_path) + self.train = SentencePairKorpusData('kowikiText.train', texts, titles) + elif response.isdigit(): + texts, titles = self.load(local_path, num_lines=int(response)) + self.train = SentencePairKorpusData('kowikiText.train', texts, titles) + else: + dirname = os.path.abspath(f'{root_dir}/kowiki') + self.train = f'kowikitext corpus is downloaded. Open local directory {dirname}' + print('Continue to load `dev` and `test`') + continue + + texts, titles = self.load(local_path) + if 'dev' in destination: + self.dev = SentencePairKorpusData('kowikiText.dev', texts, titles) + elif 'test' in destination: + self.test = SentencePairKorpusData('kowikiText.test', texts, titles) + else: + raise ValueError(f'Check local files') + + def load(self, path, num_lines=-1): + def split_title_text(wikitext): + lines = wikitext.split('\n') + title = lines[0] + text = '\n'.join([line.strip() for line in lines[2:] if line.strip()]) + return title, text + + wikitexts = load_wikitext(path, num_lines) + wikitexts = [split_title_text(wikitext) for wikitext in wikitexts] + titles, texts = zip(*wikitexts) + # swap position + return texts, titles + + +def fetch_kowikitext(root_dir, force_download): + for information in KOWIKI_FETCH_INFORMATION: + url = information['url'] + destination = information['destination'] + local_path = os.path.join(os.path.abspath(root_dir), destination) + fetch(url, local_path, 'kowikitext', force_download, information['method']) diff --git a/Korpora/loader.py b/Korpora/loader.py index 93f796d..f514a3c 100644 --- a/Korpora/loader.py +++ b/Korpora/loader.py @@ -4,6 +4,7 @@ from .korpus_korean_petitions import KoreanPetitionsKorpus, fetch_korean_petitions from .korpus_kornli import KorNLIKorpus, fetch_kornli from .korpus_korsts import KorSTSKorpus, fetch_korsts +from .korpus_kowiki import KowikiTextKorpus, fetch_kowikitext from .korpus_namuwiki import NamuwikiTextKorpus, fetch_namuwikitext from .korpus_naverchangwon_ner import NaverChangwonNERKorpus, fetch_naverchangwon_ner from .korpus_nsmc import NSMCKorpus, fetch_nsmc @@ -62,6 +63,7 @@ def corpus_list(cls): 'korean_petitions': KoreanPetitionsKorpus, 'kornli': KorNLIKorpus, 'korsts': KorSTSKorpus, + 'kowikitext': KowikiTextKorpus, 'namuwikitext': NamuwikiTextKorpus, 'naver_changwon_ner': NaverChangwonNERKorpus, 'nsmc': NSMCKorpus, @@ -75,6 +77,7 @@ def corpus_list(cls): 'korean_petitions': "lovit@github 님이 만드신 2017.08 ~ 2019.03 청와대 청원데이터", 'kornli': "KakaoBrain 에서 제공하는 Natural Language Inference (NLI) 데이터", 'korsts': "KakaoBrain 에서 제공하는 Semantic Textual Similarity (STS) 데이터", + 'kowikitext': "lovit@github 님이 만드신 wikitext 형식의 한국어 위키피디아 데이터", 'namuwikitext': "lovit@github 님이 만드신 wikitext 형식의 나무위키 데이터", 'naver_changwon_ner': "네이버 + 창원대 NER shared task data", 'nsmc': "e9t@github 님이 만드신 Naver sentiment movie corpus v1.0", @@ -88,6 +91,7 @@ def corpus_list(cls): 'korean_petitions': fetch_korean_petitions, 'kornli': fetch_kornli, 'korsts': fetch_korsts, + 'kowikitext': fetch_kowikitext, 'namuwikitext': fetch_namuwikitext, 'naver_changwon_ner': fetch_naverchangwon_ner, 'nsmc': fetch_nsmc, diff --git a/README.md b/README.md index bd2b297..b98f755 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ pip install Korpora |korean_petitions|청와대 국민 청원|https://github.com/lovit/petitions_archive| |kornli|Korean NLI|https://github.com/kakaobrain/KorNLUDatasets| |korsts|Korean STS|https://github.com/kakaobrain/KorNLUDatasets| +|kowikitext|한국어 위키피디아 텍스트|https://github.com/lovit/kowikitext| |namuwikitext|나무위키 텍스트|https://github.com/lovit/namuwikitext| |naver_changwon_ner|네이버 x 창원대 개체명 인식 데이터셋|https://github.com/naver/nlp-challenge/tree/master/missions/ner| |nsmc|NAVER Sentiment Movie Corpus|https://github.com/e9t/nsmc| @@ -56,6 +57,7 @@ Korpora.corpus_list() 'korean_petitions': 'lovit@github 님이 만드신 2017.08 ~ 2019.03 청와대 청원데이터', 'kornli': 'KakaoBrain 에서 제공하는 Natural Language Inference (NLI) 데이터', 'korsts': 'KakaoBrain 에서 제공하는 Semantic Textual Similarity (STS) 데이터', + 'kowikitext': 'lovit@github 님이 만드신 wikitext 형식의 한국어 위키피디아 데이터', 'namuwikitext': 'lovit@github 님이 만드신 wikitext 형식의 나무위키 데이터', 'naver_changwon_ner': '네이버 + 창원대 NER shared task data', 'nsmc': 'e9t@github 님이 만드신 Naver sentiment movie corpus v1.0', @@ -288,6 +290,39 @@ korsts.test[0] |기타|데이터 관련 추가 정보| +### kowikitext +- author: lovit@github +- repository: https://github.com/lovit/kowikitext +- size: + - train : 26794425 lines (877754 articles, 1.7G) + - dev : 130419 lines (4433 articles, 7.7M) + - test : 134340 lines (4434 articles, 8.4M) +- example +```python +from Korpora import Korpora, KowikiTextKorpus + +kowiki = KowikiTextKorpus() # or +kowiki = Korpora.load('kowikitext') + +kowiki.train[0] +# SentencePair(text='외교부장\n외교부장', pair=' = 분류:중화인민공화국의 외교부장 =') +kowiki.train[0].text +# '외교부장\n외교부장' +kowiki.train[0].pair +# = 분류:중화인민공화국의 외교부장 = +kowiki.dev[0] +# SentencePair(text='스폴리아텔레(, )는 이탈리아의 후식으로서 ...', pair=' = 스폴리아텔레 =') +kowiki.test[0] +# SentencePair(text='기타', pair=' = 분류:러시아의 기타 연주자 =') +``` +- data structure + +|속성명|내용| +|---|---| +|text|섹션 본문| +|pair|섹션 타이틀| + + ### 나무위키텍스트 - author: lovit@github - repository: https://github.com/lovit/namuwikitext