Skip to content
This repository has been archived by the owner on Nov 3, 2023. It is now read-only.

LCCC Dataset #4325

Merged
merged 2 commits into from Mar 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions parlai/tasks/lccc/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
50 changes: 50 additions & 0 deletions parlai/tasks/lccc/agents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

from typing import Optional
from parlai.core.params import ParlaiParser
from parlai.core.opt import Opt
import copy
import os
from .build import build

from parlai.core.teachers import ConversationTeacher


def _path(opt):
# build the data if it does not exist
build(opt)

# set up path to data (specific to each dataset)
datatype = opt['datatype'].split(':')[0]
return os.path.join(opt['datapath'], 'LCCC', datatype + '.json')


class LCCCTeacher(ConversationTeacher):
@classmethod
def add_cmdline_args(
cls, parser: ParlaiParser, partial_opt: Optional[Opt] = None
) -> ParlaiParser:
super().add_cmdline_args(parser, partial_opt)
agent = parser.add_argument_group('LCCC Task Arguments')
agent.add_argument(
'--label-turns',
type=str,
help='which speaker to use as label',
choices=['firstspeaker', 'secondspeaker', 'both'],
default='secondspeaker',
)
return parser

def __init__(self, opt, shared=None):
opt = copy.deepcopy(opt)
# get datafile
opt['conversationteacher_datafile'] = _path(opt)
super().__init__(opt, shared)


class DefaultTeacher(LCCCTeacher):
pass
62 changes: 62 additions & 0 deletions parlai/tasks/lccc/build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from parlai.core.build_data import DownloadableFile
from parlai.utils.io import PathManager
import parlai.core.build_data as build_data
import os
import json

RESOURCES = [
DownloadableFile(
'https://cloud.tsinghua.edu.cn/f/f131a4d259184566a29c/?dl=1',
'LCCC.zip',
'f5203511cd8d6a608008af0aa290aa516d983abc16aa510471e3c4ee6bca7886',
),
]


def build(opt):
dpath = os.path.join(opt['datapath'], 'LCCC')
version = None

if not build_data.built(dpath, version_string=version):
if build_data.built(dpath):
# An older version exists, so remove these outdated files.
build_data.remove_dir(dpath)
build_data.make_dir(dpath)

# Download the data.
RESOURCES[0].download_file(dpath)
# Format it for use with ConversationTeacher
_create_parlai_format(dpath)
# Mark the data as built.
build_data.mark_done(dpath, version_string=version)


def _create_parlai_format(dpath: str):
"""
Copy data into the format read by ConversationTeacher.
"""

datatypes = ['train', 'valid', 'test']
for datatype in datatypes:
datatype_full = 'LCCC-base_' + datatype
load_path = os.path.join(dpath, f'{datatype_full}.json')
save_path = os.path.join(dpath, f'{datatype}.json')
with PathManager.open(load_path, 'r', encoding='utf8') as f_read:
data = json.load(f_read)
with PathManager.open(save_path, 'w', encoding='utf8') as f_write:
for episode in data:
new_episode = []
pid = 0
for text in episode:
new_episode.append(
{
'id': 'partner{}'.format(pid + 1),
'text': text.replace(' ', ''),
}
)
pid = (pid + 1) % 2
print(
json.dumps({'dialog': [new_episode]}, ensure_ascii=False),
file=f_write,
)
os.remove(load_path)
11 changes: 11 additions & 0 deletions parlai/tasks/lccc/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

from parlai.utils.testing import AutoTeacherTest # noqa: F401


class TestDefaultTeacher(AutoTeacherTest):
task = 'lccc'
28 changes: 28 additions & 0 deletions parlai/tasks/lccc/test/lccc_test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
acts:
- - episode_done: true
eval_labels:
- 去相机家里吃……
id: lccc
text: 我饿了。
- - episode_done: true
eval_labels:
- 你过来我们什么关系
id: lccc
text: 网络大实话里说的是也许你能在网络里找到你想要的友情但永远不会找到你想要的爱情
- - episode_done: true
eval_labels:
- 我不挑食
id: lccc
text: 老铁家好吃贾三不好吃
- - episode_done: true
eval_labels:
- 死鱼皮真会安慰人那不是翘臀是肥肉不!是赘肉!
id: lccc
text: 你有翘臀啊!!!!你的脸还不够小啊?????
- - episode_done: false
eval_labels:
- 哈哈哈快到南方来
id: lccc
text: 好羡慕原来你们那真的可以光腿
num_episodes: 10000
num_examples: 12943
28 changes: 28 additions & 0 deletions parlai/tasks/lccc/test/lccc_train.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
acts:
- - episode_done: true
id: lccc
labels:
- 道歉!!再有时间找你去
text: 你去那儿竟然不喊我生气了,快点给我道歉
- - episode_done: true
id: lccc
labels:
- SEED早上刚被禁用还有一个月的VIP路线呢禁了之后才买的另一个买了一年结果用了一下午就挂了现在用了个极速网速差的很
text: 我用SEED.24小时签到一次可以用4小时,对于我这种每天晚上逛一下的感觉不错
- - episode_done: true
id: lccc
labels:
- 干完这一票我的会员等级就要升了!
text: 咬咬牙这回要全入了!
- - episode_done: true
id: lccc
labels:
- 代表了哪里的普通人?
text: 记得还…我们普通人要搬三天砖才赚来的20元
- - episode_done: true
id: lccc
labels:
- 好得差不多啦
text: 早点好起来啊。生日快乐
num_episodes: 6820506
num_examples: 8869637
28 changes: 28 additions & 0 deletions parlai/tasks/lccc/test/lccc_valid.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
acts:
- - episode_done: true
eval_labels:
- 那个饭凉了吧唧的怎么吃啊摔
id: lccc
text: 啊我好爱虾仁蛋黄酱金枪鱼蛋黄酱
- - episode_done: true
eval_labels:
- 看了下全文,那女的考试当天就表明身体不舒服了,考试不是她预约是教练自己安排的,教练还让她考试不就是教练的错吗?而且她住院花了31万,赔30万不过分吧
id: lccc
text: 考试撞墙关驾校屁事?你怎么不顺便把考场施工单位也告了?
- - episode_done: true
eval_labels:
- 好好好,偶遇我大闺蜜
id: lccc
text: 我先去穿衣服,准备走了
- - episode_done: true
eval_labels:
- 保存一下
id: lccc
text: 期待小猎豹的表现
- - episode_done: true
eval_labels:
- 给你一个么么哒
id: lccc
text: 为了你们新年有惊喜,我也是用心良苦了
num_episodes: 20000
num_examples: 25558
13 changes: 13 additions & 0 deletions parlai/tasks/task_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -1549,4 +1549,17 @@
"website": "https://github.com/HLTCHKUST/Xpersona",
},
},
{
"id": "LCCC",
"display_name": "LCCC",
"task": "lccc",
"tags": ["ChitChat"],
"description": (
"Large-scale cleaned Chinese conversation dataset."
),
"links": {
"arXiv": "https://arxiv.org/pdf/2008.03946",
"website": "https://github.com/thu-coai/CDial-GPT",
},
},
]