facebookresearch · stephenroller · Mar 3, 2022 · Jan 26, 2022 · Mar 3, 2022
diff --git a/parlai/tasks/lccc/__init__.py b/parlai/tasks/lccc/__init__.py
@@ -0,0 +1,5 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/parlai/tasks/lccc/agents.py b/parlai/tasks/lccc/agents.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional
+from parlai.core.params import ParlaiParser
+from parlai.core.opt import Opt
+import copy
+import os
+from .build import build
+
+from parlai.core.teachers import ConversationTeacher
+
+
+def _path(opt):
+    # build the data if it does not exist
+    build(opt)
+
+    # set up path to data (specific to each dataset)
+    datatype = opt['datatype'].split(':')[0]
+    return os.path.join(opt['datapath'], 'LCCC', datatype + '.json')
+
+
+class LCCCTeacher(ConversationTeacher):
+    @classmethod
+    def add_cmdline_args(
+        cls, parser: ParlaiParser, partial_opt: Optional[Opt] = None
+    ) -> ParlaiParser:
+        super().add_cmdline_args(parser, partial_opt)
+        agent = parser.add_argument_group('LCCC Task Arguments')
+        agent.add_argument(
+            '--label-turns',
+            type=str,
+            help='which speaker to use as label',
+            choices=['firstspeaker', 'secondspeaker', 'both'],
+            default='secondspeaker',
+        )
+        return parser
+
+    def __init__(self, opt, shared=None):
+        opt = copy.deepcopy(opt)
+        # get datafile
+        opt['conversationteacher_datafile'] = _path(opt)
+        super().__init__(opt, shared)
+
+
+class DefaultTeacher(LCCCTeacher):
+    pass
diff --git a/parlai/tasks/lccc/build.py b/parlai/tasks/lccc/build.py
@@ -0,0 +1,62 @@
+from parlai.core.build_data import DownloadableFile
+from parlai.utils.io import PathManager
+import parlai.core.build_data as build_data
+import os
+import json
+
+RESOURCES = [
+    DownloadableFile(
+        'https://cloud.tsinghua.edu.cn/f/f131a4d259184566a29c/?dl=1',
+        'LCCC.zip',
+        'f5203511cd8d6a608008af0aa290aa516d983abc16aa510471e3c4ee6bca7886',
+    ),
+]
+
+
+def build(opt):
+    dpath = os.path.join(opt['datapath'], 'LCCC')
+    version = None
+
+    if not build_data.built(dpath, version_string=version):
+        if build_data.built(dpath):
+            # An older version exists, so remove these outdated files.
+            build_data.remove_dir(dpath)
+        build_data.make_dir(dpath)
+
+        # Download the data.
+        RESOURCES[0].download_file(dpath)
+        # Format it for use with ConversationTeacher
+        _create_parlai_format(dpath)
+        # Mark the data as built.
+        build_data.mark_done(dpath, version_string=version)
+
+
+def _create_parlai_format(dpath: str):
+    """
+    Copy data into the format read by ConversationTeacher.
+    """
+
+    datatypes = ['train', 'valid', 'test']
+    for datatype in datatypes:
+        datatype_full = 'LCCC-base_' + datatype
+        load_path = os.path.join(dpath, f'{datatype_full}.json')
+        save_path = os.path.join(dpath, f'{datatype}.json')
+        with PathManager.open(load_path, 'r', encoding='utf8') as f_read:
+            data = json.load(f_read)
+        with PathManager.open(save_path, 'w', encoding='utf8') as f_write:
+            for episode in data:
+                new_episode = []
+                pid = 0
+                for text in episode:
+                    new_episode.append(
+                        {
+                            'id': 'partner{}'.format(pid + 1),
+                            'text': text.replace(' ', ''),
+                        }
+                    )
+                    pid = (pid + 1) % 2
+                print(
+                    json.dumps({'dialog': [new_episode]}, ensure_ascii=False),
+                    file=f_write,
+                )
+        os.remove(load_path)
diff --git a/parlai/tasks/lccc/test.py b/parlai/tasks/lccc/test.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from parlai.utils.testing import AutoTeacherTest  # noqa: F401
+
+
+class TestDefaultTeacher(AutoTeacherTest):
+    task = 'lccc'
diff --git a/parlai/tasks/lccc/test/lccc_test.yml b/parlai/tasks/lccc/test/lccc_test.yml
@@ -0,0 +1,28 @@
+acts:
+- - episode_done: true
+    eval_labels:
+    - 去相机家里吃……
+    id: lccc
+    text: 我饿了。
+- - episode_done: true
+    eval_labels:
+    - 你过来我们什么关系
+    id: lccc
+    text: 网络大实话里说的是也许你能在网络里找到你想要的友情但永远不会找到你想要的爱情
+- - episode_done: true
+    eval_labels:
+    - 我不挑食
+    id: lccc
+    text: 老铁家好吃贾三不好吃
+- - episode_done: true
+    eval_labels:
+    - 死鱼皮真会安慰人那不是翘臀是肥肉不！是赘肉！
+    id: lccc
+    text: 你有翘臀啊！！！！你的脸还不够小啊？？？？？
+- - episode_done: false
+    eval_labels:
+    - 哈哈哈快到南方来
+    id: lccc
+    text: 好羡慕原来你们那真的可以光腿
+num_episodes: 10000
+num_examples: 12943
diff --git a/parlai/tasks/lccc/test/lccc_train.yml b/parlai/tasks/lccc/test/lccc_train.yml
@@ -0,0 +1,28 @@
+acts:
+- - episode_done: true
+    id: lccc
+    labels:
+    - 道歉！！再有时间找你去
+    text: 你去那儿竟然不喊我生气了，快点给我道歉
+- - episode_done: true
+    id: lccc
+    labels:
+    - SEED早上刚被禁用还有一个月的VIP路线呢禁了之后才买的另一个买了一年结果用了一下午就挂了现在用了个极速网速差的很
+    text: 我用SEED.24小时签到一次可以用4小时，对于我这种每天晚上逛一下的感觉不错
+- - episode_done: true
+    id: lccc
+    labels:
+    - 干完这一票我的会员等级就要升了！
+    text: 咬咬牙这回要全入了！
+- - episode_done: true
+    id: lccc
+    labels:
+    - 代表了哪里的普通人？
+    text: 记得还…我们普通人要搬三天砖才赚来的20元
+- - episode_done: true
+    id: lccc
+    labels:
+    - 好得差不多啦
+    text: 早点好起来啊。生日快乐
+num_episodes: 6820506
+num_examples: 8869637
diff --git a/parlai/tasks/lccc/test/lccc_valid.yml b/parlai/tasks/lccc/test/lccc_valid.yml
@@ -0,0 +1,28 @@
+acts:
+- - episode_done: true
+    eval_labels:
+    - 那个饭凉了吧唧的怎么吃啊摔
+    id: lccc
+    text: 啊我好爱虾仁蛋黄酱金枪鱼蛋黄酱
+- - episode_done: true
+    eval_labels:
+    - 看了下全文，那女的考试当天就表明身体不舒服了，考试不是她预约是教练自己安排的，教练还让她考试不就是教练的错吗？而且她住院花了31万，赔30万不过分吧
+    id: lccc
+    text: 考试撞墙关驾校屁事？你怎么不顺便把考场施工单位也告了？
+- - episode_done: true
+    eval_labels:
+    - 好好好，偶遇我大闺蜜
+    id: lccc
+    text: 我先去穿衣服，准备走了
+- - episode_done: true
+    eval_labels:
+    - 保存一下
+    id: lccc
+    text: 期待小猎豹的表现
+- - episode_done: true
+    eval_labels:
+    - 给你一个么么哒
+    id: lccc
+    text: 为了你们新年有惊喜，我也是用心良苦了
+num_episodes: 20000
+num_examples: 25558
diff --git a/parlai/tasks/task_list.py b/parlai/tasks/task_list.py
@@ -1549,4 +1549,17 @@
             "website": "https://github.com/HLTCHKUST/Xpersona",
         },
     },
+    {
+        "id": "LCCC",
+        "display_name": "LCCC",
+        "task": "lccc",
+        "tags": ["ChitChat"],
+        "description": (
+            "Large-scale cleaned Chinese conversation dataset."
+        ),
+        "links": {
+            "arXiv": "https://arxiv.org/pdf/2008.03946",
+            "website": "https://github.com/thu-coai/CDial-GPT",
+        },
+    },
 ]