Skip to content
This repository has been archived by the owner on Nov 3, 2023. It is now read-only.

Commit

Permalink
XPersona Dataset (#4314)
Browse files Browse the repository at this point in the history
  • Loading branch information
Zheng0428 authored Jan 25, 2022
1 parent a8bf30e commit f451c6d
Show file tree
Hide file tree
Showing 29 changed files with 1,335 additions and 0 deletions.
13 changes: 13 additions & 0 deletions parlai/tasks/task_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -1536,4 +1536,17 @@
),
"links": {"arXiv": "https://arxiv.org/abs/2110.07518"},
},
{
"id": "XPersona",
"display_name": "XPersona",
"task": "xpersona",
"tags": ["ChitChat"],
"description": (
"XPersona is an extension of ConvAI2 with six more languages: Chinese, French, Indonesian, Italian, Korean, and Japanese."
),
"links": {
"arXiv": "https://arxiv.org/pdf/2003.07568.pdf",
"website": "https://github.com/HLTCHKUST/Xpersona",
},
},
]
5 changes: 5 additions & 0 deletions parlai/tasks/xpersona/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
70 changes: 70 additions & 0 deletions parlai/tasks/xpersona/agents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import copy
import os
from .build import build
from parlai.core.teachers import FbDeprecatedDialogTeacher

'''This dataset is available in seven different languages.
To use the dataset in the specified language, use the task flag to specify it.
--task xpersona:{LANGUAGE}
The default language is English.
'''


def _path(opt):
# build the data if it does not exist
build(opt)

# set up path to data (specific to each dataset)
dt = opt['datatype'].split(':')[0]
if ':' in opt['task']:
language = opt['task'].split(':')[1]
else:
language = 'En'
return os.path.join(opt['datapath'], 'XPersona', language + '_' + dt + '.txt')


class XPersonaTeacher(FbDeprecatedDialogTeacher):
def __init__(self, opt, shared=None):
opt = copy.deepcopy(opt)
opt['datafile'] = _path(opt)
super().__init__(opt, shared)


class DefaultTeacher(XPersonaTeacher):
pass


class EnTeacher(XPersonaTeacher):
pass


class ItTeacher(XPersonaTeacher):
pass


class ZhTeacher(XPersonaTeacher):
pass


class IdTeacher(XPersonaTeacher):
pass


class FrTeacher(XPersonaTeacher):
pass


class JpTeacher(XPersonaTeacher):
pass


class KoTeacher(XPersonaTeacher):
pass
193 changes: 193 additions & 0 deletions parlai/tasks/xpersona/build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# Download and build the data if it does not exist.

import parlai.core.build_data as build_data
import os
from parlai.core.build_data import DownloadableFile
import json
from parlai.utils.io import PathManager

RESOURCES = [
########En########
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/En_persona_test.json',
'En_test_tmp.json',
'8baa09a8064a22967544f501821aa114393a59339c0559da8afa160966ba87c9',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/En_persona_train.json',
'En_train_tmp.json',
'e23112bba7320f798b07afb4c5acc3edad2a2ccb7df5cc46f141a0c79ff4665c',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/En_persona_valid.json',
'En_valid_tmp.json',
'08ed3d41c5b0681c2d125a5312b43d926a8a5aa1d10a5df655d17f4c56dab635',
zipped=False,
),
#######Fr########
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Fr_persona_train_corrected.json',
'Fr_train_tmp.json',
'40e66e91aa6360eeda642c2c674f03c52854626eaf35da50084d26ff42f61292',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Fr_persona_split_test_human_annotated.json',
'Fr_test_tmp.json',
'0783fcf01bdf4c27ec28120a9b23bf4f0248e97ae476e03cd7e759cb2667ab23',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Fr_persona_split_valid_human_annotated.json',
'Fr_valid_tmp.json',
'8ad86b05aabfadedba7863828b1cc4fdff0926ebf476121268089ac7ed9af149',
zipped=False,
),
########Id########
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Id_persona_train_corrected.json',
'Id_train_tmp.json',
'fee1a9769fe707fd09401c33bdf3b3cd4f7b5fd100998577f3f179c42423bc4f',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Id_persona_split_test_human_annotated.json',
'Id_test_tmp.json',
'e0bcd3c02f318f4381c42798a2ce6e0a10237b6998793076fbf97f633a9f2563',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Id_persona_split_valid_human_annotated.json',
'Id_valid_tmp.json',
'8f70cab662f082ae3ee2abce9e9cac619ebc632a2820d93414a59005dbf75d7e',
zipped=False,
),
########It########
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/It_persona_train_corrected.json',
'It_train_tmp.json',
'9636893050ad16dc2daabfe8bde6979c9c780c5da9d0790f5668e198aac18b8f',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/It_persona_split_test_human_annotated.json',
'It_test_tmp.json',
'690103031791fd5c6763176a074cdb65e249adc784e2d577110c2d9430a02a87',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/It_persona_split_valid_human_annotated.json',
'It_valid_tmp.json',
'720dc91d3f9bc6a56ac229c6800cde63c71677a3db8d0ace7a59a7f94d89df3d',
zipped=False,
),
########Jp########
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Jp_persona_train_corrected.json',
'Jp_train_tmp.json',
'808ee29c79303e1300b38aceee79111ffe1fd2a2facb09a0956615a61d840738',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Jp_persona_split_test_human_annotated.json',
'Jp_test_tmp.json',
'682fed16148517097437942088d225bd728cb7b41aa390559681ae73e5e6848f',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Jp_persona_split_valid_human_annotated.json',
'Jp_valid_tmp.json',
'a86bb811364d100bc77ddc8038265df1e01bbc3be095e56c6ec8f179e6365d75',
zipped=False,
),
########Ko########
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Ko_persona_train_corrected.json',
'Ko_train_tmp.json',
'105d6a08d02e76f1d006edb1819b96a8b5fa8d94b3ed278936bcf171368809b7',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Ko_persona_split_test_human_annotated.json',
'Ko_test_tmp.json',
'f7ac6bd2aec7014a28d34bb34dceed653b389ce25d21ca770e77142578dc70a6',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Ko_persona_split_valid_human_annotated.json',
'Ko_valid_tmp.json',
'188470f863f639946bc8248a9f6aa1e589b41ec61792b88b81e7a95c72deeae0',
zipped=False,
),
########Zh########
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Zh_persona_train_corrected.json',
'Zh_train_tmp.json',
'e07899fa91edd127ec77502bd604693c40e264b60225976e2ac6ed145d080323',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Zh_persona_split_test_human_annotated.json',
'Zh_test_tmp.json',
'0767a4a27c765277792597502f57ea8bb80bf7be94613b0d833107f66a7d3512',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Zh_persona_split_valid_human_annotated.json',
'Zh_valid_tmp.json',
'cfa90117d73fe294a1b776b2d1c7b53711bfcd724f5833204726c910dad5482d',
zipped=False,
),
]


def build(opt):
dpath = os.path.join(opt['datapath'], 'XPersona')
version = None

if not build_data.built(dpath, version_string=version):
print('[building data: ' + dpath + ']')
if build_data.built(dpath):
# An older version exists, so remove these outdated files.
build_data.remove_dir(dpath)
build_data.make_dir(dpath)

# Download the data.
for downloadable_file in RESOURCES:
downloadable_file.download_file(dpath)
_create_parlai_format(dpath)

# Mark the data as built.
build_data.mark_done(dpath, version_string=version)


def _create_parlai_format(dpath: str):
datatypes = ['train', 'valid', 'test']
languages = ['En_', 'Zh_', 'Fr_', 'Ko_', 'Id_', 'Jp_', 'It_']
for language in languages:
for datatype in datatypes:
datatype_full = language + datatype + '_tmp'
datatype_rename = language + datatype
load_path = os.path.join(dpath, f'{datatype_full}.json')
save_path = os.path.join(dpath, f'{datatype_rename}.txt')
with PathManager.open(load_path, 'r', encoding='utf8') as f_read:
data = json.load(f_read)
with PathManager.open(save_path, 'w', encoding='utf8') as f_write:
for content in data:
line_num = 0
personas = content['persona']
dialogs = content['dialogue']
for persona in personas:
line_num += 1
f_write.write(str(line_num) + ' your persona:' + persona + '\n')
for utterance_A, utterance_B in dialogs:
line_num += 1
f_write.write(f"{line_num} {utterance_A}\t{utterance_B}\n")
os.remove(load_path)
39 changes: 39 additions & 0 deletions parlai/tasks/xpersona/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

from parlai.utils.testing import AutoTeacherTest # noqa: F401


class TestDefaultTeacher(AutoTeacherTest):
task = 'xpersona'


class TestEnTeacher(AutoTeacherTest):
task = 'xpersona:En'


class TestZhTeacher(AutoTeacherTest):
task = 'xpersona:Zh'


class TestFrTeacher(AutoTeacherTest):
task = 'xpersona:Fr'


class TestIdTeacher(AutoTeacherTest):
task = 'xpersona:Id'


class TestItTeacher(AutoTeacherTest):
task = 'xpersona:It'


class TestKoTeacher(AutoTeacherTest):
task = 'xpersona:Ko'


class TestJpTeacher(AutoTeacherTest):
task = 'xpersona:Jp'
42 changes: 42 additions & 0 deletions parlai/tasks/xpersona/test/xpersona_En_test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
acts:
- - episode_done: false
eval_labels:
- i am good , i just got off work and tired , i have two jobs .
id: xpersona:En
reward: 0
text: 'your persona:i read twenty books a year.
your persona:i''m a stunt double as my second job.
your persona:i only eat kosher.
your persona:i was raised in a single parent household.
hello what are doing today ?'
- - episode_done: false
eval_labels:
- i rather read , i've read about 20 books this year .
id: xpersona:En
reward: 0
text: i just got done watching a horror movie
- - episode_done: false
eval_labels:
- but a good movie is always good .
id: xpersona:En
reward: 0
text: wow ! i do love a good horror movie . loving this cooler weather
- - episode_done: false
eval_labels:
- i work in the movies as well .
id: xpersona:En
reward: 0
text: yes ! my son is in junior high and i just started letting him watch them
too
- - episode_done: false
eval_labels:
- yes it is neat , i stunt double , it is so much fun and hard work .
id: xpersona:En
reward: 0
text: neat ! ! i used to work in the human services field
num_episodes: 1000
num_examples: 7801
Loading

0 comments on commit f451c6d

Please sign in to comment.