Skip to content
This repository has been archived by the owner on Nov 3, 2023. It is now read-only.

XPersona Dataset #4314

Merged
merged 5 commits into from
Jan 25, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions parlai/tasks/task_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -1536,4 +1536,17 @@
),
"links": {"arXiv": "https://arxiv.org/abs/2110.07518"},
},
{
"id": "XPersona",
"display_name": "XPersona",
"task": "xpersona",
"tags": ["ChitChat"],
"description": (
"XPersona is an extension of ConvAI2 with six more languages: Chinese, French, Indonesian, Italian, Korean, and Japanese."
),
"links": {
"arXiv": "https://arxiv.org/pdf/2003.07568.pdf",
"website": "https://github.com/HLTCHKUST/Xpersona",
},
},
]
5 changes: 5 additions & 0 deletions parlai/tasks/xpersona/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
70 changes: 70 additions & 0 deletions parlai/tasks/xpersona/agents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import copy
import os
from .build import build
from parlai.core.teachers import FbDeprecatedDialogTeacher

'''This dataset is available in seven different languages.
To use the dataset in the specified language, use the task flag to specify it.
--task xpersona:{LANGUAGE}
The default language is English.
'''


def _path(opt):
# build the data if it does not exist
build(opt)

# set up path to data (specific to each dataset)
dt = opt['datatype'].split(':')[0]
if ':' in opt['task']:
language = opt['task'].split(':')[1]
else:
language = 'En'
return os.path.join(opt['datapath'], 'XPersona', language + '_' + dt + '.txt')


class XPersonaTeacher(FbDeprecatedDialogTeacher):
def __init__(self, opt, shared=None):
opt = copy.deepcopy(opt)
opt['datafile'] = _path(opt)
super().__init__(opt, shared)


class DefaultTeacher(XPersonaTeacher):
pass


class EnTeacher(XPersonaTeacher):
pass


class ItTeacher(XPersonaTeacher):
pass


class ZhTeacher(XPersonaTeacher):
pass


class IdTeacher(XPersonaTeacher):
pass


class FrTeacher(XPersonaTeacher):
pass


class JpTeacher(XPersonaTeacher):
pass


class KoTeacher(XPersonaTeacher):
pass
200 changes: 200 additions & 0 deletions parlai/tasks/xpersona/build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# Download and build the data if it does not exist.

import parlai.core.build_data as build_data
import os
from parlai.core.build_data import DownloadableFile
import json
from parlai.utils.io import PathManager

RESOURCES = [
########En########
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/En_persona_test.json',
'En_test_tmp.json',
'8baa09a8064a22967544f501821aa114393a59339c0559da8afa160966ba87c9',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/En_persona_train.json',
'En_train_tmp.json',
'e23112bba7320f798b07afb4c5acc3edad2a2ccb7df5cc46f141a0c79ff4665c',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/En_persona_valid.json',
'En_valid_tmp.json',
'08ed3d41c5b0681c2d125a5312b43d926a8a5aa1d10a5df655d17f4c56dab635',
zipped=False,
),
#######Fr########
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Fr_persona_train_corrected.json',
'Fr_train_tmp.json',
'40e66e91aa6360eeda642c2c674f03c52854626eaf35da50084d26ff42f61292',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Fr_persona_split_test_human_annotated.json',
'Fr_test_tmp.json',
'0783fcf01bdf4c27ec28120a9b23bf4f0248e97ae476e03cd7e759cb2667ab23',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Fr_persona_split_valid_human_annotated.json',
'Fr_valid_tmp.json',
'8ad86b05aabfadedba7863828b1cc4fdff0926ebf476121268089ac7ed9af149',
zipped=False,
),
########Id########
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Id_persona_train_corrected.json',
'Id_train_tmp.json',
'fee1a9769fe707fd09401c33bdf3b3cd4f7b5fd100998577f3f179c42423bc4f',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Id_persona_split_test_human_annotated.json',
'Id_test_tmp.json',
'e0bcd3c02f318f4381c42798a2ce6e0a10237b6998793076fbf97f633a9f2563',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Id_persona_split_valid_human_annotated.json',
'Id_valid_tmp.json',
'8f70cab662f082ae3ee2abce9e9cac619ebc632a2820d93414a59005dbf75d7e',
zipped=False,
),
########It########
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/It_persona_train_corrected.json',
'It_train_tmp.json',
'9636893050ad16dc2daabfe8bde6979c9c780c5da9d0790f5668e198aac18b8f',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/It_persona_split_test_human_annotated.json',
'It_test_tmp.json',
'690103031791fd5c6763176a074cdb65e249adc784e2d577110c2d9430a02a87',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/It_persona_split_valid_human_annotated.json',
'It_valid_tmp.json',
'720dc91d3f9bc6a56ac229c6800cde63c71677a3db8d0ace7a59a7f94d89df3d',
zipped=False,
),
########Jp########
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Jp_persona_train_corrected.json',
'Jp_train_tmp.json',
'808ee29c79303e1300b38aceee79111ffe1fd2a2facb09a0956615a61d840738',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Jp_persona_split_test_human_annotated.json',
'Jp_test_tmp.json',
'682fed16148517097437942088d225bd728cb7b41aa390559681ae73e5e6848f',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Jp_persona_split_valid_human_annotated.json',
'Jp_valid_tmp.json',
'a86bb811364d100bc77ddc8038265df1e01bbc3be095e56c6ec8f179e6365d75',
zipped=False,
),
########Ko########
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Ko_persona_train_corrected.json',
'Ko_train_tmp.json',
'105d6a08d02e76f1d006edb1819b96a8b5fa8d94b3ed278936bcf171368809b7',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Ko_persona_split_test_human_annotated.json',
'Ko_test_tmp.json',
'f7ac6bd2aec7014a28d34bb34dceed653b389ce25d21ca770e77142578dc70a6',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Ko_persona_split_valid_human_annotated.json',
'Ko_valid_tmp.json',
'188470f863f639946bc8248a9f6aa1e589b41ec61792b88b81e7a95c72deeae0',
zipped=False,
),
########Zh########
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Zh_persona_train_corrected.json',
'Zh_train_tmp.json',
'e07899fa91edd127ec77502bd604693c40e264b60225976e2ac6ed145d080323',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Zh_persona_split_test_human_annotated.json',
'Zh_test_tmp.json',
'0767a4a27c765277792597502f57ea8bb80bf7be94613b0d833107f66a7d3512',
zipped=False,
),
DownloadableFile(
'https://raw.githubusercontent.com/HLTCHKUST/Xpersona/master/dataset/Zh_persona_split_valid_human_annotated.json',
'Zh_valid_tmp.json',
'cfa90117d73fe294a1b776b2d1c7b53711bfcd724f5833204726c910dad5482d',
zipped=False,
),
]


def build(opt):
dpath = os.path.join(opt['datapath'], 'XPersona')
version = None

if not build_data.built(dpath, version_string=version):
print('[building data: ' + dpath + ']')
if build_data.built(dpath):
# An older version exists, so remove these outdated files.
build_data.remove_dir(dpath)
build_data.make_dir(dpath)

# Download the data.
for downloadable_file in RESOURCES:
downloadable_file.download_file(dpath)
_create_parlai_format(dpath)

# Mark the data as built.
build_data.mark_done(dpath, version_string=version)


def _create_parlai_format(dpath: str):
datatypes = ['train', 'valid', 'test']
languages = ['En_', 'Zh_', 'Fr_', 'Ko_', 'Id_', 'Jp_', 'It_']
for language in languages:
for datatype in datatypes:
datatype_full = language + datatype + '_tmp'
datatype_rename = language + datatype
load_path = os.path.join(dpath, f'{datatype_full}.json')
save_path = os.path.join(dpath, f'{datatype_rename}.txt')
with PathManager.open(load_path, 'r', encoding='utf8') as f_read:
data = json.load(f_read)
with PathManager.open(save_path, 'w', encoding='utf8') as f_write:
for content in data:
line_num = 0
personas = content['persona']
dialogs = content['dialogue']
for persona in personas:
line_num += 1
f_write.write(str(line_num) + ' your persona:' + persona + '\n')
for utterance_A, utterance_B in dialogs:
line_num += 1
f_write.write(
str(line_num)
+ ' '
+ utterance_A
+ '\t'
+ utterance_B
+ '\n'
)
stephenroller marked this conversation as resolved.
Show resolved Hide resolved
os.remove(load_path)
39 changes: 39 additions & 0 deletions parlai/tasks/xpersona/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

from parlai.utils.testing import AutoTeacherTest # noqa: F401


class TestDefaultTeacher(AutoTeacherTest):
task = 'xpersona'


class TestEnTeacher(AutoTeacherTest):
task = 'xpersona:En'


class TestZhTeacher(AutoTeacherTest):
task = 'xpersona:Zh'


class TestFrTeacher(AutoTeacherTest):
task = 'xpersona:Fr'


class TestIdTeacher(AutoTeacherTest):
task = 'xpersona:Id'


class TestItTeacher(AutoTeacherTest):
task = 'xpersona:It'


class TestKoTeacher(AutoTeacherTest):
task = 'xpersona:Ko'


class TestJpTeacher(AutoTeacherTest):
task = 'xpersona:Jp'
42 changes: 42 additions & 0 deletions parlai/tasks/xpersona/test/xpersona_En_test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
acts:
- - episode_done: false
eval_labels:
- i am good , i just got off work and tired , i have two jobs .
id: xpersona:En
reward: 0
text: 'your persona:i read twenty books a year.
your persona:i''m a stunt double as my second job.
your persona:i only eat kosher.
your persona:i was raised in a single parent household.
hello what are doing today ?'
- - episode_done: false
eval_labels:
- i rather read , i've read about 20 books this year .
id: xpersona:En
reward: 0
text: i just got done watching a horror movie
- - episode_done: false
eval_labels:
- but a good movie is always good .
id: xpersona:En
reward: 0
text: wow ! i do love a good horror movie . loving this cooler weather
- - episode_done: false
eval_labels:
- i work in the movies as well .
id: xpersona:En
reward: 0
text: yes ! my son is in junior high and i just started letting him watch them
too
- - episode_done: false
eval_labels:
- yes it is neat , i stunt double , it is so much fun and hard work .
id: xpersona:En
reward: 0
text: neat ! ! i used to work in the human services field
num_episodes: 1000
num_examples: 7801
Loading