Skip to content
This repository has been archived by the owner on Nov 3, 2023. It is now read-only.

Commit

Permalink
[TOD][Datasets][Easy] Taskmaster(1) in Conversations format (#4189)
Browse files Browse the repository at this point in the history
  • Loading branch information
moyapchen authored Dec 23, 2021
1 parent ecf3e8a commit ce92103
Show file tree
Hide file tree
Showing 9 changed files with 546 additions and 3 deletions.
246 changes: 244 additions & 2 deletions parlai/tasks/taskmaster/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,258 @@
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

"""
Taskmaster-1 implementation for ParlAI.
Note that we have conversations structured both in the "TOD" format as well as those
from prior.
"""

from typing import Optional
from parlai.core.params import ParlaiParser
import os
import pandas as pd
from parlai.core.opt import Opt
from parlai.core.teachers import FixedDialogTeacher
import parlai.core.tod.tod_core as tod
from typing import Optional
from parlai.utils.data import DatatypeHelper
from parlai.utils.io import PathManager

import parlai.tasks.taskmaster.build as build_
import parlai.core.tod.tod_agents as tod_agents

# Following is for legacy format
from parlai.core.teachers import FixedDialogTeacher
from . import tm_utils
import json


################### TOD Conversation format

SILENCE_TOKEN = "__SILENCE__"

# Faster to copy/paste this than parse a json file
ONTOLOGY = {
"uber": {
"id": "uber_lyft",
"vertical": "ride_booking",
"required": ["location.from", "location.to", "type.ride", "num.people"],
"optional": [
"price.estimate",
"duration.estimate",
"time.pickup",
"time.dropoff",
],
},
"movie": {
"id": "movie_ticket",
"vertical": "ticket_booking",
"required": [
"name.movie",
"name.theater",
"num.tickets",
"time.start",
"location.theater",
"price.ticket",
],
"optional": ["type.screening", "time.end", "time.duration"],
},
"restaurant": {
"id": "restaurant_reservation",
"vertical": "reservation",
"required": [
"name.restaurant",
"name.reservation",
"num.guests",
"time.reservation",
],
"optional": ["type.seating", "location.restaurant"],
},
"coffee": {
"id": "coffee_ordering",
"vertical": "coffee_order",
"required": ["location.store", "name.drink", "size.drink"],
"optional": ["num.drink", "type.milk", "preference"],
},
"pizza": {
"id": "pizza_ordering",
"vertical": "pizza_order",
"required": ["name.store", "name.pizza", "size.pizza"],
"optional": ["type.topping", "type.crust", "preference", "location.store"],
},
"auto": {
"id": "auto_repair",
"vertical": "appointment",
"required": ["name.store", "name.customer", "date.appt", "time.appt"],
"optional": ["reason.appt", "name.vehicle", "year.vehicle", "location.store"],
},
}


class Taskmaster1Parser(tod_agents.TodStructuredDataParser):
"""
Abstract data loader.
"""

@classmethod
def add_cmdline_args(
cls, parser: ParlaiParser, partial_opt: Optional[Opt] = None
) -> ParlaiParser:
parser = super().add_cmdline_args(parser, partial_opt)
return parser

def __init__(self, opt: Opt, shared=None):
self.fold = DatatypeHelper.fold(opt["datatype"])
opt["datafile"] = self.fold
self.dpath = os.path.join(opt["datapath"], "taskmaster-1")
if shared is None:
build_.build(opt)
super().__init__(opt, shared)

def _load_data(self, fold):
chunks = []
with PathManager.open(os.path.join(self.dpath, f"self-dialogs.json")) as f:
subset = pd.read_json(f)
chunks.append(subset)
with PathManager.open(os.path.join(self.dpath, f"woz-dialogs.json")) as f:
subset = pd.read_json(f)
chunks.append(subset)
chunks = pd.concat(chunks, axis=0)
# deterministic shuffle data for splits
chunks = chunks.sample(frac=1.0, random_state=42)
split_size = len(chunks) // 10
if fold == "train":
chunks = chunks[: split_size * 8]
elif fold == "valid":
chunks = chunks[split_size * 8 : split_size * 9]
elif fold == "test":
chunks = chunks[split_size * 9 :]
return chunks, ONTOLOGY

def _parse_segment_to_slots(self, segment_list):
result = {}
for segment in segment_list:
slot_name = segment["annotations"][0]["name"]
slot_value = segment["text"]
prefix_split_idx = slot_name.find(".")
api_name = slot_name[:prefix_split_idx]
slot_name = slot_name[prefix_split_idx + 1 :]
result[slot_name] = slot_value
result[tod.STANDARD_API_NAME_SLOT] = api_name
return result

def _get_utterance_and_slots_for_speaker(self, speaker, utterances, idx):
utts = []
slots = {}
while idx < len(utterances):
here = utterances[idx]
if here["speaker"] != speaker:
break
utts.append(here["text"])
slots.update(self._parse_segment_to_slots(here.get("segments", [])))
idx += 1
return idx, "\n".join(utts), slots

def _parse_to_api_schema(self, raw):
"""
NOTE: Format of ontology in this is different from TM2 + TM3. Need to figure out which is relevant for the domain.
"""
result = {}
for key, val in raw.items():
here = {}
here[tod.STANDARD_API_NAME_SLOT] = val["id"]
here[tod.STANDARD_REQUIRED_KEY] = val.get("required", [])
here[tod.STANDARD_OPTIONAL_KEY] = val.get("optional", [])
result[key] = here
return result

def _get_turns_from_parsed(self, user_utt, api_calls, api_resps, sys_utt):
result = [
tod.TodStructuredRound(
user_utt=user_utt,
api_call_machine=api_calls,
api_resp_machine=api_resps,
sys_utt=sys_utt,
)
]
return result

def setup_episodes(self, fold):
"""
Parses into TodStructuredEpisode.
"""
chunks, api_schema_raw = self._load_data(fold)
api_schemas_machine = self._parse_to_api_schema(api_schema_raw)
episodes = []
for _, row in chunks.iterrows():
utterances = row["utterances"][:]
if not all(
[
x.get("speaker") == "ASSISTANT" or x.get("speaker") == "USER"
for x in utterances
]
):
# there's an example or two that causes things to infinite loop. >.>
continue
idx = 0
rounds = []
goal_calls = []
if len(utterances) > 0 and utterances[0]["speaker"] == "ASSISTANT":
(idx, sys_utt, _) = self._get_utterance_and_slots_for_speaker(
"ASSISTANT", utterances, idx
)

turns = self._get_turns_from_parsed(SILENCE_TOKEN, {}, {}, sys_utt)
for t in turns:
rounds.append(t)

while idx < len(utterances):
(idx, user_utt, user_slots) = self._get_utterance_and_slots_for_speaker(
"USER", utterances, idx
)
(
idx,
sys_utt,
system_slots,
) = self._get_utterance_and_slots_for_speaker(
"ASSISTANT", utterances, idx
)
# The annotations in this dataset don't make sense as api responses but... we'll just roll.
turns = self._get_turns_from_parsed(
user_utt, user_slots, system_slots, sys_utt
)
for t in turns:
rounds.append(t)
apis = []
for candidate_api in api_schemas_machine:
if candidate_api in row["instruction_id"]:
apis.append(api_schemas_machine[candidate_api])
episode = tod.TodStructuredEpisode(
api_schemas_machine=apis,
goal_calls_machine=goal_calls,
rounds=rounds,
delex=self.opt.get("delex", False),
)
episodes.append(episode)
return episodes

def get_id_task_prefix(self):
return "Taskmaster1"

def _label_fold(self, chunks):
return chunks.conversation_id.apply(self._h)


class SystemTeacher(Taskmaster1Parser, tod_agents.TodSystemTeacher):
pass


class UserSimulatorTeacher(Taskmaster1Parser, tod_agents.TodUserSimulatorTeacher):
pass


############ Legacy defined teachers


class SelfDialogueTeacher(FixedDialogTeacher):
"""
Teacher for written two-person dialogues with labels being responses for the
Expand Down
2 changes: 1 addition & 1 deletion parlai/tasks/taskmaster/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def build(opt):
# get path to data directory
dpath = os.path.join(opt['datapath'], 'taskmaster-1')
# define version if any
version = "1.01"
version = "1.02"

# check if data had been previously built
if not build_data.built(dpath, version_string=version):
Expand Down
8 changes: 8 additions & 0 deletions parlai/tasks/taskmaster/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,11 @@ class TestWozDialogueTeacher(AutoTeacherTest):

class TestSelfDialogueSegmentTeacher(AutoTeacherTest):
task = "taskmaster:self_dialogue_segment"


class TestSystemTeacher(AutoTeacherTest):
task = "taskmaster:SystemTeacher"


class TestUserSimulatorTeacher(AutoTeacherTest):
task = "taskmaster:UserSimulatorTeacher"
45 changes: 45 additions & 0 deletions parlai/tasks/taskmaster/test/taskmaster_SystemTeacher_test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
acts:
- - domain: ''
episode_done: false
eval_labels:
- 'APIS: '
id: Taskmaster1_SystemTeacher
slots: {}
text: 'APIS: '
type: 'APIS: '
- - domain: ''
episode_done: false
eval_labels:
- 'APICALL: '
id: Taskmaster1_SystemTeacher
slots: {}
text: 'USER: __SILENCE__'
type: 'APICALL: '
- - domain: ''
episode_done: false
eval_labels:
- 'SYSTEM: hey there, how can i help you?'
id: Taskmaster1_SystemTeacher
slots: {}
text: 'APIRESP: '
type: 'SYSTEM: '
- - domain: ''
episode_done: false
eval_labels:
- 'APICALL: '
id: Taskmaster1_SystemTeacher
slots: {}
text: 'USER: Hi. I want to ride Uber in car.'
type: 'APICALL: '
- - domain: ''
episode_done: false
eval_labels:
- 'SYSTEM: sure, where are you heading to?
where is your destination?'
id: Taskmaster1_SystemTeacher
slots: {}
text: 'APIRESP: '
type: 'SYSTEM: '
num_episodes: 1326
num_examples: 33198
49 changes: 49 additions & 0 deletions parlai/tasks/taskmaster/test/taskmaster_SystemTeacher_train.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
acts:
- - domain: ''
episode_done: false
id: Taskmaster1_SystemTeacher
labels:
- 'APIS: '
slots: {}
text: 'APIS: '
type: 'APIS: '
- - domain: ''
episode_done: false
id: Taskmaster1_SystemTeacher
labels:
- 'APICALL: '
slots: {}
text: 'USER: __SILENCE__'
type: 'APICALL: '
- - domain: ''
episode_done: false
id: Taskmaster1_SystemTeacher
labels:
- 'SYSTEM: hi, how can i assist you?'
slots: {}
text: 'APIRESP: '
type: 'SYSTEM: '
- - domain: ''
episode_done: false
id: Taskmaster1_SystemTeacher
labels:
- 'APICALL: api_name = uber_lyft ; location.from.accept = the Wichita Dwight D.
Eisenhower National Airport ; location.to.accept = the Wichita State University
campus'
slots:
api_name: uber_lyft
location.from.accept: the Wichita Dwight D. Eisenhower National Airport
location.to.accept: the Wichita State University campus
text: 'USER: Hi, I need an Uber pickup from the Wichita Dwight D. Eisenhower National
Airport to the Wichita State University campus.'
type: 'APICALL: '
- - domain: ''
episode_done: false
id: Taskmaster1_SystemTeacher
labels:
- 'SYSTEM: what kind of ride would you like?'
slots: {}
text: 'APIRESP: '
type: 'SYSTEM: '
num_episodes: 10555
num_examples: 262021
Loading

0 comments on commit ce92103

Please sign in to comment.