Skip to content
This repository has been archived by the owner on Nov 3, 2023. It is now read-only.

[task] Fix Cornell Movies #3627

Merged
merged 3 commits into from
Apr 30, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion parlai/core/torch_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -1687,7 +1687,7 @@ def batchify(self, obs_batch, sort=False):
)
if any('label_truncated_length' in ex for ex in exs):
label_truncated_lengths = torch.LongTensor(
[ex.get('label_truncated_length') for ex in exs]
[ex.get('label_truncated_length', 0) for ex in exs]
)
field = 'labels' if labels_avail else 'eval_labels'

Expand Down
130 changes: 46 additions & 84 deletions parlai/tasks/cornell_movie/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,109 +4,71 @@
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

from parlai.core.teachers import FbDeprecatedDialogTeacher
from parlai.core.teachers import DialogTeacher
from .build import build
from parlai.utils.data import DatatypeHelper

import copy
import os
import codecs


def _path(opt, filtered):
# Build the data if it doesn't exist.
build(opt)
dt = opt['datatype'].split(':')[0]
return os.path.join(opt['datapath'], 'CornellMovie', dt + filtered + '.txt')
def _path(opt, *additions):
return os.path.join(opt['datapath'], 'CornellMovie', *additions)
spencerp marked this conversation as resolved.
Show resolved Hide resolved


class DefaultTeacher(FbDeprecatedDialogTeacher):
class DefaultTeacher(DialogTeacher):
DOUBLE = False

def __init__(self, opt, shared=None):
opt = copy.deepcopy(opt)
opt['datafile'] = _path(opt, '')
opt['cands_datafile'] = opt['datafile']
self.fold = DatatypeHelper.fold(opt['datatype'])
opt['datafile'] = _path(opt, self.fold + '.txt')
super().__init__(opt, shared)

def num_examples(self):
if self.fold == 'train':
return 133125
elif self.fold == 'valid':
return 16759
elif self.fold == 'test':
return 16611
def setup_data(self, datafile):
lines_file = _path(self.opt, 'cornell movie-dialogs corpus', 'movie_lines.txt')
convo_file = _path(
self.opt, 'cornell movie-dialogs corpus', 'movie_conversations.txt'
)

lines = {}

codecs.register_error('strict', codecs.ignore_errors)
with codecs.open(lines_file, 'r') as f:
for line in f:
l = line.split(' +++$+++ ')
lines[l[0]] = ' '.join(l[4:]).strip('\n').replace('\t', ' ')

cnt = 0
with codecs.open(convo_file, 'r') as f:
for cnt, line in enumerate(f, 1):
l = line.split(' ')
convo = ' '.join(l[6:]).strip('\n').strip('[').strip(']')
c = convo.replace("'", '').replace(' ', '').split(',')

texts = [lines[l] for l in c]

if (cnt % 10 == 0) and self.fold != 'test':
continue
elif (cnt % 10 == 1) and self.fold != 'valid':
continue
elif (cnt % 10 > 1) and self.fold != 'train':
continue

for i, (prompt, response) in enumerate(zip(texts[::2], texts[1::2])):
yield {'text': prompt, 'label': response}, i == 0

def num_episodes(self):
if self.fold == 'train':
return 66478
elif self.fold == 'valid':
return 8310
elif self.fold == 'test':
return 8309
if self.DOUBLE:
for i, (prompt, response) in enumerate(
zip(texts[1::2], texts[2::2])
):
yield {'text': prompt, 'label': response}, i == 0


class DoubleTeacher(DefaultTeacher):
"""
This version creates text-label pairs from the perspective of both speakers.
"""

def num_examples(self):
if self.fold == 'train':
return 176975
elif self.fold == 'valid':
return 22349
elif self.fold == 'test':
return 22013

def num_episodes(self):
if self.fold == 'train':
return 102401
elif self.fold == 'valid':
return 12806
elif self.fold == 'test':
return 12790

def _rebuild(self, entries):
new_list = []
if len(entries) > 0:
# add all ( y_t => x_(t+1) ) pairs
new_list.extend(
[
(entries[i][1][0], [entries[i + 1][0]])
for i in range(len(entries) - 1)
]
)
return new_list

def _is_valid(self, entry):
if entry[0] == '' or entry[1] is None:
return False
return True

def setup_data(self, path):
"""
Adds additional perspectives. For example, in the conversation:

x1 y1
x2 y2
x3

Creates the additional dialog:

y1 x2
y2 x3
"""
# this shows conversations in both directions
alternate = []
for entry, new in super().setup_data(path):
if new:
for i, e in enumerate(self._rebuild(alternate)):
if self._is_valid(e):
yield e, i == 0
alternate.clear()
alternate.append(entry)
if self._is_valid(entry):
yield entry, new
if alternate:
for i, e in enumerate(self._rebuild(alternate)):
if self._is_valid(e):
yield e, i == 0
DOUBLE = True
53 changes: 1 addition & 52 deletions parlai/tasks/cornell_movie/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,55 +20,11 @@
]


def create_fb_format(lines_file, convo_file, outpath):
print('[building fbformat]')
with PathManager.open(
os.path.join(outpath, 'train.txt'), 'w'
) as ftrain, PathManager.open(
os.path.join(outpath, 'valid.txt'), 'w'
) as fvalid, PathManager.open(
os.path.join(outpath, 'test.txt'), 'w'
) as ftest:
lines = {}

codecs.register_error('strict', codecs.ignore_errors)
with codecs.open(lines_file, 'r') as f:
for line in f:
l = line.split(' +++$+++ ')
lines[l[0]] = ' '.join(l[4:]).strip('\n').replace('\t', ' ')

cnt = 0
with codecs.open(convo_file, 'r') as f:
for line in f:
l = line.split(' ')
convo = ' '.join(l[6:]).strip('\n').strip('[').strip(']')
c = convo.replace("'", '').replace(' ', '').split(',')

# forward conversation
s = ''
index = 0
for i in range(0, len(c), 2):
index += 1
s += str(index) + ' ' + lines[c[i]]
if len(c) > i + 1:
s += '\t' + lines[c[i + 1]]
s += '\n'

cnt = cnt + 1
handle = ftrain
if (cnt % 10) == 0:
handle = ftest
if (cnt % 10) == 1:
handle = fvalid
handle.write(s + '\n')


def build(opt):
dpath = os.path.join(opt['datapath'], 'CornellMovie')
version = None
version = 'v1.01'

if not build_data.built(dpath, version_string=version):
print('[building data: ' + dpath + ']')
if build_data.built(dpath):
# An older version exists, so remove these outdated files.
build_data.remove_dir(dpath)
Expand All @@ -78,12 +34,5 @@ def build(opt):
for downloadable_file in RESOURCES:
downloadable_file.download_file(dpath)

dpext = os.path.join(dpath, 'cornell movie-dialogs corpus')
create_fb_format(
os.path.join(dpext, 'movie_lines.txt'),
os.path.join(dpext, 'movie_conversations.txt'),
dpath,
)

# Mark the data as built.
build_data.mark_done(dpath, version_string=version)
128 changes: 2 additions & 126 deletions parlai/tasks/cornell_movie/test/cornell_movie_double_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,152 +3,28 @@ acts:
eval_labels:
- You're sweet.
id: cornell_movie:double
label_candidates:
- ' And he agreed?'
- ' The only one who understands what this me-...'
- '"...unsure whether or not Enemy Action..."'
- '"A Reason to Love."'
- '"A people is a detour of nature to get 6 or 7 great men - Yes, and then to
get around them..." Nietzsche said that.'
- '"Cora" is my part. You''ve got to tell Lloyd it''s for me.'
- '"Debbie Does Dallas"... Hell, it''s in Russian. I can''t read it...'
- '"Deserve" don''t mean shit, Little Bill.'
- '"Deutchland, Deutchland ... "'
- '"Do chickens give milk?"'
- '[Your brother wants to talk to you.]'
- '`Cause you know I can do other stuff. I mean, if you wanted me to talk or...'
- and HERBERT exchange a glance. HONORA smiles at Juliet.
- frowns at Henry.
- how much weight have you lost?
- kissed a lotta tadpoles. Listen, I been thinking about your problem. I'm not
the guy to sponsor you. It would be unethical. But, there is something I could
do for you. Putt-putt golf.
- ooohhhhhh no.
- xxxxxx
- yeah, I had a bad night.
- yeah.
reward: 0
text: You have my word. As a gentleman
- - episode_done: false
eval_labels:
- What crap?
id: cornell_movie:double
label_candidates:
- ' And he agreed?'
- ' The only one who understands what this me-...'
- '"...unsure whether or not Enemy Action..."'
- '"A Reason to Love."'
- '"A people is a detour of nature to get 6 or 7 great men - Yes, and then to
get around them..." Nietzsche said that.'
- '"Cora" is my part. You''ve got to tell Lloyd it''s for me.'
- '"Debbie Does Dallas"... Hell, it''s in Russian. I can''t read it...'
- '"Deserve" don''t mean shit, Little Bill.'
- '"Deutchland, Deutchland ... "'
- '"Do chickens give milk?"'
- '[Your brother wants to talk to you.]'
- '`Cause you know I can do other stuff. I mean, if you wanted me to talk or...'
- and HERBERT exchange a glance. HONORA smiles at Juliet.
- frowns at Henry.
- how much weight have you lost?
- kissed a lotta tadpoles. Listen, I been thinking about your problem. I'm not
the guy to sponsor you. It would be unethical. But, there is something I could
do for you. Putt-putt golf.
- ooohhhhhh no.
- xxxxxx
- yeah, I had a bad night.
- yeah.
reward: 0
text: do you listen to this crap?
- - episode_done: true
eval_labels:
- Thank God! If I had to hear one more story about your coiffure...
id: cornell_movie:double
label_candidates:
- ' And he agreed?'
- ' The only one who understands what this me-...'
- '"...unsure whether or not Enemy Action..."'
- '"A Reason to Love."'
- '"A people is a detour of nature to get 6 or 7 great men - Yes, and then to
get around them..." Nietzsche said that.'
- '"Cora" is my part. You''ve got to tell Lloyd it''s for me.'
- '"Debbie Does Dallas"... Hell, it''s in Russian. I can''t read it...'
- '"Deserve" don''t mean shit, Little Bill.'
- '"Deutchland, Deutchland ... "'
- '"Do chickens give milk?"'
- '[Your brother wants to talk to you.]'
- '`Cause you know I can do other stuff. I mean, if you wanted me to talk or...'
- and HERBERT exchange a glance. HONORA smiles at Juliet.
- frowns at Henry.
- how much weight have you lost?
- kissed a lotta tadpoles. Listen, I been thinking about your problem. I'm not
the guy to sponsor you. It would be unethical. But, there is something I could
do for you. Putt-putt golf.
- ooohhhhhh no.
- xxxxxx
- yeah, I had a bad night.
- yeah.
reward: 0
text: Me. This endless ...blonde babble. I'm like, boring myself.
- - episode_done: true
eval_labels:
- Me. This endless ...blonde babble. I'm like, boring myself.
id: cornell_movie:double
label_candidates:
- ' And he agreed?'
- ' The only one who understands what this me-...'
- '"...unsure whether or not Enemy Action..."'
- '"A Reason to Love."'
- '"A people is a detour of nature to get 6 or 7 great men - Yes, and then to
get around them..." Nietzsche said that.'
- '"Cora" is my part. You''ve got to tell Lloyd it''s for me.'
- '"Debbie Does Dallas"... Hell, it''s in Russian. I can''t read it...'
- '"Deserve" don''t mean shit, Little Bill.'
- '"Deutchland, Deutchland ... "'
- '"Do chickens give milk?"'
- '[Your brother wants to talk to you.]'
- '`Cause you know I can do other stuff. I mean, if you wanted me to talk or...'
- and HERBERT exchange a glance. HONORA smiles at Juliet.
- frowns at Henry.
- how much weight have you lost?
- kissed a lotta tadpoles. Listen, I been thinking about your problem. I'm not
the guy to sponsor you. It would be unethical. But, there is something I could
do for you. Putt-putt golf.
- ooohhhhhh no.
- xxxxxx
- yeah, I had a bad night.
- yeah.
text: What crap?
- - episode_done: true
eval_labels:
- Sometimes I wonder if the guys we're supposed to want to go out with are the
ones we actually want to go out with, you know?
id: cornell_movie:double
label_candidates:
- ' And he agreed?'
- ' The only one who understands what this me-...'
- '"...unsure whether or not Enemy Action..."'
- '"A Reason to Love."'
- '"A people is a detour of nature to get 6 or 7 great men - Yes, and then to
get around them..." Nietzsche said that.'
- '"Cora" is my part. You''ve got to tell Lloyd it''s for me.'
- '"Debbie Does Dallas"... Hell, it''s in Russian. I can''t read it...'
- '"Deserve" don''t mean shit, Little Bill.'
- '"Deutchland, Deutchland ... "'
- '"Do chickens give milk?"'
- '[Your brother wants to talk to you.]'
- '`Cause you know I can do other stuff. I mean, if you wanted me to talk or...'
- and HERBERT exchange a glance. HONORA smiles at Juliet.
- frowns at Henry.
- how much weight have you lost?
- kissed a lotta tadpoles. Listen, I been thinking about your problem. I'm not
the guy to sponsor you. It would be unethical. But, there is something I could
do for you. Putt-putt golf.
- ooohhhhhh no.
- xxxxxx
- yeah, I had a bad night.
- yeah.
reward: 0
text: Bianca, I don't think the highlights of dating Joey Dorsey are going to
include door-opening and coat-holding.
num_episodes: 12790
num_examples: 22013
num_episodes: 8309
num_examples: 13725
Loading