Skip to content
This repository has been archived by the owner on Nov 3, 2023. It is now read-only.

[CMU_DoG] Download data from source #3615

Merged
merged 2 commits into from
Apr 23, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 142 additions & 5 deletions parlai/tasks/cmu_dog/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,32 @@
# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import parlai.core.build_data as build_data
import json
import os
import random
import shutil
from tqdm import tqdm

import parlai.core.build_data as build_data
from parlai.core.build_data import DownloadableFile
from parlai.utils.io import PathManager
from parlai.utils.logging import logger


RESOURCES = [
DownloadableFile(
'http://parl.ai/downloads/cmu_dog/cmu_dog.tar.gz',
'cmu_dog.tar.gz',
'30d2bac0dae6b4e4c0b94ba581fffaf1acb46838480f7ad6736ad03d9312ae9d',
'https://github.com/festvox/datasets-CMU_DoG/archive/618a14f27546165859305649aa84e6ac8710bb63.zip',
'cmu_dog.zip',
'f8ba8820cf86ee1c196b237b0cde80edba940e4ddea28c582830f6d098b3c769',
)
]

UNZIPPED_PARENT_DIR = 'datasets-CMU_DoG-618a14f27546165859305649aa84e6ac8710bb63'


def build(opt):
dpath = os.path.join(opt['datapath'], 'cmu_dog')
version = '1.1'
version = '1.2'
if not build_data.built(dpath, version):
print('[building data: ' + dpath + ']')
if build_data.built(dpath):
Expand All @@ -30,4 +40,131 @@ def build(opt):
for downloadable_file in RESOURCES:
downloadable_file.download_file(dpath)

move_unzipped_files_up(dpath)
consolidate_wiki_data(dpath)
consolidate_convos(dpath)
build_deduped_split(dpath)
split_into_seen_unseen(dpath)

build_data.mark_done(dpath, version)


def move_unzipped_files_up(dpath: str):
unzipped_path = os.path.join(dpath, UNZIPPED_PARENT_DIR)
for f in os.listdir(unzipped_path):
shutil.move(os.path.join(unzipped_path, f), dpath)
shutil.rmtree(unzipped_path)


def consolidate_wiki_data(dpath: str):
all_articles = {}
src_dir = os.path.join(dpath, 'WikiData')
for f_name in tqdm(os.listdir(src_dir)):
with open(os.path.join(src_dir, f_name)) as f:
wiki_page = json.load(f)
idx = wiki_page['wikiDocumentIdx']
all_articles[idx] = wiki_page
dest_path = os.path.join(dpath, 'wiki_data.json')
with open(dest_path, 'w') as d:
json.dump(all_articles, d, indent=2)
shutil.rmtree(src_dir)


def consolidate_convos(dpath: str):
os.makedirs(os.path.join(dpath, 'conversations'), exist_ok=True)
for split in ['train', 'valid', 'test']:
consolidate_convo_split(dpath, split)


def consolidate_convo_split(dpath: str, split: str):
all_convos = {}
src_dir = os.path.join(dpath, 'Conversations', split)
for f_name in tqdm(os.listdir(src_dir)):
with open(os.path.join(src_dir, f_name)) as f:
convo = json.load(f)
cid = f_name.split('.')[0]
all_convos[cid] = convo
dest_path = os.path.join(dpath, 'conversations', f"{split}.json")
with open(dest_path, 'w') as dest:
json.dump(all_convos, dest, indent=2)
shutil.rmtree(src_dir)


def build_deduped_split(dpath: str):
"""
Original CMU-DoG has 110 ids that are used in multiple of train/valid/test.

Get rid of the duplication.
"""
cdir = os.path.join(dpath, "conversations")
data = {}
for fold in ["test", "valid", "train"]:
fpath = os.path.join(cdir, f"{fold}.json")
with PathManager.open(fpath) as f:
data[fold] = json.load(f)

train_len = len(data["train"])
valid_len = len(data["valid"])
test_len = len(data["test"])
logger.info(
f"Converation count with duplicates: train-{train_len}, valid-{valid_len}, test-{test_len}"
)

train_valid = set(data["train"].keys()) & set(data["valid"].keys())
train_test = set(data["train"].keys()) & set(data["test"].keys())
valid_test = set(data["valid"].keys()) & set(data["test"].keys())

for key in train_valid:
data["train"].pop(key)
for key in train_test:
data["train"].pop(key)
for key in valid_test:
data["test"].pop(key)

train_len = len(data["train"])
valid_len = len(data["valid"])
test_len = len(data["test"])
logger.info(
f"Converation count without duplicates: train-{train_len}, valid-{valid_len}, test-{test_len}"
)

for fold in ["test", "valid", "train"]:
fpath = os.path.join(cdir, f"{fold}_deduped.json")
with PathManager.open(fpath, "w+") as f:
json.dump(data[fold], f, indent=2)


def split_into_seen_unseen(dpath: str):
"""
Following WoW, we have overlap in train, valid, and test seen but none in test
valid. Do an 80:10:5:5 split between train, valid, test_seen, test_unseen or as
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: "none in test unseen"

close to it.

~205 documents for test_unseen to do this, and movies 1 and 3 have 90 and 117
movies, respectively, which is about that
"""
random.seed(42)
cdir = os.path.join(dpath, "conversations")
new = {"train": {}, "valid": {}, "test_seen": {}, "test_unseen": {}}
for fold in ["test", "valid", "train"]:
with PathManager.open(os.path.join(cdir, f"{fold}_deduped.json")) as f:
data = json.load(f)
for k, v in data.items():
if v["wikiDocumentIdx"] == 1 or v["wikiDocumentIdx"] == 3:
new["test_unseen"][k] = v
else:
rand = random.randint(1, 95)
if rand <= 80:
new["train"][k] = v
elif rand <= 90:
new["valid"][k] = v
else:
new["test_seen"][k] = v

for fold in new:
with PathManager.open(
os.path.join(cdir, f"{fold}_split_seen_unseen.json"), "w+"
) as f:
json.dump(new[fold], f, indent=2)
c_cnt = len(new[fold])
logger.info(f"Seen/unseen {fold} conversation count: {c_cnt}")
54 changes: 0 additions & 54 deletions parlai/tasks/cmu_dog/consolidate.py

This file was deleted.

58 changes: 0 additions & 58 deletions parlai/tasks/cmu_dog/split_by_movie.py

This file was deleted.