diff --git a/.circleci/cached_datasets_list.txt b/.circleci/cached_datasets_list.txt new file mode 100644 index 0000000000..c345588868 --- /dev/null +++ b/.circleci/cached_datasets_list.txt @@ -0,0 +1,21 @@ +IMDB +AG_NEWS +SogouNews +DBpedia +YelpReviewPolarity +YelpReviewFull +YahooAnswers +AmazonReviewPolarity +AmazonReviewFull +UDPOS +CoNLL2000Chunking +Multi30k +IWSLT2016 +IWSLT2017 +WMT14 +WikiText2 +WikiText103 +PennTreebank +SQuAD1 +SQuAD2 +EnWik9 \ No newline at end of file diff --git a/.circleci/config.yml b/.circleci/config.yml index 32f99b26ff..b2c61d69f6 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -44,7 +44,9 @@ commands: steps: - run: name: Generate CCI cache key - command: echo "$(date "+%D")" > .cachekey + command: + echo "$(date "+%D")" > .cachekey + cat cached_datasets_list.txt >> .cachekey - persist_to_workspace: root: . paths: diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in index 3830d7f881..3c997d777c 100644 --- a/.circleci/config.yml.in +++ b/.circleci/config.yml.in @@ -44,7 +44,9 @@ commands: steps: - run: name: Generate CCI cache key - command: echo "$(date "+%D")" > .cachekey + command: + echo "$(date "+%D")" > .cachekey + cat cached_datasets_list.txt >> .cachekey - persist_to_workspace: root: . paths: diff --git a/test/common/cache_utils.py b/test/common/cache_utils.py index 404dbd0d24..16c62e7e35 100644 --- a/test/common/cache_utils.py +++ b/test/common/cache_utils.py @@ -9,11 +9,14 @@ def check_cache_status(): assert os.path.exists(CACHE_STATUS_FILE), "Cache status file does not exists" with open(CACHE_STATUS_FILE, 'r') as f: + missing_datasets = [] cache_status = json.load(f) for dataset_name in cache_status: for split in cache_status[dataset_name]: if cache_status[dataset_name][split]['status'] == "fail": - raise FileNotFoundError("Failing all raw dataset unit tests as cache is missing atleast one raw dataset") + missing_datasets.append(dataset_name + '_' + split) + if missing_datasets: + raise FileNotFoundError("Failing all raw dataset unit tests as cache is missing {} datasets".format(missing_datasets)) def generate_data_cache(): @@ -30,7 +33,7 @@ def generate_data_cache(): if dataset_name not in cache_status: cache_status[dataset_name] = {} try: - if dataset_name == "Multi30k" or dataset_name == 'WMT14': + if dataset_name == 'WMT14': _ = torchtext.experimental.datasets.raw.DATASETS[dataset_name](split=split) else: _ = torchtext.datasets.DATASETS[dataset_name](split=split)