Skip to content

Commit

Permalink
fix #117, allow auto download with tar format videos
Browse files Browse the repository at this point in the history
  • Loading branch information
teowu committed Jun 16, 2024
1 parent 98b3955 commit 5bf59ed
Showing 1 changed file with 5 additions and 11 deletions.
16 changes: 5 additions & 11 deletions lmms_eval/api/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -776,7 +776,6 @@ def _download_from_youtube(path):
if accelerator.is_main_process:
force_download = dataset_kwargs.get("force_download", False)
force_unzip = dataset_kwargs.get("force_unzip", False)
print(force_download)
cache_path = snapshot_download(repo_id=self.DATASET_PATH, repo_type="dataset", force_download=force_download, etag_timeout=60)
zip_files = glob(os.path.join(cache_path, "**/*.zip"), recursive=True)
tar_files = glob(os.path.join(cache_path, "**/*.tar*"), recursive=True)
Expand All @@ -797,15 +796,11 @@ def untar_video_data(tar_file):


def concat_tar_parts(tar_parts, output_tar):
print("This is the output file:", output_tar, "from:", tar_parts)
try:
with open(output_tar, 'wb') as out_tar:
from tqdm import tqdm
for part in tqdm(sorted(tar_parts)):
with open(part, 'rb') as part_file:
out_tar.write(part_file.read())
except Exception as ex:
print("Error!!!", ex)
with open(output_tar, 'wb') as out_tar:
from tqdm import tqdm
for part in tqdm(sorted(tar_parts)):
with open(part, 'rb') as part_file:
out_tar.write(part_file.read())
eval_logger.info(f"Concatenated parts {tar_parts} into {output_tar}")

# Unzip zip files if needed
Expand All @@ -824,7 +819,6 @@ def concat_tar_parts(tar_parts, output_tar):
tar_parts_dict[base_name] = []
tar_parts_dict[base_name].append(tar_file)

print(tar_parts_dict)

# Concatenate and untar split parts
for base_name, parts in tar_parts_dict.items():
Expand Down

0 comments on commit 5bf59ed

Please sign in to comment.