fix #117, allow auto download with tar format videos · EvolvingLMMs-Lab/lmms-eval@a056f11 (original) (raw)
`@@ -776,8 +776,10 @@ def _download_from_youtube(path):
`
776
776
`if accelerator.is_main_process:
`
777
777
`force_download = dataset_kwargs.get("force_download", False)
`
778
778
`force_unzip = dataset_kwargs.get("force_unzip", False)
`
``
779
`+
print(force_download)
`
779
780
`cache_path = snapshot_download(repo_id=self.DATASET_PATH, repo_type="dataset", force_download=force_download, etag_timeout=60)
`
780
781
`zip_files = glob(os.path.join(cache_path, "**/*.zip"), recursive=True)
`
``
782
`+
tar_files = glob(os.path.join(cache_path, "**/.tar"), recursive=True)
`
781
783
``
782
784
`def unzip_video_data(zip_file):
`
783
785
`import zipfile
`
`@@ -786,10 +788,57 @@ def unzip_video_data(zip_file):
`
786
788
`zip_ref.extractall(cache_dir)
`
787
789
`eval_logger.info(f"Extracted all files from {zip_file} to {cache_dir}")
`
788
790
``
``
791
`+
def untar_video_data(tar_file):
`
``
792
`+
import tarfile
`
``
793
`+
with tarfile.open(tar_file, "r") as tar_ref:
`
``
794
`+
tar_ref.extractall(cache_dir)
`
``
795
`+
eval_logger.info(f"Extracted all files from {tar_file} to {cache_dir}")
`
``
796
+
``
797
+
``
798
+
``
799
`+
def concat_tar_parts(tar_parts, output_tar):
`
``
800
`+
print("This is the output file:", output_tar, "from:", tar_parts)
`
``
801
`+
try:
`
``
802
`+
with open(output_tar, 'wb') as out_tar:
`
``
803
`+
from tqdm import tqdm
`
``
804
`+
for part in tqdm(sorted(tar_parts)):
`
``
805
`+
with open(part, 'rb') as part_file:
`
``
806
`+
out_tar.write(part_file.read())
`
``
807
`+
except Exception as ex:
`
``
808
`+
print("Error!!!", ex)
`
``
809
`+
eval_logger.info(f"Concatenated parts {tar_parts} into {output_tar}")
`
``
810
+
``
811
`+
Unzip zip files if needed
`
789
812
`if force_unzip or (not os.path.exists(cache_dir) and len(zip_files) > 0):
`
790
813
`for zip_file in zip_files:
`
791
814
`unzip_video_data(zip_file)
`
792
815
``
``
816
`+
Concatenate and extract tar files if needed
`
``
817
`+
if force_unzip or (not os.path.exists(cache_dir) and len(tar_files) > 0):
`
``
818
`+
tar_parts_dict = {}
`
``
819
+
``
820
`+
Group tar parts together
`
``
821
`+
for tar_file in tar_files:
`
``
822
`+
base_name = tar_file.split('.tar')[0]
`
``
823
`+
if base_name not in tar_parts_dict:
`
``
824
`+
tar_parts_dict[base_name] = []
`
``
825
`+
tar_parts_dict[base_name].append(tar_file)
`
``
826
+
``
827
`+
print(tar_parts_dict)
`
``
828
+
``
829
`+
Concatenate and untar split parts
`
``
830
`+
for base_name, parts in tar_parts_dict.items():
`
``
831
`+
eval_logger.info(f"Extracting following tar files: {parts}")
`
``
832
`+
output_tar = base_name + ".tar"
`
``
833
`+
if not os.path.exists(output_tar):
`
``
834
`+
eval_logger.info(f"Start concatenating tar files")
`
``
835
+
``
836
`+
concat_tar_parts(parts, output_tar)
`
``
837
`+
eval_logger.info(f"Finish concatenating tar files")
`
``
838
+
``
839
`+
if not os.path.exists(os.path.join(cache_dir, os.path.basename(base_name))):
`
``
840
`+
untar_video_data(output_tar)
`
``
841
+
793
842
`accelerator.wait_for_everyone()
`
794
843
`dataset_kwargs.pop("cache_dir")
`
795
844
`dataset_kwargs.pop("video")
`