import os
import requests
import time
from datasets import load_dataset
from tqdm import tqdm

# --- Configuration ---
DATASET_ID = "PianoVAM/PianoVAM_v1.0"
OUTPUT_BASE_DIR = "PianoVAM_v1.0"  # Folder where files will be saved
CACHE_DIR = "./hf_cache"              # Hugging Face cache folder
MAX_RETRIES = 5                       # Max number of retries on error
# ----------------

def download_file(url, local_path):
    """Downloads a file from the given URL to local_path with retries and resume support."""
    # 1. Skip if the file already exists
    if os.path.exists(local_path):
        print(f"'{os.path.basename(local_path)}' already exists. Skipping.")
        return True

    # 2. Create parent directories
    os.makedirs(os.path.dirname(local_path), exist_ok=True)
    
    # 3. Retry download up to MAX_RETRIES times
    retries = 0
    while retries < MAX_RETRIES:
        try:
            print(f"Downloading '{os.path.basename(local_path)}'...")
            with requests.get(url, stream=True, timeout=60) as r:
                r.raise_for_status()  # Raise an exception for bad status codes
                
                # Save to a temporary file first
                temp_path = local_path + ".part"
                with open(temp_path, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)
            
            # If download is successful, rename the temp file to the final filename
            os.rename(temp_path, local_path)
            print(f"Successfully downloaded '{os.path.basename(local_path)}'.")
            return True # Exit function on success

        except requests.exceptions.RequestException as e:
            print(f"Download error: {e}")
            retries += 1
            if retries < MAX_RETRIES:
                # Wait before retrying (e.g., 2, 4, 8, 16 seconds)
                wait_time = 2 ** retries
                print(f"Retrying in {wait_time} seconds... ({retries}/{MAX_RETRIES})")
                time.sleep(wait_time)
            else:
                print(f"Failed to download '{os.path.basename(local_path)}' after {MAX_RETRIES} attempts.")
                return False # Final failure

# --- Script Execution Start ---

# 1. Load dataset metadata
print("1. Loading dataset metadata...")
dataset = load_dataset(DATASET_ID, cache_dir=CACHE_DIR)

# 2. Create a list of all files to download
all_files_to_download = []
modalities = ['audio', 'video', 'midi', 'handskeleton', 'tsv']

for split in dataset.keys():
    for example in dataset[split]:
        base_url = f"https://huggingface.co/datasets/{DATASET_ID}/resolve/main/"
        for modality in modalities:
            path_key = f"{modality}_path"
            if example.get(path_key):
                file_info = {
                    "url": base_url + example[path_key],
                    "path": os.path.join(OUTPUT_BASE_DIR, example[path_key])
                }
                all_files_to_download.append(file_info)

print(f"2. A total of {len(all_files_to_download)} files will be downloaded.")

# 3. Sequentially download the files
successful_downloads = 0
failed_downloads = 0

for file_info in tqdm(all_files_to_download, desc="Overall Progress"):
    if download_file(file_info['url'], file_info['path']):
        successful_downloads += 1
    else:
        failed_downloads += 1

print("\n--- Download Complete ---")
print(f"Successful: {successful_downloads}, Failed: {failed_downloads}")