citypulse/backend/test/Machine_Learning/fetch_datasets.py

import os
import zipfile
import shutil
import random
import json
from pathlib import Path

# ---------- CONFIG ----------
BASE_DIR = Path("dataset")
DOWNLOAD_DIR = Path("downloads")
CLASSES = ["pothole", "streetlight", "garbage", "signage"]
TRAIN_SPLIT = 0.8  # 80% train, 20% val

os.makedirs(BASE_DIR, exist_ok=True)
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# Create folder structure
for split in ["train", "val"]:
    for cls in CLASSES:
        os.makedirs(BASE_DIR / split / cls, exist_ok=True)

# ---------- AUTHENTICATION ----------
def setup_kaggle_api():
    """Load kaggle.json and set environment variables"""
    kaggle_path = Path("kaggle.json")  # put kaggle.json in the same folder as this script
    if not kaggle_path.exists():
        raise FileNotFoundError("❌ kaggle.json not found! Download it from https://www.kaggle.com/settings")

    with open(kaggle_path, "r") as f:
        creds = json.load(f)

    os.environ["KAGGLE_USERNAME"] = creds["username"]
    os.environ["KAGGLE_KEY"] = creds["key"]
    print("✅ Kaggle API credentials loaded.")

# ---------- HELPERS ----------
def unzip_and_move(zip_path, class_name):
    """Unzip dataset and put images into dataset/train/ & val/ folders"""
    extract_path = Path("tmp_extract")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

    # Collect images
    all_images = list(extract_path.rglob("*.jpg")) + list(extract_path.rglob("*.png")) + list(extract_path.rglob("*.jpeg"))
    random.shuffle(all_images)

    # Train/Val split
    split_idx = int(len(all_images) * TRAIN_SPLIT)
    train_files = all_images[:split_idx]
    val_files = all_images[split_idx:]

    for img in train_files:
        target = BASE_DIR / "train" / class_name / img.name
        shutil.move(str(img), target)

    for img in val_files:
        target = BASE_DIR / "val" / class_name / img.name
        shutil.move(str(img), target)

    shutil.rmtree(extract_path)

def kaggle_download(dataset_slug, out_zip):
    """Download Kaggle dataset into downloads/ folder"""
    os.system(f'kaggle datasets download -d {dataset_slug} -p {DOWNLOAD_DIR} -o')
    return DOWNLOAD_DIR / out_zip

# ---------- MAIN ----------
if __name__ == "__main__":
    setup_kaggle_api()

    # Pothole dataset
    pothole_zip = kaggle_download("andrewmvd/pothole-detection", "pothole-detection.zip")
    unzip_and_move(pothole_zip, "pothole")

    # Garbage dataset
    garbage_zip = kaggle_download("dataclusterlabs/domestic-trash-garbage-dataset", "domestic-trash-garbage-dataset.zip")
    unzip_and_move(garbage_zip, "garbage")

    # TrashNet (alternative garbage dataset)
    trashnet_zip = kaggle_download("techsash/waste-classification-data", "waste-classification-data.zip")
    unzip_and_move(trashnet_zip, "garbage")

    # Signage dataset
    signage_zip = kaggle_download("ahemateja19bec1025/traffic-sign-dataset-classification", "traffic-sign-dataset-classification.zip")
    unzip_and_move(signage_zip, "signage")  # Combine all sign classes into one

    #Drainage dataset (⚠️ still missing)
    print("⚠️ No Kaggle dataset found for drainage. Please add manually to dataset/train/drainage & val/drainage.")
    # Streetlight dataset (⚠️ still missing)
    print("⚠️ No Kaggle dataset found for streetlights. Please add manually to dataset/train/streetlight & val/streetlight.")

    print("✅ All datasets downloaded, cleaned, and organized into 'dataset/'")