Files
citypulse/backend/test/Machine_Learning/fetch_datasets.py
Cursor Agent 46dea3304f Refactor: Integrate backend API and normalize data
This commit integrates the backend API for fetching and updating report data. It also includes a normalization function to handle data consistency between the API and local storage.

Co-authored-by: anthonymuncher <anthonymuncher@gmail.com>
2025-09-26 10:27:39 +00:00

93 lines
3.4 KiB
Python

import os
import zipfile
import shutil
import random
import json
from pathlib import Path
# ---------- CONFIG ----------
BASE_DIR = Path("dataset")
DOWNLOAD_DIR = Path("downloads")
CLASSES = ["pothole", "streetlight", "garbage", "signage"]
TRAIN_SPLIT = 0.8 # 80% train, 20% val
os.makedirs(BASE_DIR, exist_ok=True)
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
# Create folder structure
for split in ["train", "val"]:
for cls in CLASSES:
os.makedirs(BASE_DIR / split / cls, exist_ok=True)
# ---------- AUTHENTICATION ----------
def setup_kaggle_api():
"""Load kaggle.json and set environment variables"""
kaggle_path = Path("kaggle.json") # put kaggle.json in the same folder as this script
if not kaggle_path.exists():
raise FileNotFoundError("❌ kaggle.json not found! Download it from https://www.kaggle.com/settings")
with open(kaggle_path, "r") as f:
creds = json.load(f)
os.environ["KAGGLE_USERNAME"] = creds["username"]
os.environ["KAGGLE_KEY"] = creds["key"]
print("✅ Kaggle API credentials loaded.")
# ---------- HELPERS ----------
def unzip_and_move(zip_path, class_name):
"""Unzip dataset and put images into dataset/train/ & val/ folders"""
extract_path = Path("tmp_extract")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_path)
# Collect images
all_images = list(extract_path.rglob("*.jpg")) + list(extract_path.rglob("*.png")) + list(extract_path.rglob("*.jpeg"))
random.shuffle(all_images)
# Train/Val split
split_idx = int(len(all_images) * TRAIN_SPLIT)
train_files = all_images[:split_idx]
val_files = all_images[split_idx:]
for img in train_files:
target = BASE_DIR / "train" / class_name / img.name
shutil.move(str(img), target)
for img in val_files:
target = BASE_DIR / "val" / class_name / img.name
shutil.move(str(img), target)
shutil.rmtree(extract_path)
def kaggle_download(dataset_slug, out_zip):
"""Download Kaggle dataset into downloads/ folder"""
os.system(f'kaggle datasets download -d {dataset_slug} -p {DOWNLOAD_DIR} -o')
return DOWNLOAD_DIR / out_zip
# ---------- MAIN ----------
if __name__ == "__main__":
setup_kaggle_api()
# Pothole dataset
pothole_zip = kaggle_download("andrewmvd/pothole-detection", "pothole-detection.zip")
unzip_and_move(pothole_zip, "pothole")
# Garbage dataset
garbage_zip = kaggle_download("dataclusterlabs/domestic-trash-garbage-dataset", "domestic-trash-garbage-dataset.zip")
unzip_and_move(garbage_zip, "garbage")
# TrashNet (alternative garbage dataset)
trashnet_zip = kaggle_download("techsash/waste-classification-data", "waste-classification-data.zip")
unzip_and_move(trashnet_zip, "garbage")
# Signage dataset
signage_zip = kaggle_download("ahemateja19bec1025/traffic-sign-dataset-classification", "traffic-sign-dataset-classification.zip")
unzip_and_move(signage_zip, "signage") # Combine all sign classes into one
#Drainage dataset (⚠️ still missing)
print("⚠️ No Kaggle dataset found for drainage. Please add manually to dataset/train/drainage & val/drainage.")
# Streetlight dataset (⚠️ still missing)
print("⚠️ No Kaggle dataset found for streetlights. Please add manually to dataset/train/streetlight & val/streetlight.")
print("✅ All datasets downloaded, cleaned, and organized into 'dataset/'")