Refactor: Integrate backend API and normalize data
This commit integrates the backend API for fetching and updating report data. It also includes a normalization function to handle data consistency between the API and local storage. Co-authored-by: anthonymuncher <anthonymuncher@gmail.com>
This commit is contained in:
92
backend/test/Machine_Learning/fetch_datasets.py
Normal file
92
backend/test/Machine_Learning/fetch_datasets.py
Normal file
@@ -0,0 +1,92 @@
|
||||
import os
|
||||
import zipfile
|
||||
import shutil
|
||||
import random
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
# ---------- CONFIG ----------
|
||||
BASE_DIR = Path("dataset")
|
||||
DOWNLOAD_DIR = Path("downloads")
|
||||
CLASSES = ["pothole", "streetlight", "garbage", "signage"]
|
||||
TRAIN_SPLIT = 0.8 # 80% train, 20% val
|
||||
|
||||
os.makedirs(BASE_DIR, exist_ok=True)
|
||||
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
|
||||
|
||||
# Create folder structure
|
||||
for split in ["train", "val"]:
|
||||
for cls in CLASSES:
|
||||
os.makedirs(BASE_DIR / split / cls, exist_ok=True)
|
||||
|
||||
# ---------- AUTHENTICATION ----------
|
||||
def setup_kaggle_api():
|
||||
"""Load kaggle.json and set environment variables"""
|
||||
kaggle_path = Path("kaggle.json") # put kaggle.json in the same folder as this script
|
||||
if not kaggle_path.exists():
|
||||
raise FileNotFoundError("❌ kaggle.json not found! Download it from https://www.kaggle.com/settings")
|
||||
|
||||
with open(kaggle_path, "r") as f:
|
||||
creds = json.load(f)
|
||||
|
||||
os.environ["KAGGLE_USERNAME"] = creds["username"]
|
||||
os.environ["KAGGLE_KEY"] = creds["key"]
|
||||
print("✅ Kaggle API credentials loaded.")
|
||||
|
||||
# ---------- HELPERS ----------
|
||||
def unzip_and_move(zip_path, class_name):
|
||||
"""Unzip dataset and put images into dataset/train/ & val/ folders"""
|
||||
extract_path = Path("tmp_extract")
|
||||
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
||||
zip_ref.extractall(extract_path)
|
||||
|
||||
# Collect images
|
||||
all_images = list(extract_path.rglob("*.jpg")) + list(extract_path.rglob("*.png")) + list(extract_path.rglob("*.jpeg"))
|
||||
random.shuffle(all_images)
|
||||
|
||||
# Train/Val split
|
||||
split_idx = int(len(all_images) * TRAIN_SPLIT)
|
||||
train_files = all_images[:split_idx]
|
||||
val_files = all_images[split_idx:]
|
||||
|
||||
for img in train_files:
|
||||
target = BASE_DIR / "train" / class_name / img.name
|
||||
shutil.move(str(img), target)
|
||||
|
||||
for img in val_files:
|
||||
target = BASE_DIR / "val" / class_name / img.name
|
||||
shutil.move(str(img), target)
|
||||
|
||||
shutil.rmtree(extract_path)
|
||||
|
||||
def kaggle_download(dataset_slug, out_zip):
|
||||
"""Download Kaggle dataset into downloads/ folder"""
|
||||
os.system(f'kaggle datasets download -d {dataset_slug} -p {DOWNLOAD_DIR} -o')
|
||||
return DOWNLOAD_DIR / out_zip
|
||||
|
||||
# ---------- MAIN ----------
|
||||
if __name__ == "__main__":
|
||||
setup_kaggle_api()
|
||||
|
||||
# Pothole dataset
|
||||
pothole_zip = kaggle_download("andrewmvd/pothole-detection", "pothole-detection.zip")
|
||||
unzip_and_move(pothole_zip, "pothole")
|
||||
|
||||
# Garbage dataset
|
||||
garbage_zip = kaggle_download("dataclusterlabs/domestic-trash-garbage-dataset", "domestic-trash-garbage-dataset.zip")
|
||||
unzip_and_move(garbage_zip, "garbage")
|
||||
|
||||
# TrashNet (alternative garbage dataset)
|
||||
trashnet_zip = kaggle_download("techsash/waste-classification-data", "waste-classification-data.zip")
|
||||
unzip_and_move(trashnet_zip, "garbage")
|
||||
|
||||
# Signage dataset
|
||||
signage_zip = kaggle_download("ahemateja19bec1025/traffic-sign-dataset-classification", "traffic-sign-dataset-classification.zip")
|
||||
unzip_and_move(signage_zip, "signage") # Combine all sign classes into one
|
||||
|
||||
#Drainage dataset (⚠️ still missing)
|
||||
print("⚠️ No Kaggle dataset found for drainage. Please add manually to dataset/train/drainage & val/drainage.")
|
||||
# Streetlight dataset (⚠️ still missing)
|
||||
print("⚠️ No Kaggle dataset found for streetlights. Please add manually to dataset/train/streetlight & val/streetlight.")
|
||||
|
||||
print("✅ All datasets downloaded, cleaned, and organized into 'dataset/'")
|
||||
Reference in New Issue
Block a user