Files
citypulse/backend/test/Machine_Learning/street_light_scrapping.py
Cursor Agent 46dea3304f Refactor: Integrate backend API and normalize data
This commit integrates the backend API for fetching and updating report data. It also includes a normalization function to handle data consistency between the API and local storage.

Co-authored-by: anthonymuncher <anthonymuncher@gmail.com>
2025-09-26 10:27:39 +00:00

63 lines
2.1 KiB
Python

import os
import zipfile
import shutil
import random
from pathlib import Path
import requests
# ---------- CONFIG ----------
BASE_DIR = Path("dataset")
DOWNLOAD_DIR = Path("downloads")
CLASS_NAME = "streetlight"
TRAIN_SPLIT = 0.8 # 80% train, 20% val
os.makedirs(BASE_DIR / "train" / CLASS_NAME, exist_ok=True)
os.makedirs(BASE_DIR / "val" / CLASS_NAME, exist_ok=True)
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
def download_from_github(url: str, out_path: Path):
print(f"⬇️ Trying download: {url}")
resp = requests.get(url, stream=True)
if resp.status_code != 200:
print(f"❌ Download failed: status code {resp.status_code}")
return False
with open(out_path, "wb") as f:
for chunk in resp.iter_content(8192):
f.write(chunk)
print(f"✅ Downloaded to {out_path}")
return True
def unzip_and_split(zip_path: Path, class_name: str):
extract_path = Path("tmp_extract")
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(extract_path)
all_images = list(extract_path.rglob("*.jpg")) + list(extract_path.rglob("*.png")) + list(extract_path.rglob("*.jpeg"))
if not all_images:
print("⚠️ No images in extracted folder.")
return
random.shuffle(all_images)
split_idx = int(len(all_images) * TRAIN_SPLIT)
train = all_images[:split_idx]
val = all_images[split_idx:]
for img in train:
shutil.move(str(img), BASE_DIR / "train" / class_name / img.name)
for img in val:
shutil.move(str(img), BASE_DIR / "val" / class_name / img.name)
shutil.rmtree(extract_path)
print(f"{class_name} split: {len(train)} train / {len(val)} val")
if __name__ == "__main__":
# Try the GitHub repo from the paper
streetlight_url = "https://github.com/Team16Project/Street-Light-Dataset/archive/refs/heads/main.zip"
zip_path = DOWNLOAD_DIR / "streetlight_dataset.zip"
ok = download_from_github(streetlight_url, zip_path)
if ok:
unzip_and_split(zip_path, CLASS_NAME)
else:
print("⚠️ Could not download streetlight dataset. You may need to find alternative source.")