citypulse/backend/test/Machine_Learning/street_light_scrapping.py

import os
import zipfile
import shutil
import random
from pathlib import Path
import requests

# ---------- CONFIG ----------
BASE_DIR = Path("dataset")
DOWNLOAD_DIR = Path("downloads")
CLASS_NAME = "streetlight"
TRAIN_SPLIT = 0.8  # 80% train, 20% val

os.makedirs(BASE_DIR / "train" / CLASS_NAME, exist_ok=True)
os.makedirs(BASE_DIR / "val" / CLASS_NAME, exist_ok=True)
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

def download_from_github(url: str, out_path: Path):
    print(f"⬇️ Trying download: {url}")
    resp = requests.get(url, stream=True)
    if resp.status_code != 200:
        print(f"❌ Download failed: status code {resp.status_code}")
        return False
    with open(out_path, "wb") as f:
        for chunk in resp.iter_content(8192):
            f.write(chunk)
    print(f"✅ Downloaded to {out_path}")
    return True

def unzip_and_split(zip_path: Path, class_name: str):
    extract_path = Path("tmp_extract")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

    all_images = list(extract_path.rglob("*.jpg")) + list(extract_path.rglob("*.png")) + list(extract_path.rglob("*.jpeg"))
    if not all_images:
        print("⚠️ No images in extracted folder.")
        return

    random.shuffle(all_images)
    split_idx = int(len(all_images) * TRAIN_SPLIT)
    train = all_images[:split_idx]
    val = all_images[split_idx:]

    for img in train:
        shutil.move(str(img), BASE_DIR / "train" / class_name / img.name)
    for img in val:
        shutil.move(str(img), BASE_DIR / "val" / class_name / img.name)

    shutil.rmtree(extract_path)
    print(f"✅ {class_name} split: {len(train)} train / {len(val)} val")

if __name__ == "__main__":
    # Try the GitHub repo from the paper
    streetlight_url = "https://github.com/Team16Project/Street-Light-Dataset/archive/refs/heads/main.zip"
    zip_path = DOWNLOAD_DIR / "streetlight_dataset.zip"

    ok = download_from_github(streetlight_url, zip_path)
    if ok:
        unzip_and_split(zip_path, CLASS_NAME)
    else:
        print("⚠️ Could not download streetlight dataset. You may need to find alternative source.")