house-elo-ranking/backend/app/routers/images.py

"""Image proxy – scrape Funda listing pages for photo URLs."""

import re
import time
import urllib.request
from typing import Optional

from fastapi import APIRouter, Depends, HTTPException
from sqlalchemy import text
from sqlalchemy.orm import Session

from app.config import settings
from app.database import get_db

router = APIRouter()

# Simple in-memory cache: global_id → (timestamp, image_urls)
_cache: dict[str, tuple[float, list[str]]] = {}
_CACHE_TTL = 3600  # 1 hour


def _scrape_images(url: str) -> list[str]:
    """Fetch a Funda listing page and extract image URLs."""
    req = urllib.request.Request(
        url,
        headers={
            "User-Agent": (
                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
            ),
            "Accept": "text/html,application/xhtml+xml",
        },
    )
    try:
        resp = urllib.request.urlopen(req, timeout=10)
        html = resp.read().decode("utf-8", errors="replace")
    except Exception:
        return []

    images: list[str] = []
    seen_bases: set[str] = set()

    # Pattern 1: valentina_media images (main property photos)
    for match in re.finditer(
        r"https://cloud\.funda\.nl/valentina_media/(\d+/\d+/\d+)(?:\.jpg|_\d+x\d+\.jpg)",
        html,
    ):
        base = match.group(1)
        if base not in seen_bases:
            seen_bases.add(base)
            images.append(
                f"https://cloud.funda.nl/valentina_media/{base}.jpg?options=width=720"
            )

    # Pattern 2: listing-management images (newer uploads)
    for match in re.finditer(
        r"https://cloud\.funda\.nl/listing-management/([0-9a-f-]{36})",
        html,
    ):
        uuid = match.group(1)
        if uuid not in seen_bases:
            seen_bases.add(uuid)
            images.append(
                f"https://cloud.funda.nl/listing-management/{uuid}?options=width=720"
            )

    return images


@router.get("/listings/{global_id}/images")
def get_listing_images(
    global_id: str,
    db: Session = Depends(get_db),
) -> dict[str, list[str]]:
    """Return image URLs for a listing, scraped from its Funda page."""
    # Check cache
    now = time.time()
    if global_id in _cache:
        ts, cached = _cache[global_id]
        if now - ts < _CACHE_TTL:
            return {"images": cached}

    # Look up listing URL
    row = db.execute(
        text(
            f"SELECT url FROM {settings.LISTINGS_SCHEMA}.{settings.LISTINGS_TABLE} "
            f"WHERE global_id = :gid"
        ),
        {"gid": global_id},
    ).first()

    if not row or not row.url:
        raise HTTPException(status_code=404, detail="Listing not found")

    images = _scrape_images(row.url)
    _cache[global_id] = (now, images)

    return {"images": images}