feat: implement images in listings comparison

2026-03-06 12:53:06 +00:00
parent e1a67da3ce
commit da5a455c36
7 changed files with 223 additions and 6 deletions
--- a/backend/app/routers/images.py
+++ b/backend/app/routers/images.py
@@ -0,0 +1,98 @@
+"""Image proxy – scrape Funda listing pages for photo URLs."""
+
+import re
+import time
+import urllib.request
+from typing import Optional
+
+from fastapi import APIRouter, Depends, HTTPException
+from sqlalchemy import text
+from sqlalchemy.orm import Session
+
+from app.config import settings
+from app.database import get_db
+
+router = APIRouter()
+
+# Simple in-memory cache: global_id → (timestamp, image_urls)
+_cache: dict[str, tuple[float, list[str]]] = {}
+_CACHE_TTL = 3600  # 1 hour
+
+
+def _scrape_images(url: str) -> list[str]:
+    """Fetch a Funda listing page and extract image URLs."""
+    req = urllib.request.Request(
+        url,
+        headers={
+            "User-Agent": (
+                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+                "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+            ),
+            "Accept": "text/html,application/xhtml+xml",
+        },
+    )
+    try:
+        resp = urllib.request.urlopen(req, timeout=10)
+        html = resp.read().decode("utf-8", errors="replace")
+    except Exception:
+        return []
+
+    images: list[str] = []
+    seen_bases: set[str] = set()
+
+    # Pattern 1: valentina_media images (main property photos)
+    for match in re.finditer(
+        r"https://cloud\.funda\.nl/valentina_media/(\d+/\d+/\d+)(?:\.jpg|_\d+x\d+\.jpg)",
+        html,
+    ):
+        base = match.group(1)
+        if base not in seen_bases:
+            seen_bases.add(base)
+            images.append(
+                f"https://cloud.funda.nl/valentina_media/{base}.jpg?options=width=720"
+            )
+
+    # Pattern 2: listing-management images (newer uploads)
+    for match in re.finditer(
+        r"https://cloud\.funda\.nl/listing-management/([0-9a-f-]{36})",
+        html,
+    ):
+        uuid = match.group(1)
+        if uuid not in seen_bases:
+            seen_bases.add(uuid)
+            images.append(
+                f"https://cloud.funda.nl/listing-management/{uuid}?options=width=720"
+            )
+
+    return images
+
+
+@router.get("/listings/{global_id}/images")
+def get_listing_images(
+    global_id: str,
+    db: Session = Depends(get_db),
+) -> dict[str, list[str]]:
+    """Return image URLs for a listing, scraped from its Funda page."""
+    # Check cache
+    now = time.time()
+    if global_id in _cache:
+        ts, cached = _cache[global_id]
+        if now - ts < _CACHE_TTL:
+            return {"images": cached}
+
+    # Look up listing URL
+    row = db.execute(
+        text(
+            f"SELECT url FROM {settings.LISTINGS_SCHEMA}.{settings.LISTINGS_TABLE} "
+            f"WHERE global_id = :gid"
+        ),
+        {"gid": global_id},
+    ).first()
+
+    if not row or not row.url:
+        raise HTTPException(status_code=404, detail="Listing not found")
+
+    images = _scrape_images(row.url)
+    _cache[global_id] = (now, images)
+
+    return {"images": images}