fix: image and ranking bugfix

2026-03-06 14:06:45 +00:00
parent da5a455c36
commit 8de47410ce
2 changed files with 17 additions and 77 deletions
--- a/backend/app/routers/images.py
+++ b/backend/app/routers/images.py
@@ -1,9 +1,4 @@
-"""Image proxy – scrape Funda listing pages for photo URLs."""
-
-import re
-import time
-import urllib.request
-from typing import Optional
+"""Image endpoints – retrieve photo URLs from the raw Funda data."""

 from fastapi import APIRouter, Depends, HTTPException
 from sqlalchemy import text
@@ -14,85 +9,23 @@ from app.database import get_db

 router = APIRouter()

-# Simple in-memory cache: global_id → (timestamp, image_urls)
-_cache: dict[str, tuple[float, list[str]]] = {}
-_CACHE_TTL = 3600  # 1 hour
-
-
-def _scrape_images(url: str) -> list[str]:
-    """Fetch a Funda listing page and extract image URLs."""
-    req = urllib.request.Request(
-        url,
-        headers={
-            "User-Agent": (
-                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
-                "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
-            ),
-            "Accept": "text/html,application/xhtml+xml",
-        },
-    )
-    try:
-        resp = urllib.request.urlopen(req, timeout=10)
-        html = resp.read().decode("utf-8", errors="replace")
-    except Exception:
-        return []
-
-    images: list[str] = []
-    seen_bases: set[str] = set()
-
-    # Pattern 1: valentina_media images (main property photos)
-    for match in re.finditer(
-        r"https://cloud\.funda\.nl/valentina_media/(\d+/\d+/\d+)(?:\.jpg|_\d+x\d+\.jpg)",
-        html,
-    ):
-        base = match.group(1)
-        if base not in seen_bases:
-            seen_bases.add(base)
-            images.append(
-                f"https://cloud.funda.nl/valentina_media/{base}.jpg?options=width=720"
-            )
-
-    # Pattern 2: listing-management images (newer uploads)
-    for match in re.finditer(
-        r"https://cloud\.funda\.nl/listing-management/([0-9a-f-]{36})",
-        html,
-    ):
-        uuid = match.group(1)
-        if uuid not in seen_bases:
-            seen_bases.add(uuid)
-            images.append(
-                f"https://cloud.funda.nl/listing-management/{uuid}?options=width=720"
-            )
-
-    return images
-

@router.get("/listings/{global_id}/images")
 def get_listing_images(
    global_id: str,
    db: Session = Depends(get_db),
 ) -> dict[str, list[str]]:
-    """Return image URLs for a listing, scraped from its Funda page."""
-    # Check cache
-    now = time.time()
-    if global_id in _cache:
-        ts, cached = _cache[global_id]
-        if now - ts < _CACHE_TTL:
-            return {"images": cached}
-
-    # Look up listing URL
+    """Return image URLs for a listing from the raw Funda JSON data."""
    row = db.execute(
        text(
-            f"SELECT url FROM {settings.LISTINGS_SCHEMA}.{settings.LISTINGS_TABLE} "
-            f"WHERE global_id = :gid"
+            "SELECT raw_json->'photo_urls' AS photo_urls "
+            "FROM raw_funda.listing_details "
+            "WHERE global_id = :gid"
        ),
        {"gid": global_id},
    ).first()

-    if not row or not row.url:
-        raise HTTPException(status_code=404, detail="Listing not found")
+    if not row or not row.photo_urls:
+        return {"images": []}

-    images = _scrape_images(row.url)
-    _cache[global_id] = (now, images)
-
-    return {"images": images}
+    return {"images": list(row.photo_urls)}
--- a/backend/app/routers/rankings.py
+++ b/backend/app/routers/rankings.py
@@ -4,6 +4,7 @@ from fastapi import APIRouter, Depends
 from sqlalchemy import text
 from sqlalchemy.orm import Session

+from app.config import settings
 from app.database import get_db
 from app.queries import LISTING_SELECT, row_to_listing
 from app.schemas import RankingResponse
@@ -18,8 +19,14 @@ def get_rankings(
    offset: int = 0,
    db: Session = Depends(get_db),
 ):
-    """Return listings ranked by ELO rating (highest first)."""
-    query = LISTING_SELECT
+    """Return listings ranked by ELO rating (highest first).
+
+    Only listings in the stable sample (elo.sample_listings) are shown.
+    """
+    query = LISTING_SELECT + f"""
+        INNER JOIN {settings.ELO_SCHEMA}.sample_listings s
+            ON l.global_id = s.global_id
+    """
    params: dict = {"limit": limit, "offset": offset}

    if status and status != "all":