fix: image and ranking bugfix

2026-03-06 14:06:45 +00:00
parent da5a455c36
commit 8de47410ce
2 changed files with 17 additions and 77 deletions
--- a/backend/app/routers/images.py
+++ b/backend/app/routers/images.py
@@ -1,9 +1,4 @@
-"""Image proxy – scrape Funda listing pages for photo URLs."""
+"""Image endpoints – retrieve photo URLs from the raw Funda data."""
 import re
 import time
 import urllib.request
 from typing import Optional
 from fastapi import APIRouter, Depends, HTTPException
 from sqlalchemy import text
@@ -14,85 +9,23 @@ from app.database import get_db
 router = APIRouter()
 # Simple in-memory cache: global_id → (timestamp, image_urls)
 _cache: dict[str, tuple[float, list[str]]] = {}
 _CACHE_TTL = 3600  # 1 hour
 def _scrape_images(url: str) -> list[str]:
    """Fetch a Funda listing page and extract image URLs."""
    req = urllib.request.Request(
        url,
        headers={
            "User-Agent": (
                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
            ),
            "Accept": "text/html,application/xhtml+xml",
        },
    )
    try:
        resp = urllib.request.urlopen(req, timeout=10)
        html = resp.read().decode("utf-8", errors="replace")
    except Exception:
        return []
    images: list[str] = []
    seen_bases: set[str] = set()
    # Pattern 1: valentina_media images (main property photos)
    for match in re.finditer(
        r"https://cloud\.funda\.nl/valentina_media/(\d+/\d+/\d+)(?:\.jpg|_\d+x\d+\.jpg)",
        html,
    ):
        base = match.group(1)
        if base not in seen_bases:
            seen_bases.add(base)
            images.append(
                f"https://cloud.funda.nl/valentina_media/{base}.jpg?options=width=720"
            )
    # Pattern 2: listing-management images (newer uploads)
    for match in re.finditer(
        r"https://cloud\.funda\.nl/listing-management/([0-9a-f-]{36})",
        html,
    ):
        uuid = match.group(1)
        if uuid not in seen_bases:
            seen_bases.add(uuid)
            images.append(
                f"https://cloud.funda.nl/listing-management/{uuid}?options=width=720"
            )
    return images
@router.get("/listings/{global_id}/images")
 def get_listing_images(
    global_id: str,
    db: Session = Depends(get_db),
 ) -> dict[str, list[str]]:
-    """Return image URLs for a listing, scraped from its Funda page."""
+    """Return image URLs for a listing from the raw Funda JSON data."""
    # Check cache
    now = time.time()
    if global_id in _cache:
        ts, cached = _cache[global_id]
        if now - ts < _CACHE_TTL:
            return {"images": cached}
    # Look up listing URL
    row = db.execute(
        text(
-            f"SELECT url FROM {settings.LISTINGS_SCHEMA}.{settings.LISTINGS_TABLE} "
+            "SELECT raw_json->'photo_urls' AS photo_urls "
-            f"WHERE global_id = :gid"
+            "FROM raw_funda.listing_details "
            "WHERE global_id = :gid"
        ),
        {"gid": global_id},
    ).first()
-    if not row or not row.url:
+    if not row or not row.photo_urls:
-        raise HTTPException(status_code=404, detail="Listing not found")
+        return {"images": []}
-    images = _scrape_images(row.url)
+    return {"images": list(row.photo_urls)}
    _cache[global_id] = (now, images)
    return {"images": images}
--- a/backend/app/routers/rankings.py
+++ b/backend/app/routers/rankings.py
@@ -4,6 +4,7 @@ from fastapi import APIRouter, Depends
 from sqlalchemy import text
 from sqlalchemy.orm import Session
 from app.config import settings
 from app.database import get_db
 from app.queries import LISTING_SELECT, row_to_listing
 from app.schemas import RankingResponse
@@ -18,8 +19,14 @@ def get_rankings(
    offset: int = 0,
    db: Session = Depends(get_db),
 ):
-    """Return listings ranked by ELO rating (highest first)."""
+    """Return listings ranked by ELO rating (highest first).
-    query = LISTING_SELECT
+
    Only listings in the stable sample (elo.sample_listings) are shown.
    """
    query = LISTING_SELECT + f"""
        INNER JOIN {settings.ELO_SCHEMA}.sample_listings s
            ON l.global_id = s.global_id
    """
    params: dict = {"limit": limit, "offset": offset}
    if status and status != "all":