diff --git a/backend/app/routers/images.py b/backend/app/routers/images.py index af4791a..1a77b88 100644 --- a/backend/app/routers/images.py +++ b/backend/app/routers/images.py @@ -1,9 +1,4 @@ -"""Image proxy – scrape Funda listing pages for photo URLs.""" - -import re -import time -import urllib.request -from typing import Optional +"""Image endpoints – retrieve photo URLs from the raw Funda data.""" from fastapi import APIRouter, Depends, HTTPException from sqlalchemy import text @@ -14,85 +9,23 @@ from app.database import get_db router = APIRouter() -# Simple in-memory cache: global_id → (timestamp, image_urls) -_cache: dict[str, tuple[float, list[str]]] = {} -_CACHE_TTL = 3600 # 1 hour - - -def _scrape_images(url: str) -> list[str]: - """Fetch a Funda listing page and extract image URLs.""" - req = urllib.request.Request( - url, - headers={ - "User-Agent": ( - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" - ), - "Accept": "text/html,application/xhtml+xml", - }, - ) - try: - resp = urllib.request.urlopen(req, timeout=10) - html = resp.read().decode("utf-8", errors="replace") - except Exception: - return [] - - images: list[str] = [] - seen_bases: set[str] = set() - - # Pattern 1: valentina_media images (main property photos) - for match in re.finditer( - r"https://cloud\.funda\.nl/valentina_media/(\d+/\d+/\d+)(?:\.jpg|_\d+x\d+\.jpg)", - html, - ): - base = match.group(1) - if base not in seen_bases: - seen_bases.add(base) - images.append( - f"https://cloud.funda.nl/valentina_media/{base}.jpg?options=width=720" - ) - - # Pattern 2: listing-management images (newer uploads) - for match in re.finditer( - r"https://cloud\.funda\.nl/listing-management/([0-9a-f-]{36})", - html, - ): - uuid = match.group(1) - if uuid not in seen_bases: - seen_bases.add(uuid) - images.append( - f"https://cloud.funda.nl/listing-management/{uuid}?options=width=720" - ) - - return images - @router.get("/listings/{global_id}/images") def get_listing_images( global_id: str, db: Session = Depends(get_db), ) -> dict[str, list[str]]: - """Return image URLs for a listing, scraped from its Funda page.""" - # Check cache - now = time.time() - if global_id in _cache: - ts, cached = _cache[global_id] - if now - ts < _CACHE_TTL: - return {"images": cached} - - # Look up listing URL + """Return image URLs for a listing from the raw Funda JSON data.""" row = db.execute( text( - f"SELECT url FROM {settings.LISTINGS_SCHEMA}.{settings.LISTINGS_TABLE} " - f"WHERE global_id = :gid" + "SELECT raw_json->'photo_urls' AS photo_urls " + "FROM raw_funda.listing_details " + "WHERE global_id = :gid" ), {"gid": global_id}, ).first() - if not row or not row.url: - raise HTTPException(status_code=404, detail="Listing not found") + if not row or not row.photo_urls: + return {"images": []} - images = _scrape_images(row.url) - _cache[global_id] = (now, images) - - return {"images": images} + return {"images": list(row.photo_urls)} diff --git a/backend/app/routers/rankings.py b/backend/app/routers/rankings.py index e7bb727..75572f6 100644 --- a/backend/app/routers/rankings.py +++ b/backend/app/routers/rankings.py @@ -4,6 +4,7 @@ from fastapi import APIRouter, Depends from sqlalchemy import text from sqlalchemy.orm import Session +from app.config import settings from app.database import get_db from app.queries import LISTING_SELECT, row_to_listing from app.schemas import RankingResponse @@ -18,8 +19,14 @@ def get_rankings( offset: int = 0, db: Session = Depends(get_db), ): - """Return listings ranked by ELO rating (highest first).""" - query = LISTING_SELECT + """Return listings ranked by ELO rating (highest first). + + Only listings in the stable sample (elo.sample_listings) are shown. + """ + query = LISTING_SELECT + f""" + INNER JOIN {settings.ELO_SCHEMA}.sample_listings s + ON l.global_id = s.global_id + """ params: dict = {"limit": limit, "offset": offset} if status and status != "all":