fix: image and ranking bugfix

This commit is contained in:
Stijnvandenbroek
2026-03-06 14:06:45 +00:00
parent da5a455c36
commit 8de47410ce
2 changed files with 17 additions and 77 deletions

View File

@@ -1,9 +1,4 @@
"""Image proxy scrape Funda listing pages for photo URLs.""" """Image endpoints retrieve photo URLs from the raw Funda data."""
import re
import time
import urllib.request
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException from fastapi import APIRouter, Depends, HTTPException
from sqlalchemy import text from sqlalchemy import text
@@ -14,85 +9,23 @@ from app.database import get_db
router = APIRouter() router = APIRouter()
# Simple in-memory cache: global_id → (timestamp, image_urls)
_cache: dict[str, tuple[float, list[str]]] = {}
_CACHE_TTL = 3600 # 1 hour
def _scrape_images(url: str) -> list[str]:
"""Fetch a Funda listing page and extract image URLs."""
req = urllib.request.Request(
url,
headers={
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml",
},
)
try:
resp = urllib.request.urlopen(req, timeout=10)
html = resp.read().decode("utf-8", errors="replace")
except Exception:
return []
images: list[str] = []
seen_bases: set[str] = set()
# Pattern 1: valentina_media images (main property photos)
for match in re.finditer(
r"https://cloud\.funda\.nl/valentina_media/(\d+/\d+/\d+)(?:\.jpg|_\d+x\d+\.jpg)",
html,
):
base = match.group(1)
if base not in seen_bases:
seen_bases.add(base)
images.append(
f"https://cloud.funda.nl/valentina_media/{base}.jpg?options=width=720"
)
# Pattern 2: listing-management images (newer uploads)
for match in re.finditer(
r"https://cloud\.funda\.nl/listing-management/([0-9a-f-]{36})",
html,
):
uuid = match.group(1)
if uuid not in seen_bases:
seen_bases.add(uuid)
images.append(
f"https://cloud.funda.nl/listing-management/{uuid}?options=width=720"
)
return images
@router.get("/listings/{global_id}/images") @router.get("/listings/{global_id}/images")
def get_listing_images( def get_listing_images(
global_id: str, global_id: str,
db: Session = Depends(get_db), db: Session = Depends(get_db),
) -> dict[str, list[str]]: ) -> dict[str, list[str]]:
"""Return image URLs for a listing, scraped from its Funda page.""" """Return image URLs for a listing from the raw Funda JSON data."""
# Check cache
now = time.time()
if global_id in _cache:
ts, cached = _cache[global_id]
if now - ts < _CACHE_TTL:
return {"images": cached}
# Look up listing URL
row = db.execute( row = db.execute(
text( text(
f"SELECT url FROM {settings.LISTINGS_SCHEMA}.{settings.LISTINGS_TABLE} " "SELECT raw_json->'photo_urls' AS photo_urls "
f"WHERE global_id = :gid" "FROM raw_funda.listing_details "
"WHERE global_id = :gid"
), ),
{"gid": global_id}, {"gid": global_id},
).first() ).first()
if not row or not row.url: if not row or not row.photo_urls:
raise HTTPException(status_code=404, detail="Listing not found") return {"images": []}
images = _scrape_images(row.url) return {"images": list(row.photo_urls)}
_cache[global_id] = (now, images)
return {"images": images}

View File

@@ -4,6 +4,7 @@ from fastapi import APIRouter, Depends
from sqlalchemy import text from sqlalchemy import text
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from app.config import settings
from app.database import get_db from app.database import get_db
from app.queries import LISTING_SELECT, row_to_listing from app.queries import LISTING_SELECT, row_to_listing
from app.schemas import RankingResponse from app.schemas import RankingResponse
@@ -18,8 +19,14 @@ def get_rankings(
offset: int = 0, offset: int = 0,
db: Session = Depends(get_db), db: Session = Depends(get_db),
): ):
"""Return listings ranked by ELO rating (highest first).""" """Return listings ranked by ELO rating (highest first).
query = LISTING_SELECT
Only listings in the stable sample (elo.sample_listings) are shown.
"""
query = LISTING_SELECT + f"""
INNER JOIN {settings.ELO_SCHEMA}.sample_listings s
ON l.global_id = s.global_id
"""
params: dict = {"limit": limit, "offset": offset} params: dict = {"limit": limit, "offset": offset}
if status and status != "all": if status and status != "all":