fix: image and ranking bugfix
This commit is contained in:
@@ -1,9 +1,4 @@
|
|||||||
"""Image proxy – scrape Funda listing pages for photo URLs."""
|
"""Image endpoints – retrieve photo URLs from the raw Funda data."""
|
||||||
|
|
||||||
import re
|
|
||||||
import time
|
|
||||||
import urllib.request
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, HTTPException
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
from sqlalchemy import text
|
from sqlalchemy import text
|
||||||
@@ -14,85 +9,23 @@ from app.database import get_db
|
|||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
|
|
||||||
# Simple in-memory cache: global_id → (timestamp, image_urls)
|
|
||||||
_cache: dict[str, tuple[float, list[str]]] = {}
|
|
||||||
_CACHE_TTL = 3600 # 1 hour
|
|
||||||
|
|
||||||
|
|
||||||
def _scrape_images(url: str) -> list[str]:
|
|
||||||
"""Fetch a Funda listing page and extract image URLs."""
|
|
||||||
req = urllib.request.Request(
|
|
||||||
url,
|
|
||||||
headers={
|
|
||||||
"User-Agent": (
|
|
||||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
||||||
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
||||||
),
|
|
||||||
"Accept": "text/html,application/xhtml+xml",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
resp = urllib.request.urlopen(req, timeout=10)
|
|
||||||
html = resp.read().decode("utf-8", errors="replace")
|
|
||||||
except Exception:
|
|
||||||
return []
|
|
||||||
|
|
||||||
images: list[str] = []
|
|
||||||
seen_bases: set[str] = set()
|
|
||||||
|
|
||||||
# Pattern 1: valentina_media images (main property photos)
|
|
||||||
for match in re.finditer(
|
|
||||||
r"https://cloud\.funda\.nl/valentina_media/(\d+/\d+/\d+)(?:\.jpg|_\d+x\d+\.jpg)",
|
|
||||||
html,
|
|
||||||
):
|
|
||||||
base = match.group(1)
|
|
||||||
if base not in seen_bases:
|
|
||||||
seen_bases.add(base)
|
|
||||||
images.append(
|
|
||||||
f"https://cloud.funda.nl/valentina_media/{base}.jpg?options=width=720"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Pattern 2: listing-management images (newer uploads)
|
|
||||||
for match in re.finditer(
|
|
||||||
r"https://cloud\.funda\.nl/listing-management/([0-9a-f-]{36})",
|
|
||||||
html,
|
|
||||||
):
|
|
||||||
uuid = match.group(1)
|
|
||||||
if uuid not in seen_bases:
|
|
||||||
seen_bases.add(uuid)
|
|
||||||
images.append(
|
|
||||||
f"https://cloud.funda.nl/listing-management/{uuid}?options=width=720"
|
|
||||||
)
|
|
||||||
|
|
||||||
return images
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/listings/{global_id}/images")
|
@router.get("/listings/{global_id}/images")
|
||||||
def get_listing_images(
|
def get_listing_images(
|
||||||
global_id: str,
|
global_id: str,
|
||||||
db: Session = Depends(get_db),
|
db: Session = Depends(get_db),
|
||||||
) -> dict[str, list[str]]:
|
) -> dict[str, list[str]]:
|
||||||
"""Return image URLs for a listing, scraped from its Funda page."""
|
"""Return image URLs for a listing from the raw Funda JSON data."""
|
||||||
# Check cache
|
|
||||||
now = time.time()
|
|
||||||
if global_id in _cache:
|
|
||||||
ts, cached = _cache[global_id]
|
|
||||||
if now - ts < _CACHE_TTL:
|
|
||||||
return {"images": cached}
|
|
||||||
|
|
||||||
# Look up listing URL
|
|
||||||
row = db.execute(
|
row = db.execute(
|
||||||
text(
|
text(
|
||||||
f"SELECT url FROM {settings.LISTINGS_SCHEMA}.{settings.LISTINGS_TABLE} "
|
"SELECT raw_json->'photo_urls' AS photo_urls "
|
||||||
f"WHERE global_id = :gid"
|
"FROM raw_funda.listing_details "
|
||||||
|
"WHERE global_id = :gid"
|
||||||
),
|
),
|
||||||
{"gid": global_id},
|
{"gid": global_id},
|
||||||
).first()
|
).first()
|
||||||
|
|
||||||
if not row or not row.url:
|
if not row or not row.photo_urls:
|
||||||
raise HTTPException(status_code=404, detail="Listing not found")
|
return {"images": []}
|
||||||
|
|
||||||
images = _scrape_images(row.url)
|
return {"images": list(row.photo_urls)}
|
||||||
_cache[global_id] = (now, images)
|
|
||||||
|
|
||||||
return {"images": images}
|
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ from fastapi import APIRouter, Depends
|
|||||||
from sqlalchemy import text
|
from sqlalchemy import text
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.config import settings
|
||||||
from app.database import get_db
|
from app.database import get_db
|
||||||
from app.queries import LISTING_SELECT, row_to_listing
|
from app.queries import LISTING_SELECT, row_to_listing
|
||||||
from app.schemas import RankingResponse
|
from app.schemas import RankingResponse
|
||||||
@@ -18,8 +19,14 @@ def get_rankings(
|
|||||||
offset: int = 0,
|
offset: int = 0,
|
||||||
db: Session = Depends(get_db),
|
db: Session = Depends(get_db),
|
||||||
):
|
):
|
||||||
"""Return listings ranked by ELO rating (highest first)."""
|
"""Return listings ranked by ELO rating (highest first).
|
||||||
query = LISTING_SELECT
|
|
||||||
|
Only listings in the stable sample (elo.sample_listings) are shown.
|
||||||
|
"""
|
||||||
|
query = LISTING_SELECT + f"""
|
||||||
|
INNER JOIN {settings.ELO_SCHEMA}.sample_listings s
|
||||||
|
ON l.global_id = s.global_id
|
||||||
|
"""
|
||||||
params: dict = {"limit": limit, "offset": offset}
|
params: dict = {"limit": limit, "offset": offset}
|
||||||
|
|
||||||
if status and status != "all":
|
if status and status != "all":
|
||||||
|
|||||||
Reference in New Issue
Block a user