99 lines
2.8 KiB
Python
99 lines
2.8 KiB
Python
"""Image proxy – scrape Funda listing pages for photo URLs."""
|
||
|
||
import re
|
||
import time
|
||
import urllib.request
|
||
from typing import Optional
|
||
|
||
from fastapi import APIRouter, Depends, HTTPException
|
||
from sqlalchemy import text
|
||
from sqlalchemy.orm import Session
|
||
|
||
from app.config import settings
|
||
from app.database import get_db
|
||
|
||
router = APIRouter()
|
||
|
||
# Simple in-memory cache: global_id → (timestamp, image_urls)
|
||
_cache: dict[str, tuple[float, list[str]]] = {}
|
||
_CACHE_TTL = 3600 # 1 hour
|
||
|
||
|
||
def _scrape_images(url: str) -> list[str]:
|
||
"""Fetch a Funda listing page and extract image URLs."""
|
||
req = urllib.request.Request(
|
||
url,
|
||
headers={
|
||
"User-Agent": (
|
||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||
),
|
||
"Accept": "text/html,application/xhtml+xml",
|
||
},
|
||
)
|
||
try:
|
||
resp = urllib.request.urlopen(req, timeout=10)
|
||
html = resp.read().decode("utf-8", errors="replace")
|
||
except Exception:
|
||
return []
|
||
|
||
images: list[str] = []
|
||
seen_bases: set[str] = set()
|
||
|
||
# Pattern 1: valentina_media images (main property photos)
|
||
for match in re.finditer(
|
||
r"https://cloud\.funda\.nl/valentina_media/(\d+/\d+/\d+)(?:\.jpg|_\d+x\d+\.jpg)",
|
||
html,
|
||
):
|
||
base = match.group(1)
|
||
if base not in seen_bases:
|
||
seen_bases.add(base)
|
||
images.append(
|
||
f"https://cloud.funda.nl/valentina_media/{base}.jpg?options=width=720"
|
||
)
|
||
|
||
# Pattern 2: listing-management images (newer uploads)
|
||
for match in re.finditer(
|
||
r"https://cloud\.funda\.nl/listing-management/([0-9a-f-]{36})",
|
||
html,
|
||
):
|
||
uuid = match.group(1)
|
||
if uuid not in seen_bases:
|
||
seen_bases.add(uuid)
|
||
images.append(
|
||
f"https://cloud.funda.nl/listing-management/{uuid}?options=width=720"
|
||
)
|
||
|
||
return images
|
||
|
||
|
||
@router.get("/listings/{global_id}/images")
|
||
def get_listing_images(
|
||
global_id: str,
|
||
db: Session = Depends(get_db),
|
||
) -> dict[str, list[str]]:
|
||
"""Return image URLs for a listing, scraped from its Funda page."""
|
||
# Check cache
|
||
now = time.time()
|
||
if global_id in _cache:
|
||
ts, cached = _cache[global_id]
|
||
if now - ts < _CACHE_TTL:
|
||
return {"images": cached}
|
||
|
||
# Look up listing URL
|
||
row = db.execute(
|
||
text(
|
||
f"SELECT url FROM {settings.LISTINGS_SCHEMA}.{settings.LISTINGS_TABLE} "
|
||
f"WHERE global_id = :gid"
|
||
),
|
||
{"gid": global_id},
|
||
).first()
|
||
|
||
if not row or not row.url:
|
||
raise HTTPException(status_code=404, detail="Listing not found")
|
||
|
||
images = _scrape_images(row.url)
|
||
_cache[global_id] = (now, images)
|
||
|
||
return {"images": images}
|