feat: add mlflow and lightbm model

2026-03-07 20:45:25 +00:00
parent b944d1f214
commit 61abb37ef4
12 changed files with 1430 additions and 4 deletions
--- a/4
+++ b/4
@@ -5,8 +5,8 @@ WORKDIR /app

 COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv

-# Install system dependencies (git is required by elementary/dbt deps)
-RUN apt-get update && apt-get install -y --no-install-recommends git && rm -rf /var/lib/apt/lists/*
+# Install system dependencies (git is required by elementary/dbt deps, libgomp1 for LightGBM)
+RUN apt-get update && apt-get install -y --no-install-recommends git libgomp1 && rm -rf /var/lib/apt/lists/*

 # Install dependencies before copying full source (layer caching)
 COPY pyproject.toml uv.lock* ./
--- a/data_platform/assets/ml/init.py
+++ b/data_platform/assets/ml/init.py
@@ -0,0 +1,5 @@
+"""Machine-learning assets."""
+
+from data_platform.assets.ml.elo_model import elo_prediction_model
+
+__all__ = ["elo_prediction_model"]
--- a/data_platform/assets/ml/elo_model.py
+++ b/data_platform/assets/ml/elo_model.py
@@ -0,0 +1,264 @@
+"""LightGBM model to predict ELO ratings for Funda listings."""
+
+from pathlib import Path
+
+import mlflow
+import mlflow.lightgbm
+import numpy as np
+import pandas as pd
+from dagster import (
+    AssetExecutionContext,
+    Config,
+    MaterializeResult,
+    MetadataValue,
+    asset,
+)
+from lightgbm import LGBMRegressor
+from sklearn.model_selection import train_test_split
+
+from data_platform.helpers import render_sql
+from data_platform.resources import MLflowResource, PostgresResource
+
+_SQL_DIR = Path(__file__).parent / "sql"
+
+# Energy label → ordinal int (higher = better)
+ENERGY_LABEL_MAP: dict[str | None, int] = {
+    "A5": 10,
+    "A4": 9,
+    "A3": 8,
+    "A2": 7,
+    "A1": 6,
+    "A": 5,
+    "B": 4,
+    "C": 3,
+    "D": 2,
+    "E": 1,
+    "F": 0,
+    "G": -1,
+}
+
+_MAX_RETAINED_RUNS = 3
+
+NUMERIC_FEATURES = [
+    "current_price",
+    "living_area",
+    "plot_area",
+    "bedrooms",
+    "rooms",
+    "construction_year",
+    "latitude",
+    "longitude",
+    "photo_count",
+    "views",
+    "saves",
+    "price_per_sqm",
+]
+BOOL_FEATURES = [
+    "has_garden",
+    "has_balcony",
+    "has_solar_panels",
+    "has_heat_pump",
+    "has_roof_terrace",
+    "is_energy_efficient",
+    "is_monument",
+]
+DERIVED_FEATURES = [
+    "energy_label_num",
+]
+
+ALL_FEATURES = NUMERIC_FEATURES + BOOL_FEATURES + DERIVED_FEATURES
+
+
+class EloModelConfig(Config):
+    """Training hyper-parameters and options."""
+
+    test_size: float = 0.2
+    random_state: int = 42
+    min_comparisons: int = 5
+    n_estimators: int = 200
+    learning_rate: float = 0.05
+    max_depth: int = 6
+    num_leaves: int = 31
+    mlflow_experiment: str = "elo-rating-prediction"
+
+
+def _preprocess(df: pd.DataFrame) -> pd.DataFrame:
+    """Convert raw columns to model-ready numeric features."""
+    df["energy_label_num"] = (
+        df["energy_label"]
+        .str.strip()
+        .str.upper()
+        .map(ENERGY_LABEL_MAP)
+        .fillna(-2)
+        .astype(int)
+    )
+
+    for col in BOOL_FEATURES:
+        df[col] = df[col].fillna(False).astype(int)
+
+    for col in NUMERIC_FEATURES:
+        df[col] = pd.to_numeric(df[col], errors="coerce")
+        median = df[col].median()
+        df[col] = df[col].fillna(median if pd.notna(median) else 0)
+
+    return df
+
+
+@asset(
+    deps=["elo_ratings", "funda_listings"],
+    group_name="ml",
+    kinds={"python", "mlflow", "lightgbm"},
+    tags={"manual": "true"},
+    description=(
+        "Train a LightGBM regressor to predict normalised ELO rating from "
+        "listing features.  Logs the model, parameters and metrics to MLflow."
+    ),
+)
+def elo_prediction_model(
+    context: AssetExecutionContext,
+    config: EloModelConfig,
+    postgres: PostgresResource,
+    mlflow_resource: MLflowResource,
+) -> MaterializeResult:
+    # Fetch training data
+    engine = postgres.get_engine()
+    query = render_sql(_SQL_DIR, "select_training_data.sql")
+    df = pd.read_sql(
+        query,
+        engine,
+        params={"min_comparisons": config.min_comparisons},
+    )
+    context.log.info(f"Loaded {len(df)} listings with ELO ratings.")
+
+    if len(df) < 10:
+        raise ValueError(
+            f"Not enough rated listings ({len(df)}). "
+            "Need at least 10 rows with sufficient comparisons."
+        )
+
+    # Preprocess and normalise ELO target
+    df = _preprocess(df)
+    df["elo_norm"] = (df["elo_rating"] - 1500) / 100
+
+    X = df[ALL_FEATURES].copy()
+    y = df["elo_norm"]
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=config.test_size, random_state=config.random_state
+    )
+    context.log.info(f"Train set: {len(X_train)} rows, test set: {len(X_test)} rows.")
+
+    # Train model
+    mlflow.set_tracking_uri(mlflow_resource.get_tracking_uri())
+    mlflow.set_experiment(config.mlflow_experiment)
+
+    with mlflow.start_run() as run:
+        model = LGBMRegressor(
+            n_estimators=config.n_estimators,
+            learning_rate=config.learning_rate,
+            max_depth=config.max_depth,
+            num_leaves=config.num_leaves,
+            random_state=config.random_state,
+            verbosity=-1,
+        )
+        model.fit(
+            X_train,
+            y_train,
+            eval_set=[(X_test, y_test)],
+            eval_metric="rmse",
+        )
+
+        # Evaluate
+        y_pred = model.predict(X_test)
+        rmse = float(np.sqrt(np.mean((y_test - y_pred) ** 2)))
+        mae = float(np.mean(np.abs(y_test - y_pred)))
+        r2 = float(
+            1 - np.sum((y_test - y_pred) ** 2) / np.sum((y_test - y_test.mean()) ** 2)
+        )
+
+        context.log.info(f"RMSE: {rmse:.4f}  MAE: {mae:.4f}  R²: {r2:.4f}")
+
+        # Log params, metrics and model to MLflow
+        mlflow.log_params(
+            {
+                "n_estimators": config.n_estimators,
+                "learning_rate": config.learning_rate,
+                "max_depth": config.max_depth,
+                "num_leaves": config.num_leaves,
+                "test_size": config.test_size,
+                "min_comparisons": config.min_comparisons,
+                "train_rows": len(X_train),
+                "test_rows": len(X_test),
+                "features": ", ".join(ALL_FEATURES),
+            }
+        )
+        mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})
+
+        importances = dict(
+            zip(ALL_FEATURES, model.feature_importances_.tolist(), strict=False)
+        )
+        for feat, imp in importances.items():
+            mlflow.log_metric(f"importance_{feat}", imp)
+
+        mlflow.lightgbm.log_model(
+            model,
+            artifact_path="elo_lgbm_model",
+            input_example=X_test.iloc[:1],
+        )
+
+        run_id = run.info.run_id
+
+    context.log.info(
+        f"MLflow run {run_id} logged to experiment '{config.mlflow_experiment}'."
+    )
+
+    # Delete old runs beyond retention limit
+    _cleanup_old_runs(config.mlflow_experiment, context)
+
+    # Build feature importance table for Dagster metadata
+    imp_sorted = sorted(importances.items(), key=lambda x: x[1], reverse=True)
+    imp_md = "| Feature | Importance |\n|---|---|\n"
+    imp_md += "\n".join(f"| {f} | {v} |" for f, v in imp_sorted)
+
+    return MaterializeResult(
+        metadata={
+            "mlflow_run_id": MetadataValue.text(run_id),
+            "mlflow_experiment": MetadataValue.text(config.mlflow_experiment),
+            "train_rows": len(X_train),
+            "test_rows": len(X_test),
+            "rmse": MetadataValue.float(rmse),
+            "mae": MetadataValue.float(mae),
+            "r2": MetadataValue.float(r2),
+            "feature_importances": MetadataValue.md(imp_md),
+        }
+    )
+
+
+def _cleanup_old_runs(
+    experiment_name: str,
+    context: AssetExecutionContext,
+    keep: int = _MAX_RETAINED_RUNS,
+) -> None:
+    """Delete oldest MLflow runs, keeping only the most recent *keep*."""
+    client = mlflow.tracking.MlflowClient()
+    experiment = client.get_experiment_by_name(experiment_name)
+    if experiment is None:
+        return
+
+    runs = client.search_runs(
+        experiment_ids=[experiment.experiment_id],
+        order_by=["start_time DESC"],
+    )
+
+    if len(runs) <= keep:
+        return
+
+    stale_runs = runs[keep:]
+    for run in stale_runs:
+        context.log.info(f"Deleting old MLflow run {run.info.run_id}")
+        client.delete_run(run.info.run_id)
+
+    context.log.info(
+        f"Retained {keep} runs, deleted {len(stale_runs)} old run(s) "
+        f"from experiment '{experiment_name}'."
+    )
--- a/data_platform/assets/ml/sql/select_training_data.sql
+++ b/data_platform/assets/ml/sql/select_training_data.sql
@@ -0,0 +1,26 @@
+select
+    fl.global_id,
+    fl.current_price,
+    fl.living_area,
+    fl.plot_area,
+    fl.bedrooms,
+    fl.rooms,
+    fl.construction_year,
+    fl.latitude,
+    fl.longitude,
+    fl.energy_label,
+    fl.has_garden,
+    fl.has_balcony,
+    fl.has_solar_panels,
+    fl.has_heat_pump,
+    fl.has_roof_terrace,
+    fl.is_energy_efficient,
+    fl.is_monument,
+    fl.photo_count,
+    fl.views,
+    fl.saves,
+    fl.price_per_sqm,
+    er.elo_rating
+from marts.funda_listings as fl
+inner join elo.ratings as er on fl.global_id = er.global_id
+where er.comparison_count >=: min_comparisons
--- a/data_platform/definitions.py
+++ b/data_platform/definitions.py
@@ -12,13 +12,14 @@ from data_platform.assets.ingestion.funda import (
    raw_funda_price_history,
    raw_funda_search_results,
 )
+from data_platform.assets.ml import elo_prediction_model
 from data_platform.helpers import apply_automation
 from data_platform.jobs import (
    elementary_refresh_job,
    funda_ingestion_job,
    funda_raw_quality_job,
 )
-from data_platform.resources import FundaResource, PostgresResource
+from data_platform.resources import FundaResource, MLflowResource, PostgresResource
 from data_platform.schedules import (
    elementary_refresh_schedule,
    funda_ingestion_schedule,
@@ -34,6 +35,7 @@ defs = Definitions(
            raw_funda_price_history,
            elo_ratings,
            elo_comparisons,
+            elo_prediction_model,
        ]
    ),
    jobs=[funda_ingestion_job, funda_raw_quality_job, elementary_refresh_job],
@@ -53,5 +55,6 @@ defs = Definitions(
        "dbt": DbtCliResource(project_dir=str(DBT_PROJECT_DIR)),
        "funda": FundaResource(),
        "postgres": PostgresResource(),
+        "mlflow_resource": MLflowResource(),
    },
 )
--- a/data_platform/resources/init.py
+++ b/data_platform/resources/init.py
@@ -37,3 +37,12 @@ class PostgresResource(ConfigurableResource):
        engine = self.get_engine()
        with engine.begin() as conn:
            conn.execute(text(statement), rows)
+
+
+class MLflowResource(ConfigurableResource):
+    """MLflow experiment tracking resource."""
+
+    tracking_uri: str = EnvVar("MLFLOW_TRACKING_URI")
+
+    def get_tracking_uri(self) -> str:
+        return self.tracking_uri
--- a/dbt/models/marts/funda_listings.yml
+++ b/dbt/models/marts/funda_listings.yml
@@ -77,6 +77,10 @@ models:
      - name: energy_label
        description: Dutch energy performance label (A–G).
        data_type: text
+        data_tests:
+          - accepted_values:
+              values: ["A5", "A4", "A3", "A2", "A1", "A", "B", "C", "D", "E", "F", "G"]
+              where: "energy_label is not null"
      - name: living_area
        description: Interior floor area in m².
        data_type: integer
--- a/dbt/models/staging/stg_funda_listings.yml
+++ b/dbt/models/staging/stg_funda_listings.yml
@@ -89,7 +89,7 @@ models:
        data_type: text
        data_tests:
          - accepted_values:
-              values: ["A4", "A3", "A2", "A1", "A", "B", "C", "D", "E", "F", "G"]
+              values: ["A5", "A4", "A3", "A2", "A1", "A", "B", "C", "D", "E", "F", "G"]
              where: "energy_label is not null"
      - name: living_area
        description: Interior floor area in m².
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -3,6 +3,7 @@ x-dagster: &dagster-common
  env_file: .env
  environment:
    DAGSTER_HOME: /app/dagster_home
+    MLFLOW_TRACKING_URI: http://mlflow:5000
  depends_on:
    postgres:
      condition: service_healthy
@@ -95,6 +96,22 @@ services:
      postgres:
        condition: service_healthy

+  # MLflow experiment tracking server
+  mlflow:
+    image: python:3.12-slim
+    container_name: mlflow
+    restart: unless-stopped
+    env_file: .env
+    entrypoint: ["sh", "/mlflow/start.sh"]
+    volumes:
+      - mlflow-artifacts:/mlflow/artifacts
+      - ./mlflow/start.sh:/mlflow/start.sh:ro
+    ports:
+      - "5000:5000"
+    depends_on:
+      postgres:
+        condition: service_healthy
+
  # Elementary data observability report
  elementary-web:
    image: nginx:alpine
@@ -111,3 +128,4 @@ volumes:
  dbt-target:
  pgadmin-data:
  elementary-reports:
+  mlflow-artifacts:
--- a/mlflow/start.sh
+++ b/mlflow/start.sh
@@ -0,0 +1,35 @@
+#!/bin/sh
+set -e
+
+pip install --quiet mlflow psycopg2-binary
+
+# Ensure the mlflow database exists before starting the server.
+# docker-entrypoint-initdb.d scripts only run on first init, so this
+# handles existing postgres volumes where the database was never created.
+python -c "
+import psycopg2
+from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
+
+conn = psycopg2.connect(
+    host='postgres',
+    port=5432,
+    user='${POSTGRES_USER}',
+    password='${POSTGRES_PASSWORD}',
+    dbname='${POSTGRES_DB}',
+)
+conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
+cur = conn.cursor()
+cur.execute(\"SELECT 1 FROM pg_database WHERE datname = 'mlflow'\")
+if not cur.fetchone():
+    cur.execute('CREATE DATABASE mlflow')
+    print('Created mlflow database')
+else:
+    print('mlflow database already exists')
+conn.close()
+"
+
+exec mlflow server \
+    --host=0.0.0.0 \
+    --port=5000 \
+    --backend-store-uri="postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/mlflow" \
+    --default-artifact-root=/mlflow/artifacts
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,6 +12,10 @@ dependencies = [
    "pyfunda",
    "jinja2",
    "elementary-data[postgres]",
+    "mlflow",
+    "lightgbm",
+    "scikit-learn",
+    "pandas",
 ]

 [build-system]
--- a/uv.lock
+++ b/uv.lock