"""Infer ELO scores for new listings using the best trained model.""" from pathlib import Path import mlflow import pandas as pd from dagster import ( AssetExecutionContext, AssetKey, Config, MaterializeResult, MetadataValue, asset, ) from sqlalchemy import text from data_platform.assets.ml.elo_model import ( ALL_FEATURES, _preprocess, ) from data_platform.helpers import render_sql from data_platform.resources import MLflowResource, PostgresResource _SQL_DIR = Path(__file__).parent / "sql" class EloInferenceConfig(Config): """Configuration for ELO inference.""" mlflow_experiment: str = "elo-rating-prediction" metric: str = "rmse" ascending: bool = True def _best_run(experiment_name: str, metric: str, ascending: bool): """Return the MLflow run with the best metric value.""" client = mlflow.tracking.MlflowClient() experiment = client.get_experiment_by_name(experiment_name) if experiment is None: raise ValueError( f"MLflow experiment '{experiment_name}' does not exist. " "Train the elo_prediction_model asset first." ) order = "ASC" if ascending else "DESC" runs = client.search_runs( experiment_ids=[experiment.experiment_id], order_by=[f"metrics.{metric} {order}"], max_results=1, ) if not runs: raise ValueError( f"No runs found in experiment '{experiment_name}'. " "Train the elo_prediction_model asset first." ) return runs[0] @asset( deps=["elo_prediction_model", AssetKey(["marts", "funda_listings"])], group_name="ml", kinds={"python", "mlflow"}, tags={"manual": "true"}, description=( "Load the best ELO prediction model from MLflow and infer scores " "for all listings that have not been scored yet." ), ) def elo_inference( context: AssetExecutionContext, config: EloInferenceConfig, postgres: PostgresResource, mlflow_resource: MLflowResource, ) -> MaterializeResult: engine = postgres.get_engine() # Ensure target table exists with engine.begin() as conn: conn.execute(text(render_sql(_SQL_DIR, "ensure_elo_schema.sql"))) conn.execute(text(render_sql(_SQL_DIR, "ensure_predictions_table.sql"))) # Fetch unscored listings query = render_sql(_SQL_DIR, "select_unscored_listings.sql") df = pd.read_sql(text(query), engine) context.log.info(f"Found {len(df)} unscored listings.") if df.empty: return MaterializeResult( metadata={ "scored": 0, "status": MetadataValue.text("No new listings to score."), } ) # Load best model mlflow.set_tracking_uri(mlflow_resource.get_tracking_uri()) best_run = _best_run(config.mlflow_experiment, config.metric, config.ascending) run_id = best_run.info.run_id model_uri = f"runs:/{run_id}/elo_lgbm_model" context.log.info( f"Loading model from run {run_id} " f"({config.metric}={best_run.data.metrics.get(config.metric, '?')})." ) model = mlflow.lightgbm.load_model(model_uri) # Preprocess features identically to training df = _preprocess(df) X = df[ALL_FEATURES].copy() # Predict normalised ELO and convert back to original scale elo_norm = model.predict(X) df["predicted_elo"] = elo_norm * 100 + 1500 # Write predictions rows = [ { "global_id": row.global_id, "predicted_elo": float(row.predicted_elo), "mlflow_run_id": run_id, } for row in df.itertuples() ] upsert = render_sql(_SQL_DIR, "upsert_prediction.sql") with engine.begin() as conn: conn.execute(text(upsert), rows) context.log.info(f"Wrote {len(rows)} predictions (run {run_id}).") return MaterializeResult( metadata={ "scored": len(rows), "mlflow_run_id": MetadataValue.text(run_id), "predicted_elo_mean": MetadataValue.float( float(df["predicted_elo"].mean()) ), "predicted_elo_std": MetadataValue.float(float(df["predicted_elo"].std())), } )