daily parsing of raw weather

This commit is contained in:
2025-08-04 16:27:36 +02:00
parent 1bea1d2bb8
commit b3121b4f32
2 changed files with 47 additions and 1 deletions

View File

@@ -1,10 +1,14 @@
from collections.abc import Iterator
from datetime import datetime, timezone from datetime import datetime, timezone
from functools import partial from functools import partial
from pathlib import Path
from typing import Any from typing import Any
import polars as pl
import requests_cache import requests_cache
from config import APP from config import APP
from partitions import ( from partitions import (
daily_partitions_def,
latitude_partitions_def, latitude_partitions_def,
location_partitions_def, location_partitions_def,
longitude_partitions_def, longitude_partitions_def,
@@ -120,7 +124,7 @@ def raw_weather_batch_longitude() -> None:
io_manager_key="json_io_manager", io_manager_key="json_io_manager",
partitions_def=latitude_partitions_def, partitions_def=latitude_partitions_def,
) )
def raw_weather_batch_latitude(context: dg.AssetExecutionContext): def raw_weather_batch_latitude(context: dg.AssetExecutionContext) -> None:
ic(context.resources._asdict().keys()) # contains json_io_manager ic(context.resources._asdict().keys()) # contains json_io_manager
ic(context.partition_key) ic(context.partition_key)
@@ -164,3 +168,40 @@ def raw_weather_batch_latitude(context: dg.AssetExecutionContext):
}, },
) )
) )
@asset(
name="parsed",
deps=[raw_weather],
io_manager_key="polars_parquet_io_manager",
partitions_def=daily_partitions_def,
output_required=False,
)
def parsed_weather(
context: dg.AssetExecutionContext,
) -> Iterator[dg.Output[pl.DataFrame]]:
base = (
Path(context.resources.polars_parquet_io_manager.base_dir).joinpath(
*raw_weather.key.path
)
/ context.partition_key
)
dfs = []
ic(base)
for path in Path(base).rglob("*.json"):
df = pl.read_json(path)
df = df.select(
*(
pl.lit(v).alias(k)
for k, v in zip(
("_latitude", "_longitude", "_time"),
path.relative_to(base).with_suffix("").parts,
)
),
*df.columns,
)
dfs.append(df)
if dfs:
yield dg.Output(pl.concat(dfs))

View File

@@ -1,5 +1,10 @@
import os
import dagster as dg import dagster as dg
latitude_partitions_def = dg.DynamicPartitionsDefinition(name="latitudes") latitude_partitions_def = dg.DynamicPartitionsDefinition(name="latitudes")
longitude_partitions_def = dg.DynamicPartitionsDefinition(name="longitudes") longitude_partitions_def = dg.DynamicPartitionsDefinition(name="longitudes")
location_partitions_def = dg.DynamicPartitionsDefinition(name="locations") location_partitions_def = dg.DynamicPartitionsDefinition(name="locations")
daily_partitions_def = dg.DailyPartitionsDefinition(
start_date="2025-08-01", end_offset=1, timezone=os.environ.get("TZ", "UTC")
)