towards batch download
This commit is contained in:
@@ -4,10 +4,14 @@ from typing import Any
|
||||
|
||||
import requests_cache
|
||||
from config import APP
|
||||
from partitions import location_partitions_def
|
||||
from partitions import (
|
||||
latitude_partitions_def,
|
||||
location_partitions_def,
|
||||
longitude_partitions_def,
|
||||
)
|
||||
from requests import Request
|
||||
from retry_requests import retry
|
||||
from utils import parse_coord
|
||||
from utils import parse_coordinate_str
|
||||
|
||||
import dagster as dg
|
||||
|
||||
@@ -22,7 +26,7 @@ asset = partial(dg.asset, key_prefix=APP)
|
||||
def raw_weather(context: dg.AssetExecutionContext) -> Any:
|
||||
"""Asset to fetch raw weather data for each location."""
|
||||
partition_key = context.partition_key
|
||||
lat, lon = parse_coord(partition_key)
|
||||
lat, lon = parse_coordinate_str(partition_key)
|
||||
context.log.info(
|
||||
f"Fetching weather data for location ({partition_key}): {lat}, {lon}"
|
||||
)
|
||||
@@ -96,3 +100,20 @@ def raw_weather(context: dg.AssetExecutionContext) -> Any:
|
||||
"path_suffix": [date_str, time_str],
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@asset(
|
||||
io_manager_key="json_io_manager",
|
||||
partitions_def=latitude_partitions_def,
|
||||
name="raw_batch",
|
||||
)
|
||||
def raw_weather_batch(context: dg.AssetExecutionContext) -> None:
|
||||
for partitions_def in [
|
||||
location_partitions_def,
|
||||
latitude_partitions_def,
|
||||
longitude_partitions_def,
|
||||
]:
|
||||
existing_keys = set(
|
||||
context.instance.get_dynamic_partitions(partitions_def.name)
|
||||
)
|
||||
ic(partitions_def.name, len(existing_keys), existing_keys)
|
||||
|
||||
@@ -25,5 +25,8 @@ definitions = dg.Definitions(
|
||||
"json_io_manager": JsonIOManager(base_dir=storage_dir),
|
||||
"polars_parquet_io_manager": PolarsParquetIOManager(base_dir=storage_dir),
|
||||
},
|
||||
sensors=[sensors.list_locations],
|
||||
sensors=[sensors.list_locations, sensors.list_latitudes, sensors.retrieve_weather],
|
||||
schedules=[
|
||||
# schedules.raw_weather_schedule
|
||||
],
|
||||
)
|
||||
|
||||
@@ -3,5 +3,9 @@ import assets
|
||||
import dagster as dg
|
||||
|
||||
raw_weather_job = dg.define_asset_job(
|
||||
name="weather_data_job", selection=[assets.raw_weather.key]
|
||||
name="raw_weather_job", selection=[assets.raw_weather.key]
|
||||
)
|
||||
|
||||
raw_weather_batch_job = dg.define_asset_job(
|
||||
name="raw_weather_batch_job", selection=[assets.raw_weather_batch.key]
|
||||
)
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import dagster as dg
|
||||
|
||||
latitude_partitions_def = dg.DynamicPartitionsDefinition(name="latitudes")
|
||||
longitude_partitions_def = dg.DynamicPartitionsDefinition(name="longitudes")
|
||||
location_partitions_def = dg.DynamicPartitionsDefinition(name="locations")
|
||||
|
||||
11
apps/weather/src/schedules.py
Normal file
11
apps/weather/src/schedules.py
Normal file
@@ -0,0 +1,11 @@
|
||||
# This does not work:
|
||||
# Tried to build a partitioned schedule from an asset job, but received an invalid partitions definition. The permitted partitions definitions are:
|
||||
# 1. TimeWindowPartitionsDefinition
|
||||
# 2. MultiPartitionsDefinition with a single TimeWindowPartitionsDefinition dimension
|
||||
# 3. StaticPartitionsDefinition
|
||||
# Instead, use a sensor to trigger materialization on a schedule
|
||||
# raw_weather_schedule = dg.build_schedule_from_partitioned_job(
|
||||
# job=raw_weather_job,
|
||||
# # cron_schedule="0 * * * *",
|
||||
# default_status=dg.DefaultScheduleStatus.RUNNING,
|
||||
# )
|
||||
@@ -1,25 +1,40 @@
|
||||
import jobs
|
||||
import numpy as np
|
||||
from partitions import location_partitions_def
|
||||
from utils import format_coord
|
||||
from partitions import (
|
||||
latitude_partitions_def,
|
||||
location_partitions_def,
|
||||
longitude_partitions_def,
|
||||
)
|
||||
from utils import coordinate_to_str, latitude_to_str, longitude_to_str
|
||||
|
||||
import dagster as dg
|
||||
|
||||
lat_range = np.arange(51.0, 53.01, 0.25, dtype=float)
|
||||
lon_range = np.arange(3.0, 7.01, 0.25, dtype=float)
|
||||
|
||||
|
||||
@dg.sensor(job=jobs.raw_weather_job)
|
||||
def list_locations(context: dg.SensorEvaluationContext) -> dg.SensorResult:
|
||||
"""Sensor that emits RunRequests for new 0.25-degree grid locations not yet seen as partitions."""
|
||||
existing_keys = set(
|
||||
|
||||
existing_latitudes = set(
|
||||
context.instance.get_dynamic_partitions(location_partitions_def.name)
|
||||
)
|
||||
existing_longitudes = set(
|
||||
context.instance.get_dynamic_partitions(location_partitions_def.name)
|
||||
)
|
||||
existing_coordinates = set(
|
||||
context.instance.get_dynamic_partitions(location_partitions_def.name)
|
||||
)
|
||||
|
||||
lat_range = np.arange(51.0, 53.01, 0.25, dtype=float)
|
||||
lon_range = np.arange(3.0, 7.01, 0.25, dtype=float)
|
||||
|
||||
locations = [format_coord(lat, lon) for lat in lat_range for lon in lon_range]
|
||||
latitudes = [latitude_to_str(lat) for lat in lat_range]
|
||||
longitudes = [longitude_to_str(lon) for lon in lon_range]
|
||||
locations = [coordinate_to_str(lat, lon) for lat in lat_range for lon in lon_range]
|
||||
|
||||
new_latitudes = [lat for lat in latitudes if lat not in existing_latitudes]
|
||||
new_longitudes = [lon for lon in longitudes if lon not in existing_longitudes]
|
||||
new_locations = [
|
||||
location for location in locations if location not in existing_keys
|
||||
location for location in locations if location not in existing_coordinates
|
||||
]
|
||||
if new_locations:
|
||||
context.log.info(f"Discovered {len(new_locations)} new locations.")
|
||||
@@ -27,8 +42,55 @@ def list_locations(context: dg.SensorEvaluationContext) -> dg.SensorResult:
|
||||
# Limit to 3 new locations
|
||||
selected = new_locations[:3]
|
||||
return dg.SensorResult(
|
||||
run_requests=[dg.RunRequest(partition_key=loc) for loc in selected],
|
||||
run_requests=[], # dg.RunRequest(partition_key=location) for location in locations],
|
||||
dynamic_partitions_requests=[
|
||||
location_partitions_def.build_add_request(selected)
|
||||
location_partitions_def.build_add_request(selected),
|
||||
latitude_partitions_def.build_add_request(new_latitudes),
|
||||
longitude_partitions_def.build_add_request(new_longitudes),
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@dg.sensor(job=jobs.raw_weather_batch_job)
|
||||
def list_latitudes(context: dg.SensorEvaluationContext) -> dg.SensorResult:
|
||||
existing_latitudes = set(
|
||||
context.instance.get_dynamic_partitions(location_partitions_def.name)
|
||||
)
|
||||
latitudes = [latitude_to_str(lat) for lat in lat_range]
|
||||
new_latitudes = [lat for lat in latitudes if lat not in existing_latitudes]
|
||||
return dg.SensorResult(
|
||||
run_requests=[
|
||||
dg.RunRequest(partition_key=partition_key)
|
||||
for partition_key in new_latitudes
|
||||
],
|
||||
dynamic_partitions_requests=[
|
||||
latitude_partitions_def.build_add_request(new_latitudes)
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@dg.sensor(job=jobs.raw_weather_job, minimum_interval_seconds=60 * 60)
|
||||
def retrieve_weather(context: dg.SensorEvaluationContext) -> dg.SensorResult:
|
||||
"""
|
||||
Retrieve weather sensor function.
|
||||
|
||||
This function monitors and retrieves weather data by evaluating the current dynamic
|
||||
partitions and triggering run requests for each key in the partitions. The function
|
||||
is executed as a sensor with a defined minimum interval.
|
||||
|
||||
Args:
|
||||
- context (dg.SensorEvaluationContext): The context provided by the sensor framework,
|
||||
allowing access to the instance for retrieving dynamic partitions.
|
||||
|
||||
Returns:
|
||||
The result of the sensor's evaluation, containing run requests for each existing partition key.
|
||||
"""
|
||||
existing_keys = set(
|
||||
context.instance.get_dynamic_partitions(location_partitions_def.name)
|
||||
)
|
||||
return dg.SensorResult(
|
||||
run_requests=[
|
||||
dg.RunRequest(partition_key=partition_key)
|
||||
for partition_key in existing_keys
|
||||
]
|
||||
)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import unittest
|
||||
|
||||
from utils import format_coord, parse_coord
|
||||
from utils import coordinate_to_str, parse_coordinate_str
|
||||
|
||||
|
||||
class TestCoordinateFormatting(unittest.TestCase):
|
||||
@@ -15,20 +15,20 @@ class TestCoordinateFormatting(unittest.TestCase):
|
||||
]
|
||||
for lat, lon, expected in cases:
|
||||
with self.subTest(lat=lat, lon=lon):
|
||||
self.assertEqual(format_coord(lat, lon), expected)
|
||||
parsed_lat, parsed_lon = parse_coord(expected)
|
||||
self.assertEqual(coordinate_to_str(lat, lon), expected)
|
||||
parsed_lat, parsed_lon = parse_coordinate_str(expected)
|
||||
print(f"Parsed: {parsed_lat}, {parsed_lon} from {expected}")
|
||||
self.assertAlmostEqual(parsed_lat, lat, places=6)
|
||||
self.assertAlmostEqual(parsed_lon, lon, places=6)
|
||||
|
||||
def test_invalid_length(self):
|
||||
with self.assertRaises(ValueError):
|
||||
parse_coord("N52E4") # too short, malformed
|
||||
parse_coordinate_str("N52E4") # too short, malformed
|
||||
|
||||
def test_negative_coordinates(self):
|
||||
coord = format_coord(-52.25, -4.0)
|
||||
coord = coordinate_to_str(-52.25, -4.0)
|
||||
self.assertEqual(coord, "S5225W0400")
|
||||
lat, lon = parse_coord(coord)
|
||||
lat, lon = parse_coordinate_str(coord)
|
||||
self.assertAlmostEqual(lat, -52.25, places=6)
|
||||
self.assertAlmostEqual(lon, -4.0, places=6)
|
||||
|
||||
|
||||
@@ -1,23 +1,33 @@
|
||||
import re
|
||||
|
||||
|
||||
def format_coord(latitude: float, longitude: float) -> str:
|
||||
def component_to_str(value: float, pos_letter: str, neg_letter: str) -> str:
|
||||
"""Convert a coordinate value to a string with a hemisphere indicator."""
|
||||
hemi = pos_letter if value >= 0 else neg_letter
|
||||
abs_val = abs(value)
|
||||
degrees = int(abs_val)
|
||||
fraction = int(round((abs_val - degrees) * 100)) # .25 becomes 25
|
||||
return f"{hemi}{degrees:02d}{fraction:02d}"
|
||||
|
||||
|
||||
def latitude_to_str(latitude: float) -> str:
|
||||
"""Convert a latitude value to a string with a hemisphere indicator."""
|
||||
return component_to_str(latitude, "N", "S")
|
||||
|
||||
|
||||
def longitude_to_str(latitude: float) -> str:
|
||||
"""Convert a longitude value to a string with a hemisphere indicator."""
|
||||
return component_to_str(latitude, "E", "W")
|
||||
|
||||
|
||||
def coordinate_to_str(latitude: float, longitude: float) -> str:
|
||||
"""Format latitude and longitude into a string with hemisphere indicators."""
|
||||
|
||||
def to_str(value: float, pos_letter: str, neg_letter: str) -> str:
|
||||
"""Convert a coordinate value to a string with hemisphere indicator."""
|
||||
hemi = pos_letter if value >= 0 else neg_letter
|
||||
abs_val = abs(value)
|
||||
degrees = int(abs_val)
|
||||
fraction = int(round((abs_val - degrees) * 100)) # .25 becomes 25
|
||||
return f"{hemi}{degrees:02d}{fraction:02d}"
|
||||
|
||||
lat_str = to_str(latitude, "N", "S")
|
||||
lon_str = to_str(longitude, "E", "W")
|
||||
lat_str = latitude_to_str(latitude)
|
||||
lon_str = longitude_to_str(longitude)
|
||||
return f"{lat_str}{lon_str}"
|
||||
|
||||
|
||||
def parse_coord(coord: str) -> tuple[float, float]:
|
||||
def parse_coordinate_str(coord: str) -> tuple[float, float]:
|
||||
"""
|
||||
Parse a formatted coordinate string (e.g. 'N5225E0040' or 'S1350W12200')
|
||||
back into (latitude, longitude) float values.
|
||||
|
||||
Reference in New Issue
Block a user