store and register scrape

This commit is contained in:
2025-07-29 17:22:42 +02:00
parent 0539dd9f7e
commit 02db619c6d
4 changed files with 72 additions and 26 deletions

View File

@@ -1,11 +1,10 @@
import re
from collections.abc import Iterator
from datetime import datetime
import jobs
import requests
from bs4 import BeautifulSoup
from config import URL
from utils import extract_date
import dagster as dg
@@ -18,25 +17,13 @@ def check_update(context: dg.SensorEvaluationContext) -> Iterator[dg.RunRequest]
response.raise_for_status()
try:
# Parse with BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
# Find the first <div> after </header>
if (header := soup.find("header")) and (div := header.find_next_sibling("div")):
# Extract date part using regex
match = re.search(
r"(\d{1,2})(st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})", div.text
)
if match:
day, _, month, year = match.groups()
date = datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
date_str = date.strftime("%Y-%m-%d ")
context.log.info(f"Found date: {date_str}")
if date_str > context.cursor:
context.update_cursor(date_str)
yield dg.RunRequest()
return
date_obj = next(extract_date(response.text))
date_str = date_obj.strftime("%Y-%m-%d")
context.log.info(f"Found date: {date_str}")
if date_str > context.cursor:
context.update_cursor(date_str)
yield dg.RunRequest()
return
except Exception as e:
context.log.error(f"Parsing error: {e}")