dagster/apps/stocks/src/utils.py

import re
from collections.abc import Iterator
from datetime import date, datetime

from bs4 import BeautifulSoup


def extract_date(page_source: str) -> Iterator[date]:
    # Parse with BeautifulSoup
    soup = BeautifulSoup(page_source, "html.parser")

    # Find the first <div> after </header>
    if (header := soup.find("header")) and (div := header.find_next_sibling("div")):
        # Extract date part using regex
        match = re.search(r"(\d{1,2})(st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})", div.text)
        if match:
            day, _, month, year = match.groups()
            date_obj = datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
            yield date_obj