import re from collections.abc import Iterator from datetime import date, datetime from bs4 import BeautifulSoup def extract_date(page_source: str) -> Iterator[date]: # Parse with BeautifulSoup soup = BeautifulSoup(page_source, "html.parser") # Find the first
after if (header := soup.find("header")) and (div := header.find_next_sibling("div")): # Extract date part using regex match = re.search(r"(\d{1,2})(st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})", div.text) if match: day, _, month, year = match.groups() date_obj = datetime.strptime(f"{day} {month} {year}", "%d %B %Y") yield date_obj