20 lines
683 B
Python
20 lines
683 B
Python
import re
|
|
from collections.abc import Iterator
|
|
from datetime import date, datetime
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
def extract_date(page_source: str) -> Iterator[date]:
|
|
# Parse with BeautifulSoup
|
|
soup = BeautifulSoup(page_source, "html.parser")
|
|
|
|
# Find the first <div> after </header>
|
|
if (header := soup.find("header")) and (div := header.find_next_sibling("div")):
|
|
# Extract date part using regex
|
|
match = re.search(r"(\d{1,2})(st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})", div.text)
|
|
if match:
|
|
day, _, month, year = match.groups()
|
|
date_obj = datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
|
|
yield date_obj
|