use playwright to scrape stocks page
This commit is contained in:
@@ -1,7 +1,11 @@
|
||||
import re
|
||||
from collections.abc import Iterator
|
||||
from datetime import date
|
||||
from datetime import datetime
|
||||
|
||||
import jobs
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from config import URL
|
||||
|
||||
import dagster as dg
|
||||
|
||||
@@ -10,6 +14,34 @@ import dagster as dg
|
||||
def check_update(context: dg.SensorEvaluationContext) -> Iterator[dg.RunRequest]:
|
||||
ic(context.cursor)
|
||||
|
||||
yield dg.RunRequest()
|
||||
response = requests.get(URL)
|
||||
response.raise_for_status()
|
||||
|
||||
context.update_cursor(date.today().strftime("%Y-%m-%d"))
|
||||
try:
|
||||
# Parse with BeautifulSoup
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
|
||||
# Find the first <div> after </header>
|
||||
if (header := soup.find("header")) and (div := header.find_next_sibling("div")):
|
||||
# Extract date part using regex
|
||||
match = re.search(
|
||||
r"(\d{1,2})(st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})", div.text
|
||||
)
|
||||
if match:
|
||||
day, _, month, year = match.groups()
|
||||
date = datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
|
||||
date_str = date.strftime("%Y-%m-%d ")
|
||||
context.log.info(f"Found date: {date_str}")
|
||||
|
||||
if date_str > context.cursor:
|
||||
context.update_cursor(date_str)
|
||||
yield dg.RunRequest()
|
||||
return
|
||||
except Exception as e:
|
||||
context.log.error(f"Parsing error: {e}")
|
||||
|
||||
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
file = f"{now_str} stocks.html"
|
||||
context.log.info(f"Saving file: {file}")
|
||||
with open(f"/cache/{file}") as fp:
|
||||
fp.write(response.text)
|
||||
|
||||
Reference in New Issue
Block a user