use playwright to scrape stocks page

2025-07-29 11:31:26 +02:00
parent 8aa943b0bf
commit 586a4ae904
4 changed files with 94 additions and 56 deletions
--- a/apps/stocks/src/sensors.py
+++ b/apps/stocks/src/sensors.py
@@ -1,7 +1,11 @@
+import re
 from collections.abc import Iterator
-from datetime import date
+from datetime import datetime

 import jobs
+import requests
+from bs4 import BeautifulSoup
+from config import URL

 import dagster as dg

@@ -10,6 +14,34 @@ import dagster as dg
 def check_update(context: dg.SensorEvaluationContext) -> Iterator[dg.RunRequest]:
    ic(context.cursor)

-    yield dg.RunRequest()
+    response = requests.get(URL)
+    response.raise_for_status()

-    context.update_cursor(date.today().strftime("%Y-%m-%d"))
+    try:
+        # Parse with BeautifulSoup
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        # Find the first <div> after </header>
+        if (header := soup.find("header")) and (div := header.find_next_sibling("div")):
+            # Extract date part using regex
+            match = re.search(
+                r"(\d{1,2})(st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})", div.text
+            )
+            if match:
+                day, _, month, year = match.groups()
+                date = datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
+                date_str = date.strftime("%Y-%m-%d ")
+                context.log.info(f"Found date: {date_str}")
+
+                if date_str > context.cursor:
+                    context.update_cursor(date_str)
+                    yield dg.RunRequest()
+                    return
+    except Exception as e:
+        context.log.error(f"Parsing error: {e}")
+
+    now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    file = f"{now_str} stocks.html"
+    context.log.info(f"Saving file: {file}")
+    with open(f"/cache/{file}") as fp:
+        fp.write(response.text)