use playwright to scrape stocks page

2025-07-29 11:31:26 +02:00
parent 8aa943b0bf
commit 586a4ae904
4 changed files with 94 additions and 56 deletions
--- a/apps/stocks/src/assets.py
+++ b/apps/stocks/src/assets.py
@@ -1,72 +1,74 @@
-import time
+import asyncio
 from functools import partial

 from config import APP, URL
-from selenium import webdriver
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.support.ui import WebDriverWait
+from playwright.async_api import async_playwright

 import dagster as dg

-asset = partial(dg.asset, key_prefix=APP)
+TAGS = {"app": APP}
+asset = partial(dg.asset, key_prefix=APP, tags=TAGS)


-@asset(
-    io_manager_key="html_io_manager",
-    name="raw",
-    tags={"dagster/image": "dagster-code-stocks:playwright"},
-)
-def raw_html(context: dg.AssetExecutionContext) -> None:
-    # Start a headless browser
-    options = webdriver.ChromeOptions()
-    options.add_argument("--headless")
-    driver = webdriver.Chrome(options=options)
-    driver.set_window_size(1000, 2000)
+async def main() -> str:
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=True)
+        context = await browser.new_context(viewport={"width": 1000, "height": 2000})
+        page = await context.new_page()

-    # Load the page
-    driver.get(URL)
+        await page.goto(URL, timeout=60000)

-    # Wait until at least one toggle button is present
-    wait = WebDriverWait(driver, 20)
-    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "toggle-btn")))
+        # Wait until at least one toggle button is present
+        await page.wait_for_selector(".toggle-btn", timeout=20000)

-    driver.execute_script("document.body.style.zoom='50%'")
+        # Set zoom
+        await page.evaluate("document.body.style.zoom='50%'")

-    # Find all toggle buttons (wait ensures DOM is ready)
-    toggle_buttons = driver.find_elements(By.CLASS_NAME, "toggle-btn")
-    context.log.info(f"Found {len(toggle_buttons)} toggle buttons")
+        # Find all toggle buttons
+        toggle_buttons = await page.query_selector_all(".toggle-btn")
+        print(f"Found {len(toggle_buttons)} toggle buttons")

-    # Click each toggle button if it's visible and enabled
-    for i, btn in enumerate(toggle_buttons):
-        try:
-            # Wait until clickable and click
-            WebDriverWait(driver, 5).until(EC.element_to_be_clickable(btn))
-            btn.click()
-            time.sleep(1)  # short pause for content to load
+        for i, btn in enumerate(toggle_buttons):
+            try:
+                # Ensure it's visible and enabled
+                if await btn.is_visible() and await btn.is_enabled():
+                    await btn.click()
+                    await page.wait_for_timeout(1000)

-            if i == len(toggle_buttons) - 1:
-                break
+                if i == len(toggle_buttons) - 1:
+                    break

-            scroll_step = 500
-            total_height = driver.execute_script("return document.body.scrollHeight")
-            current_position = 0
+                # Scroll down gradually
+                scroll_step = 500
+                total_height = await page.evaluate("() => document.body.scrollHeight")
+                current_position = 0

-            while current_position < total_height:
-                driver.execute_script(f"window.scrollTo(0, {current_position});")
-                time.sleep(0.1)
-                current_position += scroll_step
-                total_height = driver.execute_script(
-                    "return document.body.scrollHeight"
-                )
+                while current_position < total_height:
+                    await page.evaluate(f"window.scrollTo(0, {current_position});")
+                    await page.wait_for_timeout(100)
+                    current_position += scroll_step
+                    total_height = await page.evaluate(
+                        "() => document.body.scrollHeight"
+                    )

-        except Exception as e:
-            context.log.info(f"Skipped button due to error: {e}")
+            except Exception as e:
+                print(f"Skipped button due to error: {e}")

-    # Continue with scraping after all sections are expanded...
-    html = driver.page_source
+        # Get the page content
+        page_source = await page.content()

-    # Close browser when done
-    driver.quit()
+        # Close the browser
+        await browser.close()

-    return html
+        # Continue scraping logic here...
+        print("Scraping done")
+
+        # Save the page content to a file
+        with open("scraped_page.html", "w") as f:
+            f.write(page_source)
+        return page_source
+
+
+@asset(io_manager_key="html_io_manager", name="raw")
+def raw_html() -> str:
+    return asyncio.run(main())
--- a/apps/stocks/src/config.py
+++ b/apps/stocks/src/config.py
@@ -2,3 +2,4 @@ import os
 from pathlib import Path

 APP = os.environ.get("APP", Path(__file__).parent.parent.name)
+URL = "https://www.hellostocks.ai/superinvestor/strategies"
--- a/apps/stocks/src/definitions.py
+++ b/apps/stocks/src/definitions.py
@@ -4,6 +4,7 @@ import assets
 import sensors
 from dagster_polars import PolarsParquetIOManager
 from icecream import install
+from resources import HtmlIOManager

 import dagster as dg
 from dagster import load_assets_from_modules
@@ -12,15 +13,17 @@ install()

 APP = os.environ["APP"]

+storage_dir = os.environ.get("STORAGE_DIR", "/storage")
+
 definitions = dg.Definitions(
    assets=[
        asset.with_attributes(
            group_names_by_key={asset.key: APP},
-            tags_by_key={asset.key: {"app": APP}},
        )
        for asset in load_assets_from_modules([assets])
    ],
    resources={
+        "html_io_manager": HtmlIOManager(base_dir=storage_dir),
        "polars_parquet_io_manager": PolarsParquetIOManager(
            base_dir=os.environ.get("STORAGE_DIR", "/storage") + f"/{APP}"
        ),
--- a/apps/stocks/src/sensors.py
+++ b/apps/stocks/src/sensors.py
@@ -1,7 +1,11 @@
+import re
 from collections.abc import Iterator
-from datetime import date
+from datetime import datetime

 import jobs
+import requests
+from bs4 import BeautifulSoup
+from config import URL

 import dagster as dg

@@ -10,6 +14,34 @@ import dagster as dg
 def check_update(context: dg.SensorEvaluationContext) -> Iterator[dg.RunRequest]:
    ic(context.cursor)

-    yield dg.RunRequest()
+    response = requests.get(URL)
+    response.raise_for_status()

-    context.update_cursor(date.today().strftime("%Y-%m-%d"))
+    try:
+        # Parse with BeautifulSoup
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        # Find the first <div> after </header>
+        if (header := soup.find("header")) and (div := header.find_next_sibling("div")):
+            # Extract date part using regex
+            match = re.search(
+                r"(\d{1,2})(st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})", div.text
+            )
+            if match:
+                day, _, month, year = match.groups()
+                date = datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
+                date_str = date.strftime("%Y-%m-%d ")
+                context.log.info(f"Found date: {date_str}")
+
+                if date_str > context.cursor:
+                    context.update_cursor(date_str)
+                    yield dg.RunRequest()
+                    return
+    except Exception as e:
+        context.log.error(f"Parsing error: {e}")
+
+    now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    file = f"{now_str} stocks.html"
+    context.log.info(f"Saving file: {file}")
+    with open(f"/cache/{file}") as fp:
+        fp.write(response.text)