use playwright to scrape stocks page

This commit is contained in:
2025-07-29 11:31:26 +02:00
parent 8aa943b0bf
commit 586a4ae904
4 changed files with 94 additions and 56 deletions

View File

@@ -1,72 +1,74 @@
import time import asyncio
from functools import partial from functools import partial
from config import APP, URL from config import APP, URL
from selenium import webdriver from playwright.async_api import async_playwright
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import dagster as dg import dagster as dg
asset = partial(dg.asset, key_prefix=APP) TAGS = {"app": APP}
asset = partial(dg.asset, key_prefix=APP, tags=TAGS)
@asset( async def main() -> str:
io_manager_key="html_io_manager", async with async_playwright() as p:
name="raw", browser = await p.chromium.launch(headless=True)
tags={"dagster/image": "dagster-code-stocks:playwright"}, context = await browser.new_context(viewport={"width": 1000, "height": 2000})
) page = await context.new_page()
def raw_html(context: dg.AssetExecutionContext) -> None:
# Start a headless browser
options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)
driver.set_window_size(1000, 2000)
# Load the page await page.goto(URL, timeout=60000)
driver.get(URL)
# Wait until at least one toggle button is present # Wait until at least one toggle button is present
wait = WebDriverWait(driver, 20) await page.wait_for_selector(".toggle-btn", timeout=20000)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "toggle-btn")))
driver.execute_script("document.body.style.zoom='50%'") # Set zoom
await page.evaluate("document.body.style.zoom='50%'")
# Find all toggle buttons (wait ensures DOM is ready) # Find all toggle buttons
toggle_buttons = driver.find_elements(By.CLASS_NAME, "toggle-btn") toggle_buttons = await page.query_selector_all(".toggle-btn")
context.log.info(f"Found {len(toggle_buttons)} toggle buttons") print(f"Found {len(toggle_buttons)} toggle buttons")
# Click each toggle button if it's visible and enabled for i, btn in enumerate(toggle_buttons):
for i, btn in enumerate(toggle_buttons): try:
try: # Ensure it's visible and enabled
# Wait until clickable and click if await btn.is_visible() and await btn.is_enabled():
WebDriverWait(driver, 5).until(EC.element_to_be_clickable(btn)) await btn.click()
btn.click() await page.wait_for_timeout(1000)
time.sleep(1) # short pause for content to load
if i == len(toggle_buttons) - 1: if i == len(toggle_buttons) - 1:
break break
scroll_step = 500 # Scroll down gradually
total_height = driver.execute_script("return document.body.scrollHeight") scroll_step = 500
current_position = 0 total_height = await page.evaluate("() => document.body.scrollHeight")
current_position = 0
while current_position < total_height: while current_position < total_height:
driver.execute_script(f"window.scrollTo(0, {current_position});") await page.evaluate(f"window.scrollTo(0, {current_position});")
time.sleep(0.1) await page.wait_for_timeout(100)
current_position += scroll_step current_position += scroll_step
total_height = driver.execute_script( total_height = await page.evaluate(
"return document.body.scrollHeight" "() => document.body.scrollHeight"
) )
except Exception as e: except Exception as e:
context.log.info(f"Skipped button due to error: {e}") print(f"Skipped button due to error: {e}")
# Continue with scraping after all sections are expanded... # Get the page content
html = driver.page_source page_source = await page.content()
# Close browser when done # Close the browser
driver.quit() await browser.close()
return html # Continue scraping logic here...
print("Scraping done")
# Save the page content to a file
with open("scraped_page.html", "w") as f:
f.write(page_source)
return page_source
@asset(io_manager_key="html_io_manager", name="raw")
def raw_html() -> str:
return asyncio.run(main())

View File

@@ -2,3 +2,4 @@ import os
from pathlib import Path from pathlib import Path
APP = os.environ.get("APP", Path(__file__).parent.parent.name) APP = os.environ.get("APP", Path(__file__).parent.parent.name)
URL = "https://www.hellostocks.ai/superinvestor/strategies"

View File

@@ -4,6 +4,7 @@ import assets
import sensors import sensors
from dagster_polars import PolarsParquetIOManager from dagster_polars import PolarsParquetIOManager
from icecream import install from icecream import install
from resources import HtmlIOManager
import dagster as dg import dagster as dg
from dagster import load_assets_from_modules from dagster import load_assets_from_modules
@@ -12,15 +13,17 @@ install()
APP = os.environ["APP"] APP = os.environ["APP"]
storage_dir = os.environ.get("STORAGE_DIR", "/storage")
definitions = dg.Definitions( definitions = dg.Definitions(
assets=[ assets=[
asset.with_attributes( asset.with_attributes(
group_names_by_key={asset.key: APP}, group_names_by_key={asset.key: APP},
tags_by_key={asset.key: {"app": APP}},
) )
for asset in load_assets_from_modules([assets]) for asset in load_assets_from_modules([assets])
], ],
resources={ resources={
"html_io_manager": HtmlIOManager(base_dir=storage_dir),
"polars_parquet_io_manager": PolarsParquetIOManager( "polars_parquet_io_manager": PolarsParquetIOManager(
base_dir=os.environ.get("STORAGE_DIR", "/storage") + f"/{APP}" base_dir=os.environ.get("STORAGE_DIR", "/storage") + f"/{APP}"
), ),

View File

@@ -1,7 +1,11 @@
import re
from collections.abc import Iterator from collections.abc import Iterator
from datetime import date from datetime import datetime
import jobs import jobs
import requests
from bs4 import BeautifulSoup
from config import URL
import dagster as dg import dagster as dg
@@ -10,6 +14,34 @@ import dagster as dg
def check_update(context: dg.SensorEvaluationContext) -> Iterator[dg.RunRequest]: def check_update(context: dg.SensorEvaluationContext) -> Iterator[dg.RunRequest]:
ic(context.cursor) ic(context.cursor)
yield dg.RunRequest() response = requests.get(URL)
response.raise_for_status()
context.update_cursor(date.today().strftime("%Y-%m-%d")) try:
# Parse with BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
# Find the first <div> after </header>
if (header := soup.find("header")) and (div := header.find_next_sibling("div")):
# Extract date part using regex
match = re.search(
r"(\d{1,2})(st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})", div.text
)
if match:
day, _, month, year = match.groups()
date = datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
date_str = date.strftime("%Y-%m-%d ")
context.log.info(f"Found date: {date_str}")
if date_str > context.cursor:
context.update_cursor(date_str)
yield dg.RunRequest()
return
except Exception as e:
context.log.error(f"Parsing error: {e}")
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
file = f"{now_str} stocks.html"
context.log.info(f"Saving file: {file}")
with open(f"/cache/{file}") as fp:
fp.write(response.text)