use playwright to scrape stocks page
This commit is contained in:
@@ -1,72 +1,74 @@
|
|||||||
import time
|
import asyncio
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
from config import APP, URL
|
from config import APP, URL
|
||||||
from selenium import webdriver
|
from playwright.async_api import async_playwright
|
||||||
from selenium.webdriver.common.by import By
|
|
||||||
from selenium.webdriver.support import expected_conditions as EC
|
|
||||||
from selenium.webdriver.support.ui import WebDriverWait
|
|
||||||
|
|
||||||
import dagster as dg
|
import dagster as dg
|
||||||
|
|
||||||
asset = partial(dg.asset, key_prefix=APP)
|
TAGS = {"app": APP}
|
||||||
|
asset = partial(dg.asset, key_prefix=APP, tags=TAGS)
|
||||||
|
|
||||||
|
|
||||||
@asset(
|
async def main() -> str:
|
||||||
io_manager_key="html_io_manager",
|
async with async_playwright() as p:
|
||||||
name="raw",
|
browser = await p.chromium.launch(headless=True)
|
||||||
tags={"dagster/image": "dagster-code-stocks:playwright"},
|
context = await browser.new_context(viewport={"width": 1000, "height": 2000})
|
||||||
)
|
page = await context.new_page()
|
||||||
def raw_html(context: dg.AssetExecutionContext) -> None:
|
|
||||||
# Start a headless browser
|
|
||||||
options = webdriver.ChromeOptions()
|
|
||||||
options.add_argument("--headless")
|
|
||||||
driver = webdriver.Chrome(options=options)
|
|
||||||
driver.set_window_size(1000, 2000)
|
|
||||||
|
|
||||||
# Load the page
|
await page.goto(URL, timeout=60000)
|
||||||
driver.get(URL)
|
|
||||||
|
|
||||||
# Wait until at least one toggle button is present
|
# Wait until at least one toggle button is present
|
||||||
wait = WebDriverWait(driver, 20)
|
await page.wait_for_selector(".toggle-btn", timeout=20000)
|
||||||
wait.until(EC.presence_of_element_located((By.CLASS_NAME, "toggle-btn")))
|
|
||||||
|
|
||||||
driver.execute_script("document.body.style.zoom='50%'")
|
# Set zoom
|
||||||
|
await page.evaluate("document.body.style.zoom='50%'")
|
||||||
|
|
||||||
# Find all toggle buttons (wait ensures DOM is ready)
|
# Find all toggle buttons
|
||||||
toggle_buttons = driver.find_elements(By.CLASS_NAME, "toggle-btn")
|
toggle_buttons = await page.query_selector_all(".toggle-btn")
|
||||||
context.log.info(f"Found {len(toggle_buttons)} toggle buttons")
|
print(f"Found {len(toggle_buttons)} toggle buttons")
|
||||||
|
|
||||||
# Click each toggle button if it's visible and enabled
|
|
||||||
for i, btn in enumerate(toggle_buttons):
|
for i, btn in enumerate(toggle_buttons):
|
||||||
try:
|
try:
|
||||||
# Wait until clickable and click
|
# Ensure it's visible and enabled
|
||||||
WebDriverWait(driver, 5).until(EC.element_to_be_clickable(btn))
|
if await btn.is_visible() and await btn.is_enabled():
|
||||||
btn.click()
|
await btn.click()
|
||||||
time.sleep(1) # short pause for content to load
|
await page.wait_for_timeout(1000)
|
||||||
|
|
||||||
if i == len(toggle_buttons) - 1:
|
if i == len(toggle_buttons) - 1:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# Scroll down gradually
|
||||||
scroll_step = 500
|
scroll_step = 500
|
||||||
total_height = driver.execute_script("return document.body.scrollHeight")
|
total_height = await page.evaluate("() => document.body.scrollHeight")
|
||||||
current_position = 0
|
current_position = 0
|
||||||
|
|
||||||
while current_position < total_height:
|
while current_position < total_height:
|
||||||
driver.execute_script(f"window.scrollTo(0, {current_position});")
|
await page.evaluate(f"window.scrollTo(0, {current_position});")
|
||||||
time.sleep(0.1)
|
await page.wait_for_timeout(100)
|
||||||
current_position += scroll_step
|
current_position += scroll_step
|
||||||
total_height = driver.execute_script(
|
total_height = await page.evaluate(
|
||||||
"return document.body.scrollHeight"
|
"() => document.body.scrollHeight"
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
context.log.info(f"Skipped button due to error: {e}")
|
print(f"Skipped button due to error: {e}")
|
||||||
|
|
||||||
# Continue with scraping after all sections are expanded...
|
# Get the page content
|
||||||
html = driver.page_source
|
page_source = await page.content()
|
||||||
|
|
||||||
# Close browser when done
|
# Close the browser
|
||||||
driver.quit()
|
await browser.close()
|
||||||
|
|
||||||
return html
|
# Continue scraping logic here...
|
||||||
|
print("Scraping done")
|
||||||
|
|
||||||
|
# Save the page content to a file
|
||||||
|
with open("scraped_page.html", "w") as f:
|
||||||
|
f.write(page_source)
|
||||||
|
return page_source
|
||||||
|
|
||||||
|
|
||||||
|
@asset(io_manager_key="html_io_manager", name="raw")
|
||||||
|
def raw_html() -> str:
|
||||||
|
return asyncio.run(main())
|
||||||
|
|||||||
@@ -2,3 +2,4 @@ import os
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
APP = os.environ.get("APP", Path(__file__).parent.parent.name)
|
APP = os.environ.get("APP", Path(__file__).parent.parent.name)
|
||||||
|
URL = "https://www.hellostocks.ai/superinvestor/strategies"
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import assets
|
|||||||
import sensors
|
import sensors
|
||||||
from dagster_polars import PolarsParquetIOManager
|
from dagster_polars import PolarsParquetIOManager
|
||||||
from icecream import install
|
from icecream import install
|
||||||
|
from resources import HtmlIOManager
|
||||||
|
|
||||||
import dagster as dg
|
import dagster as dg
|
||||||
from dagster import load_assets_from_modules
|
from dagster import load_assets_from_modules
|
||||||
@@ -12,15 +13,17 @@ install()
|
|||||||
|
|
||||||
APP = os.environ["APP"]
|
APP = os.environ["APP"]
|
||||||
|
|
||||||
|
storage_dir = os.environ.get("STORAGE_DIR", "/storage")
|
||||||
|
|
||||||
definitions = dg.Definitions(
|
definitions = dg.Definitions(
|
||||||
assets=[
|
assets=[
|
||||||
asset.with_attributes(
|
asset.with_attributes(
|
||||||
group_names_by_key={asset.key: APP},
|
group_names_by_key={asset.key: APP},
|
||||||
tags_by_key={asset.key: {"app": APP}},
|
|
||||||
)
|
)
|
||||||
for asset in load_assets_from_modules([assets])
|
for asset in load_assets_from_modules([assets])
|
||||||
],
|
],
|
||||||
resources={
|
resources={
|
||||||
|
"html_io_manager": HtmlIOManager(base_dir=storage_dir),
|
||||||
"polars_parquet_io_manager": PolarsParquetIOManager(
|
"polars_parquet_io_manager": PolarsParquetIOManager(
|
||||||
base_dir=os.environ.get("STORAGE_DIR", "/storage") + f"/{APP}"
|
base_dir=os.environ.get("STORAGE_DIR", "/storage") + f"/{APP}"
|
||||||
),
|
),
|
||||||
|
|||||||
@@ -1,7 +1,11 @@
|
|||||||
|
import re
|
||||||
from collections.abc import Iterator
|
from collections.abc import Iterator
|
||||||
from datetime import date
|
from datetime import datetime
|
||||||
|
|
||||||
import jobs
|
import jobs
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from config import URL
|
||||||
|
|
||||||
import dagster as dg
|
import dagster as dg
|
||||||
|
|
||||||
@@ -10,6 +14,34 @@ import dagster as dg
|
|||||||
def check_update(context: dg.SensorEvaluationContext) -> Iterator[dg.RunRequest]:
|
def check_update(context: dg.SensorEvaluationContext) -> Iterator[dg.RunRequest]:
|
||||||
ic(context.cursor)
|
ic(context.cursor)
|
||||||
|
|
||||||
yield dg.RunRequest()
|
response = requests.get(URL)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
context.update_cursor(date.today().strftime("%Y-%m-%d"))
|
try:
|
||||||
|
# Parse with BeautifulSoup
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
|
# Find the first <div> after </header>
|
||||||
|
if (header := soup.find("header")) and (div := header.find_next_sibling("div")):
|
||||||
|
# Extract date part using regex
|
||||||
|
match = re.search(
|
||||||
|
r"(\d{1,2})(st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})", div.text
|
||||||
|
)
|
||||||
|
if match:
|
||||||
|
day, _, month, year = match.groups()
|
||||||
|
date = datetime.strptime(f"{day} {month} {year}", "%d %B %Y")
|
||||||
|
date_str = date.strftime("%Y-%m-%d ")
|
||||||
|
context.log.info(f"Found date: {date_str}")
|
||||||
|
|
||||||
|
if date_str > context.cursor:
|
||||||
|
context.update_cursor(date_str)
|
||||||
|
yield dg.RunRequest()
|
||||||
|
return
|
||||||
|
except Exception as e:
|
||||||
|
context.log.error(f"Parsing error: {e}")
|
||||||
|
|
||||||
|
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
file = f"{now_str} stocks.html"
|
||||||
|
context.log.info(f"Saving file: {file}")
|
||||||
|
with open(f"/cache/{file}") as fp:
|
||||||
|
fp.write(response.text)
|
||||||
|
|||||||
Reference in New Issue
Block a user