dagster/apps/vinyl/src/sounds/fetch.py

#!/usr/bin/python3

import time
from datetime import datetime

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm


def get_page_count(html_content):
    soup = BeautifulSoup(html_content, "html.parser")

    # Find all pagination links
    page_links = soup.select("ul.pagination li a")

    # Extract the numbers from the hrefs and convert to integers
    page_numbers = [
        int(link.get_text()) for link in page_links if link.get_text().isdigit()
    ]

    return max(page_numbers)


def parse_page(html_content):
    entries = []
    soup = BeautifulSoup(html_content, "html.parser")
    for product in soup.find_all("div", {"class": "search-product"}):
        item_id = product.find("a", rel=True)["rel"][0]
        name = product.find("h5").text.strip()
        artist_title = name.split("-")
        artist = artist_title[0].strip()
        title = artist_title[1].strip()
        price = (
            product.find("span", class_="product-price")
            .text.strip()
            .replace("€", "")
            .strip()
        )

        entry = {
            "id": item_id,
            "name": name,
            "artist": artist,
            "title": title,
            "price": price,
        }
        if detail := product.find("h6", {"class": "hide-for-small"}):
            entry["detail"] = detail.text
        if supply := product.find("div", {"class": "product-voorraad"}):
            entry["supply"] = supply.text

        for info in product.find_all("div", {"class": "product-info"}):
            info = info.text.split(":")
            if "Genre" in info[0]:
                entry["genre"] = info[1].strip()
            if "Releasedatum" in info[0]:
                entry["release"] = info[1].strip()
            entries.append(entry)

    return pd.DataFrame(entries).reindex(
        columns=[
            "id",
            "name",
            "artist",
            "title",
            "price",
            "supply",
            "release",
            "genre",
            "detail",
        ]
    )


def fetch_deals():
    # Get page count
    page_count = get_page_count(
        requests.get("https://www.sounds.nl/uitverkoop/1/lp/all/art").text
    )
    time.sleep(1)
    print(f"Number of pages: {page_count}")

    # Parse all pages
    base_url = "https://www.sounds.nl/uitverkoop/{page_number}/lp/all"
    dfs = []
    for i in tqdm(range(page_count)):
        df = parse_page(requests.get(base_url.format(page_number=i)).text)
        dfs.append(df)
        time.sleep(2)

    # Combine dfs
    return pd.concat(dfs) if dfs else pd.DataFrame(columns=["id", "name", "price"])


if __name__ == "__main__":
    df = fetch_deals()
    print(f"Found {len(df)} deals")

    # Show current deals
    print(df.sort_values(by="price").head(10))

    # Write to file
    now = datetime.now()
    prefix = now.strftime("%Y-%m-%d_%H:%M:%S")
    directory = "/home/bram/src/python"
    filepath = f"{directory}/{prefix}_sounds.csv"
    print(f"Writing data to {filepath}")
    df.to_csv(filepath)