111 lines
3.0 KiB
Python
111 lines
3.0 KiB
Python
#!/usr/bin/python3
|
|
|
|
import time
|
|
from datetime import datetime
|
|
|
|
import pandas as pd
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from tqdm import tqdm
|
|
|
|
|
|
def get_page_count(html_content):
|
|
soup = BeautifulSoup(html_content, "html.parser")
|
|
|
|
# Find all pagination links
|
|
page_links = soup.select("ul.pagination li a")
|
|
|
|
# Extract the numbers from the hrefs and convert to integers
|
|
page_numbers = [
|
|
int(link.get_text()) for link in page_links if link.get_text().isdigit()
|
|
]
|
|
|
|
return max(page_numbers)
|
|
|
|
|
|
def parse_page(html_content):
|
|
entries = []
|
|
soup = BeautifulSoup(html_content, "html.parser")
|
|
for product in soup.find_all("div", {"class": "search-product"}):
|
|
item_id = product.find("a", rel=True)["rel"][0]
|
|
name = product.find("h5").text.strip()
|
|
artist_title = name.split("-")
|
|
artist = artist_title[0].strip()
|
|
title = artist_title[1].strip()
|
|
price = (
|
|
product.find("span", class_="product-price")
|
|
.text.strip()
|
|
.replace("€", "")
|
|
.strip()
|
|
)
|
|
|
|
entry = {
|
|
"id": item_id,
|
|
"name": name,
|
|
"artist": artist,
|
|
"title": title,
|
|
"price": price,
|
|
}
|
|
if detail := product.find("h6", {"class": "hide-for-small"}):
|
|
entry["detail"] = detail.text
|
|
if supply := product.find("div", {"class": "product-voorraad"}):
|
|
entry["supply"] = supply.text
|
|
|
|
for info in product.find_all("div", {"class": "product-info"}):
|
|
info = info.text.split(":")
|
|
if "Genre" in info[0]:
|
|
entry["genre"] = info[1].strip()
|
|
if "Releasedatum" in info[0]:
|
|
entry["release"] = info[1].strip()
|
|
entries.append(entry)
|
|
|
|
return pd.DataFrame(entries).reindex(
|
|
columns=[
|
|
"id",
|
|
"name",
|
|
"artist",
|
|
"title",
|
|
"price",
|
|
"supply",
|
|
"release",
|
|
"genre",
|
|
"detail",
|
|
]
|
|
)
|
|
|
|
|
|
def fetch_deals():
|
|
# Get page count
|
|
page_count = get_page_count(
|
|
requests.get("https://www.sounds.nl/uitverkoop/1/lp/all/art").text
|
|
)
|
|
time.sleep(1)
|
|
print(f"Number of pages: {page_count}")
|
|
|
|
# Parse all pages
|
|
base_url = "https://www.sounds.nl/uitverkoop/{page_number}/lp/all"
|
|
dfs = []
|
|
for i in tqdm(range(page_count)):
|
|
df = parse_page(requests.get(base_url.format(page_number=i)).text)
|
|
dfs.append(df)
|
|
time.sleep(2)
|
|
|
|
# Combine dfs
|
|
return pd.concat(dfs) if dfs else pd.DataFrame(columns=["id", "name", "price"])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
df = fetch_deals()
|
|
print(f"Found {len(df)} deals")
|
|
|
|
# Show current deals
|
|
print(df.sort_values(by="price").head(10))
|
|
|
|
# Write to file
|
|
now = datetime.now()
|
|
prefix = now.strftime("%Y-%m-%d_%H:%M:%S")
|
|
directory = "/home/bram/src/python"
|
|
filepath = f"{directory}/{prefix}_sounds.csv"
|
|
print(f"Writing data to {filepath}")
|
|
df.to_csv(filepath)
|