initial commit
This commit is contained in:
84
src/app/vinyl/sounds/fetch.py
Normal file
84
src/app/vinyl/sounds/fetch.py
Normal file
@@ -0,0 +1,84 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def get_page_count(html_content):
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
|
||||
# Find all pagination links
|
||||
page_links = soup.select("ul.pagination li a")
|
||||
|
||||
# Extract the numbers from the hrefs and convert to integers
|
||||
page_numbers = [
|
||||
int(link.get_text()) for link in page_links if link.get_text().isdigit()
|
||||
]
|
||||
|
||||
return max(page_numbers)
|
||||
|
||||
|
||||
def parse_page(html_content):
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
|
||||
# Extract the name (artist - album) from the h5 tag
|
||||
names = list(map(lambda x: x.get_text(strip=True), soup.find_all("h5")))
|
||||
|
||||
# Remove 'Telefoon', 'E-mail', 'Facebook'
|
||||
names = list(filter(lambda x: " -" in x, names))
|
||||
|
||||
# Extract the numerical id from the a tag
|
||||
ids = list(map(lambda x: x["rel"][0], soup.find_all("a", rel=True)))
|
||||
|
||||
# Extract the price
|
||||
prices = list(
|
||||
map(
|
||||
lambda x: float(x.get_text(strip=True).split()[1]),
|
||||
soup.find_all("span", class_="product-price"),
|
||||
)
|
||||
)
|
||||
|
||||
df = pd.DataFrame({"id": ids, "name": names, "price": prices})
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def fetch_deals():
|
||||
# Get page count
|
||||
page_count = get_page_count(
|
||||
requests.get("https://www.sounds.nl/uitverkoop/1/lp/all/art").text
|
||||
)
|
||||
time.sleep(1)
|
||||
print(f"Number of pages: {page_count}")
|
||||
|
||||
# Parse all pages
|
||||
base_url = "https://www.sounds.nl/uitverkoop/{page_number}/lp/all"
|
||||
dfs = []
|
||||
for i in tqdm(range(page_count)):
|
||||
df = parse_page(requests.get(base_url.format(page_number=i)).text)
|
||||
dfs.append(df)
|
||||
time.sleep(2)
|
||||
|
||||
# Combine dfs
|
||||
return pd.concat(dfs) if dfs else pd.DataFrame(columns=["id", "name", "price"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
df = fetch_deals()
|
||||
print(f"Found {len(df)} deals")
|
||||
|
||||
# Show current deals
|
||||
print(df.sort_values(by="price").head(10))
|
||||
|
||||
# Write to file
|
||||
now = datetime.now()
|
||||
prefix = now.strftime("%Y-%m-%d_%H:%M:%S")
|
||||
directory = "/home/bram/src/python"
|
||||
filepath = f"{directory}/{prefix}_sounds.csv"
|
||||
print(f"Writing data to {filepath}")
|
||||
df.to_csv(filepath)
|
||||
Reference in New Issue
Block a user