import time
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
p_number = 12345
url = f"https://camp-fire.jp/projects/{p_number}/backers"
link = url
result = []
while True:
print(link)
r = requests.get(link)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
for i in soup.select("li.clearfix"):
d = {}
tag = i.select_one("div.body > ul.date > li > a")
d["project"] = p_number
d["name"] = tag.get_text(strip=True)
d["profile"] = tag.get("href")
d["count"] = (
i.select_one("div.body > ul.date > li > small > strong")
.get_text(strip=True)
.rstrip("件")
)
d["datetime"] = i.select_one(
"div.body > ul.date > li.rfloat > span.time"
).get_text(strip=True)
d["message"] = i.select_one("div.body > p.readmore").get_text(strip=True)
result.append(d)
next_page = soup.select_one('div.pagination > div.clearfix > span > a[rel="next"]')
if next_page:
link = urljoin(url, next_page.get("href"))
time.sleep(1)
else:
break
import pandas as pd
df = pd.DataFrame(result)
df["user_id"] = df["profile"].str.replace("/profile/", "")
df["profile"] = df["profile"].str.replace("^/", "https://camp-fire.jp/", regex=True)
df.to_csv(f"{p_number}.csv")