www.mext.go.jp
第3章 情報とデータサイエンス 前半
大量のデータの収集と整理・整形
演習2
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
url = "https://www.mext.go.jp/b_menu/news/index.html"
r = requests.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
data = []
for news in soup.select("ul.news_list"):
date = news.find_previous_sibling("h3").get_text(strip=True)
for title in news.select("li > span.link > a"):
d = {}
d["date"] = date
d["title"] = title.get_text(strip=True)
d["url"] = urljoin(url, title.get("href"))
data.append(d)
import pandas as pd
df = pd.DataFrame(data)
df_date = df["date"].str.extract(
r"(?P<gen>令和)(?P<nen>\d{1,2})年(?P<month>\d{1,2})月(?P<day>\d{1,2})日$", expand=True
)
df_date["gen"] = df_date["gen"].map({"令和": 2018})
df_date = df_date.astype(int)
df_date["year"] = df_date["gen"] + df_date["nen"]
df["date"] = pd.to_datetime(df_date[["year", "month", "day"]])
df