愛知県のクラスタをスクレイピング・JSON化

import datetime
import json
import re

import pandas as pd
import requests
from bs4 import BeautifulSoup

def dumps_json(file_name, json_data):

    with open(file_name, "w") as fw:
        json.dump(json_data, fw, ensure_ascii=False, indent=2)

url = "https://www.pref.aichi.jp/site/covid19-aichi/kansensya-kensa.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

# スクレイピング
r = requests.get(url, headers=headers)

r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")


table = soup.find("table", summary="愛知県内の発生状況")

caption = table.find("caption").get_text(strip=True)

# 今日
dt_now = datetime.datetime.now()

# 公表日をdatetimeに変換
y = dt_now.year
m, d, h = map(int, re.findall("[0-9]{1,2}", caption))

# ※時間は変更してください
last_update = datetime.datetime(y, m, d, h, 0)


df = pd.read_html(table.prettify())[0].drop("人数.1", axis=1)

df.dropna(how="all", inplace=True)

df["人数"] = df["人数"].str.rstrip("人").fillna(0).astype(int)
df["うち入院"] = df["うち入院"].str.rstrip("人").fillna(0).astype(int)

df.rename(columns={"Unnamed: 0": "クラスタ"}, inplace=True)

df_cluster = df.loc[~df["クラスタ"].str.endswith("計"), :].copy()

df_cluster["クラスタ"] = df_cluster["クラスタ"].str.replace("（.+）", "")

df_cluster["クラスタ"] = df_cluster["クラスタ"].str.lstrip("○")

df_cluster.set_index("クラスタ", inplace=True)

cluster = {
    "cluster": {
        "data": df_cluster.to_dict(orient="dict"),
        "date": last_update.strftime("%Y/%m/%d %H:%M"),
    }
}

dumps_json("data_cluster.json", cluster)