兵庫県の新型コロナウイルス感染症の県内検査状況をスクレイピング

github.com

pdfのスクレイピングのプログラムすごいな

めちゃくちゃ勉強になった

グラフ表示用のJavascriptのデータからスクレイピング

import datetime
import json
import re

import pandas as pd
import requests
from bs4 import BeautifulSoup


def dumps_json(file_name, json_data):

    with open(file_name, "w") as fw:
        json.dump(json_data, fw, ensure_ascii=False, indent=4)


url = "https://web.pref.hyogo.lg.jp/kf16/singatakoronakensa.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}

r = requests.get(url, headers=headers)

r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

# スクレイピング

tag = soup.find("script", text=re.compile("var dataset"))

t = tag.find_next("center").get_text(strip=True)

# 最終更新日

mt = re.match(r"※(\d+)/(\d+)\s(\d+)時", t)

m, d, h = map(int, mt.groups())

last_update = datetime.datetime(2020, m, d, h, 0)

# データ

s = tag.get_text(strip=True)

ms = re.match(r"var dataset=(.*?);", s, re.DOTALL)

if ms:

    dataset = json.loads(ms.group(1))

    df = pd.DataFrame(dataset, columns=["labels", "検査検体数", "検査累計", "陽性確認", "陽性累計"])

    df["date"] = pd.to_datetime("2020/" + df["labels"], format="%Y/%m/%d")

    """
    # inspections_summary
    + 日付(2/01)
    + 検査検体数
    + 陽性人数
    """

    df1 = df.loc[:, ["labels", "検査検体数", "陽性確認"]].copy()

    df1

    inspections_summary = {
        "data": df1.to_dict(orient="list"),
        "last_update": last_update.strftime("%Y/%m/%d %H:%M"),
    }

    dumps_json("inspections_summary.json", inspections_summary)

    """
    # parent_summary
    + 日付(2020-02-01T00:00:00+09:00)
    + 陽性人数
    """

    df["日付"] = df["date"].dt.strftime("%Y-%m-%dT%H:%M:%S+09:00")

    df2 = df.loc[:, ["日付", "陽性確認"]].copy()

    df2.rename(columns={"陽性確認": "小計"}, inplace=True)

    df2

    patients_summary = {
        "data": df2.to_dict(orient="records"),
        "last_update": last_update.strftime("%Y/%m/%d %H:%M"),
    }

    dumps_json("patients_summary.json", patients_summary)

    """
    # inspections
    + 日付(31/01/2020)
    + 検査検体数
    + 陽性人数
    """

    df["判明日"] = df["date"].dt.strftime("%d/%m/%Y")

    df3 = df.loc[:, ["判明日", "検査検体数", "陽性確認"]].copy()

    inspections = {
        "data": df3.to_dict(orient="records"),
        "last_update": last_update.strftime("%Y/%m/%d %H:%M"),
    }

    dumps_json("inspections.json", inspections)
import pandas as pd

df_pcr = pd.read_excel("https://web.pref.hyogo.lg.jp/kf16/documents/pcr.xlsx", index_col=0).fillna(0)

df_pcr["PCR検査件数"] = df_pcr["PCR検査件数"].astype("int64")

df_pcr

df_pcr.sum()