兵庫県の新型コロナウイルスに感染した患者の状況をデータラングリング

web.pref.hyogo.lg.jp

import pathlib
import re
from urllib.parse import urljoin

import pandas as pd
import requests
from bs4 import BeautifulSoup


def fetch_soup(url, parser="html.parser"):

    r = requests.get(url)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, parser)

    return soup


def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    if not p.exists():

        r = requests.get(url)
        r.raise_for_status()

        with p.open(mode="wb") as fw:
            fw.write(r.content)

    return p


def excel2date(data):

    flg_is_serial = data.astype("str").str.isdigit()

    fromSerial = pd.to_datetime(
        data[flg_is_serial].astype(float), unit="D", origin=pd.Timestamp("1899/12/30")
    )
    fromString = pd.to_datetime(data[~flg_is_serial], errors="coerce")

    result = pd.concat([fromString, fromSerial])

    return result


headers = [
    ["番号", "発表日", "年代", "性別", "管轄", "居住地", "職業", "発症日", "渡航歴", "備考欄"],
    [
        "番号",
        "発表日",
        "年代",
        "性別",
        "管轄",
        "居住地",
        "職業",
        "発症日",
        "接触歴_有",
        "接触歴_無",
        "接触歴_調査中",
        "備考欄",
    ],
]

links = [
    {
        "url": "https://web.pref.hyogo.lg.jp/kk03/documents/corona-kanjajokyou1.xlsx",
        "skip": 5,
        "header": headers[0],
        "usecols": "B:K",
    },
    {
        "url": "https://web.pref.hyogo.lg.jp/kk03/documents/corona-kanjajokyou2.xlsx",
        "skip": 3,
        "header": headers[1],
        "usecols": "B:M",
    },
    {
        "url": "https://web.pref.hyogo.lg.jp/kk03/documents/corona-kanjajokyou3.xlsx",
        "skip": 5,
        "header": headers[1],
        "usecols": "B:M",
    },
    {
        "url": "https://web.pref.hyogo.lg.jp/kk03/documents/corona-kanjajokyou.xlsx",
        "skip": 5,
        "header": headers[1],
        "usecols": "B:M",
    },
]

# 最新のURLスクレイピング
url = "https://web.pref.hyogo.lg.jp/kk03/corona_hasseijyokyo.html"

soup = fetch_soup(url)

tag = soup.find("a", class_="icon_excel", text=re.compile("^新型コロナウイルスに感染した患者の状況"))

# 上書き
if tag:
    links[-1]["url"] = urljoin(url, tag.get("href"))

# データリングリング
dfs = []

for d in links:

    p = fetch_file(d["url"])

    df_tmp = pd.read_excel(
        p, skiprows=d["skip"], usecols=d["usecols"], header=None
    ).dropna(thresh=3)

    df_tmp.set_axis(d["header"], axis=1, inplace=True)

    df_tmp.drop_duplicates(subset="番号", inplace=True)

    df_tmp.set_index("番号", inplace=True)

    df_tmp["症状"] = df_tmp["発症日"].where(df_tmp["発症日"].isin(["症状なし", "調査中", "非公表", "なし"]))

    df_tmp["発表日YMD"] = excel2date(df_tmp["発表日"])
    df_tmp["発症日YMD"] = excel2date(df_tmp["発症日"])

    dfs.append(df_tmp)

# 結合
df = pd.concat(dfs).sort_index()

df.to_csv("data.csv", encoding="utf_8_sig")

df.info()

df["発症日数"] = df["発表日YMD"] - df["発症日YMD"]

# 発症日が発表日より120日以上未来の場合、1年減らす
df["発症日YMD"] = df["発症日YMD"].mask(df2["発症日数"] < pd.Timedelta(-180, unit="D"), df["発症日YMD"] - pd.offsets.DateOffset(years=1))

# 発症日が発表日より120日以上過去の場合、1年増やす
df["発症日YMD"] = df["発症日YMD"].mask(df["発症日数"] > pd.Timedelta(180, unit="D"), df["発症日YMD"] + pd.offsets.DateOffset(years=1))

# 再計算
df["発症日数"] = df["発表日YMD"] - df["発症日YMD"]

strdt = df["発表日YMD"].min()
enddt = df["発表日YMD"].max()

dt_range = pd.date_range(strdt, enddt)

df["発表日YMD"].value_counts().reindex(index=dt_range, fill_value=0)

df["年代"].astype(str).value_counts().sort_index()

df["性別"].value_counts().sort_index()

df["管轄"].value_counts().sort_index()

df["居住地"].value_counts().sort_index()

df["職業"].value_counts().sort_index()

ダミー変数からカテゴリ変数に変換

df1 = df.loc[:, ["接触歴_有", "接触歴_無", "接触歴_調査中"]].copy().notnull()
df1[df1.sum(axis=1) > 1]
df["接触歴"] = df1[df1.sum(axis=1) == 1].idxmax(axis=1).str.replace("接触歴_", "")
df_date = df["発症日YMD"].apply(lambda x: pd.Series([x.year, x.month, x.day])).rename(columns={0:"year", 1:"month", 2:"day"})