web.pref.hyogo.lg.jp
import pathlib
import re
from urllib.parse import urljoin
import pandas as pd
import requests
from bs4 import BeautifulSoup
def fetch_soup(url, parser="html.parser"):
r = requests.get(url)
r.raise_for_status()
soup = BeautifulSoup(r.content, parser)
return soup
def fetch_file(url, dir="."):
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
if not p.exists():
r = requests.get(url)
r.raise_for_status()
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
def excel2date(data):
flg_is_serial = data.astype("str").str.isdigit()
fromSerial = pd.to_datetime(
data[flg_is_serial].astype(float), unit="D", origin=pd.Timestamp("1899/12/30")
)
fromString = pd.to_datetime(data[~flg_is_serial], errors="coerce")
result = pd.concat([fromString, fromSerial])
return result
headers = [
["番号", "発表日", "年代", "性別", "管轄", "居住地", "職業", "発症日", "渡航歴", "備考欄"],
[
"番号",
"発表日",
"年代",
"性別",
"管轄",
"居住地",
"職業",
"発症日",
"接触歴_有",
"接触歴_無",
"接触歴_調査中",
"備考欄",
],
]
links = [
{
"url": "https://web.pref.hyogo.lg.jp/kk03/documents/corona-kanjajokyou1.xlsx",
"skip": 5,
"header": headers[0],
"usecols": "B:K",
},
{
"url": "https://web.pref.hyogo.lg.jp/kk03/documents/corona-kanjajokyou2.xlsx",
"skip": 3,
"header": headers[1],
"usecols": "B:M",
},
{
"url": "https://web.pref.hyogo.lg.jp/kk03/documents/corona-kanjajokyou3.xlsx",
"skip": 5,
"header": headers[1],
"usecols": "B:M",
},
{
"url": "https://web.pref.hyogo.lg.jp/kk03/documents/corona-kanjajokyou.xlsx",
"skip": 5,
"header": headers[1],
"usecols": "B:M",
},
]
url = "https://web.pref.hyogo.lg.jp/kk03/corona_hasseijyokyo.html"
soup = fetch_soup(url)
tag = soup.find("a", class_="icon_excel", text=re.compile("^新型コロナウイルスに感染した患者の状況"))
if tag:
links[-1]["url"] = urljoin(url, tag.get("href"))
dfs = []
for d in links:
p = fetch_file(d["url"])
df_tmp = pd.read_excel(
p, skiprows=d["skip"], usecols=d["usecols"], header=None
).dropna(thresh=3)
df_tmp.set_axis(d["header"], axis=1, inplace=True)
df_tmp.drop_duplicates(subset="番号", inplace=True)
df_tmp.set_index("番号", inplace=True)
df_tmp["症状"] = df_tmp["発症日"].where(df_tmp["発症日"].isin(["症状なし", "調査中", "非公表", "なし"]))
df_tmp["発表日YMD"] = excel2date(df_tmp["発表日"])
df_tmp["発症日YMD"] = excel2date(df_tmp["発症日"])
dfs.append(df_tmp)
df = pd.concat(dfs).sort_index()
df.to_csv("data.csv", encoding="utf_8_sig")
df.info()
df["発症日数"] = df["発表日YMD"] - df["発症日YMD"]
df["発症日YMD"] = df["発症日YMD"].mask(df2["発症日数"] < pd.Timedelta(-180, unit="D"), df["発症日YMD"] - pd.offsets.DateOffset(years=1))
df["発症日YMD"] = df["発症日YMD"].mask(df["発症日数"] > pd.Timedelta(180, unit="D"), df["発症日YMD"] + pd.offsets.DateOffset(years=1))
df["発症日数"] = df["発表日YMD"] - df["発症日YMD"]
strdt = df["発表日YMD"].min()
enddt = df["発表日YMD"].max()
dt_range = pd.date_range(strdt, enddt)
df["発表日YMD"].value_counts().reindex(index=dt_range, fill_value=0)
df["年代"].astype(str).value_counts().sort_index()
df["性別"].value_counts().sort_index()
df["管轄"].value_counts().sort_index()
df["居住地"].value_counts().sort_index()
df["職業"].value_counts().sort_index()
ダミー変数からカテゴリ変数に変換
df1 = df.loc[:, ["接触歴_有", "接触歴_無", "接触歴_調査中"]].copy().notnull()
df1[df1.sum(axis=1) > 1]
df["接触歴"] = df1[df1.sum(axis=1) == 1].idxmax(axis=1).str.replace("接触歴_", "")
df_date = df["発症日YMD"].apply(lambda x: pd.Series([x.year, x.month, x.day])).rename(columns={0:"year", 1:"month", 2:"day"})