github.com
- 症状がわからないので入院中を軽症・中等症に分類しています
- PDFの3ページ目にタイトルがあるかわからないのでタイトルがある場合で結合
apt install python3-tk ghostscript
pip install camelot-py[cv]
pip install pycurl
pip install retry
pip install pandas
pip install beautifulsoup4
pip install requests
import datetime
import json
import pathlib
import re
from urllib.parse import urljoin
import pandas as pd
import requests
from bs4 import BeautifulSoup
import camelot
import pycurl
from retry import retry
@retry(tries=5, delay=5, backoff=3)
def get_file(url, dir="."):
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
with p.open(mode="wb") as f:
c = pycurl.Curl()
c.setopt(c.URL, url)
c.setopt(c.WRITEDATA, f)
c.perform()
c.close()
return p
def my_parser(s):
if s:
y = dt_now.year
m, d = map(int, re.findall("[0-9]{1,2}", s))
return pd.Timestamp(year=y, month=m, day=d)
else:
return pd.NaT
url = "https://www.pref.miyazaki.lg.jp/kenko/hoken/kansensho/covid19/hassei.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
tag = soup.find("a", class_="icon_pdf", text=re.compile("^宮崎県内における感染者発生状況"))
link = urljoin(url, tag.get("href"))
file = get_file(link)
tables = camelot.read_pdf(
str(file), pages="2-end", split_text=True, strip_text="\n", line_scale=40
)
df0 = pd.concat([table.df.T.set_index(0).T.set_index("No.") for table in tables])
df1 = (
df0["現在の状況"]
.str.extract("(\d{1,2}月\d{1,2}日)?(入院中|退院|死亡)")
.rename(columns={0: "退院日", 1: "退院"})
)
df_kanja = pd.concat([df0, df1], axis=1)
dt_now = datetime.datetime.now()
dt_update = dt_now.strftime("%Y/%m/%d")
data = {"lastUpdate": dt_update}
df_kanja["判明日"] = df_kanja["判明日"].apply(my_parser)
df_kanja["退院日"] = df_kanja["退院日"].fillna("").apply(my_parser)
df_kanja["リリース日"] = df_kanja["判明日"].dt.strftime("%Y-%m-%dT08:00:00.000Z")
df_kanja["date"] = df_kanja["判明日"].dt.strftime("%Y-%m-%d")
df_patients = df_kanja.loc[:, ["リリース日", "居住地", "年代", "性別", "退院", "date"]]
data["patients"] = {
"data": df_patients.to_dict(orient="records"),
"date": dt_update,
}
df_patients_sum = (
df_kanja["判明日"].value_counts().sort_index().asfreq("D", fill_value=0).reset_index()
)
df_patients_sum["日付"] = df_patients_sum["index"].dt.strftime("%Y-%m-%dT08:00:00.000Z")
df_patients_sum.rename(columns={"判明日": "小計"}, inplace=True)
df_patients_sum.drop(columns=["index"], inplace=True)
data["patients_summary"] = {
"data": df_patients_sum.to_dict(orient="records"),
"date": dt_update,
}
r = requests.get(
"https://www.pref.miyazaki.lg.jp/kansensho-taisaku/kenko/hoken/covid19.html",
headers=headers,
)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
table = soup.find("h3", text="相談・検査件数").find_next_sibling("table")
df_test = pd.read_html(table.prettify(), index_col=0, header=[0, 1])[0]
sr_main_sum = (
df_kanja["退院"].value_counts().reindex(["入院中", "退院", "死亡"], fill_value=0).astype(int)
)
data["main_summary"] = {
"attr": "検査実施人数",
"value": int(df_test.loc["累計", (["検査件数"], ["合計"])]),
"children": [
{
"attr": "陽性患者数",
"value": int(df_test.loc["累計", (["検査件数"], ["陽性件数"])]),
"children": [
{
"attr": "入院中",
"value": int(sr_main_sum["入院中"]),
"children": [
{"attr": "軽症・中等症", "value":int(sr_main_sum["入院中"])},
{"attr": "重症", "value": 0},
],
},
{"attr": "退院", "value": int(sr_main_sum["退院"])},
{"attr": "死亡", "value": int(sr_main_sum["死亡"])},
],
}
],
}
with open("data.json", "w", encoding="utf-8") as fw:
json.dump(data, fw, ensure_ascii=False, indent=4)