www.pref.tochigi.lg.jp
github.com
apt install python3-tk ghostscript
pip install camelot-py[cv]
pip install jaconv
import requests
from bs4 import BeautifulSoup
import re
import jaconv
import datetime
from urllib.parse import urljoin
import camelot
import pandas as pd
url = "http://www.pref.tochigi.lg.jp/e04/welfare/hoken-eisei/kansen/hp/coronakensahasseijyoukyou.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html5lib")
tag = soup.find("a", text=re.compile("^栃木県における新型コロナウイルス感染症の発生状況一覧"))
link = urljoin(url, tag.get("href"))
tables = camelot.read_pdf(
link, pages="all", split_text=True, strip_text="\n", line_scale=40
)
dfs = [table.df for table in tables]
df_tmp = pd.concat(dfs)
df = df_tmp.T.set_index(0).T.set_index("番号")
df["陽性判明日"] = df["陽性判明日"].apply(
lambda s: jaconv.z2h(s, kana=False, digit=True, ascii=True)
)
df_date = df["陽性判明日"].str.extract(
"(\d{1,2}/\d{1,2})\s*(\((\d{1,2}/\d{1,2}) +(.+)\))?", expand=True
)
df_date.fillna("", inplace=True)
dt_now = datetime.datetime.now()
def my_parser(s):
if s:
y = dt_now.year
m, d = map(int, re.findall("[0-9]{1,2}", s))
return pd.Timestamp(year=y, month=m, day=d)
else:
return pd.NaT
df["陽性判明日"] = df_date[0].apply(my_parser)
df_date[2] = df_date[2].apply(my_parser)
df["退院"] = df_date[2].where(df_date[3] == "退院")
df_date
df.to_csv("covid19.csv")