github.com
www.pref.ishikawa.lg.jp
import datetime
import re
import jaconv
import pandas as pd
import requests
from bs4 import BeautifulSoup
def wareki2date(s):
m = re.match(r"(昭和|平成|令和)(\d{1,2})年(\d{1,2})月(\d{1,2})日", s)
year = int(m.group(2))
month = int(m.group(3))
day = int(m.group(4))
if m.group(1) == "昭和":
year += 1925
elif m.group(1) == "平成":
year += 1988
elif m.group(1) == "令和":
year += 2018
result = datetime.date(year, month, day)
return result
url = "https://www.pref.ishikawa.lg.jp/kansen/coronakennai.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
contents = soup.find("div", id="tmp_contents")
result = []
for tag in contents.find_all("h3"):
data = {}
h2 = tag.find_previous_sibling("h2")
data["日付"] = wareki2date(h2.get_text(strip=True))
h3 = tag.get_text(strip=True)
data["番号"] = int(re.match(r"感染者(\d+)", h3).group(1))
tmp = []
for i in tag.find_next_siblings():
if i.name in ["p", "div"]:
tmp.append(
jaconv.z2h(i.get_text(strip=True), kana=False, digit=True, ascii=True)
)
else:
break
p = "\n".join(tmp)
m = re.search(r"\(1\)年代:?(.+)\(2\)性別(.+)\(3\)居住地(.+)\(4\)(.+)", p, re.DOTALL)
s = [j.strip() for j in m.groups() if j]
data["年代"] = s[0]
data["性別"] = s[1]
data["居住地"] = s[2]
data["内容"] = s[3]
result.append(data)
df = pd.DataFrame(result)
df_tmp = df["内容"].str.split(r"\(5\)", expand=True)
df_tmp.rename(columns={0: "症状・経過", 1: "行動歴"}, inplace=True)
df_kanja = pd.concat([df, df_tmp], axis=1)
df_kanja.drop(columns="内容", inplace=True)
df_kanja.set_index("番号", inplace=True)
df_kanja.sort_index(inplace=True)
df_kanja.at[1, "症状・経過"], df_kanja.at[1, "行動歴"] = df_kanja.at[1, "行動歴"], df_kanja.at[1, "症状・経過"]
df_kanja