import datetime
import pathlib
import re
from urllib.parse import urljoin
import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup
def fetch_file(url, dir="."):
r = requests.get(url)
r.raise_for_status()
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
def wareki2date(s):
m = re.search("(昭和|平成|令和)([ 0-9元]{1,2})年(\d{1,2})月(\d{1,2})日", s)
if m:
year, month, day = [1 if i == "元" else int(i.strip()) for i in m.group(2, 3, 4)]
if m.group(1) == "昭和":
year += 1925
elif m.group(1) == "平成":
year += 1988
elif m.group(1) == "令和":
year += 2018
return datetime.date(year, month, day)
else:
return dt_now.date
def days2date(s):
y = dt_now.year
days = re.findall("[0-9]{1,2}", s)
if len(days) == 2:
m, d = map(int, days)
return pd.Timestamp(year=y, month=m, day=d)
else:
return pd.NaT
url = "https://www.pref.ehime.jp/h25500/kansen/covid19.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
JST = datetime.timezone(datetime.timedelta(hours=+9), "JST")
dt_now = datetime.datetime.now(JST)
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
links = [
urljoin(url, tag.get("href"))
for tag in soup.find_all(
"a", href=re.compile(".pdf$"), text=re.compile("新型コロナウイルスの感染の確認について")
)
if tag
]
n = 7
dfs = []
for link in links[0:n]:
path_pdf = fetch_file(link)
with pdfplumber.open(path_pdf) as pdf:
dt_text = ""
dt_update = dt_now.date
for page in pdf.pages:
if page.page_number == 1:
dt_text = page.within_bbox((0, 65, page.width, 100)).extract_text()
dt_update = wareki2date(re.sub("\s", "", dt_text))
tables = page.extract_tables()
for table in tables:
df_tmp = pd.DataFrame(
table[1:], columns=[re.sub("\s", "", col) for col in table[0]]
)
r, c = df_tmp.shape
if (r > 1) and (c > 6):
df_tmp["事例"] = df_tmp["事例"].fillna(method="ffill")
df_tmp["陽性者"] = df_tmp["陽性者"].replace("\s", "", regex=True)
df_tmp["住所地"] = df_tmp["住所地"].replace("\s", "", regex=True)
df_tmp["職業"] = df_tmp["職業"].replace("\s", "", regex=True)
df_tmp["公表日"] = dt_update
dfs.append(df_tmp)
df = pd.concat(dfs).reset_index(drop=True)
df["No"] = df["陽性者"].str.extract("(\d{1,3})人目").astype(int)
note = df["備考"].str.replace("\s", "", regex=True)
df["発症日"] = note.str.extract("(\d{1,2}/\d{1,2})発症")
df["発症日"] = df["発症日"].fillna("").apply(days2date)
symptom = note.str.extract("(発症|症状有)((.+))").rename(columns={0: "状況", 1: "症状"})
df["症状"] = symptom["症状"].mask(note.str.contains("症状なし"), "症状なし")
df["入院"] = note.str.extract("指定医療機関に(入院済み?|入院予定)")
df["県外滞在歴"] = note.str.extract("県外滞在歴(あり|なし)")
df["接触"] = note.apply(lambda s: ";".join(re.findall("(\d+)人目", s)))
df["関係"] = note.str.extract("の(濃厚接触者|接触者|関係者|家族)")
df["クラスタ"] = df["事例"].apply(lambda s: ";".join(re.findall("(\d+)事例目", s)))
df.sort_values(by=["クラスタ", "No"], inplace=True)
df.to_csv("ehime.csv")