import pathlib
from urllib.parse import urljoin
import pandas as pd
import requests
from bs4 import BeautifulSoup
base_url = "http://www.qq.pref.ehime.jp/qq38/WP0805/RP080501BL"
payload = {
"_blockCd": "",
"forward_next": "",
"torinBlockDetailInfo.torinBlockDetail[0].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[1].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[2].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[3].blockCheckFlg": "1",
"torinBlockDetailInfo.torinBlockDetail[4].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[5].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[6].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[7].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[8].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[9].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[10].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[11].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[12].blockCheckFlg": "0",
}
with requests.Session() as s:
r = s.get(base_url)
soup = BeautifulSoup(r.content, "html.parser")
token = soup.find("input", attrs={"name": "_csrf"}).get("value")
payload["_csrf"] = token
url = urljoin(
base_url, soup.find("form", attrs={"id": "_wp0805Form"}).get("action")
)
r = s.post(url, data=payload)
dfs = pd.read_html(r.text, attrs={"summary": "検索結果一覧を表示しています。"}, na_values="▲ページトップへ")
def dfs_join(dfs):
result = []
for df in dfs:
tmp = df.iloc[2:].rename(columns=df.iloc[1])
tmp.dropna(how="all", inplace=True)
tmp["日付"] = df.iat[0, 0]
result.append(tmp)
return pd.concat(result)
def split_tel(se):
tmp = se.str.extractall("(TEL([昼夜]))\s+([0-9\-]+)").reset_index()
df = tmp.pivot(index="level_0", columns=0, values=1)
result = df.reindex(columns=["TEL(昼)", "TEL(夜)"])
return result
df0 = dfs_join(dfs[1:]).reset_index(drop=True)
df0["医療機関情報"] = df0["医療機関情報"].str.replace("医療法人\s+?", "", regex=True)
df0[["病院名", "住所", "電話番号"]] = df0["医療機関情報"].str.split(n=2, expand=True)
df_tel = split_tel(df0["電話番号"])
df0 = df0.join(df_tel)
df0[["日中", "夜間"]] = df0["外来受付時間"].str.split(expand=True)
df0[["日付", "曜日"]] = df0["日付"].str.split(expand=True)
df_date = df0["日付"].str.extract("(\d{4})年(\d{1,2})月(\d{1,2})日").astype(int)
df_date.rename(columns={0: "year", 1: "month", 2: "day"}, inplace=True)
df0["日付"] = pd.to_datetime(df_date)
df0
col = ["日付", "曜日", "病院名", "住所", "TEL(昼)", "TEL(夜)", "診療科目", "日中", "夜間"]
df1 = df0.reindex(columns=col)
df2 = df1.melt(
id_vars=df1.columns.values[:7], var_name="時間帯", value_name="診察時間"
).dropna(subset=["診察時間"])
df2["時間帯"].mask(df2["診察時間"].str.contains("翌日"), "夜間", inplace=True)
df3 = (
df2.pivot(index=df2.columns[:7], columns="時間帯", values="診察時間")
.reset_index()
.reindex(columns=col)
)
df3.sort_values(by=["日付", "日中", "夜間"], inplace=True)
dt_str = df3["日付"][0].strftime("%Y-%m-%d")
p = pathlib.Path("data", f"{dt_str}.csv")
p.parent.mkdir(parents=True, exist_ok=True)
df3.to_csv(p, encoding="utf_8_sig", index=False)