import requests
from bs4 import BeautifulSoup
import re
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}
def get_link():
url = "http://www.pref.osaka.lg.jp/iryo/osakakansensho/corona.html"
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
h2 = soup.find("h2", string="最新情報")
link = h2.find_next("a", string=re.compile("^新型コロナウイルス感染症患者")).get("href")
return link
link = get_link()
r = requests.get(link, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
url = soup.find("a", string="別紙", href=re.compile("\.xlsx$")).get("href")
import datetime
def excel_date(num):
if isinstance(num, int):
return datetime.datetime(1899, 12, 30) + datetime.timedelta(days=num)
else:
return None
import pandas as pd
dfe = pd.read_excel(url, header=None)
grouped_df = dfe.groupby((dfe[0] == "番号").cumsum())
result = []
for i in grouped_df.groups:
dfg = grouped_df.get_group(i)
dfg.fillna("", inplace=True)
data = {}
data["番号"] = dfg.iat[1, 0]
data["年代"] = dfg.iat[1, 1]
data["家族構成"] = dfg.iat[1, 3]
data["発症日"] = excel_date(dfg.iat[1, 4])
data["特記事項"] = "\n".join(dfg.iloc[1:, 5].values).strip()
data["性別"] = dfg.iat[3, 1]
data["職業"] = dfg.iat[3, 3]
data["症状"] = dfg.iat[3, 4]
data["居住地"] = dfg.iat[6, 1]
data["市"] = "".join(dfg.iloc[7, 1:3].values).strip("()()")
data["勤務状況"] = "\n".join(dfg.iloc[6:8, 3].values).strip()
data["症状詳細"] = "\n".join(dfg.iloc[4:, 4].values).strip()
result.append(data)
df = pd.DataFrame(result)
df
df.to_csv("osaka.csv")