兵庫県の新型コロナウイルスに感染した患者の状況のExcelファイルをデータラングリング

# -*- coding: utf-8 -*-

import datetime
import json
import re
from urllib.parse import urljoin

import pandas as pd
import requests
from bs4 import BeautifulSoup

url = "https://web.pref.hyogo.lg.jp/kk03/corona_kanjyajyokyo.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

days = ["月", "火", "水", "木", "金", "土", "日"]

r = requests.get(url, headers=headers)

r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

# スクレイピング

tag = soup.select_one("a.icon_excel")

s = tag.get_text(strip=True)

link = urljoin(url, tag.get("href"))

# 最終更新日

ms = re.match(r"新型コロナウイルスに感染した患者の状況[((](\d+)月(\d+)日\s(\d+)時現在[))]", s)

m, d, h = map(int, ms.groups())

last_update = datetime.datetime(datetime.datetime.now().year, m, d, 0, 0) + datetime.timedelta(hours=h)

# データラングリング

my_parser = lambda date: pd.to_datetime(
    date, unit="D", origin=pd.Timestamp("1899/12/30")
)


df = pd.read_excel("data.xlsx", skiprows=3, skipfooter=2, parse_dates=[2], date_parser=my_parser)

df

# 欠損列を削除
df.dropna(how='all', axis=1, inplace=True)

df["リリース日"] = df["確認日"].dt.strftime("%Y-%m-%dT%H:%M:%S+09:00")

df["date"] = df["確認日"].dt.strftime("%Y-%m-%d")

df["年代"] = df["年代"].astype(str) + "代"

df["week"] = df["確認日"].dt.dayofweek

df["曜日"] = df["week"].apply(lambda x: days[x])


df.rename(columns={"番号": "No", "備考欄": "備考"}, inplace=True)
df.set_index("No", inplace=True)
df.sort_index(inplace=True)

df1 = df.loc[:, ["リリース日", "曜日", "居住地", "年代", "性別", "備考", "date"]].copy()

df1

patients = {
    "data": df1.to_dict(orient="records"),
    "last_update": last_update.strftime("%Y/%m/%d %H:%M"),
}

patients


with open("patients.json", "w") as fw:
    json.dump(patients, fw, ensure_ascii=False, indent=4)

# df1.to_csv("patients.csv", encoding="utf_8_sig")

新型コロナウイルスに感染した患者の発生状況

import requests
from bs4 import BeautifulSoup

url = "https://web.pref.hyogo.lg.jp/kk03/corona_hasseijyokyo.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}

r = requests.get(url, headers=headers)

r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

tbl = soup.find("table", class_="datatable")

tbl.caption.get_text(strip=True)

result = []

for tr in tbl.tbody.find_all("tr"):
    tds = [td.get_text(strip=True) for td in tr.find_all("td")]

    result.append(tds)