pdfのスクレイピングのプログラムすごいな
めちゃくちゃ勉強になった
グラフ表示用のJavascriptのデータからスクレイピング
import datetime import json import re import pandas as pd import requests from bs4 import BeautifulSoup def dumps_json(file_name, json_data): with open(file_name, "w") as fw: json.dump(json_data, fw, ensure_ascii=False, indent=4) url = "https://web.pref.hyogo.lg.jp/kf16/singatakoronakensa.html" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko", } r = requests.get(url, headers=headers) r.raise_for_status() soup = BeautifulSoup(r.content, "html.parser") # スクレイピング tag = soup.find("script", text=re.compile("var dataset")) t = tag.find_next("center").get_text(strip=True) # 最終更新日 mt = re.match(r"※(\d+)/(\d+)\s(\d+)時", t) m, d, h = map(int, mt.groups()) last_update = datetime.datetime(2020, m, d, h, 0) # データ s = tag.get_text(strip=True) ms = re.match(r"var dataset=(.*?);", s, re.DOTALL) if ms: dataset = json.loads(ms.group(1)) df = pd.DataFrame(dataset, columns=["labels", "検査検体数", "検査累計", "陽性確認", "陽性累計"]) df["date"] = pd.to_datetime("2020/" + df["labels"], format="%Y/%m/%d") """ # inspections_summary + 日付(2/01) + 検査検体数 + 陽性人数 """ df1 = df.loc[:, ["labels", "検査検体数", "陽性確認"]].copy() df1 inspections_summary = { "data": df1.to_dict(orient="list"), "last_update": last_update.strftime("%Y/%m/%d %H:%M"), } dumps_json("inspections_summary.json", inspections_summary) """ # parent_summary + 日付(2020-02-01T00:00:00+09:00) + 陽性人数 """ df["日付"] = df["date"].dt.strftime("%Y-%m-%dT%H:%M:%S+09:00") df2 = df.loc[:, ["日付", "陽性確認"]].copy() df2.rename(columns={"陽性確認": "小計"}, inplace=True) df2 patients_summary = { "data": df2.to_dict(orient="records"), "last_update": last_update.strftime("%Y/%m/%d %H:%M"), } dumps_json("patients_summary.json", patients_summary) """ # inspections + 日付(31/01/2020) + 検査検体数 + 陽性人数 """ df["判明日"] = df["date"].dt.strftime("%d/%m/%Y") df3 = df.loc[:, ["判明日", "検査検体数", "陽性確認"]].copy() inspections = { "data": df3.to_dict(orient="records"), "last_update": last_update.strftime("%Y/%m/%d %H:%M"), } dumps_json("inspections.json", inspections)
import pandas as pd df_pcr = pd.read_excel("https://web.pref.hyogo.lg.jp/kf16/documents/pcr.xlsx", index_col=0).fillna(0) df_pcr["PCR検査件数"] = df_pcr["PCR検査件数"].astype("int64") df_pcr df_pcr.sum()