github.com
pdfのスクレイピングのプログラムすごいな
めちゃくちゃ勉強になった
グラフ表示用のJavascriptのデータからスクレイピング
import datetime
import json
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
def dumps_json(file_name, json_data):
with open(file_name, "w") as fw:
json.dump(json_data, fw, ensure_ascii=False, indent=4)
url = "https://web.pref.hyogo.lg.jp/kf16/singatakoronakensa.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
tag = soup.find("script", text=re.compile("var dataset"))
t = tag.find_next("center").get_text(strip=True)
mt = re.match(r"※(\d+)/(\d+)\s(\d+)時", t)
m, d, h = map(int, mt.groups())
last_update = datetime.datetime(2020, m, d, h, 0)
s = tag.get_text(strip=True)
ms = re.match(r"var dataset=(.*?);", s, re.DOTALL)
if ms:
dataset = json.loads(ms.group(1))
df = pd.DataFrame(dataset, columns=["labels", "検査検体数", "検査累計", "陽性確認", "陽性累計"])
df["date"] = pd.to_datetime("2020/" + df["labels"], format="%Y/%m/%d")
"""
# inspections_summary
+ 日付(2/01)
+ 検査検体数
+ 陽性人数
"""
df1 = df.loc[:, ["labels", "検査検体数", "陽性確認"]].copy()
df1
inspections_summary = {
"data": df1.to_dict(orient="list"),
"last_update": last_update.strftime("%Y/%m/%d %H:%M"),
}
dumps_json("inspections_summary.json", inspections_summary)
"""
# parent_summary
+ 日付(2020-02-01T00:00:00+09:00)
+ 陽性人数
"""
df["日付"] = df["date"].dt.strftime("%Y-%m-%dT%H:%M:%S+09:00")
df2 = df.loc[:, ["日付", "陽性確認"]].copy()
df2.rename(columns={"陽性確認": "小計"}, inplace=True)
df2
patients_summary = {
"data": df2.to_dict(orient="records"),
"last_update": last_update.strftime("%Y/%m/%d %H:%M"),
}
dumps_json("patients_summary.json", patients_summary)
"""
# inspections
+ 日付(31/01/2020)
+ 検査検体数
+ 陽性人数
"""
df["判明日"] = df["date"].dt.strftime("%d/%m/%Y")
df3 = df.loc[:, ["判明日", "検査検体数", "陽性確認"]].copy()
inspections = {
"data": df3.to_dict(orient="records"),
"last_update": last_update.strftime("%Y/%m/%d %H:%M"),
}
dumps_json("inspections.json", inspections)
続きを読む