data.jsonまで作成
import copy import datetime import json import pathlib import re import jaconv import requests from bs4 import BeautifulSoup def get_title(tag): if tag.name == "h2": if tag.get_text(strip=True) == "新型コロナウイルス感染症の県内における発生状況": return True return False r = requests.get( "https://www.pref.yamanashi.jp/koucho/coronavirus/info_coronavirus_prevention.html" ) r.raise_for_status() soup = BeautifulSoup(r.content, "html.parser") h2 = soup.find(get_title) data = [] s = "" # 下向きに同レベルのタグを抽出 for tag in h2.find_next_siblings(): if tag.name == "h4": data.append(jaconv.z2h(s.rstrip(), kana=False, digit=True, ascii=True)) s = "" elif tag.name == "h2": data.append(jaconv.z2h(s.rstrip(), kana=False, digit=True, ascii=True)) break s += tag.get_text(strip=True) + "\n" data JST = datetime.timezone(datetime.timedelta(hours=+9)) # 和暦から西暦のdateに変換 def wareki2date(s): m = re.match(r"令和(\d{1,2})年(\d{1,2})月(\d{1,2})日", s) year, month, day = map(int, m.groups()) year += 2018 result = datetime.datetime(year, month, day, tzinfo=JST) return result result = [] for d in data[1:]: # m = re.match("^.+$", d, re.MULTILINE) m = re.match(r"県内\d{1,3}例目", d) if m: temp = {"No": m.group(0)} for i in re.finditer(r"(発生判明日|年代|性別|居住地):(.+)$", d, re.MULTILINE): temp[i.group(1)] = i.group(2) if i.group(1) == "居住地": t = copy.deepcopy(temp) t["リリース日"] = wareki2date(t["発生判明日"]).isoformat() del t["発生判明日"] t["退院"] = None result.append(t) DATA_DIR = "data" p = pathlib.Path(DATA_DIR, "data.json") p.parent.mkdir(parents=True, exist_ok=True) with p.open(mode="w", encoding="utf-8") as fw: json.dump(result[::-1], fw, ensure_ascii=False, indent=4)