import requests from bs4 import BeautifulSoup from urllib.parse import urljoin import pathlib import re headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko" } def fetch_soup(url, parser="html.parser"): r = requests.get(url, headers=headers) r.raise_for_status() soup = BeautifulSoup(r.content, parser) return soup def fetch_file(url, dir="."): p = pathlib.Path(dir, pathlib.PurePath(url).name) p.parent.mkdir(parents=True, exist_ok=True) r = requests.get(url) r.raise_for_status() with p.open(mode="wb") as fw: fw.write(r.content) return p url = "https://www.pref.toyama.jp/120507/kurashi/kenkou/kenkou/covid-19/kj00022038.html" soup = fetch_soup(url) href = soup.find("a", text=re.compile("^強化・緩和の判断指標")).get("href") link = urljoin(url, href) p = fetch_file(link) import pandas as pd df0 = ( pd.read_excel(p, index_col=None, header=None) .iloc[2:9, 3:] .T.reset_index(drop=True) .set_axis(["date", "入院者数", "重症病床稼働率", "新規陽性者数", "感染経路不明者数", "陽性率", "先週対比"], axis=1) ) df_date = df0["date"].str.extract("(\d{1,2}/\d{1,2})").rename({0: "date"}, axis=1) df_date["year"] = 2020 + (df_date["date"] == "1/1").cumsum() df_date[["month", "day"]] = ( df_date["date"].str.extract("(\d{1,2})/(\d{1,2})", expand=True).astype(int) ) df_date.drop("date", axis=1, inplace=True) df0["date"] = pd.to_datetime(df_date, errors="coerce") df0.set_index("date", inplace=True) df0.mask(df0 == "-", inplace=True) df1 = df0.astype(float) df1["日付"] = df1.index.strftime("%Y-%m-%d") df1.iloc[-1].to_dict()
import requests from bs4 import BeautifulSoup headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko" } def fetch_soup(url, parser="html.parser"): r = requests.get(url, headers=headers) r.raise_for_status() soup = BeautifulSoup(r.content, parser) return soup url = "https://www.pref.toyama.jp/120507/kurashi/kenkou/kenkou/covid-19/kj00021798.html" soup = fetch_soup(url) summary = soup.find("div", id="tmp_contents").get_text(strip=True) import unicodedata text = unicodedata.normalize("NFKC", summary) print(text) import re data = {} for i in re.finditer( "(新規感染者数|累計|入院者数|重症者|宿泊療養施設入所者数|自宅療養者数|死亡者数|退院及び療養解除者数|アルファ株|デルタ株|オミクロン株) *?([0-9,]+)人?", text, ): data[i.group(1)] = int(i.group(2).replace(",", "")) data
import pathlib import re from urllib.parse import urljoin import pandas as pd import requests from bs4 import BeautifulSoup headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko" } def fetch_soup(url, parser="html.parser"): r = requests.get(url, headers=headers) r.raise_for_status() soup = BeautifulSoup(r.content, parser) return soup def fetch_file(url, dir="."): p = pathlib.Path(dir, pathlib.PurePath(url).name) p.parent.mkdir(parents=True, exist_ok=True) r = requests.get(url) r.raise_for_status() with p.open(mode="wb") as fw: fw.write(r.content) return p url = "https://www.pref.toyama.jp/120507/kurashi/kenkou/kenkou/covid-19/kj00021798.html" soup = fetch_soup(url) href = soup.find( "a", text=re.compile("^富山県内における新型コロナウイルス感染症の発生状況一覧"), href=re.compile("\.xlsx?$") ).get("href") link = urljoin(url, href) p = fetch_file(link) df0 = pd.read_excel(p, skiprows=2) df0.rename(columns={"県番号": "No", "検査結果判明日": "日付"}, inplace=True) df1 = df0[df0["市番号"] != "患者発生届取り下げ"].copy() flg_is_serial = df1["日付"].astype("str").str.isdigit() fromSerial = pd.to_datetime( df1.loc[flg_is_serial, "日付"].astype(float), unit="D", origin=pd.Timestamp("1899/12/30"), ) fromString = pd.to_datetime(df1.loc[~flg_is_serial, "日付"]) df1["日付"] = pd.concat([fromString, fromSerial]) df1 df1["性別"] = df1["性別"].replace({"男": "男性", "女": "女性"}) df1["性別"].unique() df1["年代"] = df1["年代"].str.replace("\s", "", regex=True) df1["年代"] = df1["年代"].replace(["90代", "90代以上"], "90歳以上") df1["年代"] = df1["年代"].replace(["1歳未満", "10歳未満代"], "10歳未満") df1["年代"].unique() df1["居住地"] = df1["居住地"].str.replace("\s", "", regex=True) df1["居住地"] = df1["居住地"].replace("冨山市", "富山市") df1["居住地"].unique() df1 data = ( df1["居住地"] .value_counts() .reset_index() .rename(columns={"index": "label", "居住地": "count"}) .to_dict(orient="records") ) data