!apt install python3-tk ghostscript !pip install camelot-py[cv] !pip install jaconv
import datetime import pathlib import re from urllib.parse import urljoin import camelot import jaconv import requests from bs4 import BeautifulSoup def fetch_file(url, dir="."): r = requests.get(url) r.raise_for_status() p = pathlib.Path(dir, pathlib.PurePath(url).name) p.parent.mkdir(parents=True, exist_ok=True) with p.open(mode="wb") as fw: fw.write(r.content) return p url = "https://www.pref.nagano.lg.jp/hoken-shippei/kenko/kenko/kansensho/joho/corona-doko.html" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko" } r = requests.get(url, headers=headers) r.raise_for_status() soup = BeautifulSoup(r.content, "html.parser") tag = soup.find("a", text=re.compile("^グラフPDFデータ"), href=re.compile(".pdf")) link = urljoin(url, tag.get("href")) p = fetch_file(link) df = camelot.read_pdf(str(p), pages="1", flavor="stream")[0].df df df1 = df[~(df[0] + df[1]).str.startswith("・")] temp = [] for _, item in df1.iloc[2:].iteritems(): s = "".join(item.str.cat(sep="").split()) temp.append(jaconv.z2h(s)) text = "".join(temp) data = {} for i in re.finditer(r"(検査実施人数|陰性|陽性者数(累積)|入院中|重症|退院等|死亡)([0-9,]+)人", text): data[i.group(1)] = int(i.group(2).replace(",", "")) m = re.search("うち([0-9,]+)名", text) if m: data["無症状病原体保有者"] = int(m.group(1).replace(",", "")) txt = jaconv.z2h(df.iloc[1].str.cat(sep=""), kana=False, digit=True, ascii=True) m_up = re.search("(\d{1,2})月(\d{1,2})日 *(\d{1,2})時現在", txt) if m_up: month, day, hour = map(int, m_up.groups()) dt_now = datetime.datetime(2020, month, day, hour) else: dt_now = datetime.datetime.now() data["更新日時"] = dt_now.isoformat() data