!apt install python3-tk ghostscript !pip install camelot-py[cv] !pip install jaconv
import datetime import re import pathlib import camelot import jaconv import requests def fetch_file(url, dir="."): r = requests.get(url) r.raise_for_status() p = pathlib.Path(dir, pathlib.PurePath(url).name) p.parent.mkdir(parents=True, exist_ok=True) with p.open(mode="wb") as fw: fw.write(r.content) return p p = fetch_file("https://www.pref.ehime.jp/h25500/kansen/documents/kennai_link.pdf") df2 = camelot.read_pdf(str(p), pages="2", flavor="stream")[0].df.T data = {} for _, item in df2.iterrows(): k = "".join(item.iloc[2:-1].str.cat(sep="").split()) v = item.iloc[-1].rstrip("人") data[k] = int(v) txt = jaconv.z2h(df2[1].str.cat(sep=""), kana=False, digit=True, ascii=True) m = re.search("(\d{1,2})月(\d{1,2})日 *(\d{1,2})時現在", txt) if m: month, day, hour = map(int, m.groups()) dt_now = datetime.datetime(2020, month, day, hour) else: dt_now = datetime.datetime.now() data["更新日時"] = dt_now.isoformat() data