愛媛県内の状況のPDFから検査陽性者の状況をスクレイピング

!apt install python3-tk ghostscript
!pip install camelot-py[cv]

!pip install jaconv
import datetime
import re
import pathlib

import camelot
import jaconv
import requests

def fetch_file(url, dir="."):

    r = requests.get(url)
    r.raise_for_status()

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p

p = fetch_file("https://www.pref.ehime.jp/h25500/kansen/documents/kennai_link.pdf")

df2 = camelot.read_pdf(str(p), pages="2", flavor="stream")[0].df.T

data = {}

for _, item in df2.iterrows():
    k = "".join(item.iloc[2:-1].str.cat(sep="").split())
    v = item.iloc[-1].rstrip("人")
    data[k] = int(v)

txt = jaconv.z2h(df2[1].str.cat(sep=""), kana=False, digit=True, ascii=True)

m = re.search("(\d{1,2})月(\d{1,2})日 *(\d{1,2})時現在", txt)

if m:
    month, day, hour = map(int, m.groups())
    dt_now = datetime.datetime(2020, month, day, hour)
else:
    dt_now = datetime.datetime.now()

data["更新日時"] = dt_now.isoformat()

data