OCR Spaceで愛知県の検査陽性者の状況をスクレイピング

OCR SpaceのFree OCR APIを使って愛知県の検査陽性者の状況をスクレイピングする

ocr.space

keyはENVのOCR_SPACE_APIに入れる

import os
import re
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

ocr_api_key = os.environ["OCR_SPACE_API"]

def ocr_space_url(url, overlay=False, api_key=ocr_api_key, language="eng"):

    payload = {
        "url": url,
        "isOverlayRequired": overlay,
        "apikey": api_key,
        "language": language,
    }
    r = requests.post("https://api.ocr.space/parse/image", data=payload,)
    return r.json()


url = "https://www.pref.aichi.jp/site/covid19-aichi/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}

r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")

src = soup.find("img", alt=re.compile("検査陽性者$")).get("src")
img_url = urljoin(url, src)

res = ocr_space_url(url=img_url, language="jpn")

text = res["ParsedResults"][0]["ParsedText"]

data = [int(i.rstrip("人").replace(",", "")) for i in re.findall("[0-9,]+人", text)]
print(data)

result = []

while len(data) >= 12:

    if data[2] == data[3] + data[4] + data[5]:
        if (
            data[1]
            == data[2] + data[6] + data[7] + data[8] + data[9] + data[10] + data[11]
        ):
            result = data[:12]
            break

    data.pop(0)

result

# 日時

m_date = re.search("(\d{4})年(\d{1,2})月(\d{1,2})日(\d{1,2})時", text)

if m_date:
    year, month, day, hour = map(int, m_date.groups())
    dt_update = datetime.datetime(year, month, day, hour)
else:
    dt_update = datetime.datetime.now()

dt_update

# 備考

m = re.search("^※1.+検査を行ったものについて", text, re.DOTALL | re.MULTILINE)
remark = "".join(m.group(0).replace("※1", "").replace("※2", "").replace("(注)", "").replace("検査を行ったものについて", "検査を行ったものについて掲載。").splitlines())

remark