OCR SpaceのFree OCR APIを使って愛知県の検査陽性者の状況をスクレイピングする
import os import re from urllib.parse import urljoin import requests from bs4 import BeautifulSoup ocr_api_key = os.environ["OCR_SPACE_API"] def ocr_space_url(url, overlay=False, api_key=ocr_api_key, language="eng"): payload = { "url": url, "isOverlayRequired": overlay, "apikey": api_key, "language": language, } r = requests.post("https://api.ocr.space/parse/image", data=payload,) return r.json() url = "https://www.pref.aichi.jp/site/covid19-aichi/" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko", } r = requests.get(url, headers=headers) r.raise_for_status() soup = BeautifulSoup(r.content, "html.parser") src = soup.find("img", alt=re.compile("検査陽性者$")).get("src") img_url = urljoin(url, src) res = ocr_space_url(url=img_url, language="jpn") text = res["ParsedResults"][0]["ParsedText"] data = [int(i.rstrip("人").replace(",", "")) for i in re.findall("[0-9,]+人", text)] print(data) result = [] while len(data) >= 12: if data[2] == data[3] + data[4] + data[5]: if ( data[1] == data[2] + data[6] + data[7] + data[8] + data[9] + data[10] + data[11] ): result = data[:12] break data.pop(0) result # 日時 m_date = re.search("(\d{4})年(\d{1,2})月(\d{1,2})日(\d{1,2})時", text) if m_date: year, month, day, hour = map(int, m_date.groups()) dt_update = datetime.datetime(year, month, day, hour) else: dt_update = datetime.datetime.now() dt_update # 備考 m = re.search("^※1.+検査を行ったものについて", text, re.DOTALL | re.MULTILINE) remark = "".join(m.group(0).replace("※1", "").replace("※2", "").replace("(注)", "").replace("検査を行ったものについて", "検査を行ったものについて掲載。").splitlines()) remark