PDFをテキスト化して抽出する方が簡単だったorz
import pathlib import re import pdfbox import requests url = "https://www.pref.ehime.jp/h25500/kansen/documents/kennai_link.pdf" r = requests.get(url) r.raise_for_status() p = pathlib.Path("data.pdf") with p.open(mode="wb") as fw: fw.write(r.content) pdf = pdfbox.PDFBox() for i in range(1, 3): pdf.extract_text("data.pdf", f"data{i}.txt", sort=True, start_page=i, end_page=i) with open("data1.txt") as fr: txt1 = fr.read() with open("data2.txt") as fr: txt2 = fr.read() print(txt1) print(txt2) m1 = re.search(r"^合 計 ([0-9,]+) ([0-9,]+) ([0-9,]+)$", txt1, re.M) 検査 = [int(i.replace(",", "")) for i in m1.groups()] m2 = re.search("治療中:([0-9]+)人", txt1) 治療中 = int(m2.group(1).replace(",", "")) m3 = re.search("退院等:([0-9]+)人", txt1) 退院等 = int(m3.group(1).replace(",", "")) 状況 = [int(i.replace(",", "")) for i in re.findall("([0-9,]+)人", txt2)] print(検査) print(治療中) print(退院等) print(状況)