兵庫県の圏域別受入可能病床数を取得(pdfminer)

pip install jaconv
pip install pdfminer.six
import re
from typing import Dict, List, Union
from urllib.parse import urljoin

import jaconv
import requests
from bs4 import BeautifulSoup
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBoxHorizontal
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage


def get_numbers_in_text(text: str) -> List[int]:
    return list(map(int, re.findall('[0-9]+', jaconv.z2h(text, digit=True))))


# スクレイピング
url = "https://web.pref.hyogo.lg.jp/kk03/200129.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}

r = requests.get(url, headers=headers)

r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

tag = soup.find("a", class_="icon_pdf", text=re.compile("医療提供体制の確保について"))

link = urljoin(url, tag.get("href"))

#!wget $link -O corona.pdf

# PDF処理

resourceManager = PDFResourceManager()
device = PDFPageAggregator(resourceManager, laparams=LAParams())

boxes = []

with open("corona.pdf", "rb") as fp:

    interpreter = PDFPageInterpreter(resourceManager, device)

    for page in PDFPage.get_pages(fp, maxpages=1):

        # ページを処理する。
        interpreter.process_page(page)

        # LTPageオブジェクトを取得。
        layout = device.get_result()

        tmp = [l for l in layout if isinstance(l, LTTextBoxHorizontal)]

        boxes.extend(tmp)

boxes.sort(key=lambda b: (-b.y1, b.x0))

text = "\n".join([box.get_text().strip() for box in boxes])

print(text)


m = re.search("【圏域別受入可能病床数】\n(.+)\n3  今後の状況の進展に応じて必要となる取組等", text, re.DOTALL)

data = get_numbers_in_text(m.group(0))

print(data)