pip install jaconv
pip install pdfminer.six
import re
from typing import Dict, List, Union
from urllib.parse import urljoin
import jaconv
import requests
from bs4 import BeautifulSoup
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBoxHorizontal
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
def get_numbers_in_text(text: str) -> List[int]:
return list(map(int, re.findall('[0-9]+', jaconv.z2h(text, digit=True))))
url = "https://web.pref.hyogo.lg.jp/kk03/200129.html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
tag = soup.find("a", class_="icon_pdf", text=re.compile("医療提供体制の確保について"))
link = urljoin(url, tag.get("href"))
resourceManager = PDFResourceManager()
device = PDFPageAggregator(resourceManager, laparams=LAParams())
boxes = []
with open("corona.pdf", "rb") as fp:
interpreter = PDFPageInterpreter(resourceManager, device)
for page in PDFPage.get_pages(fp, maxpages=1):
interpreter.process_page(page)
layout = device.get_result()
tmp = [l for l in layout if isinstance(l, LTTextBoxHorizontal)]
boxes.extend(tmp)
boxes.sort(key=lambda b: (-b.y1, b.x0))
text = "\n".join([box.get_text().strip() for box in boxes])
print(text)
m = re.search("【圏域別受入可能病床数】\n(.+)\n3 今後の状況の進展に応じて必要となる取組等", text, re.DOTALL)
data = get_numbers_in_text(m.group(0))
print(data)