最新情報から5件分取得
記事内容と確認するため抽出前の情報も表示しています
import requests from bs4 import BeautifulSoup import re impot time headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko", } def get_links(): url = "https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/0000121431_00086.html" r = requests.get(url, headers=headers) r.raise_for_status() soup = BeautifulSoup(r.content, "html5lib") result = [ i.get("href") for i in soup.find("h2", string="国内の患者発生") .find_next("div", class_="m-grid") .find_all("a", string=re.compile(r"^新型コロナウイルスに関連した患者の発生について")) ] return result links = get_links() print(links) for link in links[:5]: r = requests.get(link, headers=headers) r.raise_for_status() soup = BeautifulSoup(r.content, "html5lib") p_day = soup.find("p", class_="m-boxInfo__date").get_text() t = soup.select_one("div.m-grid > div.m-grid__col1").get_text() m = re.search("概要:(.+)以下", t, re.DOTALL) text = re.sub( r"\n\s+", "、", m.group().replace(" ", "").strip().replace("(高齢者、死亡例)", "(高齢者 死亡例)"), ) print(text) print("-" * 75) print(p_day) print("\n【陽性患者】") for i in re.findall(r"^(.+?):.*?患者.+?例((.+?))", text, re.MULTILINE): # print(i) for j in i[1].split("、"): l = [k for k in j.split(":")] print(i[0], l[0], l[1]) print("\n【無症状病原体保有者】") for i in re.findall(r"(.+?):.*?無症状病原体保有者.+?名((.+?))", text, re.MULTILINE): for j in i[1].split("、"): if j: print(i[0], j) print("=" * 75) time.sleep(3)