pip install jaconv
プログラム
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}
def get_link():
url = "https://www.mhlw.go.jp/stf/seisakunitsuite/bunya/0000121431_00086.html"
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html5lib")
href = (
soup.find("div", class_="l-contentMain")
.find(string=re.compile(r"^新型コロナウイルス感染症の現在の状況と厚生労働省の対応について"))
.find_parent("a")
)
link = urljoin(url, href.get("href"))
return link
url = get_link()
print(url)
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html5lib")
print(soup.find("h1").get_text(strip=True))
img = soup.find("img", src=re.compile("^data:image/png;base64,"))
img_b64 = img.get("src").replace("data:image/png;base64,", "")
import base64
png = base64.b64decode(img_b64)
with open("corona.png", "wb") as fw:
fw.write(png)
from IPython.display import Image
Image("./corona.png")
import pandas as pd
import jaconv
dfs = pd.read_html(soup.prettify(), header=0, index_col=0)
df1 = dfs[1].copy()
def str2int(x):
x = x.rstrip("名")
x = x.replace(",", "")
x = x.strip()
x = jaconv.z2h(x, digit=True)
return int(x)
df1.index = df1.index.str.strip("※ ")
df1["感染者数"] = df1["感染者数"].apply(str2int)
df1["死亡者数"] = df1["死亡者数"].apply(str2int)
df1
dfs[2]
dfs[3]