www.pref.chiba.lg.jp
https://docs.google.com/spreadsheets/d/e/2PACX-1vR-bY3elqTA7sEThEP4GOOuOaLtE0VReY8-KeE25eFkHIGhR_x9tQFdirliUWVhHfPN6RPB4oT5kNAw/pub?output=xlsx
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pathlib
import pdfplumber
import pandas as pd
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}
def fetch_soup(url, parser="html.parser"):
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, parser)
return soup
def fetch_pdf(url):
soup = fetch_soup(url)
tag = soup.select_one("a.icon_pdf")
link = urljoin(url, tag.get("href"))
p = fetch_file(link)
return p
def fetch_file(url, dir="."):
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
r = requests.get(url)
r.raise_for_status()
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
url = "https://www.pref.chiba.lg.jp/shippei/press/2019/ncov-index.html"
soup = fetch_soup(url)
dfs = []
for i in (
soup.find("h2", text="新型コロナウイルス感染症(変異株)患者等の発生状況")
.find_next_sibling("ul")
.select("li > a")
):
link = urljoin(url, i.get("href"))
p = fetch_pdf(link)
with pdfplumber.open(p) as pdf:
for page in pdf.pages:
table = page.extract_table()
df_tmp = pd.DataFrame(table)
dfs.append(df_tmp)
df0 = pd.concat(dfs).set_axis(["No.", "年代", "性別", "居住地", "症状・経過", "備考"], axis=1)
df1 = df0[df0["備考"] != "備考"].copy()
for col in df1.select_dtypes(include=object).columns:
df1[col] = df1[col].str.strip().str.normalize("NFKC")
df1["No."] = df1["No."].astype(int)
df1.set_index("No.", inplace=True)
df1.sort_index(inplace=True)
df1
df2 = df1["症状・経過"].str.split(expand=True).rename(columns={0: "時期", 1: "症状"})
df3 = (
df1["備考"]
.str.replace("\s", "", regex=True)
.str.strip("・")
.str.split("・", expand=True)
.fillna("")
.rename(columns={0: "海外滞在歴", 1: "不特定多数との接触", 2: "濃厚接触者"})
)
df3["海外滞在歴"] = df3["海外滞在歴"].str.replace("海外滞在歴", "")
df3["不特定多数との接触"] = df3["不特定多数との接触"].str.replace("不特定多数との接触", "")
df = pd.concat([df1.loc[:, ["年代", "性別", "居住地"]], df2, df3], axis=1)
df.to_csv("chiba.csv", encoding="utf_8_sig")