千葉県の新型コロナウイルス感染症(変異株)患者等の発生状況をCSV変換

www.pref.chiba.lg.jp

  • xlsxファイル

https://docs.google.com/spreadsheets/d/e/2PACX-1vR-bY3elqTA7sEThEP4GOOuOaLtE0VReY8-KeE25eFkHIGhR_x9tQFdirliUWVhHfPN6RPB4oT5kNAw/pub?output=xlsx

import requests
from bs4 import BeautifulSoup

from urllib.parse import urljoin

import pathlib

import pdfplumber
import pandas as pd

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}


def fetch_soup(url, parser="html.parser"):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, parser)

    return soup


def fetch_pdf(url):

    soup = fetch_soup(url)

    tag = soup.select_one("a.icon_pdf")

    link = urljoin(url, tag.get("href"))

    p = fetch_file(link)

    return p


def fetch_file(url, dir="."):

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    r = requests.get(url)
    r.raise_for_status()

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p


url = "https://www.pref.chiba.lg.jp/shippei/press/2019/ncov-index.html"

soup = fetch_soup(url)

dfs = []

for i in (
    soup.find("h2", text="新型コロナウイルス感染症(変異株)患者等の発生状況")
    .find_next_sibling("ul")
    .select("li > a")
):

    link = urljoin(url, i.get("href"))

    p = fetch_pdf(link)

    with pdfplumber.open(p) as pdf:

        for page in pdf.pages:

            table = page.extract_table()

            df_tmp = pd.DataFrame(table)

            dfs.append(df_tmp)

df0 = pd.concat(dfs).set_axis(["No.", "年代", "性別", "居住地", "症状・経過", "備考"], axis=1)

df1 = df0[df0["備考"] != "備考"].copy()

for col in df1.select_dtypes(include=object).columns:
    df1[col] = df1[col].str.strip().str.normalize("NFKC")

df1["No."] = df1["No."].astype(int)

df1.set_index("No.", inplace=True)

df1.sort_index(inplace=True)

df1

df2 = df1["症状・経過"].str.split(expand=True).rename(columns={0: "時期", 1: "症状"})

df3 = (
    df1["備考"]
    .str.replace("\s", "", regex=True)
    .str.strip("・")
    .str.split("・", expand=True)
    .fillna("")
    .rename(columns={0: "海外滞在歴", 1: "不特定多数との接触", 2: "濃厚接触者"})
)

df3["海外滞在歴"] = df3["海外滞在歴"].str.replace("海外滞在歴", "")

df3["不特定多数との接触"] = df3["不特定多数との接触"].str.replace("不特定多数との接触", "")

df = pd.concat([df1.loc[:, ["年代", "性別", "居住地"]], df2, df3], axis=1)

df.to_csv("chiba.csv", encoding="utf_8_sig")