愛媛県のコロナ感染者の概要の表を作成

import datetime
import pathlib
import re
from urllib.parse import urljoin

import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup

def fetch_file(url, dir="."):

    r = requests.get(url)
    r.raise_for_status()

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p

def wareki2date(s):

    m = re.search("(昭和|平成|令和)([ 0-9元]{1,2})年(\d{1,2})月(\d{1,2})日", s)

    if m:

        year, month, day = [1 if i == "元" else int(i.strip()) for i in m.group(2, 3, 4)]

        if m.group(1) == "昭和":
            year += 1925
        elif m.group(1) == "平成":
            year += 1988
        elif m.group(1) == "令和":
            year += 2018

        return datetime.date(year, month, day)

    else:
        return dt_now.date

def days2date(s):

    y = dt_now.year

    days = re.findall("[0-9]{1,2}", s)

    if len(days) == 2:
        m, d = map(int, days)
        return pd.Timestamp(year=y, month=m, day=d)
    else:
        return pd.NaT

url = "https://www.pref.ehime.jp/h25500/kansen/covid19.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

JST = datetime.timezone(datetime.timedelta(hours=+9), "JST")
dt_now = datetime.datetime.now(JST)

r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

links = [
    urljoin(url, tag.get("href"))
    for tag in soup.find_all(
        "a", href=re.compile(".pdf$"), text=re.compile("新型コロナウイルスの感染の確認について")
    )
    if tag
]

# 上位何件
n = 7

dfs = []

for link in links[0:n]:

    path_pdf = fetch_file(link)

    with pdfplumber.open(path_pdf) as pdf:

        dt_text = ""
        dt_update = dt_now.date

        for page in pdf.pages:

            if page.page_number == 1:

                dt_text = page.within_bbox((0, 65, page.width, 100)).extract_text()
                dt_update = wareki2date(re.sub("\s", "", dt_text))
 
            tables = page.extract_tables()

            for table in tables:

                df_tmp = pd.DataFrame(
                    table[1:], columns=[re.sub("\s", "", col) for col in table[0]]
                )

                r, c = df_tmp.shape

                if (r > 1) and (c > 6):

                    df_tmp["事例"] = df_tmp["事例"].fillna(method="ffill")

                    # 空白文字を除去
                    df_tmp["陽性者"] = df_tmp["陽性者"].replace("\s", "", regex=True)
                    df_tmp["住所地"] = df_tmp["住所地"].replace("\s", "", regex=True)
                    df_tmp["職業"] = df_tmp["職業"].replace("\s", "", regex=True)
                    df_tmp["公表日"] = dt_update

                    dfs.append(df_tmp)

df = pd.concat(dfs).reset_index(drop=True)

df["No"] = df["陽性者"].str.extract("(\d{1,3})人目").astype(int)

note = df["備考"].str.replace("\s", "", regex=True)

df["発症日"] = note.str.extract("(\d{1,2}/\d{1,2})発症")

df["発症日"] = df["発症日"].fillna("").apply(days2date)

symptom = note.str.extract("(発症|症状有)((.+))").rename(columns={0: "状況", 1: "症状"})

df["症状"] = symptom["症状"].mask(note.str.contains("症状なし"), "症状なし")

df["入院"] = note.str.extract("指定医療機関に(入院済み?|入院予定)")

df["県外滞在歴"] = note.str.extract("県外滞在歴(あり|なし)")

df["接触"] = note.apply(lambda s: ";".join(re.findall("(\d+)人目", s)))

df["関係"] = note.str.extract("の(濃厚接触者|接触者|関係者|家族)")

df["クラスタ"] = df["事例"].apply(lambda s: ";".join(re.findall("(\d+)事例目", s)))

df.sort_values(by=["クラスタ", "No"], inplace=True)

df.to_csv("ehime.csv")