栃木県における新型コロナウイルス感染症の発生状況一覧をスクレイピング

www.pref.tochigi.lg.jp

github.com

apt install python3-tk ghostscript
pip install camelot-py[cv]
pip install jaconv
import requests
from bs4 import BeautifulSoup

import re
import jaconv
import datetime

from urllib.parse import urljoin

import camelot
import pandas as pd

url = "http://www.pref.tochigi.lg.jp/e04/welfare/hoken-eisei/kansen/hp/coronakensahasseijyoukyou.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}

r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html5lib")

tag = soup.find("a", text=re.compile("^栃木県における新型コロナウイルス感染症の発生状況一覧"))

link = urljoin(url, tag.get("href"))

tables = camelot.read_pdf(
    link, pages="all", split_text=True, strip_text="\n", line_scale=40
)

dfs = [table.df for table in tables]

df_tmp = pd.concat(dfs)

df = df_tmp.T.set_index(0).T.set_index("番号")

df["陽性判明日"] = df["陽性判明日"].apply(
    lambda s: jaconv.z2h(s, kana=False, digit=True, ascii=True)
)

df_date = df["陽性判明日"].str.extract(
    "(\d{1,2}/\d{1,2})\s*(\((\d{1,2}/\d{1,2}) +(.+)\))?", expand=True
)

df_date.fillna("", inplace=True)

dt_now = datetime.datetime.now()


def my_parser(s):

    if s:
        y = dt_now.year
        m, d = map(int, re.findall("[0-9]{1,2}", s))

        return pd.Timestamp(year=y, month=m, day=d)

    else:
        return pd.NaT


df["陽性判明日"] = df_date[0].apply(my_parser)

df_date[2] = df_date[2].apply(my_parser)

df["退院"] = df_date[2].where(df_date[3] == "退院")

df_date

df.to_csv("covid19.csv")