千葉県の新型コロナウイルス感染症患者等の県内発生状況についてのPDFをデータラングリング

  • pdfplumberでfilterでフッターのページ数をのけた
  • ページによって列の誤認識?空白文字が入るので列を削除
import datetime
import pathlib
import re
from urllib.parse import urljoin

import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup


def fetch_file(url, dir="."):

    r = requests.get(url)
    r.raise_for_status()

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p


def days2date(s):

    y = dt_now.year

    days = re.findall("[0-9]{1,2}", s)

    if len(days) == 2:
        m, d = map(int, days)
        return pd.Timestamp(year=y, month=m, day=d)
    else:
        print(s)
        return pd.NaT


def wareki2date(s):

    m = re.search("(昭和|平成|令和)([0-9元]{1,2})年(\d{1,2})月(\d{1,2})日", s)

    if m:

        year, month, day = [1 if i == "元" else int(i.strip()) for i in m.group(2, 3, 4)]

        if m.group(1) == "昭和":
            year += 1925
        elif m.group(1) == "平成":
            year += 1988
        elif m.group(1) == "令和":
            year += 2018

        return datetime.date(year, month, day)

    else:
        return dt_now.date


JST = datetime.timezone(datetime.timedelta(hours=+9), "JST")
dt_now = datetime.datetime.now(JST)

url = "https://www.pref.chiba.lg.jp/shippei/press/2019/ncov-index.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

tag = soup.find("a", text=re.compile("^新型コロナウイルス感染症患者等の県内発生状況について"))

link = urljoin(url, tag.get("href"))

path_pdf = fetch_file(link)

table_settings = {
    "vertical_strategy": "lines",
    "horizontal_strategy": "lines",
    "intersection_tolerance": 3,
}

with pdfplumber.open(path_pdf) as pdf:

    dfs = []
    dt_text = ""

    for page in pdf.pages:

        if page.page_number == 1:

            dt_text = page.within_bbox((0, 0, page.width, 140)).extract_text()

# フッターのはみだしがなくなったので不要になった
"""
        # フッターのページ数を削除
        def test(obj):
            if obj["object_type"] == "char":
                if obj["bottom"] > 1073:
                    return False
            return True

        filtered = page.filter(test)

        table = filtered.extract_table(table_settings)
"""

        table =  page.extract_table(table_settings)

        df_tmp = pd.DataFrame(table)

        # 空文字の列を削除
        df_tmp = df_tmp.mask((df_tmp == "") | df_tmp.isna()).dropna(how="all", axis=1)

        dfs.append(pd.DataFrame(df_tmp.values))

dt_update = wareki2date(re.sub("\s", "", dt_text))
dt_update

df = pd.concat(dfs)

df

dfg = df.groupby((df[0] == "No.").cumsum())


def set_col(data):

    df = pd.DataFrame(data[1:], columns=data[0])

    return df


df1 = set_col(dfg.get_group(1).values).set_index("No.")

path_csv1 = pathlib.Path(f'{dt_update.strftime("%Y%m%d")}_患者.csv')

df1.to_csv(path_csv1, encoding="utf_8_sig")

df2 = set_col(dfg.get_group(2).dropna(how="all", axis=1).values).set_index("No.")

df2

path_csv2 = pathlib.Path(f'{dt_update.strftime("%Y%m%d")}_無症状.csv')

df2.to_csv(path_csv2, encoding="utf_8_sig")