熊本市ごみカレンダーのPDFからCSV作成2

github.com

!pip install pdfplumber
!apt install libmagickwand-dev ghostscript

!wget "https://www.city.kumamoto.jp/common/UploadFileDsp.aspx?c_id=5&id=4638&sub_id=20&flid=239879" -O data.pdf
import io
import re

import pandas as pd
import pdfplumber


def make_cal(se0, year, n):

    n += 3

    y, m = divmod(n, 12)

    year += y
    month = m + 1

    df0 = se0.str.split(expand=True).reset_index(drop=True)
    df1 = df0[df0.isin(days + kind)].copy().dropna(how="all")

    df2 = df1.apply(lambda x: x.dropna().reset_index(drop=True), axis=1)

    s0 = df2.to_csv(index=False, header=False)
    s1 = re.sub(",(29|30|31)", r"\n\1", s0)

    df3 = (
        pd.read_csv(io.StringIO(s1), header=None, index_col=0)
        .dropna(how="all", axis=1)
        .dropna(how="all")
        .fillna("")
        .sort_index()
    )

    df4 = (
        df3[1]
        .str.cat(df3[2], sep="・")
        .str.strip("・")
        .reset_index()
        .rename(columns={0: "day", 1: "kind"})
    )

    df4["year"] = year
    df4["month"] = month

    df4["date"] = pd.to_datetime(df4[["year", "month", "day"]])

    df4.set_index("date", inplace=True)

    return df4["kind"]


tate = [0, 32, 94, 155, 216, 277, 339, 400]
yoko = [23, 90, 158, 226, 292, 360]

bboxs = [
    [18, 154, 419, 516],
    [424, 154, 825, 516],
    [18, 693, 419, 1056],
    [424, 693, 825, 1056],
]

pdf = pdfplumber.open("data.pdf")

days = list(map(str, range(1, 32)))
kind = ["燃やすごみ", "紙", "プラ容器包装", "資源物", "ペットボトル", "特定品目", "埋立ごみ"]

dfs = []
n = 0

for i in range(1, 4):

    page = pdf.pages[i]

    for bbox in bboxs:

        crop = page.within_bbox(bbox)

        vertical = list(map(lambda x: x + bbox[0], tate))
        horizontal = list(map(lambda x: x + bbox[1], yoko))

        table_settings = {
            "vertical_strategy": "explicit",
            "explicit_vertical_lines": vertical,
            "horizontal_strategy": "explicit",
            "explicit_horizontal_lines": horizontal,
        }

        se_tmp = pd.DataFrame(crop.extract_table(table_settings)).stack().str.replace("日", "日 ", regex=True)

        se = make_cal(se_tmp, 2021, n)

        dfs.append(se)
        n += 1

df0 = pd.concat(dfs)

dt_range = pd.date_range(start="2021-04-01", end="2022-03-31")

df1 = (
    df0.reindex(dt_range, fill_value="収集なし")
    .reset_index()
    .rename({"index": "収集日", "kind": "収集区分"}, axis=1)
)

df1.to_csv("kumamoto.csv", index=False)