熊本市ごみカレンダーのPDFからCSV作成1

!pip install pdfplumber
!apt install libmagickwand-dev ghostscript

!wget "https://www.city.kumamoto.jp/common/UploadFileDsp.aspx?c_id=5&id=4638&sub_id=20&flid=239879" -O data.pdf
import re

import pandas as pd
import pdfplumber

tate = [0, 32, 94, 155, 216, 277, 339, 400]
yoko = [23, 90, 158, 226, 292, 360]

bboxs = [
    [18, 154, 419, 516],
    [424, 154, 825, 516],
    [18, 693, 419, 1056],
    [424, 693, 825, 1056],
]

pdf = pdfplumber.open("data.pdf")


def make_cal(se0, year, n):

    n += 3

    y, m = divmod(n, 12)

    year += y
    month = m + 1

    se1 = (
        se0.str.replace(
            "(元日|成人の日|建国記念の日|天皇誕生日|春分の日|昭和の日|憲法記念日|みどりの日|こどもの日|海の日|山の日|敬老の日|秋分の日|スポーツの日|文化の日|勤労感謝の日|振替休日)",
            "",
            regex=True,
        )
        .str.replace("特定\n品目", "")
        .str.replace("\n+", "\n")
    )

    s = "\n".join(se1.tolist())
    
    data = re.findall(
        "(\d{1,2})\s?(燃やすごみ|紙|プラ容器包装|資源物|ペットボトル|特定品目|埋立ごみ)\s?(燃やすごみ|紙|プラ容器包装|資源物|ペットボトル|特定品目|埋立ごみ)?",
        s,
    )

    df0 = pd.DataFrame(data)
    df0.set_index(0, inplace=True)

    df1 = df0[1].str.cat(df0[2], sep="・").str.strip("・").reset_index()
    df1.set_axis(["day", "kind"], axis=1, inplace=True)

    df1["day"] = df1["day"].astype(int)

    df1["year"] = year
    df1["month"] = month

    df1["date"] = pd.to_datetime(df1[["year", "month", "day"]])

    df1.set_index("date", inplace=True)

    return df1["kind"]


dfs = []
n = 0

for i in range(1, 4):

    page = pdf.pages[i]

    for bbox in bboxs:

        crop = page.within_bbox(bbox)

        im = crop.to_image()
        im.save(f"image{n}.png", format="PNG")

        vertical = list(map(lambda x: x + bbox[0], tate))
        horizontal = list(map(lambda x: x + bbox[1], yoko))

        table_settings = {
            "vertical_strategy": "explicit",
            "explicit_vertical_lines": vertical,
            "horizontal_strategy": "explicit",
            "explicit_horizontal_lines": horizontal,
        }

        se_tmp = pd.DataFrame(crop.extract_table(table_settings)).stack()

        se = make_cal(se_tmp, 2021, n)

        dfs.append(se)
        n += 1

df0 = pd.concat(dfs)

dt0 = pd.date_range(start="2021-04-01", end="2022-03-31")

df1 = (
    df0.reindex(dt0, fill_value="収集なし")
    .reset_index()
    .rename({"index": "収集日", "kind": "収集区分"}, axis=1)
)

df1.to_csv("kumamoto.csv", index=False)