github.com
!pip install pdfplumber
!apt install libmagickwand-dev ghostscript
!wget "https://www.city.kumamoto.jp/common/UploadFileDsp.aspx?c_id=5&id=4638&sub_id=20&flid=239879" -O data.pdf
import io
import re
import pandas as pd
import pdfplumber
def make_cal(se0, year, n):
n += 3
y, m = divmod(n, 12)
year += y
month = m + 1
df0 = se0.str.split(expand=True).reset_index(drop=True)
df1 = df0[df0.isin(days + kind)].copy().dropna(how="all")
df2 = df1.apply(lambda x: x.dropna().reset_index(drop=True), axis=1)
s0 = df2.to_csv(index=False, header=False)
s1 = re.sub(",(29|30|31)", r"\n\1", s0)
df3 = (
pd.read_csv(io.StringIO(s1), header=None, index_col=0)
.dropna(how="all", axis=1)
.dropna(how="all")
.fillna("")
.sort_index()
)
df4 = (
df3[1]
.str.cat(df3[2], sep="・")
.str.strip("・")
.reset_index()
.rename(columns={0: "day", 1: "kind"})
)
df4["year"] = year
df4["month"] = month
df4["date"] = pd.to_datetime(df4[["year", "month", "day"]])
df4.set_index("date", inplace=True)
return df4["kind"]
tate = [0, 32, 94, 155, 216, 277, 339, 400]
yoko = [23, 90, 158, 226, 292, 360]
bboxs = [
[18, 154, 419, 516],
[424, 154, 825, 516],
[18, 693, 419, 1056],
[424, 693, 825, 1056],
]
pdf = pdfplumber.open("data.pdf")
days = list(map(str, range(1, 32)))
kind = ["燃やすごみ", "紙", "プラ容器包装", "資源物", "ペットボトル", "特定品目", "埋立ごみ"]
dfs = []
n = 0
for i in range(1, 4):
page = pdf.pages[i]
for bbox in bboxs:
crop = page.within_bbox(bbox)
vertical = list(map(lambda x: x + bbox[0], tate))
horizontal = list(map(lambda x: x + bbox[1], yoko))
table_settings = {
"vertical_strategy": "explicit",
"explicit_vertical_lines": vertical,
"horizontal_strategy": "explicit",
"explicit_horizontal_lines": horizontal,
}
se_tmp = pd.DataFrame(crop.extract_table(table_settings)).stack().str.replace("日", "日 ", regex=True)
se = make_cal(se_tmp, 2021, n)
dfs.append(se)
n += 1
df0 = pd.concat(dfs)
dt_range = pd.date_range(start="2021-04-01", end="2022-03-31")
df1 = (
df0.reindex(dt_range, fill_value="収集なし")
.reset_index()
.rename({"index": "収集日", "kind": "収集区分"}, axis=1)
)
df1.to_csv("kumamoto.csv", index=False)