!pip install pdfplumber !apt install libmagickwand-dev ghostscript !wget "https://www.city.kumamoto.jp/common/UploadFileDsp.aspx?c_id=5&id=4638&sub_id=20&flid=239879" -O data.pdf
import re import pandas as pd import pdfplumber tate = [0, 32, 94, 155, 216, 277, 339, 400] yoko = [23, 90, 158, 226, 292, 360] bboxs = [ [18, 154, 419, 516], [424, 154, 825, 516], [18, 693, 419, 1056], [424, 693, 825, 1056], ] pdf = pdfplumber.open("data.pdf") def make_cal(se0, year, n): n += 3 y, m = divmod(n, 12) year += y month = m + 1 se1 = ( se0.str.replace( "(元日|成人の日|建国記念の日|天皇誕生日|春分の日|昭和の日|憲法記念日|みどりの日|こどもの日|海の日|山の日|敬老の日|秋分の日|スポーツの日|文化の日|勤労感謝の日|振替休日)", "", regex=True, ) .str.replace("特定\n品目", "") .str.replace("\n+", "\n") ) s = "\n".join(se1.tolist()) data = re.findall( "(\d{1,2})\s?(燃やすごみ|紙|プラ容器包装|資源物|ペットボトル|特定品目|埋立ごみ)\s?(燃やすごみ|紙|プラ容器包装|資源物|ペットボトル|特定品目|埋立ごみ)?", s, ) df0 = pd.DataFrame(data) df0.set_index(0, inplace=True) df1 = df0[1].str.cat(df0[2], sep="・").str.strip("・").reset_index() df1.set_axis(["day", "kind"], axis=1, inplace=True) df1["day"] = df1["day"].astype(int) df1["year"] = year df1["month"] = month df1["date"] = pd.to_datetime(df1[["year", "month", "day"]]) df1.set_index("date", inplace=True) return df1["kind"] dfs = [] n = 0 for i in range(1, 4): page = pdf.pages[i] for bbox in bboxs: crop = page.within_bbox(bbox) im = crop.to_image() im.save(f"image{n}.png", format="PNG") vertical = list(map(lambda x: x + bbox[0], tate)) horizontal = list(map(lambda x: x + bbox[1], yoko)) table_settings = { "vertical_strategy": "explicit", "explicit_vertical_lines": vertical, "horizontal_strategy": "explicit", "explicit_horizontal_lines": horizontal, } se_tmp = pd.DataFrame(crop.extract_table(table_settings)).stack() se = make_cal(se_tmp, 2021, n) dfs.append(se) n += 1 df0 = pd.concat(dfs) dt0 = pd.date_range(start="2021-04-01", end="2022-03-31") df1 = ( df0.reindex(dt0, fill_value="収集なし") .reset_index() .rename({"index": "収集日", "kind": "収集区分"}, axis=1) ) df1.to_csv("kumamoto.csv", index=False)