令和3年度県立高等学校学科別入学志願者数(志願変更後) のPDFをCSVに変換する
!wget https://ehime-c.esnet.ed.jp/koukou/nyuusi/r02nyuusi/r02isi_ato/atozenniti.pdf -O data.pdf
import pdfplumber import pandas as pd with pdfplumber.open("data.pdf") as pdf: dfs = [] for page in pdf.pages: table = page.extract_table() df_tmp = pd.DataFrame(table[1:], columns=table[0]) dfs.append(df_tmp) df = pd.concat(dfs) df1 = df.iloc[:, :7] df2 = df.iloc[:, 7:] df = pd.concat([df1, df2]).dropna(thresh=5) df.set_axis( [ "学校名", "本校・分校", "学科名", "定員", "入学志願者数", "推薦", "倍率", ], axis="columns", inplace=True, ) df = df.mask(df.isna()) df["学校名"] = df["学校名"].fillna(method="ffill") df["定員"] = df["定員"].str.replace(",", "").astype(int) df["入学志願者数"] = df["入学志願者数"].str.replace(",", "").astype(int) df["推薦"] = df["推薦"].str.replace(",", "").astype(int) df["倍率"] = df["倍率"].astype(float) # グループ内で補完 df["本校・分校"] = df.groupby("学校名")["本校・分校"].ffill() df.reset_index(drop=True, inplace=True) df.index += 1 df.to_csv("data.csv")