wget https://www.mhlw.go.jp/content/10906000/000784439.pdf -O data.pdf
import camelot import pandas as pd def str2date(s: pd.Series) -> pd.Series: df = ( s.str.extract("(\d{4})年(\d{1,2})月(\d{1,2})日") .rename(columns={0: "year", 1: "month", 2: "day"}) .fillna(0) .astype(int) ) return pd.to_datetime(df, errors="coerce") tables = camelot.read_pdf("data.pdf", pages="2-15") dfs = [table.df.iloc[2:] for table in tables] df = pd.concat(dfs) df.set_axis( [ "No", "年齢", "性別", "接種日", "発生日", "ロット番号", "接種回数", "基礎疾患等", "死因等", "報告医が死因等の判断に至った検査", "因果関係", "他要因の可能性の有無", "前回_因果関係評価", "前回_コメント", "現在_因果関係評価", "現在_コメント", ], axis=1, inplace=True, ) df["No"] = df["No"].str.replace("注\d", "") df["接種日"] = str2date(df["接種日"]) df["発生日"] = str2date(df["発生日"]) df["年齢"] = df["年齢"].str.extract("(\d+)歳", expand=False).astype(int) df["ロット番号"] = df["ロット番号"].str.replace("([\s\S]*)", "").str.strip().mask(df["ロット番号"] == "不明") df["接種回数"] = df["接種回数"].str.rstrip("回目").mask(df["接種回数"] == "不明") df1 = df.loc[:, ["No", "年齢", "性別", "接種日", "発生日", "ロット番号", "接種回数"]] df1.to_csv("output.csv", encoding="utf_8_sig", index=False) df1