2021/03/19以降の一覧PDF
requirements.txt
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0319besshi.pdf https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0320_besshi.pdf https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0321besshi.pdf
ダウンロード
!wget -i requirements.txt -P download -w 1 –random-wait
import pdfplumber import pandas as pd import datetime import pathlib dfs = [] for p in pathlib.Path("download").glob("*.pdf"): with pdfplumber.open(p) as pdf: for page in pdf.pages: table = page.extract_table() df_tmp = pd.DataFrame(table[1:], columns=table[0]).rename( columns={"№": "No"} ) df_tmp.columns = df_tmp.columns.str.replace("\n", "") df_tmp.set_index("No", inplace=True) dfs.append(df_tmp) df = pd.concat(dfs).sort_index() for col in df.select_dtypes(include=object).columns: df[col] = ( df[col].str.strip().str.normalize("NFKC").str.replace("\n", "", regex=True) ) # 日付変換 dt_now = datetime.datetime.now() def str2date(s: pd.Series) -> pd.Series: df = ( s.str.extract("(\d{1,2})/(\d{1,2})") .rename(columns={0: "month", 1: "day"}) .fillna(0) .astype(int) ) df["year"] = dt_now.year tmp = pd.to_datetime(df, errors="coerce") df["year"] = df["year"].mask(tmp > dt_now, df["year"] - 1) return pd.to_datetime(df, errors="coerce") df["発症日YMD"] = str2date(df["発症日"]) df["確認日YMD"] = str2date(df["陽性判明"]) df.to_csv("sendai2.csv", encoding="utf_8_sig")