PDFのURLのスクレイピングは省略してダウンロード
PDFファイルをダウンロード
wget https://www.pref.aichi.jp/uploaded/attachment/328890.pdf -O data.pdf
!apt install python3-tk ghostscript !pip install camelot-py[cv]
import datetime import re import pandas as pd import camelot tables = camelot.read_pdf( "data.pdf", pages="1-end", split_text=True, strip_text="\n", line_scale=40 ) df_csv = pd.concat([table.df for table in tables]) df_csv.to_csv("data.csv", index=None, header=None) def my_parser(s): y = datetime.datetime.now().year m, d = map(int, re.findall("[0-9]{1,2}", s)) return pd.Timestamp(year=y, month=m, day=d) df = pd.read_csv("data.csv", index_col=0, parse_dates=["発表日"], date_parser=my_parser) df