import sys
import camelot
import pandas as pd
tables = camelot.read_pdf(
"http://www.pref.saitama.lg.jp/a0001/news/page/2020/documents/021125-0902.pdf",
pages="all",
strip_text=" \n",
)
dfs = [table.df.T.iloc[2:] for table in tables]
df = pd.concat(dfs).reset_index(drop=True)
df.to_csv(sys.stdout, index=False, header=False)
pdfplumber
pip install pdfplumber
wget http://www.pref.saitama.lg.jp/a0001/news/page/2020/documents/021125-0902.pdf -O data.pdf
import sys
import pdfplumber
import pandas as pd
with pdfplumber.open("data.pdf") as pdf:
dfs = [pd.DataFrame(page.extract_table()).T.iloc[2:] for page in pdf.pages]
df = pd.concat(dfs).replace("\s", "", regex=True)
df.to_csv(sys.stdout, index=False, header=False)