PDFの行ずれをグループで結合

qiita.com

!pip install camelot
!pip install pikepdf

import camelot
import pandas as pd
import pikepdf

# 保護解除
with pikepdf.open("data.pdf") as pdf:
    pdf.save("output.pdf")

# strip_textで空白改行を除去
tables = camelot.read_pdf("output.pdf", flavor="stream", strip_text=" .\n")

# タイトル
title = "".join(tables[0].data[0])

# テーブル数確認
print(tables.n)

# 複数の場合はtables[0]の数字を増やす、tables[1]、tables[2]
df0 = pd.DataFrame(tables[0].data[1:])

df0.to_csv("result.csv", encoding="utf_8_sig")

# 以下加工する場合

# サブタイトル
subtitle = df0[0].str.cat(sep="")

df1 = df0.iloc[:, 1:]

# 欠損行を下と同じグループ
df1["grp"] = (~(df1 == "").any(axis=1).shift(1).fillna(False)).cumsum()

df1

# グループで結合
df2 = df1.groupby("grp").agg("".join)

df2.to_csv("result.csv", encoding="utf_8_sig")