www.city.nagoya.jp
oku.edu.mie-u.ac.jp
togetter.com
PDFをXMLに変換しTOP・LEFTで並び替えして抽出する
完成したCSVファイル
drive.google.com
Gist
github.com
PDF変換ソフトをインストール・ダウンロード
!apt install poppler-utils
!wget https://www-eu.apache.org/dist/pdfbox/2.0.17/pdfbox-app-2.0.17.jar -O pdfbox-app.jar
PDFをXMLに変換 ※目的のファイルを実行
!wget http://www.city.nagoya.jp/kenkofukushi/cmsfiles/contents/0000088/88972/kaitodeta1.pdf -O data.pdf
!java -jar pdfbox-app.jar PDFSplit -split 155 data.pdf
!for i in {1..16}; do pdftohtml -q -xml data-${i}.pdf data-${i}.xml; done
n = 17
!wget http://www.city.nagoya.jp/kenkofukushi/cmsfiles/contents/0000088/88972/kaitodeta2.pdf -O data.pdf
!java -jar pdfbox-app.jar PDFSplit -split 155 data.pdf
!for i in {1..16}; do pdftohtml -q -xml data-${i}.pdf data-${i}.xml; done
n = 17
!wget http://www.city.nagoya.jp/kenkofukushi/cmsfiles/contents/0000088/88972/kaitodeta3.pdf -O data.pdf
!java -jar pdfbox-app.jar PDFSplit -split 155 data.pdf
!for i in {1..25}; do pdftohtml -q -xml data-${i}.pdf data-${i}.xml; done
n = 26
!wget http://www.city.nagoya.jp/kenkofukushi/cmsfiles/contents/0000088/88972/kaitodeta4.pdf -O data.pdf
!java -jar pdfbox-app.jar PDFSplit -split 155 data.pdf
!for i in {1..17}; do pdftohtml -q -xml data-${i}.pdf data-${i}.xml; done
n = 18
!wget http://www.city.nagoya.jp/kenkofukushi/cmsfiles/contents/0000088/88972/kaitodeta5.pdf -O data.pdf
!java -jar pdfbox-app.jar PDFSplit -split 175 data.pdf
!for i in {1..16}; do pdftohtml -q -xml data-${i}.pdf data-${i}.xml; done
n = 17
from xml.etree import ElementTree as ET
import csv
result = []
for i in range(1, n):
tree = ET.parse(f"data-{i}.xml")
root = tree.getroot()
pages = root.findall("page")
for page in pages:
for item in page.findall("text"):
temp = item.attrib
temp["side"] = i
temp["text"] = item.text.strip()
temp["page"] = page.get("number")
if temp["text"]:
result.append(temp)
else:
print(item.text)
with open("data.csv", "w", newline="", encoding="utf-8") as fw:
fieldnames = ["page", "side", "top", "left", "width", "height", "font", "text"]
writer = csv.DictWriter(fw, fieldnames=fieldnames)
writer.writeheader()
for row in result:
writer.writerow(row)
データ調整
import pandas as pd
df1 = pd.read_csv("data.csv")
df2 = df1.drop(df1[(df1["page"] == 1) & (df1["top"] < 510)].index)
df3 = df2.loc[:, ["page", "side", "top", "left", "text"]]
df3
df_count = df3.pivot_table(index=["side", "left"], aggfunc="count")
df_count
df_count.to_csv("count.tsv", sep="\t")
ずれ調整 ※目的のファイルを実行
df3.loc[(df3["side"] == 8) & (df3["left"] == 383), "left"] = 382
df3.loc[(df3["side"] == 8) & (df3["left"] == 765), "left"] = 766
df3.loc[(df3["side"] == 12) & (df3["left"] == 646), "left"] = 647
df3.loc[(df3["side"] == 12) & (df3["left"] == 882), "left"] = 883
df3.loc[(df3["side"] == 14) & (df3["left"] == 648), "left"] = 647
df3.loc[(df3["side"] == 14) & (df3["left"] == 680), "left"] = 681
s1 = pd.Series([1, 10, 522, 600, ""], index=df3.columns, name=9999998)
df3 = df3.append(s1)
s2 = pd.Series([1, 10, 522, 800, ""], index=df3.columns, name=9999999)
df3 = df3.append(s2)
df3.loc[(df3["side"] == 9) & (df3["left"] == 383), "left"] = 382
df3.loc[(df3["side"] == 12) & (df3["left"] == 356), "left"] = 355
df3.loc[(df3["side"] == 12) & (df3["left"] == 857), "left"] = 856
df3.loc[(df3["side"] == 12) & (df3["left"] == 1125), "left"] = 1126
df3.loc[(df3["side"] == 13) & (df3["left"] == 560), "left"] = 559
df3.loc[(df3["side"] == 13) & (df3["left"] == 592), "left"] = 593
s1 = pd.Series([1, 9, 519, 900, ""], index=df3.columns, name=9999999)
df3 = df3.append(s1)
df3.loc[(df3["side"] == 24) & (df3["left"] == 739), "left"] = 740
df3.loc[(df3["side"] == 24) & (df3["left"] == 852), "left"] = 851
df3.loc[(df3["side"] == 24) & (df3["left"] == 884), "left"] = 885
df3.loc[(df3["side"] == 6) & (df3["left"] == 682), "left"] = 683
df3.loc[(df3["side"] == 15) & (df3["left"] == 651), "left"] = 652
df3.loc[(df3["side"] == 8) & (df3["left"] == 767), "left"] = 768
df3.loc[(df3["side"] == 11) & (df3["left"] == 356), "left"] = 355
df3.loc[(df3["side"] == 13) & (df3["left"] == 1027), "left"] = 1026
df3.loc[(df3["side"] == 13) & (df3["left"] == 1060), "left"] = 1059
df3.loc[(df3["side"] == 15) & (df3["left"] == 470), "left"] = 469
df3.loc[(df3["side"] == 15) & (df3["left"] == 502), "left"] = 503
TSVに変換
df = df3.pivot_table(
index=["page", "top"],
columns=["side", "left"],
values="text",
aggfunc=lambda x: " ".join(str(v) for v in x),
)
df.to_csv("result.tsv", sep="\t", index=False, header=False)
TSV調整
import re
with open("result.tsv") as fr:
tsv_text = fr.read()
pattern1 = re.compile(r"(?<=\d) (?=\d)")
tsv_text = pattern1.sub("\t", tsv_text)
pattern2 = re.compile(r" (\d)\t\t")
tsv_text = pattern2.sub(r"\t\1\t", tsv_text)
with open(f"result-{pdf_num}.tsv", mode="w") as fw:
fw.write(tsv_text)
TSVファイルを結合
dfs = []
for i in range(1, 6):
dfs.append(pd.read_csv(f"result-{i}.tsv", delimiter='\t', header=None, dtype=object))
df4 = pd.concat(dfs)
df4[274] = df4[274].fillna("") + df4[275].fillna("") + df4[276].fillna("") + df4[277].fillna("")
df4.drop(columns=[275, 276, 277], inplace=True)
df4.to_csv('result_all.csv', index=False, header=False)
追記
kaitodeta1.pdf
- 行2920列219を分解し220、221にそれぞれ入力 ※1
kaitodeta1.pdfとkaitodeta2.pdf
- 列275の結合は3箇所境界部分の文字が重なっているので上記のcsvでは削除しています。