子宮頸がん予防接種調査の結果のPDFをCSV化

www.city.nagoya.jp

oku.edu.mie-u.ac.jp

togetter.com

PDFをXMLに変換しTOP・LEFTで並び替えして抽出する

完成したCSVファイル

drive.google.com

Gist

github.com

PDF変換ソフトをインストール・ダウンロード

!apt install poppler-utils

!wget https://www-eu.apache.org/dist/pdfbox/2.0.17/pdfbox-app-2.0.17.jar -O pdfbox-app.jar

PDFをXMLに変換 ※目的のファイルを実行

# kaitodeta1.pdf
!wget http://www.city.nagoya.jp/kenkofukushi/cmsfiles/contents/0000088/88972/kaitodeta1.pdf -O data.pdf

!java -jar pdfbox-app.jar PDFSplit -split 155 data.pdf

!for i in {1..16}; do pdftohtml -q -xml data-${i}.pdf data-${i}.xml; done

n = 17
# kaitodeta2.pdf
!wget http://www.city.nagoya.jp/kenkofukushi/cmsfiles/contents/0000088/88972/kaitodeta2.pdf -O data.pdf

!java -jar pdfbox-app.jar PDFSplit -split 155 data.pdf

!for i in {1..16}; do pdftohtml -q -xml data-${i}.pdf data-${i}.xml; done

n = 17
# kaitodeta3.pdf
!wget http://www.city.nagoya.jp/kenkofukushi/cmsfiles/contents/0000088/88972/kaitodeta3.pdf -O data.pdf

!java -jar pdfbox-app.jar PDFSplit -split 155 data.pdf

!for i in {1..25}; do pdftohtml -q -xml data-${i}.pdf data-${i}.xml; done

n = 26
# kaitodeta4.pdf
!wget http://www.city.nagoya.jp/kenkofukushi/cmsfiles/contents/0000088/88972/kaitodeta4.pdf -O data.pdf

!java -jar pdfbox-app.jar PDFSplit -split 155 data.pdf

!for i in {1..17}; do pdftohtml -q -xml data-${i}.pdf data-${i}.xml; done

n = 18
# kaitodeta5.pdf
!wget http://www.city.nagoya.jp/kenkofukushi/cmsfiles/contents/0000088/88972/kaitodeta5.pdf -O data.pdf

!java -jar pdfbox-app.jar PDFSplit -split 175 data.pdf

!for i in {1..16}; do pdftohtml -q -xml data-${i}.pdf data-${i}.xml; done

n = 17

XMLからCSVに変換

from xml.etree import ElementTree as ET
import csv

result = []

for i in range(1, n):

    tree = ET.parse(f"data-{i}.xml")
    root = tree.getroot()

    pages = root.findall("page")

    for page in pages:

        # print(page.attrib)

        for item in page.findall("text"):

            temp = item.attrib
            temp["side"] = i
            temp["text"] = item.text.strip()
            temp["page"] = page.get("number")

            if temp["text"]:
                result.append(temp)
            else:
                print(item.text)

with open("data.csv", "w", newline="", encoding="utf-8") as fw:

    fieldnames = ["page", "side", "top", "left", "width", "height", "font", "text"]

    writer = csv.DictWriter(fw, fieldnames=fieldnames)

    writer.writeheader()

    for row in result:
        writer.writerow(row)

データ調整

import pandas as pd

df1 = pd.read_csv("data.csv")

# 1ページ目のヘッダー部分を削除
df2 = df1.drop(df1[(df1["page"] == 1) & (df1["top"] < 510)].index)

# ページ・サイド・縦・横・テキスト抽出
df3 = df2.loc[:, ["page", "side", "top", "left", "text"]]

df3

# X座標でカウント
df_count = df3.pivot_table(index=["side", "left"], aggfunc="count")

## ずれを確認
df_count
df_count.to_csv("count.tsv", sep="\t")

ずれ調整 ※目的のファイルを実行

# kaitodeta1.pdf
df3.loc[(df3["side"] == 8) & (df3["left"] == 383), "left"] = 382
df3.loc[(df3["side"] == 8) & (df3["left"] == 765), "left"] = 766
df3.loc[(df3["side"] == 12) & (df3["left"] == 646), "left"] = 647
df3.loc[(df3["side"] == 12) & (df3["left"] == 882), "left"] = 883
df3.loc[(df3["side"] == 14) & (df3["left"] == 648), "left"] = 647
df3.loc[(df3["side"] == 14) & (df3["left"] == 680), "left"] = 681

# 220列に空列生成
s1 = pd.Series([1, 10, 522, 600, ""], index=df3.columns, name=9999998)
df3 = df3.append(s1)

# 223列に空列生成
s2 = pd.Series([1, 10, 522, 800, ""], index=df3.columns, name=9999999)
df3 = df3.append(s2)
# kaitodeta2.pdf
df3.loc[(df3["side"] == 9) & (df3["left"] == 383), "left"] = 382
df3.loc[(df3["side"] == 12) & (df3["left"] == 356), "left"] = 355
df3.loc[(df3["side"] == 12) & (df3["left"] == 857), "left"] = 856
df3.loc[(df3["side"] == 12) & (df3["left"] == 1125), "left"] = 1126
df3.loc[(df3["side"] == 13) & (df3["left"] == 560), "left"] = 559
df3.loc[(df3["side"] == 13) & (df3["left"] == 592), "left"] = 593

# 220列に空列生成
s1 = pd.Series([1, 9, 519, 900, ""], index=df3.columns, name=9999999)
df3 = df3.append(s1)
# kaitodeta3.pdf
df3.loc[(df3["side"] == 24) & (df3["left"] == 739), "left"] = 740
df3.loc[(df3["side"] == 24) & (df3["left"] == 852), "left"] = 851
df3.loc[(df3["side"] == 24) & (df3["left"] == 884), "left"] = 885
# kaitodeta4.pdf
df3.loc[(df3["side"] == 6) & (df3["left"] == 682), "left"] = 683
df3.loc[(df3["side"] == 15) & (df3["left"] == 651), "left"] = 652
# kaitodeta5.pdf
df3.loc[(df3["side"] == 8) & (df3["left"] == 767), "left"] = 768
df3.loc[(df3["side"] == 11) & (df3["left"] == 356), "left"] = 355
df3.loc[(df3["side"] == 13) & (df3["left"] == 1027), "left"] = 1026
df3.loc[(df3["side"] == 13) & (df3["left"] == 1060), "left"] = 1059
df3.loc[(df3["side"] == 15) & (df3["left"] == 470), "left"] = 469
df3.loc[(df3["side"] == 15) & (df3["left"] == 502), "left"] = 503

TSVに変換

# TOP、LEFTで並び替え
df = df3.pivot_table(
    index=["page", "top"],
    columns=["side", "left"],
    values="text",
    aggfunc=lambda x: " ".join(str(v) for v in x),
)

# ファイルに書き出し
df.to_csv("result.tsv", sep="\t", index=False, header=False)

TSV調整

import re

with open("result.tsv") as fr:
    tsv_text = fr.read()

# 数字 半角スペース 数字の場合、数字 タブ 数字に置換
pattern1 = re.compile(r"(?<=\d) (?=\d)")
tsv_text = pattern1.sub("\t", tsv_text)

# 半角スペース 数字 タブ タブの場合、タブ 数字 タブに置換
pattern2 = re.compile(r" (\d)\t\t")
tsv_text = pattern2.sub(r"\t\1\t", tsv_text)

# kaitodeta1.pdfのみ実行 ※1
# tsv_text = re.sub(r"0\t\[04\] 月経量の異常による低血圧\t0\t\t00", r"0\t[04]\t月経量の異常による低血圧\t0\t00", tsv_text)

with open(f"result-{pdf_num}.tsv", mode="w") as fw:
    fw.write(tsv_text)

TSVファイルを結合

dfs = []

for i in range(1, 6):
    dfs.append(pd.read_csv(f"result-{i}.tsv", delimiter='\t', header=None, dtype=object))

df4 = pd.concat(dfs)

# 275質5_2中止理由その他を結合
df4[274] = df4[274].fillna("") + df4[275].fillna("") + df4[276].fillna("") + df4[277].fillna("")

# 275質5_2中止理由その他を不要部分を削除
df4.drop(columns=[275, 276, 277], inplace=True)

# 最終結果をファイルに書き出し
df4.to_csv('result_all.csv', index=False, header=False)

追記

kaitodeta1.pdf

  • 行2920列219を分解し220、221にそれぞれ入力 ※1

kaitodeta1.pdfとkaitodeta2.pdf

  • 列275の結合は3箇所境界部分の文字が重なっているので上記のcsvでは削除しています。