!wget https://www.city.uwajima.ehime.jp/uploaded/attachment/24937.pdf -O list.pdf
!wget https://www-eu.apache.org/dist/pdfbox/2.0.17/pdfbox-app-2.0.17.jar -O pdfbox-app.jar
!java -jar pdfbox-app.jar ExtractText -startPage 2 -sort -encoding UTF-8 list.pdf
import csv
import re
reg1 = re.compile(r"^(.+)\s(宇和島市.+)\s((089|070|080|090|0120)(.+))$")
reg2 = re.compile(r"^(.+)\s(宇和島市.+)$")
with open("result.tsv", "w", encoding="utf-8") as fw:
writer = csv.writer(fw, dialect=csv.excel_tab, lineterminator="\n")
with open("list.txt", mode="rt", encoding="utf-8") as fr:
genre = ""
rowpre = ["業種", "店舗名", "住所", "電話番号"]
for line in fr:
line = line.rstrip()
if line.endswith(("取扱店一覧", "現在", "電話番号", "ページ")):
continue
if line.startswith("【") and line.endswith("】"):
genre = line.strip("【】")
continue
m1 = reg1.search(line)
if m1:
data = [(m1.group(i).strip()) for i in range(1, 4)]
row = [genre] + data
writer.writerow(rowpre)
rowpre = row
else:
m2 = reg2.search(line)
if m2:
data = [(m2.group(i).strip()) for i in range(1, 3)]
row = [genre] + data + [""]
writer.writerow(rowpre)
rowpre = row
else:
data = re.split(r"\s{2,}", line.strip(), 1)
n = len(data)
rowpre[1] += " " + data[0]
if n == 2:
rowpre[2] += " " + data[1]
rowpre[1] = rowpre[1].replace(" (", "(")
rowpre[2] = rowpre[2].replace(" (", "(")
else:
writer.writerow(rowpre)