宇和島市プレミアム付商品券が使えるお店をスクレイピング

!wget https://www.city.uwajima.ehime.jp/uploaded/attachment/24937.pdf -O list.pdf
!wget https://www-eu.apache.org/dist/pdfbox/2.0.17/pdfbox-app-2.0.17.jar -O pdfbox-app.jar

!java -jar pdfbox-app.jar ExtractText -startPage 2 -sort -encoding UTF-8 list.pdf
import csv
import re


reg1 = re.compile(r"^(.+)\s(宇和島市.+)\s((089|070|080|090|0120)(.+))$")
reg2 = re.compile(r"^(.+)\s(宇和島市.+)$")

with open("result.tsv", "w", encoding="utf-8") as fw:
    writer = csv.writer(fw, dialect=csv.excel_tab, lineterminator="\n")

    with open("list.txt", mode="rt", encoding="utf-8") as fr:

        genre = ""

        # ヘッダー
        rowpre = ["業種", "店舗名", "住所", "電話番号"]

        for line in fr:

            line = line.rstrip()

            # 除外行
            if line.endswith(("取扱店一覧", "現在", "電話番号", "ページ")):
                continue

            # 業種
            if line.startswith("【") and line.endswith("】"):
                genre = line.strip("【】")
                continue

            # print(rowpre)
            # print(genre, line)

            # 電話番号あり
            m1 = reg1.search(line)

            if m1:
                data = [(m1.group(i).strip()) for i in range(1, 4)]

                row = [genre] + data

                writer.writerow(rowpre)

                rowpre = row

            else:

                # 電話番号なし
                m2 = reg2.search(line)

                if m2:
                    data = [(m2.group(i).strip()) for i in range(1, 3)]

                    # 電話番号に空白
                    row = [genre] + data + [""]

                    writer.writerow(rowpre)

                    rowpre = row

                else:
                    data = re.split(r"\s{2,}", line.strip(), 1)

                    n = len(data)

                    rowpre[1] += " " + data[0]

                    if n == 2:
                        rowpre[2] += " " + data[1]

            # 括弧前スペースを除去
            rowpre[1] = rowpre[1].replace(" (", "(")
            rowpre[2] = rowpre[2].replace(" (", "(")

        else:
            writer.writerow(rowpre)