建設業許可番号から会社情報取得しJSON保存

import json

import requests
from bs4 import BeautifulSoup


def get_title(table, css):

    # 許可業種
    result = [i.get_text(strip=True) for i in table.select(f"tbody > tr{css} > td")]

    return result


def get_data(table, css):

    result = []

    for i in table.select(f"tbody > tr{css} > td"):

        j = i.get_text(strip=True)

        n = int(j) if j else 0

        result.append(n)

    return result


def get_web(number):

    url = "http://etsuran.mlit.go.jp/TAKKEN/ksGaiyo.do"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
    }

    params = {"CMD": "", "sv_licenseNo": number, "caller": "KS"}

    with requests.Session() as s:

        r = s.post(url, headers=headers, params=params)

        if r.status_code == requests.codes.ok:

            soup = BeautifulSoup(r.content, "html5lib")

            result = {}

            # 会社情報
            tds = [
                t
                for td in soup.select("table.re_summ > tbody > tr > td")
                for t in td.stripped_strings
            ]

            # 許可番号
            result["auth"], result["lic_num"] = tds[0].split(None, 1)

            # 商号又は名称
            result["company"] = tds[2]

            # 商号又は名称(よみがな)
            result["company_yomi"] = tds[1]

            # 代表者名
            result["name"] = tds[4]

            # 代表者名(よみがな)
            result["name_yomi"] = tds[3]

            # 郵便番号
            result["postal_code"] = tds[5]

            # 所在地
            result["address"] = tds[6] + tds[7]

            # 電話番号
            result["tel"] = tds[8]

            # 許可業種(全部)
            tables = soup.select("table.re_summ_3")

            # 業種名
            result["const"] = get_title(tables[0], ".re_summ_ev")

            # 許可種類
            result["license"] = get_data(tables[0], ".re_summ_odd")

            # パラメーター追加
            params["CMD"] = "init"

            parmit = []

            for i in soup.select("table.re_summ_4 > tbody > tr > td > a"):

                # 許可年月日
                date = i.get_text(strip=True)

                # パラメーター追加
                params["licenseDay"] = date

                # 許可年月日ごとに取得
                d_res = s.post(url, headers=headers, params=params)

                if d_res.status_code == requests.codes.ok:

                    d_soup = BeautifulSoup(d_res.content, "html5lib")

                    # 業種リスト追加
                    parmit.append(
                        {
                            date: get_data(
                                d_soup.select("table.re_summ_3")[1], ".re_summ_odd"
                            )
                        }
                    )

            # print(parmit)

            result["license_day"] = parmit

            # 営業所
            if soup.find("img", src="/TAKKEN/images/btn_tab_office_off.png"):

                params["licenseDay"] = ""

                o_res = s.post(
                    "https://etsuran.mlit.go.jp/TAKKEN/ksEigyo.do",
                    headers=headers,
                    params=params,
                )

                o_soup = BeautifulSoup(o_res.content, "html5lib")

                o_list = []

                for j in o_soup.select("table.re_office > tbody > tr")[1:]:

                    office = {}

                    o_tds = j.find_all("td", recursive=False)

                    office["id"] = o_tds[0].get_text(strip=True)
                    office["name"], office["tel"] = o_tds[1].stripped_strings
                    office["postal_code"], office["address"] = o_tds[2].stripped_strings

                    office["license"] = get_data(
                        o_tds[3].select_one("table.re_office3"), ":nth-of-type(2)"
                    )

                    o_list.append(office)

                result["office"] = o_list

            # print(result)

            if result["company"]:

                filename = result["company"] + ".json"

                with open(filename, "w") as fw:
                    json.dump(result, fw)


if __name__ == "__main__":

    number = input("許可番号:")

    # 数字でかつ8文字
    if number.isdecimal() and len(number) == 8:
        get_web(number)

    else:
        print("エラー")