import time import urllib.parse import pandas as pd import requests from bs4 import BeautifulSoup def fetch_soup(url, parser="html.parser"): r = requests.get(url, headers=headers) r.raise_for_status() soup = BeautifulSoup(r.content, parser) return soup def get_url(page=1): url = f"http://carinf.mlit.go.jp/jidosha/carinf/opn/search.html?selCarTp=1&lstCarNo=000&txtFrDat=1000/01/01&txtToDat=9999/12/31&txtNamNm=&txtMdlNm=&txtEgmNm=&chkDevCd=&page={page}" return url def gen_url(page): for i in range(1, page + 1): yield get_url(i) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko" } soup = fetch_soup(get_url()) link = soup.select_one( 'div.ContainerRight > ul > li > a > img[alt="最後のページ"]' ).parent.get("href") qs = urllib.parse.urlparse(link).query qs_d = urllib.parse.parse_qs(qs) last_page = int(qs_d["page"][0]) last_page dfs = [] for url in gen_url(last_page): soup = fetch_soup(url) data = [] for tr in soup.select("table.tablecar > * > tr"): d = [] for td in tr.select("th,td"): d.extend(list(td.stripped_strings)) data.append(d) df_tmp = pd.DataFrame(data[1:], columns=data[0]) dfs.append(df_tmp) time.sleep(1) df = pd.concat(dfs)