ページ内にJSON-LDがあるので簡単に店舗情報が取得できます。
- 店舗名
- 住所
- 郵便番号
- 緯度経度
- 口コミ数
- 評価点
import time import json import requests from bs4 import BeautifulSoup from tqdm import tqdm_notebook headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko' } with requests.Session() as s: for page in tqdm_notebook(range(1, 61), desc="page"): url = f"https://tabelog.com/ehime/A3802/A380201/rstLst/{page}/?Srt=D&SrtT=rt" # print(url) r = s.get(url, headers=headers, timeout=3) result = [] if r.status_code == 200: soup = BeautifulSoup(r.content, "html5lib") for li in tqdm_notebook(soup.select("ul.js-rstlist-info.rstlist-info > li.list-rst.js-bookmark.js-rst-cassette-wrap.list-rst--ranking")): link = li.select_one("a.list-rst__rst-name-target.cpy-rst-name").get("href") # print(link) time.sleep(3) r2 = s.get(link, headers=headers, timeout=3) if r2.status_code == 200: soup2 = BeautifulSoup(r2.content, "html5lib") # json-ldを取得 ld_json = json.loads(soup2.find("script", type="application/ld+json").text) # 有料会員を追加 if soup2.find('h3', class_='pr-comment-title js-pr-title'): ld_json["isPremium"] = True else: ld_json["isPremium"] = False result.append(ld_json) # ファイル保存 fw = open(f"imabari_{page:02}.json", "w") json.dump(result, fw)
jsonファイル結合
import glob objs = [] for fn in glob.glob('./*.json'): try: arr = json.loads(open(fn).read()) except Exception as exc: print(exc) continue for obj in arr: print(obj) objs.append(obj) # JSONファイルに保存 with open("result.json", "w") as fw: json.dump(objs, fw)
import pandas as pd df = pd.io.json.json_normalize(objs)