食べログスクレイピングのJSON-LDを取得

ページ内にJSON-LDがあるので簡単に店舗情報が取得できます。

developers.google.com

  • 店舗名
  • 住所
  • 郵便番号
  • 緯度経度
  • 口コミ数
  • 評価点
import time
import json

import requests
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
}

with requests.Session() as s:

    for page in tqdm_notebook(range(1, 61), desc="page"):

        url = f"https://tabelog.com/ehime/A3802/A380201/rstLst/{page}/?Srt=D&SrtT=rt"

        # print(url)

        r = s.get(url, headers=headers, timeout=3)

        result = []

        if r.status_code == 200:

            soup = BeautifulSoup(r.content, "html5lib")

            for li in tqdm_notebook(soup.select("ul.js-rstlist-info.rstlist-info > li.list-rst.js-bookmark.js-rst-cassette-wrap.list-rst--ranking")):

                link = li.select_one("a.list-rst__rst-name-target.cpy-rst-name").get("href")

                # print(link)

                time.sleep(3)

                r2 = s.get(link, headers=headers, timeout=3)

                if r2.status_code == 200:

                    soup2 = BeautifulSoup(r2.content, "html5lib")
                    
                    # json-ldを取得
                    ld_json = json.loads(soup2.find("script", type="application/ld+json").text)

                    # 有料会員を追加
                    if soup2.find('h3', class_='pr-comment-title js-pr-title'):
                        ld_json["isPremium"] = True

                    else:
                        ld_json["isPremium"] = False

                result.append(ld_json)

        # ファイル保存
        fw = open(f"imabari_{page:02}.json", "w")

        json.dump(result, fw)

jsonファイル結合

import glob

objs = []
for fn in glob.glob('./*.json'):
    try:
        arr = json.loads(open(fn).read())
    except Exception as exc:
        print(exc)
        continue
    for obj in arr:
        print(obj)
        objs.append(obj)

# JSONファイルに保存
with open("result.json", "w") as fw:
    json.dump(objs, fw)
import pandas as pd

df = pd.io.json.json_normalize(objs)