import time
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook
pref_list = [
"hokkaido",
"aomori",
"iwate",
"miyagi",
"akita",
"yamagata",
"fukushima",
"ibaraki",
"tochigi",
"gunma",
"saitama",
"chiba",
"tokyo",
"kanagawa",
"niigata",
"toyama",
"ishikawa",
"fukui",
"yamanashi",
"nagano",
"gifu",
"shizuoka",
"aichi",
"mie",
"shiga",
"kyoto",
"osaka",
"hyogo",
"nara",
"wakayama",
"tottori",
"shimane",
"okayama",
"hiroshima",
"yamaguchi",
"tokushima",
"kagawa",
"ehime",
"kochi",
"fukuoka",
"saga",
"nagasaki",
"kumamoto",
"oita",
"miyazaki",
"kagoshima",
"okinawa",
]
result = []
mode = False
with requests.Session() as s:
for pref in tqdm_notebook(pref_list, desc='pref loop'):
for page in tqdm_notebook(range(1, 61), desc=pref):
url = f"https://tabelog.com/{pref}/rstLst/{page}/?Srt=D&SrtT=rvcn"
r = s.get(url, timeout=3)
if r.status_code == 200:
soup = BeautifulSoup(r.content, "html5lib")
for li in soup.select("ul.js-rstlist-info.rstlist-info > li.list-rst.js-bookmark.js-rst-cassette-wrap.list-rst--ranking"):
shop = {}
shop["pref"] = pref
shop["name"] = li.select_one("a.list-rst__rst-name-target.cpy-rst-name").get_text(strip=True)
shop["link"] = li.select_one("a.list-rst__rst-name-target.cpy-rst-name").get("href")
shop["rate"] = float(li.select_one("span.c-rating__val.c-rating__val--strong.list-rst__rating-val").get_text(strip=True))
shop["amount"] = int(li.select_one("em.list-rst__rvw-count-num.cpy-review-count").get_text(strip=True))
if shop["amount"] < 100:
break
if mode:
time.sleep(3)
rr = s.get(shop["link"], timeout=3)
if rr.status_code == 200:
shop["isLoad"] = True
ssoup = BeautifulSoup(rr.content, "html5lib")
if ssoup.find('h3', class_='pr-comment-title js-pr-title'):
shop["isPremium"] = True
else:
shop["isPremium"] = False
else:
shop["isLoad"] = False
shop["isPremium"] = False
result.append(shop)
else:
time.sleep(3)
continue
break