食べログスクレイピング

Pythonクローリング&スクレイピング[増補改訂版] ―データ収集・解析のための実践開発ガイドーに 食べログスクレイピングの記事が載ってる gihyo.jp

サンプルコードもあるのでch06と6-7 定期的にスクレイピングするならscrapyでしたほうがよさそう。

食べログスクレイピング2

抽出数が違うせいかグラフ違う

import pandas as pd
import numpy as np

df = pd.DataFrame(result)
df.describe()

df_osaka = df[df["pref"] == "osaka"]
df_osaka.describe()

df_tokyo = df[df["pref"] == "tokyo"]
df_tokyo.describe()
import matplotlib.pyplot as plt
import seaborn as sns

import japanize_matplotlib

import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 200
# 全国
edges = np.arange(3, 5, 0.02)
ax = df["rate"].plot.hist(title=f"全国の店舗({len(df)})食べログ評価点分布", xlim=(3, 5), width=0.02, color='skyblue', bins=edges)

ax.set_ylabel("店舗数")
ax.set_xlabel("評価点")

ax.axvline(x=3.6, linestyle="--", color="tomato", label='評価点 3.6')
ax.axvline(x=3.8, linestyle="--", color="limegreen", label='評価点 3.8')

plt.show()
# 大阪
ax = df_osaka["rate"].plot.hist(title=f"大阪の店舗({len(df_osaka)})食べログ評価点分布", xlim=(3, 5), width=0.02, color='skyblue', bins=edges)

ax.set_ylabel("店舗数")
ax.set_xlabel("評価点")

ax.axvline(x=3.6, linestyle="--", color="tomato", label='評価点 3.6')
ax.axvline(x=3.8, linestyle="--", color="limegreen", label='評価点 3.8')

plt.show()
# 東京
ax = df_tokyo["rate"].plot.hist(title=f"東京の店舗({len(df_tokyo)})食べログ評価点分布", xlim=(3, 5), width=0.02, color='skyblue', bins=edges)

ax.set_ylabel("店舗数")
ax.set_xlabel("評価点")

ax.axvline(x=3.6, linestyle="--", color="tomato", label='評価点 3.6')
ax.axvline(x=3.8, linestyle="--", color="limegreen", label='評価点 3.8')

plt.show()

f:id:imabari_ehime:20191014212552p:plain

f:id:imabari_ehime:20191014212603p:plain

f:id:imabari_ehime:20191014212612p:plain

今日の救急病院をツイート

# --- coding: utf-8 ---

import datetime
import re
from urllib.parse import urljoin

import requests
import twitter
from bs4 import BeautifulSoup


def scraping(html):

    soup = BeautifulSoup(html, "html.parser")

    # スクレイピング
    tables = soup.find_all(
        "table", class_="comTblGyoumuCommon", summary="検索結果一覧を表示しています。"
    )

    today = datetime.date.today()
    # today = datetime.date(2019, 10, 24)

    for table in tables:

        date, week = table.td.get_text(strip=True).split()
        day = datetime.datetime.strptime(date, "%Y年%m月%d日")

        if day.date() == today:

            result = []

            # 前データ 初期値
            dprev = ["今治市医師会市民病院", "今治市別宮町7-1-40"]

            for trs in table.find_all("tr", id=[1, 2, 3]):

                id = trs.get("id")

                data = list(trs.stripped_strings)

                # 市町村を削除
                if id == "1" and data[0] == "今治市":
                    del data[0]

                # 電話番号を削除
                for _ in range(2):
                    if len(data) > 4 and data[2].startswith("TEL"):
                        del data[2:4]

                # id=2の場合は病院名と住所を結合
                if id == "2":
                    data = dprev[:2] + data

                # print(id, data)

                # 前データとしてセット
                dprev = data

                hospital = dict(zip(["name", "address", "subject"], data[0:4]))

                hospital["class"] = 8

                # 外来受付時間を分割
                t = [j for i in data[3:] for j in i.split("〜")]

                # 外来受付時間の前半開始時間と後半終了時間をセット
                hospital["time"] = "~".join([t[0], t[-1]])

                # 外来受付時間の前半終了時間と後半開始時間が違う場合
                if len(t) == 4 and t[1] != t[2]:
                    hospital["time"] = "\n".join(["~".join(t[:2]), "~".join(t[2:])])

                # 診療科目
                # 救急   : 0
                # 外科系 : 1
                # 内科系 : 2
                # 小児科 : 4
                # その他 : 8
                # 島嶼部 : 9

                # 外科系
                if "外科" in hospital["subject"]:
                    hospital["class"] = 1

                # 内科系
                elif "内科" in hospital["subject"]:
                    hospital["class"] = 2

                # 小児科
                elif hospital["subject"] == "小児科":
                    hospital["class"] = 4

                # 救急
                elif hospital["subject"] == "指定なし":
                    hospital["class"] = 0
                    hospital["subject"] = ""

                # 住所が島嶼部の場合は、診療科目を島嶼部に変更
                match = re.search("(吉海町|宮窪町|伯方町|上浦町|大三島町|関前)", hospital["address"])

                if match:

                    hospital["class"] = 9
                    hospital["subject"] = "島嶼部"

                # 診療科目に【】を追加
                if hospital["subject"]:
                    hospital["subject"] = f'【{hospital["subject"]}】'

                # 病院情報をテキスト化
                hospital["text"] = "\n".join(
                    [hospital["subject"], hospital["name"], hospital["time"]]
                ).strip()

                # リストに追加
                result.append(hospital)

            # 診療科目、時間でソート
            result.sort(key=lambda x: (x["class"], x["time"]))

            # 日付をテキスト化
            twit_date = f"{date} {week}"

            # 陸地部で結合
            twit_riku = "\n\n".join(
                [i["text"] for i in result if i["class"] < 9]
            ).strip()

            # 島嶼部で結合
            twit_sima = "\n\n".join(
                [i["text"] for i in result if i["class"] > 8]
            ).strip()

            # 日付、陸地部、島嶼部を結合
            twit_all = "\n\n".join([twit_date, twit_riku, twit_sima]).strip()

            # print(twit_all)
            # print("-" * 20)

            api = twitter.Api(
                consumer_key="",
                consumer_secret="",
                access_token_key="",
                access_token_secret="",
            )

            # 140文字以内か
            if len(twit_all) < 140:
                # 全文ツイート
                api.PostUpdate(twit_all)

            else:
                # 島嶼部他ツイート
                api.PostUpdate("\n\n".join(twit_date + twit_sima).strip())
                # 陸地部ツイート
                api.PostUpdate("\n\n".join(twit_date + twit_riku).strip())

            break


if __name__ == "__main__":

    base_url = "http://www.qq.pref.ehime.jp/qq38/WP0805/RP080501BL.do"

    payload = {
        "blockCd[3]": "",
        "forward_next": "",
        "torinBlockDetailInfo.torinBlockDetail[0].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[1].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[2].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[3].blockCheckFlg": "1",
        "torinBlockDetailInfo.torinBlockDetail[4].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[5].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[6].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[7].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[8].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[9].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[10].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[11].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[12].blockCheckFlg": "0",
    }

    # 地域選択ページのセッション作成
    with requests.Session() as s:

        r = s.get(base_url)

        soup = BeautifulSoup(r.content, "html.parser")

        # トークンを取得
        token = soup.find(
            "input", attrs={"name": "org.apache.struts.taglib.html.TOKEN"}
        ).get("value")

        # トークンをセット
        payload["org.apache.struts.taglib.html.TOKEN"] = token

        # URL生成
        url = urljoin(
            base_url, soup.find("form", attrs={"name": "wp0805Form"}).get("action")
        )

        # URL確認
        # print(url)

        # データ送信
        r = s.post(url, data=payload)

    scraping(r.content)

食べログスクレイピング

import time

import requests
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook

pref_list = [
    "hokkaido",
    "aomori",
    "iwate",
    "miyagi",
    "akita",
    "yamagata",
    "fukushima",
    "ibaraki",
    "tochigi",
    "gunma",
    "saitama",
    "chiba",
    "tokyo",
    "kanagawa",
    "niigata",
    "toyama",
    "ishikawa",
    "fukui",
    "yamanashi",
    "nagano",
    "gifu",
    "shizuoka",
    "aichi",
    "mie",
    "shiga",
    "kyoto",
    "osaka",
    "hyogo",
    "nara",
    "wakayama",
    "tottori",
    "shimane",
    "okayama",
    "hiroshima",
    "yamaguchi",
    "tokushima",
    "kagawa",
    "ehime",
    "kochi",
    "fukuoka",
    "saga",
    "nagasaki",
    "kumamoto",
    "oita",
    "miyazaki",
    "kagoshima",
    "okinawa",
]

result = []

# 有料会員チェック
mode = False

with requests.Session() as s:

    for pref in tqdm_notebook(pref_list, desc='pref loop'):

        for page in tqdm_notebook(range(1, 61), desc=pref):

            url = f"https://tabelog.com/{pref}/rstLst/{page}/?Srt=D&SrtT=rvcn"

            # print(url)

            r = s.get(url, timeout=3)

            if r.status_code == 200:

                soup = BeautifulSoup(r.content, "html5lib")

                for li in soup.select("ul.js-rstlist-info.rstlist-info > li.list-rst.js-bookmark.js-rst-cassette-wrap.list-rst--ranking"):

                    shop = {}
                    shop["pref"] = pref
                    shop["name"] = li.select_one("a.list-rst__rst-name-target.cpy-rst-name").get_text(strip=True)
                    shop["link"] = li.select_one("a.list-rst__rst-name-target.cpy-rst-name").get("href")
                    shop["rate"] = float(li.select_one("span.c-rating__val.c-rating__val--strong.list-rst__rating-val").get_text(strip=True))
                    shop["amount"] = int(li.select_one("em.list-rst__rvw-count-num.cpy-review-count").get_text(strip=True))

                    if shop["amount"] < 100:
                        break

                    if mode:
                        time.sleep(3)

                        rr = s.get(shop["link"], timeout=3)

                        if rr.status_code == 200:

                            shop["isLoad"] = True

                            ssoup = BeautifulSoup(rr.content, "html5lib")

                            if ssoup.find('h3', class_='pr-comment-title js-pr-title'):
                                shop["isPremium"] = True
                            else:
                                shop["isPremium"] = False
                        else:
                            shop["isLoad"] = False
                            shop["isPremium"] = False

                    result.append(shop)

                    # print(shop)

                else:
                    time.sleep(3)
                    continue
                break