業務スーパーの各店舗の地図から位置情報をスクレイピング

qiita.com

上の記事の頃より東北・関東が増えてる

f:id:imabari_ehime:20210323162004p:plain

地図から位置情報をスクレイピング

CSV結果

https://docs.google.com/spreadsheets/d/e/2PACX-1vSAX3Wm3SvT6NjTEqI9BHd8jLPdgAh3oi9Nnlq8OShfFevNAV08Ktk71ofp_EzQD0Y_EA_HXEtQJXVW/pub?gid=1781326690&single=true&output=csv

スクレイピング

import requests
from bs4 import BeautifulSoup

import pathlib
import time
import urllib.parse

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}


def fetch_soup(url, parser="html.parser"):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, parser)

    return soup


def fetch_shop(url):

    print(url)

    shop = {}

    soup = fetch_soup(url)
    link = soup.select_one("div.imap > iframe").get("src")

    qs = urllib.parse.urlparse(link).query
    qs_d = urllib.parse.parse_qs(qs)

    if qs_d.get("ll"):

        lat, lng= map(float, qs_d["ll"][0].split(","))

        shop["緯度"] = lat
        shop["経度"] = lng

    shop["店舗名"] = " ".join(
        [i.get_text(strip=True) for i in soup.select("h3.shop_title p.left")]
    )
    shop["郵便番号"] = soup.select_one("div.dboxleft").get_text(strip=True)
    shop["住所"] = soup.select_one("div.dboxright").contents[1].string
    shop["電話番号"] = (
        soup.select_one('div.shop_info > div > img[alt="TEL"]')
        .parent.find_next_sibling("div")
        .get_text(strip=True)
    )
    shop["営業時間"] = (
        soup.select_one('div#openday > div > img[alt="営業時間"]')
        .parent.find_next_sibling("div")
        .get_text(strip=True)
    )

    return shop


shop_list = []

for i in range(1, 48):

    url = f"https://www.gyomusuper.jp/shop/list.php?pref_id={i}"

    soup = fetch_soup(url)

    for a in soup.select("div.list_box_l > div > div > a"):

        link = urllib.parse.urljoin(url, a.get("href"))

        shop = fetch_shop(link)
        shop_list.append(shop)

        time.sleep(2)

    time.sleep(5)

CSV変換

import pandas as pd

df = pd.DataFrame(shop_list)

df.info()

df[df.isnull().any(axis=1)]

df.to_csv("gyoumu.csv", encoding="utf_8_sig")
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 911 entries, 0 to 910
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   緯度      895 non-null    float64
 1   経度      895 non-null    float64
 2   店舗名     911 non-null    object 
 3   郵便番号    911 non-null    object 
 4   住所      911 non-null    object 
 5   電話番号    911 non-null    object 
 6   営業時間    911 non-null    object 
dtypes: float64(2), object(5)
memory usage: 49.9+ KB

位置情報が欠損は全店舗911店舗中16件

f:id:imabari_ehime:20210322113530p:plain

地図

import folium
from folium import plugins

map = folium.Map(location=[34.06604300, 132.99765800], zoom_start=6)

marker_cluster = plugins.MarkerCluster().add_to(map)

for i, r in df.iterrows():
    folium.Marker(
        location=[r["緯度"], r["経度"]],
        popup=folium.Popup(r["店舗名"], max_width=300),
    ).add_to(marker_cluster)

map