Twitterから楽天モバイル基地局情報を抽出＆データクレンジング

import re
import time
import urllib.parse

import pandas as pd
import requests
import tweepy

bearer_token = ""

ehime_names = {
    "38201": "松山市",
    "38202": "今治市",
    "38203": "宇和島市",
    "38204": "八幡浜市",
    "38205": "新居浜市",
    "38206": "西条市",
    "38207": "大洲市",
    "38210": "伊予市",
    "38213": "四国中央市",
    "38214": "西予市",
    "38215": "東温市",
    "38356": "越智郡上島町",
    "38386": "上浮穴郡久万高原町",
    "38401": "伊予郡松前町",
    "38402": "伊予郡砥部町",
    "38422": "喜多郡内子町",
    "38442": "西宇和郡伊方町",
    "38484": "北宇和郡松野町",
    "38488": "北宇和郡鬼北町",
    "38506": "南宇和郡愛南町",
}

columns = [
    "場所",
    "場所補足",
    "設置タイプ",
    "市区町村",
    "説明",
    "情報提供",
    "状況",
    "sector",
    "sub6",
    "ミリ波",
    "eNB-LCID",
    "PCI",
    "緯度",
    "経度",
]


def reverse_geo(row):

    d = {"lat": row["緯度"], "lon": row["経度"]}

    url = urllib.parse.urlunparse(
        (
            "https",
            "mreversegeocoder.gsi.go.jp",
            "/reverse-geocoder/LonLatToAddress",
            None,
            urllib.parse.urlencode(d),
            None,
        )
    )

    r = requests.get(url)

    time.sleep(1)

    data = r.json().setdefault("results", {})

    address = data.get("lv01Nm", "")
    city = ehime_names.get(data.get("muniCd", ""), "")

    return pd.Series([city, city + address])


client = tweepy.Client(bearer_token)

tweets = client.search_recent_tweets(
    "#愛媛 #楽天モバイル #基地局 -is:retweet",
    expansions=["author_id", "referenced_tweets.id"],
    tweet_fields=["created_at", "referenced_tweets"],
    user_fields=["verified"],
    max_results=100,
)

# ツイート抽出
df_data = pd.DataFrame(tweets.data)

# ツイート発信者名抽出
df_user = pd.DataFrame(tweets.includes["users"]).rename(columns={"id": "author_id"})

df0 = pd.merge(df_data, df_user, on="author_id")

# 日本時間に変換
df0["created_at"] = df0["created_at"].dt.tz_convert("Asia/Tokyo")

# 日付の昇順
df0.sort_values(by="created_at", inplace=True)

# indexリセット
df0.reset_index(drop=True, inplace=True)

df0

# データクレンジング

# 新規開局

df1 = df0[df0["text"].str.startswith("○新規開局")].copy()

df2 = (
    df1["text"]
    .apply(lambda s: pd.Series([i.strip() for i in re.split("【.+?】", s)]))
    .rename(
        columns={
            0: "type",
            1: "date",
            2: "情報提供",
            3: "place",
            4: "contents",
            5: "others",
        }
    )
)

df2[["address", "緯度", "経度"]] = df2["place"].str.extract(
    "(.+)\n\(([0-9.]+), *([0-9.]+)\)"
)

df2[["eNB-LCID", "PCI"]] = df2["contents"].str.extract(
    "・eNB-LCID: +([0-9,-]+)\n・PCI: +([0-9,-]+)"
)

df2[["市区町村", "場所"]] = df2.apply(reverse_geo, axis=1)

df2[["address", "場所"]]

df2["状況"] = "open"

df2

df2.reindex(columns=columns).to_csv("open.tsv", sep="\t", index=False)

# 新規発見

df3 = df0[df0["text"].str.startswith("○新規発見")].copy()

df4 = (
    df3["text"]
    .apply(lambda s: pd.Series([i.strip() for i in re.split("【.+?】", s)]))
    .rename(columns={0: "type", 1: "date", 2: "情報提供", 3: "place", 4: "others"})
)

df4[["address", "緯度", "経度"]] = df4["place"].str.extract(
    "(.+)\n\(([0-9.]+), *([0-9.]+)\)"
)

df4[["市区町村", "場所"]] = df4.apply(reverse_geo, axis=1)

df4[["address", "場所"]]

df4["状況"] = "close"

df4

df4.reindex(columns=columns).to_csv("close.tsv", sep="\t", index=False)