気象庁の気象警報をスクレイピング

code4sabae.github.io

import pathlib
import time
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

pref_code = {
    "01": "北海道",
    "02": "青森県",
    "03": "岩手県",
    "04": "宮城県",
    "05": "秋田県",
    "06": "山形県",
    "07": "福島県",
    "08": "茨城県",
    "09": "栃木県",
    "10": "群馬県",
    "11": "埼玉県",
    "12": "千葉県",
    "13": "東京都",
    "14": "神奈川県",
    "15": "新潟県",
    "16": "富山県",
    "17": "石川県",
    "18": "福井県",
    "19": "山梨県",
    "20": "長野県",
    "21": "岐阜県",
    "22": "静岡県",
    "23": "愛知県",
    "24": "三重県",
    "25": "滋賀県",
    "26": "京都府",
    "27": "大阪府",
    "28": "兵庫県",
    "29": "奈良県",
    "30": "和歌山県",
    "31": "鳥取県",
    "32": "島根県",
    "33": "岡山県",
    "34": "広島県",
    "35": "山口県",
    "36": "徳島県",
    "37": "香川県",
    "38": "愛媛県",
    "39": "高知県",
    "40": "福岡県",
    "41": "佐賀県",
    "42": "長崎県",
    "43": "熊本県",
    "44": "大分県",
    "45": "宮崎県",
    "46": "鹿児島県",
    "47": "沖縄県",
}

pref = [v for v in pref_code.values()]

url = "https://www.jma.go.jp/jp/warn/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

"""# スクレイピング"""

# セッション
with requests.Session() as s:

    r = s.get(url, headers=headers)

    r.raise_for_status()

    base = BeautifulSoup(r.content, "html5lib")

    htmls = []

    for tag in tqdm(base.select("div#title > noscript > table > tbody > tr > td > a")):

        area = tag.get_text(strip=True)
        link = urljoin(url, tag.get("href"))

        r = s.get(link, headers=headers)
        r.raise_for_status()

        soup = BeautifulSoup(r.content, "html5lib")

        p = pathlib.Path("html", pathlib.PurePath(link).name)
        p.parent.mkdir(parents=True, exist_ok=True)

        with p.open(mode="w") as fw:
            fw.write(soup.prettify())

        htmls.append({"area": area, "url": link, "path": p})

        time.sleep(1)

import pandas as pd


def fetch_warn(p, area):

    tmp = pd.read_html(p.open(mode="r"), attrs={"id": "WarnTableTable"})[0]

    df = tmp.melt(
        id_vars=[
            ("Unnamed: 0_level_0", "Unnamed: 0_level_1"),
            ("Unnamed: 1_level_0", "Unnamed: 1_level_1"),
            ("Unnamed: 2_level_0", "Unnamed: 2_level_1"),
        ]
    ).dropna(thresh=5)

    df.set_axis(
        ["area1", "area2", "city", "level", "alert", "value"], axis=1, inplace=True
    )

    df["pref"] = area

    return df


dfs = [fetch_warn(html["path"], html["area"]) for html in htmls]

df = pd.concat(dfs).reset_index(drop=True)

for col in df.select_dtypes(include=object).columns:
    df[col] = df[col].str.normalize("NFKC").str.replace("\s", "")

df["pref"].replace(
    {
        "宗谷地方": "北海道",
        "上川・留萌地方": "北海道",
        "網走・北見・紋別地方": "北海道",
        "釧路・根室・十勝地方": "北海道",
        "胆振・日高地方": "北海道",
        "石狩・空知・後志地方": "北海道",
        "渡島・檜山地方": "北海道",
        "沖縄本島地方": "沖縄県",
        "大東島地方": "沖縄県",
        "宮古島地方": "沖縄県",
        "八重山地方": "沖縄県",
    },
    inplace=True,
)

df["value"] = (df["value"] == "●").astype(int)

df_alert = df.pivot_table(
    index="pref", columns="level", values="value", aggfunc=sum
).reindex(index=pref, columns=["警報", "注意報"])

df_alert

df.to_csv("alert.csv", encoding="utf_8_sig")