多摩川の水位をスプレイピング

qiita.com

データのある年抽出

import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

r = requests.get(
    "http://www1.river.go.jp/cgi-bin/SrchWaterData.exe?ID=303051283310030&KIND=2&PAGE=0",
    headers=headers,
)

r.raise_for_status()

soup = BeautifulSoup(r.content, "html5lib")

years = []

for trs in soup.select("table:nth-of-type(6) tr")[2:]:

    y = int(
        trs.find("td", attrs={"nowrap": "", "bgcolor": "#FFFFCC"})
        .get_text(strip=True)
        .replace("*", "0")
    )

    for i, img in enumerate(trs.select("td > font > img")):
        if img.get("src").endswith("ari.gif"):
            years.append(y + i)

スクレイピング

import pandas as pd
import numpy as np
import time
from tqdm import tqdm_notebook

dfs = []

for year in tqdm_notebook(years):
    for month in tqdm_notebook(range(1, 13), desc=str(year)):

        data = pd.read_html(f"http://www1.river.go.jp/cgi-bin/DspWaterData.exe?KIND=2&ID=303051283310030&BGNDATE={year}{month:02}01&ENDDATE=20191231&KAWABOU=NO", match="単位:m", skiprows=1, header=0, index_col=0, na_values=["欠測", "閉局"])
        dfs.append(data[0].iloc[:-1,:])

        time.sleep(3)

df = pd.concat(dfs)

df.to_csv("result.csv")
names = ["date"] + list(range(1,25))
df1 = pd.read_csv("result.csv", parse_dates=["date"], names=names, header=None, skiprows=1)

df2 = pd.melt(df1, id_vars='date', var_name='hour', value_name='level')
df2 = df2.sort_values(['date','hour']).reset_index(drop = True)

df2["datetime"] = pd.to_datetime(df2["date"] + df2["hour"].apply(lambda x: datetime.timedelta(hours=x)))
df2.set_index("datetime", inplace=True)

df2.drop(["date", "hour"], axis=1, inplace=True)

df2.plot(linewidth=0.5)

df3 = df2[df2.index > datetime.datetime(1951,11,1)]
df3.plot(linewidth=0.5)

f:id:imabari_ehime:20191101004129p:plain

f:id:imabari_ehime:20191101004136p:plain