データのある年抽出
import requests from bs4 import BeautifulSoup headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko" } r = requests.get( "http://www1.river.go.jp/cgi-bin/SrchWaterData.exe?ID=303051283310030&KIND=2&PAGE=0", headers=headers, ) r.raise_for_status() soup = BeautifulSoup(r.content, "html5lib") years = [] for trs in soup.select("table:nth-of-type(6) tr")[2:]: y = int( trs.find("td", attrs={"nowrap": "", "bgcolor": "#FFFFCC"}) .get_text(strip=True) .replace("*", "0") ) for i, img in enumerate(trs.select("td > font > img")): if img.get("src").endswith("ari.gif"): years.append(y + i)
import pandas as pd import numpy as np import time from tqdm import tqdm_notebook dfs = [] for year in tqdm_notebook(years): for month in tqdm_notebook(range(1, 13), desc=str(year)): data = pd.read_html(f"http://www1.river.go.jp/cgi-bin/DspWaterData.exe?KIND=2&ID=303051283310030&BGNDATE={year}{month:02}01&ENDDATE=20191231&KAWABOU=NO", match="単位:m", skiprows=1, header=0, index_col=0, na_values=["欠測", "閉局"]) dfs.append(data[0].iloc[:-1,:]) time.sleep(3) df = pd.concat(dfs) df.to_csv("result.csv")
names = ["date"] + list(range(1,25)) df1 = pd.read_csv("result.csv", parse_dates=["date"], names=names, header=None, skiprows=1) df2 = pd.melt(df1, id_vars='date', var_name='hour', value_name='level') df2 = df2.sort_values(['date','hour']).reset_index(drop = True) df2["datetime"] = pd.to_datetime(df2["date"] + df2["hour"].apply(lambda x: datetime.timedelta(hours=x))) df2.set_index("datetime", inplace=True) df2.drop(["date", "hour"], axis=1, inplace=True) df2.plot(linewidth=0.5) df3 = df2[df2.index > datetime.datetime(1951,11,1)] df3.plot(linewidth=0.5)