imabari.hateblo.jp
!pip install lxml
import pandas as pd
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = 'http://www1.river.go.jp/cgi-bin/DspDamData.exe?ID=1368080150020&KIND=3&PAGE=0'
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
r = requests.get(url, headers=headers)
if r.status_code == requests.codes.ok:
soup = BeautifulSoup(r.content, 'html.parser')
link = urljoin(url, soup.select_one('body > center > p > a').get('href'))
print(link)
df = pd.read_csv(
link,
skiprows=9,
encoding='shift_jis',
names=[
'年月日', '時刻', '流域平均雨量', '流域平均雨量属性', '貯水量', '貯水量属性', '流入量', '流入量属性',
'放流量', '放流量属性', '貯水率', '貯水率属性'],
parse_dates={'日時':['年月日', '時刻']},
index_col=['日時'],
na_values=0
)
df.drop(['流域平均雨量', '流域平均雨量属性', '貯水量', '貯水量属性', '流入量', '流入量属性', '放流量', '放流量属性', '貯水率属性'], axis=1, inplace=True)
df
df.plot()
df[df.isnull().any(axis=1)]
dfh = df.fillna(method='ffill')
dfh.plot()
df.dtypes
type(df.index[0])
わからない
- 年月日と時刻を結合して日時を作成できたがdatetimeにならない
- 欠測、未受信の場合の欠損値の補完
- 正常値の属性は半角スペース
- 欠測は属性全部が$
- 未受信の場合流域平均雨量属性は-でそれ以外の属性は#
- 半角スペース以外の時は欠損値を上から補完
!pip install lxml
import pandas as pd
import datetime
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
url = 'http://www1.river.go.jp/cgi-bin/DspDamData.exe?ID=1368080700010&KIND=3&PAGE=0'
headers = {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
}
r = requests.get(url, headers=headers)
if r.status_code == requests.codes.ok:
soup = BeautifulSoup(r.content, 'html.parser')
link = urljoin(url, soup.select_one('body > center > p > a').get('href'))
print(link)
df = pd.read_csv(
link,
skiprows=9,
encoding='shift_jis',
names=[
'年月日', '時刻', '流域平均雨量', '流域平均雨量属性', '貯水量', '貯水量属性', '流入量', '流入量属性',
'放流量', '放流量属性', '貯水率', '貯水率属性'],
parse_dates=['年月日'],
)
def conv(temp):
H, M = map(int, temp.split(':'))
return datetime.timedelta(hours=H, minutes=M)
df['日時'] = df['年月日'] + df['時刻'].apply(conv)
df.set_index('日時', inplace=True)
df.dtypes
df.head(10)
df['流域平均雨量'] = df['流域平均雨量'].where(df['流域平均雨量属性'] == ' ')
df['貯水量'] = df['貯水量'].where(df['貯水量属性'] == ' ')
df['流入量'] = df['流入量'].where(df['流入量属性'] == ' ')
df['放流量'] = df['放流量'].where(df['放流量属性'] == ' ')
df['貯水率'] = df['貯水率'].where(df['貯水率属性'] == ' ')
rate_se = df.loc[:, '貯水率']
rate_se.head(10)
rate_se.isnull().sum()
rate_se[rate_se.isnull()]
rate_se.plot()
rate_correct = rate_se.fillna(method='ffill')
rate_correct = rate_se.fillna(method='bfill')
rate_correct
rate_correct.plot()
rato_hour = rate_correct.asfreq(freq='H')
rato_hour
rato_hour.plot()
rate_del = rate_se.dropna()
rate_del.plot()