Colaboratoryでスクレイピング

news.mynavi.jp

https://colab.research.google.com/

f:id:imabari_ehime:20180416172850p:plain

import pandas as pd
import requests
from bs4 import BeautifulSoup


url = 'http://www.river.go.jp/kawabou/ipDamGaikyo.do?init=init&areaCd=88&prefCd=3801&townCd=&gamenId=01-0903&fldCtlParty=no'

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
}

r = requests.get(url, headers=headers)

if r.status_code == requests.codes.ok:

    soup = BeautifulSoup(r.content, 'html5lib')

    result = [[
        x.get_text(strip=True) for x in y.find_all(['th', 'td'])
    ] for y in soup.select('body > div.gaikyoCntt > table > tbody > tr ')]

    df = pd.DataFrame(data=result[1:], columns=result[0])
    df.set_index('ダム名', inplace=True)

df

Pandas

pip3じゃなくてpipだとできた

!pip install lxml
import pandas as pd

url = 'http://www.river.go.jp/kawabou/ipDamGaikyo.do?init=init&areaCd=88&prefCd=3801&townCd=&gamenId=01-0903&fldCtlParty=no'
dfs = pd.read_html(url, header=0, index_col=0)
dfs[7]

f:id:imabari_ehime:20180417085825g:plain