愛媛県河川・水防情報よりダム情報をスクレイピング

import csv
import datetime
import time

import requests
from bs4 import BeautifulSoup


def date_span(start_date, end_date, hour_interval):

    n = start_date

    while n < end_date:
        n += datetime.timedelta(hours=hour_interval)

        yield n


# GRP = USR004:玉川ダム、USR005:台ダム、USR010:鹿野川ダム、USR011:野村ダム
grp = 'USR011'

# KTM = 1:1時間毎、2:30分毎、3:10分毎
ktm = 3

# 1時間毎:24、30分毎:12、10分毎:4
hour_interval = 4

with open('dam.tsv', 'w') as fw:

    writer = csv.writer(fw, dialect='excel-tab', lineterminator='\n')

    # ヘッダー保存
    writer.writerow(['日付', '時刻', '貯水位', '全流入量', '全放流量', '貯水量', '貯水率'])

    # 開始日時
    start_date = datetime.datetime(2018, 7, 5)

    # 終了日時
    end_date = datetime.datetime(2018, 7, 10)

    for k in date_span(start_date, end_date, hour_interval):

        url = 'http://183.176.244.72/cgi/170_USER_010_01.cgi?GID=170_USER_010&UI=U777&SI=00000&MNU=1&LO=88&BTY=IE6X&NDT=1&SK=0000000&DT={0}&GRP={1}&TPG=1&PG=1&KTM={2}'.format(
            k.strftime('%Y%m%d%H%M'), grp, ktm)

        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
        }

        r = requests.get(url, headers=headers)

        if r.status_code == requests.codes.ok:

            soup = BeautifulSoup(r.content, 'html5lib')

            for trs in soup.select('body > table:nth-of-type(7) > tbody > tr'):

                # 列 => セル => セル内の列 => セル内のセル の順に取得
                temp = [[[i.get_text(strip=True) for i in tr.select('td')]
                         for tr in tds.select('tr')]
                        for tds in trs.select('td > table > tbody')]

                # 行・列入替
                dam = list(map(list, zip(*temp)))

                for j in dam:

                    # Flatten
                    data = sum(j, [])

                    writer.writerow(data)

        time.sleep(1)

feed43で10進文字を変換したい

feed43で10進文字を変換する方法ないかな

beautifulsoupだと変換できるんだけど

import requests
from bs4 import BeautifulSoup

from urllib.parse import urljoin

url = 'http://ehime.force.com/PUB_VF_Detail_Docs'

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
}

r = requests.get(url, headers=headers)

if r.status_code == requests.codes.ok:

    soup = BeautifulSoup(r.content, 'html5lib')

    with open('test.html', mode='w', encoding='utf-8') as fw:
        fw.write(soup.prettify())

    for link in soup.find_all('a', target='_blank'):

        # iタグを除去
        # link.i.extract()
        
        print(link.get_text(strip=True))
        print(urljoin(url, link.get('href')))

勝手に変換してくれた 災害対策本部・災害警戒本部関係情報

蒼社川(片山)の水位と写真をTwitterに定時投稿

import datetime
import tempfile

import requests
import twitter
from apscheduler.schedulers.blocking import BlockingScheduler
from bs4 import BeautifulSoup


def scraping():

    url = 'http://183.176.244.72/cgi/050_HQ_100_03.cgi?GID=050_HQ_100&UI=U777&SI=00000&DT=000000000000&DBDT=0000000000&MNU=1&DTO=-1&DN=0972900400025&KTM=3&GHK=3&YSK=0&SRO=1&LO=88&TD=0000&BTY=IE6X&cSessionID=0000000000000000000000&UIS=000000000000&SIMU=0&ZM=0'

    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html5lib')

    result = []

    dt_now = datetime.datetime.now()

    for trs in soup.select('body > table > tbody > tr'):
        data = [i.get_text(strip=True) for i in trs.select('td.right-10')]

        # 日付補完
        if data[0]:
            cache = data[0]
        else:
            data[0] = cache

        # 日付
        m, d = map(int, data[0].split('/'))

        # 時間
        H, M = map(int, data[1].split(':'))

        # 日時
        dt_date = datetime.datetime(dt_now.year, m, d) + datetime.timedelta(
            hours=H, minutes=M)

        # 年をまたいだか
        if dt_date > dt_now:
            dt_date -= datetime.timedelta(years=1)

        # 小数点に変換、変換できない場合はスキップ
        try:
            water_level = float(data[2])
        except:
            pass
        else:
            result.append([dt_date, water_level])

    # 時間・現在の水位・直前の水位
    return (result[-1][0], result[-1][1], result[-2][1])


sched = BlockingScheduler()


@sched.scheduled_job('interval', minutes=10)
def river_job():

    wl_date, wl_now, wl_before = scraping()

    if wl_now > 2 or wl_before > 2:

        # 危険水位
        if wl_now >= 2.85:
            wl_alert = 'はん濫危険水位'
        elif wl_now >= 2.60:
            wl_alert = '避難判断水位'
        elif wl_now >= 2.40:
            wl_alert = 'はん濫注意水位'
        elif wl_now >= 2.10:
            wl_alert = '水防団待機水位'
        else:
            wl_alert = ''

        # 水位変化
        if wl_now > wl_before:
            wl_sign = '↗'
        elif wl_now < wl_before:
            wl_sign = '↘'
        else:
            wl_sign = '➡'

        twit = '{}現在\n蒼社川(片山)の水位は{}m{}{}です。'.format(
            wl_date.strftime('%Y/%m/%d %H:%M'), wl_now, wl_sign, wl_alert)

        res = requests.get(
            'http://www.pref.ehime.jp/kasen/Jpeg/Cam006/00_big.jpg')

        if res.status_code == requests.codes.ok:
            with tempfile.TemporaryFile() as fp:
                fp.write(res.content)
                fp.seek(0)
                fp.read()

                api = twitter.Api(
                    consumer_key='',
                    consumer_secret='',
                    access_token_key='',
                    access_token_secret='')

                print(twit)
                status = api.PostUpdate(twit, media=fp)


sched.start()

asyncioでスクレイピングを高速化

import asyncio
import aiohttp
import requests
from bs4 import BeautifulSoup


async def scraping(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:

            html = await response.text()
            soup = BeautifulSoup(html, 'html5lib')

            result = []

    return result


if __name__ == "__main__":

    urls = []

    loop = asyncio.get_event_loop()
    done, pending = loop.run_until_complete(
        asyncio.wait([scraping(url) for url in urls]))
    result = [d.result() for d in done]

ThreadPoolExecutorでスクレイピングを高速化

from concurrent.futures import ThreadPoolExecutor

import requests
from bs4 import BeautifulSoup


def scraping(url):

    r = requests.get(url)

    if r.status_code == requests.codes.ok:

        soup = BeautifulSoup(r.content, 'html5lib')

        result = []
        return (result)


if __name__ == "__main__":

    urls = []

    with ThreadPoolExecutor() as pool:
        result = [res for res in pool.map(scraping, urls)]

Feed43でRSSのないホームページのRSSを作成する

Feed43で作成

feed43.com

Step 1. Specify source page address (URL)

f:id:imabari_ehime:20180624104247p:plain

  • Address(取得したいページのURLを入力)
http://www.city.imabari.ehime.jp/whatsnew.html

Page Sourceを確認

Step 2. Define extraction rules

f:id:imabari_ehime:20180624104330p:plain

  • Global Search Pattern (optional)(ページの範囲を絞り込みたいときに入力)
<dl>{%}</dl>
  • Item (repeatable) Search Pattern(取得したいアイテムのタグを入力)
<dt>{%}</dt>{*}<dd><a href="{%}">{%}</a></dd>

取得したいところは{%} 不要なところは{}改行も{}

Step 3. Define output format(RSSのタイトル・URL・概要を入力)

f:id:imabari_ehime:20180624104807p:plain

  • Feed Title
今治市役所
  • Feed Link
http://www.city.imabari.ehime.jp/whatsnew.html
  • Feed Description
今治市役所 ホームページの更新情報

RSS item properties

  • Item Title Template(タイトル)
{%1} {%3}
  • Item Link Template(リンク)
{%2}
  • Item Content Template(概要)

f:id:imabari_ehime:20180624105001p:plain

今治市役所の更新情報 https://feed43.com/imabari-city.xml

今治警察署管内の不審者情報 https://feed43.com/imabari-fushinsha.xml

FC今治のニュース https://feed43.com/fc-imabari.xml

愛媛の事件・事故 https://feed43.com/ehime-jiken.xml

PythonのrequestsでPOST送信スクレイピング

今治地区の救急病院をスクレイピングし曜日別・医療機関別に集計する

seleniumを使わずにrequestsでpost送信

Firefoxの開発ツールでpost内容を確認

ネットワークの中からメソッドPOSTを選びパラメーターのフォームデータを確認 f:id:imabari_ehime:20180622180935p:plain

"blockCd[3]": "",
"forward_next": "",
"torinBlockDetailInfo.torinBlockDetail[0].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[1].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[2].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[3].blockCheckFlg": "1",
"torinBlockDetailInfo.torinBlockDetail[4].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[5].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[6].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[7].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[8].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[9].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[10].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[11].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[12].blockCheckFlg": "0",

# ここだけ変わる
"org.apache.struts.taglib.html.TOKEN" : "06136a1c3e9558818de3ee18fc48393f"

# HTMLのソースから「org.apache.struts.taglib.html.TOKEN」を検索すると同じ値があるのでスクレイピング
# -*- coding: utf-8 -*-

import csv
import datetime
import re
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# 日付変換
def date_conv(t_time):

    if t_time.startswith('翌日'):
        t_time = t_time.replace('翌日', '')
        next_day = datetime.timedelta(days=1)
    else:
        next_day = datetime.timedelta(days=0)

    H, M = map(int, t_time.split(':'))

    result = datetime.timedelta(hours=H, minutes=M)

    return result + next_day

# スクレイピング
def scraping(html):

    soup = BeautifulSoup(html, "html.parser")

    # CSVを保存
    with open('result.csv', 'w') as fw:
        writer = csv.writer(fw, dialect='excel', lineterminator='\n')
        writer.writerow(
            ['医療機関', '住所', 'TEL(昼)', 'TEL(夜)', '診療科目', '曜日', '開始時刻', '終了時刻'])

        table = soup.find_all(
            'table', class_='comTblGyoumuCommon', summary='検索結果一覧を表示しています。')

        shimanami = ['吉海町', '宮窪町', '伯方町', '上浦町', '大三島町', '関前']

        for i in table:

            cache = []

            date, week = i.td.get_text(strip=True).split()
            today = datetime.datetime.strptime(date, '%Y年%m月%d日')

            for tr in i.find_all('tr', id=re.compile('1|2|3')):
                data = tr.get_text('\t', strip=True).split()

                # 医療機関 住所 昼 昼TEL 夜 夜TEL 診療科目 受付時間
                if tr['id'] == '1':
                    hospital = data[1:]

                elif tr['id'] == '2':
                    hospital = cache + data

                elif tr['id'] == '3':
                    hospital = data

                # 夜間の電話がないところは空白挿入
                if hospital[4] != 'TEL(夜)':
                    hospital.insert(4, 'TEL(夜)')
                    hospital.insert(5, None)

                result = hospital[:7]

                result.remove('TEL(昼)')
                result.remove('TEL(夜)')

                # 住所が島嶼部の場合は、診療科目を島嶼部に変更
                for j in shimanami:
                    if j in result[1]:
                        if result[4] == '指定なし':
                            result[4] = '島嶼部'
                        break

                start_1st, end_1st = map(date_conv, hospital[7].split('〜'))

                # 前半のみの場合
                if len(hospital) < 9:
                    writer.writerow(result + [week] +
                                    [today + start_1st, today + end_1st])

                # 前半・後半がある場合
                else:
                    start_2nd, end_2nd = map(date_conv, hospital[8].split('〜'))

                    # 前半の終了時間と後半の開始時間が同じ場合は結合する
                    if end_1st == start_2nd:

                        writer.writerow(result + [week] +
                                        [today + start_1st, today + end_2nd])

                    # 前半の終了時間と後半の開始時間が違う場合、2つに分ける
                    else:

                        writer.writerow(result + [week] +
                                        [today + start_1st, today + end_1st])
                        writer.writerow(result + [week] +
                                        [today + start_2nd, today + end_2nd])

                cache = hospital[:6]

# メイン
if __name__ == '__main__':

    base_url = "http://www.qq.pref.ehime.jp/qq38/WP0805/RP080501BL.do"

    region_requests = {
        "blockCd[3]": "",
        "forward_next": "",
        "torinBlockDetailInfo.torinBlockDetail[0].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[1].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[2].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[3].blockCheckFlg": "1",
        "torinBlockDetailInfo.torinBlockDetail[4].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[5].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[6].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[7].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[8].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[9].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[10].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[11].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[12].blockCheckFlg": "0",
    }

    # 地域選択ページのセッション作成
    s = requests.Session()
    resp = s.get(base_url, timeout=1)
    soup = BeautifulSoup(resp.content, "html.parser")

    # トークンを取得
    token = soup.find(
        "input", attrs={
            "name": "org.apache.struts.taglib.html.TOKEN"
        }).get("value")

    region_requests["org.apache.struts.taglib.html.TOKEN"] = token

    url = urljoin(
        base_url,
        soup.find("form", attrs={
            "name": "wp0805Form"
        }).get("action"))

    # 選択する
    resp = s.post(url, data=region_requests)

    # スクレイピング
    scraping(resp.content)

    # データ読み込み
    df = pd.read_csv('result.csv', parse_dates=['開始時刻', '終了時刻'])
    df['時間'] = df['終了時刻'] - df['開始時刻']
    df.sort_values(['開始時刻', '診療科目'], ascending=[True, True], inplace=True)
    df

    # 日本語表示
    plt.rcParams['font.family'] = 'IPAPGothic'

    # 医療機関別 診療科目ごとに時間集計
    table = pd.pivot_table(
        df,
        values='時間',
        index=['医療機関'],
        columns=['診療科目'],
        fill_value=pd.Timedelta(hours=0),
        aggfunc=np.sum)

    # グラフ表示 医療機関別 診療科目ごとに時間集計
    table.plot.barh()

    # 救急(指定なし)のみ抽出
    df2 = df[df['診療科目'] == '指定なし']

    # 曜日別 医療機関ごとに時間集計
    tb1 = pd.pivot_table(
        df2,
        values='時間',
        index=['曜日'],
        columns=['医療機関'],
        fill_value=pd.Timedelta(hours=0),
        aggfunc=np.sum)
 
    # グラフ表示 曜日別 医療機関ごとに時間集計
    tb1.plot.barh(subplots=True, layout=(3, 3), figsize=(20, 20))

    # 1日毎の時間集計 抜けがないか確認
    df3 = df2.loc[:, ['医療機関', '診療科目', '開始時刻', '時間']]
    df3.set_index('開始時刻', inplace=True)
    s = df3.resample('D').sum()
    s.plot.barh()