PythonのrequestsでPOST送信スクレイピング

今治地区の救急病院をスクレイピングし曜日別・医療機関別に集計する

seleniumを使わずにrequestsでpost送信

Firefoxの開発ツールでpost内容を確認

ネットワークの中からメソッドPOSTを選びパラメーターのフォームデータを確認 f:id:imabari_ehime:20180622180935p:plain

"blockCd[3]": "",
"forward_next": "",
"torinBlockDetailInfo.torinBlockDetail[0].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[1].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[2].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[3].blockCheckFlg": "1",
"torinBlockDetailInfo.torinBlockDetail[4].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[5].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[6].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[7].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[8].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[9].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[10].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[11].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[12].blockCheckFlg": "0",

# ここだけ変わる
"org.apache.struts.taglib.html.TOKEN" : "06136a1c3e9558818de3ee18fc48393f"

# HTMLのソースから「org.apache.struts.taglib.html.TOKEN」を検索すると同じ値があるのでスクレイピング
# -*- coding: utf-8 -*-

import csv
import datetime
import re
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# 日付変換
def date_conv(t_time):

    if t_time.startswith('翌日'):
        t_time = t_time.replace('翌日', '')
        next_day = datetime.timedelta(days=1)
    else:
        next_day = datetime.timedelta(days=0)

    H, M = map(int, t_time.split(':'))

    result = datetime.timedelta(hours=H, minutes=M)

    return result + next_day

# スクレイピング
def scraping(html):

    soup = BeautifulSoup(html, "html.parser")

    # CSVを保存
    with open('result.csv', 'w') as fw:
        writer = csv.writer(fw, dialect='excel', lineterminator='\n')
        writer.writerow(
            ['医療機関', '住所', 'TEL(昼)', 'TEL(夜)', '診療科目', '曜日', '開始時刻', '終了時刻'])

        table = soup.find_all(
            'table', class_='comTblGyoumuCommon', summary='検索結果一覧を表示しています。')

        shimanami = ['吉海町', '宮窪町', '伯方町', '上浦町', '大三島町', '関前']

        for i in table:

            cache = []

            date, week = i.td.get_text(strip=True).split()
            today = datetime.datetime.strptime(date, '%Y年%m月%d日')

            for tr in i.find_all('tr', id=re.compile('1|2|3')):
                data = tr.get_text('\t', strip=True).split()

                # 医療機関 住所 昼 昼TEL 夜 夜TEL 診療科目 受付時間
                if tr['id'] == '1':
                    hospital = data[1:]

                elif tr['id'] == '2':
                    hospital = cache + data

                elif tr['id'] == '3':
                    hospital = data

                # 夜間の電話がないところは空白挿入
                if hospital[4] != 'TEL(夜)':
                    hospital.insert(4, 'TEL(夜)')
                    hospital.insert(5, None)

                result = hospital[:7]

                result.remove('TEL(昼)')
                result.remove('TEL(夜)')

                # 住所が島嶼部の場合は、診療科目を島嶼部に変更
                for j in shimanami:
                    if j in result[1]:
                        if result[4] == '指定なし':
                            result[4] = '島嶼部'
                        break

                start_1st, end_1st = map(date_conv, hospital[7].split('〜'))

                # 前半のみの場合
                if len(hospital) < 9:
                    writer.writerow(result + [week] +
                                    [today + start_1st, today + end_1st])

                # 前半・後半がある場合
                else:
                    start_2nd, end_2nd = map(date_conv, hospital[8].split('〜'))

                    # 前半の終了時間と後半の開始時間が同じ場合は結合する
                    if end_1st == start_2nd:

                        writer.writerow(result + [week] +
                                        [today + start_1st, today + end_2nd])

                    # 前半の終了時間と後半の開始時間が違う場合、2つに分ける
                    else:

                        writer.writerow(result + [week] +
                                        [today + start_1st, today + end_1st])
                        writer.writerow(result + [week] +
                                        [today + start_2nd, today + end_2nd])

                cache = hospital[:6]

# メイン
if __name__ == '__main__':

    base_url = "http://www.qq.pref.ehime.jp/qq38/WP0805/RP080501BL.do"

    region_requests = {
        "blockCd[3]": "",
        "forward_next": "",
        "torinBlockDetailInfo.torinBlockDetail[0].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[1].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[2].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[3].blockCheckFlg": "1",
        "torinBlockDetailInfo.torinBlockDetail[4].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[5].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[6].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[7].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[8].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[9].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[10].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[11].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[12].blockCheckFlg": "0",
    }

    # 地域選択ページのセッション作成
    s = requests.Session()
    resp = s.get(base_url, timeout=1)
    soup = BeautifulSoup(resp.content, "html.parser")

    # トークンを取得
    token = soup.find(
        "input", attrs={
            "name": "org.apache.struts.taglib.html.TOKEN"
        }).get("value")

    region_requests["org.apache.struts.taglib.html.TOKEN"] = token

    url = urljoin(
        base_url,
        soup.find("form", attrs={
            "name": "wp0805Form"
        }).get("action"))

    # 選択する
    resp = s.post(url, data=region_requests)

    # スクレイピング
    scraping(resp.content)

    # データ読み込み
    df = pd.read_csv('result.csv', parse_dates=['開始時刻', '終了時刻'])
    df['時間'] = df['終了時刻'] - df['開始時刻']
    df.sort_values(['開始時刻', '診療科目'], ascending=[True, True], inplace=True)
    df

    # 日本語表示
    plt.rcParams['font.family'] = 'IPAPGothic'

    # 医療機関別 診療科目ごとに時間集計
    table = pd.pivot_table(
        df,
        values='時間',
        index=['医療機関'],
        columns=['診療科目'],
        fill_value=pd.Timedelta(hours=0),
        aggfunc=np.sum)

    # グラフ表示 医療機関別 診療科目ごとに時間集計
    table.plot.barh()

    # 救急(指定なし)のみ抽出
    df2 = df[df['診療科目'] == '指定なし']

    # 曜日別 医療機関ごとに時間集計
    tb1 = pd.pivot_table(
        df2,
        values='時間',
        index=['曜日'],
        columns=['医療機関'],
        fill_value=pd.Timedelta(hours=0),
        aggfunc=np.sum)
 
    # グラフ表示 曜日別 医療機関ごとに時間集計
    tb1.plot.barh(subplots=True, layout=(3, 3), figsize=(20, 20))

    # 1日毎の時間集計 抜けがないか確認
    df3 = df2.loc[:, ['医療機関', '診療科目', '開始時刻', '時間']]
    df3.set_index('開始時刻', inplace=True)
    s = df3.resample('D').sum()
    s.plot.barh()