今治地区の救急病院をスクレイピングし曜日別・医療機関別に集計する
seleniumを使わずにrequestsでpost送信
Firefoxの開発ツールでpost内容を確認
ネットワークの中からメソッドPOSTを選びパラメーターのフォームデータを確認
"blockCd[3]": "", "forward_next": "", "torinBlockDetailInfo.torinBlockDetail[0].blockCheckFlg": "0", "torinBlockDetailInfo.torinBlockDetail[1].blockCheckFlg": "0", "torinBlockDetailInfo.torinBlockDetail[2].blockCheckFlg": "0", "torinBlockDetailInfo.torinBlockDetail[3].blockCheckFlg": "1", "torinBlockDetailInfo.torinBlockDetail[4].blockCheckFlg": "0", "torinBlockDetailInfo.torinBlockDetail[5].blockCheckFlg": "0", "torinBlockDetailInfo.torinBlockDetail[6].blockCheckFlg": "0", "torinBlockDetailInfo.torinBlockDetail[7].blockCheckFlg": "0", "torinBlockDetailInfo.torinBlockDetail[8].blockCheckFlg": "0", "torinBlockDetailInfo.torinBlockDetail[9].blockCheckFlg": "0", "torinBlockDetailInfo.torinBlockDetail[10].blockCheckFlg": "0", "torinBlockDetailInfo.torinBlockDetail[11].blockCheckFlg": "0", "torinBlockDetailInfo.torinBlockDetail[12].blockCheckFlg": "0", # ここだけ変わる "org.apache.struts.taglib.html.TOKEN" : "06136a1c3e9558818de3ee18fc48393f" # HTMLのソースから「org.apache.struts.taglib.html.TOKEN」を検索すると同じ値があるのでスクレイピング
# -*- coding: utf-8 -*- import csv import datetime import re from urllib.parse import urljoin import requests from bs4 import BeautifulSoup import matplotlib.pyplot as plt import numpy as np import pandas as pd # 日付変換 def date_conv(t_time): if t_time.startswith('翌日'): t_time = t_time.replace('翌日', '') next_day = datetime.timedelta(days=1) else: next_day = datetime.timedelta(days=0) H, M = map(int, t_time.split(':')) result = datetime.timedelta(hours=H, minutes=M) return result + next_day # スクレイピング def scraping(html): soup = BeautifulSoup(html, "html.parser") # CSVを保存 with open('result.csv', 'w') as fw: writer = csv.writer(fw, dialect='excel', lineterminator='\n') writer.writerow( ['医療機関', '住所', 'TEL(昼)', 'TEL(夜)', '診療科目', '曜日', '開始時刻', '終了時刻']) table = soup.find_all( 'table', class_='comTblGyoumuCommon', summary='検索結果一覧を表示しています。') shimanami = ['吉海町', '宮窪町', '伯方町', '上浦町', '大三島町', '関前'] for i in table: cache = [] date, week = i.td.get_text(strip=True).split() today = datetime.datetime.strptime(date, '%Y年%m月%d日') for tr in i.find_all('tr', id=re.compile('1|2|3')): data = tr.get_text('\t', strip=True).split() # 医療機関 住所 昼 昼TEL 夜 夜TEL 診療科目 受付時間 if tr['id'] == '1': hospital = data[1:] elif tr['id'] == '2': hospital = cache + data elif tr['id'] == '3': hospital = data # 夜間の電話がないところは空白挿入 if hospital[4] != 'TEL(夜)': hospital.insert(4, 'TEL(夜)') hospital.insert(5, None) result = hospital[:7] result.remove('TEL(昼)') result.remove('TEL(夜)') # 住所が島嶼部の場合は、診療科目を島嶼部に変更 for j in shimanami: if j in result[1]: if result[4] == '指定なし': result[4] = '島嶼部' break start_1st, end_1st = map(date_conv, hospital[7].split('〜')) # 前半のみの場合 if len(hospital) < 9: writer.writerow(result + [week] + [today + start_1st, today + end_1st]) # 前半・後半がある場合 else: start_2nd, end_2nd = map(date_conv, hospital[8].split('〜')) # 前半の終了時間と後半の開始時間が同じ場合は結合する if end_1st == start_2nd: writer.writerow(result + [week] + [today + start_1st, today + end_2nd]) # 前半の終了時間と後半の開始時間が違う場合、2つに分ける else: writer.writerow(result + [week] + [today + start_1st, today + end_1st]) writer.writerow(result + [week] + [today + start_2nd, today + end_2nd]) cache = hospital[:6] # メイン if __name__ == '__main__': base_url = "http://www.qq.pref.ehime.jp/qq38/WP0805/RP080501BL.do" region_requests = { "blockCd[3]": "", "forward_next": "", "torinBlockDetailInfo.torinBlockDetail[0].blockCheckFlg": "0", "torinBlockDetailInfo.torinBlockDetail[1].blockCheckFlg": "0", "torinBlockDetailInfo.torinBlockDetail[2].blockCheckFlg": "0", "torinBlockDetailInfo.torinBlockDetail[3].blockCheckFlg": "1", "torinBlockDetailInfo.torinBlockDetail[4].blockCheckFlg": "0", "torinBlockDetailInfo.torinBlockDetail[5].blockCheckFlg": "0", "torinBlockDetailInfo.torinBlockDetail[6].blockCheckFlg": "0", "torinBlockDetailInfo.torinBlockDetail[7].blockCheckFlg": "0", "torinBlockDetailInfo.torinBlockDetail[8].blockCheckFlg": "0", "torinBlockDetailInfo.torinBlockDetail[9].blockCheckFlg": "0", "torinBlockDetailInfo.torinBlockDetail[10].blockCheckFlg": "0", "torinBlockDetailInfo.torinBlockDetail[11].blockCheckFlg": "0", "torinBlockDetailInfo.torinBlockDetail[12].blockCheckFlg": "0", } # 地域選択ページのセッション作成 s = requests.Session() resp = s.get(base_url, timeout=1) soup = BeautifulSoup(resp.content, "html.parser") # トークンを取得 token = soup.find( "input", attrs={ "name": "org.apache.struts.taglib.html.TOKEN" }).get("value") region_requests["org.apache.struts.taglib.html.TOKEN"] = token url = urljoin( base_url, soup.find("form", attrs={ "name": "wp0805Form" }).get("action")) # 選択する resp = s.post(url, data=region_requests) # スクレイピング scraping(resp.content) # データ読み込み df = pd.read_csv('result.csv', parse_dates=['開始時刻', '終了時刻']) df['時間'] = df['終了時刻'] - df['開始時刻'] df.sort_values(['開始時刻', '診療科目'], ascending=[True, True], inplace=True) df # 日本語表示 plt.rcParams['font.family'] = 'IPAPGothic' # 医療機関別 診療科目ごとに時間集計 table = pd.pivot_table( df, values='時間', index=['医療機関'], columns=['診療科目'], fill_value=pd.Timedelta(hours=0), aggfunc=np.sum) # グラフ表示 医療機関別 診療科目ごとに時間集計 table.plot.barh() # 救急(指定なし)のみ抽出 df2 = df[df['診療科目'] == '指定なし'] # 曜日別 医療機関ごとに時間集計 tb1 = pd.pivot_table( df2, values='時間', index=['曜日'], columns=['医療機関'], fill_value=pd.Timedelta(hours=0), aggfunc=np.sum) # グラフ表示 曜日別 医療機関ごとに時間集計 tb1.plot.barh(subplots=True, layout=(3, 3), figsize=(20, 20)) # 1日毎の時間集計 抜けがないか確認 df3 = df2.loc[:, ['医療機関', '診療科目', '開始時刻', '時間']] df3.set_index('開始時刻', inplace=True) s = df3.resample('D').sum() s.plot.barh()