Cloud9でPython3・Beautifulsoup4・Selenium・Phantomjsでスクレイピング

f:id:imabari_ehime:20170601231528p:plain

c9.io

git clone https://xxx@bitbucket.org/xxx/imabari119.git

cd imabari119
echo "# My project's README" >> README.md

git add README.md
git commit -m "Initial commit"
git push -u origin master

Python3設定

Edit-Code Formatting-Open Language & Formatting Preference...

Python Support

Python Version: Python3

Format Code on Save: ON

Custom Code Formatter: yapf -i "$file"

# Python3に切替
sudo mv /usr/bin/python /usr/bin/python2
sudo ln -s /usr/bin/python3 /usr/bin/python
python --version

# Phantomjsをインストール
npm install phantomjs

sudo -H pip3 install scrapy
sudo -H pip3 install yapf
sudo -H pip3 install beautifulsoup4
sudo -H pip3 install selenium
# --- coding: utf-8 ---
"""
えひめ医療情報ネットの今治市地区の当番医案内から医療機関のリストを取得
"""

import csv
import re
from selenium import webdriver

from bs4 import BeautifulSoup

# Windows
driver = webdriver.PhantomJS(
    '/home/ubuntu/workspace/node_modules/.bin/phantomjs')

# ブラウザ操作
driver.get("http://www.qq.pref.ehime.jp/qq38/qqport/kenmintop/")
driver.find_element_by_css_selector(
    "div.group2 > input.each-menu-citizen__button-hover").click()
driver.find_element_by_id("id_blockCd000004").click()
driver.find_element_by_name("forward_next").click()

# スクリーンショット
# driver.save_screenshot("ss.png")

html = driver.page_source
driver.quit()

shimanami = ['吉海町', '宮窪町', '伯方町', '上浦町', '大三島町', '関前']

with open('imabari119_cal.csv', 'wt', encoding='utf8') as fw:

    writer = csv.writer(fw, lineterminator='\n', dialect=csv.excel)

    soup = BeautifulSoup(html, 'html.parser')

    table = soup.find_all(
        'table', class_='comTblGyoumuCommon', summary='検索結果一覧を表示しています。')

    for i in table:

        # 日付取得
        date = i.td.get_text(strip=True).split()

        mae = []

        for hospital in i.find_all('tr', id=re.compile('1|2|3')):

            temp = hospital.get_text('|', strip=True).split('|')

            # 日付 曜日 病院名 住所 昼 昼TEL 夜 夜TEL 診療科目 受付時間
            if hospital['id'] == '1':
                result = date + temp[1:]

            elif hospital['id'] == '2':
                result = date + mae[2:6] + temp

            elif hospital['id'] == '3':
                result = date + temp

            # 夜間の電話がないところは空白挿入
            if result[6] != 'TEL(夜)':
                result.insert(6, None)
                result.insert(7, None)

            # 昼間と夜間が同じ病院の場合は結合
            if len(result) > 10:
                jikan = ''.join(result[9:]).replace('17:3017:30~', '')
                result[9] = jikan

            # 住所が島嶼部の場合は、診療科目を島嶼部に変更
            for j in shimanami:
                if j in result[3]:
                    if result[8] == '指定なし':
                        result[8] = '島嶼部'
                    break

            # id="2"の時用に直前の病院を保存
            mae = result[:10]

            # 保存
            writer.writerow(result[:10])