2018-03-01

herokuでtwitter botを定時実行

pipenv install python-twitter
pipenv install html5lib
pipenv install beautifulsoup4
pipenv install apscheduler
pipenv install requests

pipenv run pip freeze > requirements.txt
echo "python-3.6.7" > runtime.txt
echo "clock: python clock.py" > Procfile

git config --global user.name "xxxxx"
git config --global user.email "xxxxx@gmail.com"

rm -rf .git

git init
git add .
git commit -m "my first commit"

heroku create xxxxx

git push heroku master

clock.py

from apscheduler.schedulers.blocking import BlockingScheduler
import os

sched = BlockingScheduler()

# 30分毎
# @sched.scheduled_job('interval', minutes=30)
# 1時間毎
# @sched.scheduled_job('interval', hours=1)

# 定時
@sched.scheduled_job('cron', minute=00, hour=7)
def dam_job():

    os.system('python tamagawa-dam.py')


@sched.scheduled_job('cron', minute=30, hour=7)
def hospital_job():

    os.system('python today-hospital.py')

sched.start()

sudo snap install --classic heroku

heroku login -i

heroku git:clone -a xxxxx

git init
git add .
git commit -m "my first commit"
git push heroku master

heroku logout

2018-02-07

Pythonでselenium headless（Firefox・Chrome）

スクレイピング Program Python

Firefox

github.com

sudo cp geckodriver /usr/local/bin

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup

options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)

driver.get('http://www.yahoo.co.jp')

# ブラウザ操作

driver.save_screenshot("ss.png")

html = driver.page_source
driver.quit()

soup = BeautifulSoup(html, 'html.parser')

Chrome

sites.google.com

from selenium import webdriver
from bs4 import BeautifulSoup

options = webdriver.ChromeOptions()
options.set_headless()
driver = webdriver.Chrome(chrome_options=options)

driver.get('http://www.yahoo.co.jp')

# ブラウザ操作

driver.save_screenshot("ss.png")

html = driver.page_source
driver.quit()

soup = BeautifulSoup(html, 'html.parser')

2018-02-02

herokuでheaderless chromeでConnectionResetError 104

スクレイピング Program Python

3日に1回ぐらいエラーで失敗しているので

Traceback (most recent call last):
  File "today-hospital.py", line 17, in <module>
    driver = webdriver.Chrome(chrome_options=options)
    desired_capabilities=desired_capabilities)
  File "/app/.heroku/python/lib/python3.6/site-packages/selenium/webdriver/chrome/webdriver.py", line 75, in __init__
  File "/app/.heroku/python/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 154, in __init__
    self.start_session(desired_capabilities, browser_profile)
  File "/app/.heroku/python/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 243, in start_session
  File "/app/.heroku/python/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py", line 310, in execute
  File "/app/.heroku/python/lib/python3.6/site-packages/selenium/webdriver/remote/remote_connection.py", line 466, in execute
    response = self.execute(Command.NEW_SESSION, parameters)
    response = self.command_executor.execute(driver_command, params)
    return self._request(command_info[0], url, body=data)
  File "/app/.heroku/python/lib/python3.6/site-packages/selenium/webdriver/remote/remote_connection.py", line 490, in _request
    resp = self._conn.getresponse()
    response.begin()
    version, status, reason = self._read_status()
  File "/app/.heroku/python/lib/python3.6/http/client.py", line 258, in _read_status
  File "/app/.heroku/python/lib/python3.6/socket.py", line 586, in readinto
  File "/app/.heroku/python/lib/python3.6/http/client.py", line 1331, in getresponse
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
    return self._sock.recv_into(b)
  File "/app/.heroku/python/lib/python3.6/http/client.py", line 297, in begin
ConnectionResetError: [Errno 104] Connection reset by peer

stackoverflow.com

# 1/30　ウエイトを入れてみたけど再発
driver.implicitly_wait(10)

# 2/2 以下を追加
options.add_argument('--no-sandbox')
driver.set_page_load_timeout(60)

#2/5
Phantomjsに変更

エラーがなくなればいいんだけどだめならPhantomjsに変更しようかな 2/9 Phantomjsに変更してからエラーなし

2017-12-22

インフルエンザによる学級閉鎖等の状況

スクレイピング Program Python

import requests
from bs4 import BeautifulSoup

url = 'http://www.city.imabari.ehime.jp/gakukyou/info_influenza/'
r = requests.get(url)

# エラーがないか確認する

if r.status_code == requests.codes.ok:

    soup = BeautifulSoup(r.content, 'html.parser')

    for i in soup.select('#main_container > div > table > tbody > tr'):
        print('-' * 30)
        print('インフルエンザによる学級閉鎖等の状況（市立小・中学校）')
        print(i.get_text(' ', strip=True))
        print(url)

2017-11-22

pythonスクレイピング関連記事

スクレイピング Program Python

adventar.org

qiita.com

vaaaaaanquish.hatenablog.com

orangain.hatenablog.com

2017-11-15

Pythonで総選挙データのスクレイピング

スクレイピング Program Python

データのスクレイピング

AKB48総選挙データのスクレイピング

import csv
from urllib.request import urlopen

from bs4 import BeautifulSoup

url = 'http://www.akb48.co.jp/sousenkyo_45th/result.php'
html = urlopen(url).read()

soup = BeautifulSoup(html, 'html.parser')

with open('akb48.csv', 'w') as fw:
    writer = csv.writer(fw, dialect='excel', lineterminator='\n')

    # ヘッダー
    writer.writerow(['rank', 'akb_names', 'akb_count'])
    
    for i in soup.select('#main_area > div.frameFix > div > ul > li'):

        # 順位　前後の「第位」を削除=>数値化
        rank = int(
            i.select_one('p.result_rank').get_text(strip=True).strip('第位'))

        # 得票数　後の「票」を削除=>カンマ削除=>数値化
        count = int(
            i.select_one('p.result_count').get_text(
                strip=True).rstrip('票').replace(',', ''))

        # 名前
        name = i.select_one('h4.result_name').get_text(strip=True)

        writer.writerow([rank, name, count])

2017年総選挙データ（東京）のスクレイピング

from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv

url = 'http://www.asahi.com/senkyo/senkyo2017/kaihyo/A13.html'
html = urlopen(url).read()

soup = BeautifulSoup(html, 'html.parser')

with open('hr2017_tokyo.csv', 'w') as fw:
    writer = csv.writer(fw, dialect='excel', lineterminator='\n')

    # ヘッダー
    writer.writerow(
        ['num', 'name', 'age', 'count', 'party', 'status', 'previous'])

    for num, i in enumerate(
            soup.select('div.areabox > table > tbody > tr'), start=1):

        # 姓
        sei = i.select_one('td.namae > div > span.sei').get_text(strip=True)

        # 名
        mei = i.select_one('td.namae > div > span.mei').get_text(strip=True)

        # 年齢
        age = int(
            i.select_one('td.namae > div > span.age').get_text(
                strip=True).strip('()'))

        # 得票数
        count = int(
            i.select_one('td.num > div').contents[0].strip().replace(',', ''))

        # 政党
        party = i.select_one('td.party > div').get_text(strip=True)

        # 新旧
        status = i.select_one('td.status > div').get_text(strip=True)

        # 当選回数
        previous = int(
            i.select_one('td.tosenkaisu > div').get_text(
                strip=True).rstrip('回'))

        writer.writerow(
            [num, sei + ' ' + mei, age, count, party, status, previous])

2017-11-13

Pythonのマルチスレッドで同じものを含む順列

Program Python

import concurrent.futures


def permutations(head, rest):

    if len(rest) == 0:
        return [head]

    else:
        res = []

        # set（集合）型で重複を削除、ソート
        data = sorted(set(rest))

        for i in data:

            #配列の複製
            restx = rest[:]

            #指定データ削除
            restx.remove(i)

            headx = head + [i]
            res += permutations(headx, restx)

        return res


if __name__ == '__main__':

    data = [1, 1, 1, 1, 2, 2, 2, 3, 3, 4]
    data_list = []

    for i in set(data):

        temp = data[:]
        temp.remove(i)

        data_list.append([[i], temp])

    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:

        futures = [
            executor.submit(permutations, head, rest)
            for head, rest in data_list
        ]

        for future in concurrent.futures.as_completed(futures):
            res = future.result()

            print(res)