requests-html

Requests-HTML: HTML Parsing for Humans (writing Python 3)! — requests-HTML v0.3.4 documentation

from requests_html import HTMLSession

session = HTMLSession()
r = session.get('https://www.jleague.jp/sp/club/sapporo/day/#player')

# URL抽出 相対リンク
r.html.links

# URL抽出 絶対リンク
r.html.absolute_links

# CSSセレクタ
about = r.html.find('#about', first=True)

# xpath
r.html.xpath('a')

# テキスト検索
r.html.search('Python is a {} language')[0]

# テキスト
about.text

# 属性
about.attrs

# HTML
about.html

about.find('a')

# Javascriptレンダリング
r.html.render()

dockerインストール・splashインストール 2018

docker

docs.docker.com

sudo apt-get update
sudo apt-get install apt-transport-https ca-certificates curl software-properties-common
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
sudo apt-key fingerprint 0EBFCD88
sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable"
sudo apt-get install -y docker-ce

docs.docker.com

sudo groupadd docker
sudo usermod -aG docker $USER

Installation — Splash 3.2 documentation

splash

docker pull scrapinghub/splash
docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash

http://0.0.0.0:8050/

techblog.scouter.co.jp

pip3 install scrapy-splash --user

teratail.com

setup.py

AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 5
AUTOTHROTTLE_MAX_DELAY = 60