Beautiful Soup Documentation — Beautiful Soup 4.4.0 documentation

ウェブページをHTMLで保存し、サーバーへのアクセス回数を減らす

初回アクセス時にHTMLファイルを保存

from urllib.request import urlopen
from bs4 import BeautifulSoup

url = 'アドレス'
html = urlopen(url).read()

soup = BeautifulSoup(html, 'html.parser')
# soup = BeautifulSoup(html, 'html5lib')

# prettifyで整形されたファイルで保存
with open('test.html', mode = 'w', encoding = 'utf-8') as fw:
    fw.write(soup.prettify())

requestsを使う場合

import requests
from bs4 import BeautifulSoup

# 相対アドレスの場合必要
# from urllib.parse import urljoin
# urljoin(url, 'index.html')

url = 'http://www.example.com/'

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'
}

r = requests.get(url, headers=headers)

# r.raise_for_status()

if r.status_code == requests.codes.ok:

    soup = BeautifulSoup(r.content, 'html.parser')
    # soup = BeautifulSoup(r.content, 'html5lib')

    # prettifyで整形されたファイルで保存
    with open('test.html', mode = 'w', encoding = 'utf-8') as fw:
        fw.write(soup.prettify())

２回目以降はHTMLファイルを読み込み

from bs4 import BeautifulSoup

soup = BeautifulSoup(open('test.html', encoding='utf-8'), 'html.parser')
# soup = BeautifulSoup(open('test.html', encoding='utf-8'), 'html5lib')

抽出コマンド

タグで抽出

# タグ名で取得（すべて）
soup.find_all('タグ名', '属性')

soup.find_all("title")
soup.find_all("p", "title")
soup.find_all("a")
soup.find_all(id="link2")
soup.find_all("a", class_="sister")

soup.find_all("a", class_="link", href="/link")
soup.find_all("a", attrs={"class": "link", "href": "/link"})


import re
soup.find_all(href=re.compile("elsie"))
soup.find_all(href=re.compile("elsie"), id='link1')
soup.find_all(class_=re.compile("itl"))

# タグ名はリストにすると複数可能
soup.find_all(['th','td'])


# タグ名で取得（先頭の一つ）
soup.find('タグ名', '属性')

import re
soup.find(string=re.compile("sisters"))

CSS セレクタで抽出

# CSSセレクタで取得（すべて）
soup.select("CSSセレクタ")

soup.select("title")
soup.select("p nth-of-type(3)")
soup.select("body a")
soup.select("html head title")
soup.select("head > title")
soup.select("p > a")
soup.select("p > a:nth-of-type(2)")
soup.select("p > #link1")
soup.select("body > a")
soup.select("#link1 ~ .sister")
soup.select("#link1 + .sister")
soup.select(".sister")
soup.select("[class~=sister]")
soup.select("#link1")
soup.select("a#link2")
soup.select('a[class="link1"]')
soup.select('a[id="link2"]')


# CSSセレクタで取得（先頭の一つ）
soup.select_one('CSSセレクタ')

# 属性取得
soup.a.get("href")

# タグ内の文字列取得
soup.a.string

# タグ内のすべての文字列取得（セパレータ文字、前後文字削除）
soup.get_text("|", strip=True)

Python 3.x - Beautiful Soup 4で簡単なスクレイピングができない…stringが効かない…｜teratail

文字取得

text
string
get_text()
contents

from bs4 import BeautifulSoup

html = """
<div>Sample <b>Text</b></div>
"""

soup = BeautifulSoup(html, "html.parser")

div = soup.select_one("div")

# 文字取得
print('text: ', div.text)

# stringはタグ内に文字列が２つ以上あるとNoneになる
print('string: ', div.string)
print('b.string: ', div.b.string)

print('get_text: ', div.get_text())
print('contents:', div.contents)

print('-'*20)

# Sampleを取得
print('contents.string:', div.contents[0].string)

str = [i for i in div.strings]
print('strings: ', str[0])

str = div.get_text(' ').split()
print('get_text(sep): ', str[0])

# Bタグを削除
div.b.extract()
print('extract: ', div.string)

text:  Sample Text
string:  None
b.string:  Text
get_text:  Sample Text
contents: ['Sample ', <b>Text</b>]
--------------------
contents.string: Sample 
strings:  Sample 
get_text(sep):  Sample
extract:  Sample

# タグを削除
soup.select('span.b')[0].extract()

# 中身が見れる
elm = doc.find("span", class_ = "a")
import pprint
pprint.pprint(elm.__dict__)

# レスポンスヘッダー
r.headers

# リクエストヘッダー
r.request.headers

CSS セレクタの基本

www.htmq.com

imabari.hateblo.jp

XpathとCSSpathのテスト

ChromeのConsoleで

Xpath

$x('XXXXX')

CSSpath

inspect($$('XXXXX'))

でテストできる

XPATH

qiita.com

メモ

Pythonでスクレイピング　基本

ウェブページをHTMLで保存し、サーバーへのアクセス回数を減らす

初回アクセス時にHTMLファイルを保存

requestsを使う場合

２回目以降はHTMLファイルを読み込み

抽出コマンド

タグで抽出

CSS セレクタで抽出

文字取得

CSS セレクタの基本

XpathとCSSpathのテスト

Xpath

CSSpath

XPATH

おすすめ

ウェブページをHTMLで保存し、サーバーへのアクセス回数を減らす

初回アクセス時にHTMLファイルを保存

requestsを使う場合

２回目以降はHTMLファイルを読み込み

抽出コマンド

タグで抽出

CSSセレクタで抽出

文字取得

CSSセレクタの基本

XpathとCSSpathのテスト

Xpath

CSSpath

XPATH

おすすめ

CSS セレクタで抽出

CSS セレクタの基本