某サイトの買取金額のスクレイピング対策の金額をスクレイピング

  1. 買取金額表示用の数字画像を取得
  2. OCRで画像から数字に変換
  3. 切り抜き位置から数字番号取得
  4. 数字に変換

CSSからのパースはじめてした

pypi.org

teratail.com

teratail.com

pip install easyocr
pip install cssutils
import pathlib
from urllib.parse import urljoin

import cssutils
import easyocr
import requests
from bs4 import BeautifulSoup

# スクレイピング

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}


def fetch_soup(url, parser="html5lib"):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, parser)

    return soup


def fetch_file(url, fn):

    p = pathlib.Path(fn)
    p.parent.mkdir(parents=True, exist_ok=True)

    r = requests.get(url)
    r.raise_for_status()

    with p.open(mode="wb") as fw:
        fw.write(r.content)

    return p

# URLなし
url = ""

soup = fetch_soup(url)

# prettifyで整形されたファイルで保存
with open("test.html", mode="w", encoding="utf-8") as fw:
    fw.write(soup.prettify())

# encrypt-num

css = soup.select_one("div.topic > style").get_text(strip=True)

sheet = cssutils.parseString(css)

img_url = ""

for rule in sheet:
    if rule.type == rule.STYLE_RULE:
        # find property
        for property in rule.style:
            if property.name == "background-image":
                temp = property.value
                img_url = temp.replace("url(", "").replace(")", "")
                break

img_url

link = urljoin(url, img_url)
print(link)

p = fetch_file(link, "encrypt-num.png")


reader = easyocr.Reader(["en"], gpu=False)
result = reader.readtext("encrypt-num.png", detail=0, allowlist="0123456789")[0] + ","

print(result)

for tag in soup.select("div.item.item-thumbnail.item-product-list"):

    title = tag.select_one("h4.item-title").get_text(strip=True)
    print(title)

    for price in tag.select("div.item-price.encrypt-price"):

        number = [
            int(
                int(
                    cssutils.parseStyle(i.get("style"))["background-position"].strip(
                        "-px"
                    )
                )
                / 10
            )
            for i in price.select("span.encrypt-num")
        ]

        print("".join([result[j] for j in number]))

    print("-" * 20)