- 買取金額表示用の数字画像を取得
- OCRで画像から数字に変換
- 切り抜き位置から数字番号取得
- 数字に変換
CSSからのパースはじめてした
pip install easyocr pip install cssutils
import pathlib from urllib.parse import urljoin import cssutils import easyocr import requests from bs4 import BeautifulSoup # スクレイピング headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko" } def fetch_soup(url, parser="html5lib"): r = requests.get(url, headers=headers) r.raise_for_status() soup = BeautifulSoup(r.content, parser) return soup def fetch_file(url, fn): p = pathlib.Path(fn) p.parent.mkdir(parents=True, exist_ok=True) r = requests.get(url) r.raise_for_status() with p.open(mode="wb") as fw: fw.write(r.content) return p # URLなし url = "" soup = fetch_soup(url) # prettifyで整形されたファイルで保存 with open("test.html", mode="w", encoding="utf-8") as fw: fw.write(soup.prettify()) # encrypt-num css = soup.select_one("div.topic > style").get_text(strip=True) sheet = cssutils.parseString(css) img_url = "" for rule in sheet: if rule.type == rule.STYLE_RULE: # find property for property in rule.style: if property.name == "background-image": temp = property.value img_url = temp.replace("url(", "").replace(")", "") break img_url link = urljoin(url, img_url) print(link) p = fetch_file(link, "encrypt-num.png") reader = easyocr.Reader(["en"], gpu=False) result = reader.readtext("encrypt-num.png", detail=0, allowlist="0123456789")[0] + "," print(result) for tag in soup.select("div.item.item-thumbnail.item-product-list"): title = tag.select_one("h4.item-title").get_text(strip=True) print(title) for price in tag.select("div.item-price.encrypt-price"): number = [ int( int( cssutils.parseStyle(i.get("style"))["background-position"].strip( "-px" ) ) / 10 ) for i in price.select("span.encrypt-num") ] print("".join([result[j] for j in number])) print("-" * 20)