読者です 読者をやめる 読者になる 読者になる

netkeibaのスクレイピング2

また新しいお題がでていたので

ja.stackoverflow.com ja.stackoverflow.com

from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv

url = 'http://race.netkeiba.com/?pid=race&id=c201605050211&mode=shutuba'
html = urlopen(url).read()

soup = BeautifulSoup(html, 'html5lib')

tr = soup.select('#shutuba > table > tbody > tr')


# 改行ごとに分解、前後の空白文字を除去、空行なら除外し、結合

# td = [['\n'.join([z.strip() for z in x.get_text().splitlines() if z.strip()]) for x in y.find_all(['th', 'td'])] for y in tr]

td = [[x.get_text("\n", True) for x in y.find_all(['th', 'td'])] for y in tr]

with open('race.csv', 'wt', encoding='utf-8') as fw:
    writer = csv.writer(fw, lineterminator='\n')
    writer.writerows(td)

ja.stackoverflow.com

from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv

url = 'http://race.netkeiba.com/?pid=race&id=c201605050211&mode=shutuba'
html = urlopen(url).read()

soup = BeautifulSoup(html, 'html5lib')

with open('name.csv', 'wt', encoding='utf-8') as fw:
    writer = csv.writer(fw, lineterminator='\n')

    for tr in soup.select('#shutuba > table > tbody > tr')[1:]:

        td = tr.select_one('td:nth-of-type(7)')
        # td = tr.select('td')[6]

        names = []
        names.append(td.select_one('span.h_name > a').get_text().strip())
        names.extend([i.strip('()') for i in td.select_one('span.txt_smaller').get_text().split()])

        writer.writerow(names)

imabari.hateblo.jp