読者です 読者をやめる 読者になる 読者になる

netkeibaのスクレイピング

okwave.jp

Python3ならできるんだけどPython2ではCSV保存のところでエラーがでてわからない

from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv

base_url = 'http://db.netkeiba.com/?pid=jockey_detail&id=00663&page={0}'
data = []

for num, i in enumerate([1, 2]):

    url = base_url.format(i)
    html = urlopen(url).read()
    soup = BeautifulSoup(html, 'html5lib')

    # ヘッダー追加
    if not num:
        header = [x.get_text().strip() for x in soup.select('#contents_liquid > table > thead > tr > th')]
        data.append(header)

    tr = soup.select('#contents_liquid > table > tbody > tr')
    td = [[x.get_text().strip() for x in y.select('td')] for y in tr]

    data.extend(td)

with open('horse.csv', 'wt') as fw:
    writer = csv.writer(fw, lineterminator='\n')
    writer.writerows(data)

ヘッダーとデータを両方取っておいて1ページのみヘッダー追加

from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv

base_url = 'http://db.netkeiba.com/?pid=jockey_detail&id=00663&page={0}'
data = []

for num, i in enumerate([1, 2]):

    url = base_url.format(i)
    html = urlopen(url).read()
    soup = BeautifulSoup(html, 'html5lib')

    tr = soup.find('div', {'id':'contents_liquid'}).find_all('tr')
    td = [[x.get_text().strip() for x in y.find_all(['th','td'])] for y in tr]

    if num:
        data.extend(td[1:])
    else:
        data.extend(td)

with open('horse.csv', 'wt') as fw:
    writer = csv.writer(fw, lineterminator='\n')
    writer.writerows(data)

teratail.com

from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv

url = 'http://db.netkeiba.com/horse/1994103997/'
html = urlopen(url).read()
soup = BeautifulSoup(html, 'html5lib')

td = [ [i.get_text().strip()] for i in soup.select('#db_main_box > div.db_main_deta > div > div.db_prof_area_02 > div > dl > dd > table > tbody > tr > td')]

with open('blood.csv', 'wt') as fw:
    writer = csv.writer(fw, lineterminator='\n')
    writer.writerows(td)

teratail.com

from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv

url = 'http://race.netkeiba.com/?pid=race_old&id=c201604020801'
html = urlopen(url).read()
soup = BeautifulSoup(html, 'html5lib')

tr = soup.select('#shutuba > diary_snap > table > tbody > tr')

with open('race.csv', 'wt') as fw:
    
    writer = csv.writer(fw, lineterminator='\n')

    for y in tr[3:]:
        
        td = [x.get_text().strip() for x in y.select('td')]

        # あなたの印 削除
        del td[2]

        # お気に入り馬(登録・あなたのメモ) 削除
        del td[-2:]

        writer.writerow(td)

teratail.com

from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv

url = 'http://db.netkeiba.com/race/201508050411/'
html = urlopen(url).read()

soup = BeautifulSoup(html, 'html5lib')

race = soup.select_one('#main > div > div > div > diary_snap > div > div > dl > dd > h1').get_text().strip()

span = soup.select_one(
    '#main > div > div > div > diary_snap > div > div > dl > dd > p > diary_snap_cut > span').get_text().strip()

data = [[j.strip() for j in i.split(':', 1)] for i in span.split('/')]

with open('race.csv', 'wt') as fw:
    writer = csv.writer(fw, lineterminator='\n')
    writer.writerow([race, data[0][0], data[1][1], data[2][1], data[3][1]])