JFLの試合結果から得点ランキング作成

#%%
import csv
from urllib.parse import urljoin
import time

import requests
from bs4 import BeautifulSoup


def cleaning(info, team, data):

    result = []

    for trs in data:

        temp = [i.get_text(strip=True) for i in trs.select('th, td')]
        
        # 時間の分を除去後、延長時間を計算
        temp[0] = eval(temp[0].rstrip('分'))

        # 名前のPKを削除
        temp[1] = temp[1].replace('(PK)', '').strip()

        result.append(info + [team] + temp)

    return result


def scraping(n, url):

    r = requests.get(url)

    if r.status_code == requests.codes.ok:

        soup = BeautifulSoup(r.content, 'html5lib')

        # シーズン・節
        score_season = soup.select_one(
            'div.score-header > h2.score-meta > span.score-season').get_text(
                strip=True).split()

        # 節
        score_season[1] = score_season[1].strip('第節')

        # print(score_season)

        # 日時
        score_date = soup.select_one(
            'div.score-header > h2.score-meta > span.score-date').get_text(
                strip=True).split()

        # print(score_date)

        # チーム名
        score_table = soup.select_one('table.score-table')

        home_team = score_table.select_one('th.score-team1').get_text(strip=True)
        away_team = score_table.select_one('th.score-team2').get_text(strip=True)

        # print(home_team, away_team)

        # 試合情報
        game_info = [n] + score_season + score_date + [home_team, away_team]

        # 得点
        for i in soup.select('div.section > h3'):

            # 得点のテーブルか確認
            if i.text == '得 点':

                table = [trs for trs in i.parent.select('div.score-frame > div.score-left > table > tbody > tr')]
                home_data = cleaning(game_info, home_team, table)

                table = [trs for trs in i.parent.select('div.score-frame > div.score-right > table > tbody > tr')]
                away_data = cleaning(game_info, away_team, table)

                score_data = home_data + away_data

                return (score_data)
            
        return None

if __name__ == "__main__":

    url = 'http://www.jfl.or.jp/jfl-pc/view/s.php?a=1270&f=2018A001_spc.html'

    r = requests.get(url)

    if r.status_code == requests.codes.ok:

        soup = BeautifulSoup(r.content, 'html5lib')

        with open('result.csv', 'w') as fw:
            writer = csv.writer(fw, dialect='excel', lineterminator='\n')
            writer.writerow(
                ['試合', 'シーズン', '節', '日付', '時刻', 'ホーム', 'アウェイ', '所属チーム', '時間', '背番号', '名前'])
            
            n = 0

            for link in soup.select('td.detail-link > a'):

                # 詳細のリンクか確認
                if link.text == '詳細':
                    
                    n += 1

                    spc_url = urljoin(url, link.get('href'))

                    # 詳細をスクレイピング
                    score_data = scraping(n, spc_url)

                    # CSVに保存
                    if score_data:
                        writer.writerows(score_data)

                    # 1秒待機
                    time.sleep(1)
#%%
df = pd.read_csv('result.csv')
df['得点'] = 1

# ゴール数ランキング
pv = df.pivot_table(
    values='得点', index=['選手名', 'チーム名'], aggfunc=sum, fill_value=0)
pv = pv.reset_index()

# オウンゴールを削除
pv.drop(pv.index[pv['選手名'] == 'オウンゴール'], inplace=True)

# ランキング
pv['順位'] = pv['得点'].rank(ascending=False, method='min')

# 順位・チーム名・名前で昇順
pv.sort_values(
    ['順位', 'チーム名', '選手名'], ascending=[True, True, True], inplace=True)

pv.set_index('順位', inplace=True)

df_goal = pv.loc[:, ['順位', '選手名', 'チーム名', '得点']]

df_goal.to_excel('jfl_goal_ranking.xlsx')