imabari.hateblo.jp
import csv
import time
from urllib.parse import urljoin
import gspread
import pandas as pd
import requests
from bs4 import BeautifulSoup
from oauth2client.service_account import ServiceAccountCredentials
from tqdm import tqdm
def cleaning(info, team, data):
result = []
for trs in data:
temp = [i.get_text(strip=True) for i in trs.select("th, td")]
temp[0] = eval(temp[0].rstrip("分"))
temp[2] = temp[2].replace("(PK)", "").strip()
result.append(info + [team] + temp)
return result
def scraping(n, url):
r = requests.get(url)
if r.status_code == requests.codes.ok:
soup = BeautifulSoup(r.content, "html5lib")
score_season = soup.select_one(
"div.score-header > h2.score-meta > span.score-season"
).get_text(strip=True)
score_season = score_season.strip("第節")
score_date = (
soup.select_one("div.score-header > h2.score-meta > span.score-date")
.get_text(strip=True)
.split()
)
score_table = soup.select_one("table.score-table")
home_team = score_table.select_one("th.score-team1").get_text(strip=True)
away_team = score_table.select_one("th.score-team2").get_text(strip=True)
game_info = [n, score_season] + score_date + [home_team, away_team]
for i in soup.select("div.section > h3"):
if i.text == "得 点":
table = [
trs
for trs in i.parent.select(
"div.score-frame > div.score-left > table > tbody > tr"
)
]
home_data = cleaning(game_info, home_team, table)
table = [
trs
for trs in i.parent.select(
"div.score-frame > div.score-right > table > tbody > tr"
)
]
away_data = cleaning(game_info, away_team, table)
score_data = home_data + away_data
return score_data
return None
if __name__ == "__main__":
url = "http://www.jfl.or.jp/jfl-pc/view/s.php?a=1411&f=2019A001_spc.html"
r = requests.get(url)
if r.status_code == requests.codes.ok:
soup = BeautifulSoup(r.content, "html5lib")
with open("result.csv", "w") as fw:
writer = csv.writer(fw, dialect="excel", lineterminator="\n")
writer.writerow(
["試合", "節", "日付", "時刻", "ホーム", "アウェイ", "チーム名", "時間", "背番号", "選手名"]
)
n = 0
links = soup.select("td.detail-link > a")
for link in tqdm(links):
if link.text == "詳細":
n += 1
spc_url = urljoin(url, link.get("href"))
score_data = scraping(n, spc_url)
if score_data:
writer.writerows(score_data)
time.sleep(3)
df = pd.read_csv("result.csv")
df["得点"] = 1
pv_goal = df.pivot_table(
values="得点", index=["選手名", "チーム名", "背番号"], aggfunc=sum, fill_value=0
)
pv_goal = pv_goal.reset_index()
pv_goal.drop(pv_goal.index[pv_goal["選手名"] == "オウンゴール"], inplace=True)
pv_goal["背番号"] = pv_goal["背番号"].astype(int)
pv_goal["順位"] = pv_goal["得点"].rank(ascending=False, method="min").astype(int)
jfl_2019 = [
"Honda FC",
"FC大阪",
"ソニー仙台FC",
"FC今治",
"東京武蔵野シティFC",
"MIOびわこ滋賀",
"奈良クラブ",
"ヴェルスパ大分",
"ラインメール青森",
"ヴィアティン三重",
"テゲバジャーロ宮崎",
"FCマルヤス岡崎",
"ホンダロックSC",
"流経大ドラゴンズ龍ケ崎",
"松江シティFC",
"鈴鹿アンリミテッド",
]
team = {name: i for i, name in enumerate(jfl_2019, 1)}
pv_goal["チームID"] = pv_goal["チーム名"].map(team)
pv_goal.sort_values(
["順位", "チームID", "背番号"], ascending=[True, True, True], inplace=True
)
pv_goal.drop(["チームID", "背番号"], axis=1, inplace=True)
pv_goal.set_index("順位", inplace=True)
scope = [
"https://spreadsheets.google.com/feeds",
"https://www.googleapis.com/auth/drive",
]
credentials = ServiceAccountCredentials.from_json_keyfile_name(
"jfl-ranking.json", scope
)
gc = gspread.authorize(credentials)
workbook = gc.open_by_key("xxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
rank_data = pv_goal.reset_index().values.flatten().tolist()
worksheet = workbook.worksheet("得点ランキング")
cell_list = worksheet.range("A2:D201")
for cell, v in zip(cell_list, rank_data):
cell.value = v
worksheet.update_cells(cell_list)