公開交通取締りのPDFからスクレイピング
!pip install tabula-py
import re import requests import datetime from bs4 import BeautifulSoup from urllib.parse import urljoin import pandas as pd from tabula import read_pdf # PDFのURLをスクレイピング url = "https://www.police.pref.ehime.jp/kotsusidou/koukaitorishimari.htm" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko" } r = requests.get(url, headers=headers) r.raise_for_status() soup = BeautifulSoup(r.content, "html.parser") link = urljoin(url, soup.find("a", text=re.compile("公開交通取締")).get("href")) print(link) # PDFをDataFrameに変換 df = read_pdf( link, pages="all", lattice=True, pandas_options={"names": ["day", "time", "type", "place", "route"]}, ) df # タイトル除去 df = df[df["time"] != "時 間"] # 全部 df1 = df.copy() # 今治のみ抽出 # df1 = df[df["place"].str.contains("今治")].copy() # 日を除去 df1["day"] = df1["day"].str.rstrip("日").astype(int) # 今日の日付 dt = datetime.date.today() # 年・月を補完し、日付を作成 df1["date"] = pd.to_datetime(df1["day"], format="%d").apply( lambda x: x.replace(year=dt.year, month=dt.month) ) # ~で分割 df_time = df1["time"].str.split("~", expand=True) # 前後の空白文字を除去 df_time[0] = df_time[0].str.strip() df_time[1] = df_time[1].str.strip() df_time # 時間をtimedeltaに変換 def time_conv(x): t = list(map(int, x.split(":"))) t.append(0) return datetime.timedelta(hours=t[0], minutes=t[1]) df2 = df_time.applymap(time_conv) df2.columns = ["start", "end"] df2 # 結合 df3 = pd.concat([df1, df2], axis=1).drop("time", axis=1) # 日付と時間を結合 df3["start"] = df3["date"] + df3["start"] df3["end"] = df3["date"] + df3["end"] # ソート df3.sort_values(["date", "start", "end", "route", "place"], inplace=True) # インデックスをリセット df3.reset_index(inplace=True) df4 = df3.loc[:, ["date", "start", "end", "type", "place", "route"]] # 日付を文字変換 df4["date"] = df4["date"].dt.strftime("%Y-%m-%d") # 時間を文字変換 df4["start"] = df4["start"].dt.strftime("%H:%M") df4["end"] = df4["end"].dt.strftime("%H:%M") df4.to_csv("result.csv")