imabari.hateblo.jp
サンプル
github.com
Webページ(リンク先抽出)
import requests
from bs4 import BeautifulSoup
import re
import datetime
from urllib.parse import urljoin
url = "http://example.jp"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html.parser")
tag = soup.find("a", text=re.compile("^○○○○○○○○"))
link = urljoin(url, tag.get("href"))
PDF
import camelot
import pandas as pd
tables = camelot.read_pdf(
link, pages="all", split_text=True, strip_text="\n", line_scale=40
)
dfs = [table.df for table in tables]
df = pd.concat(dfs)
XLSX
import pandas as pd
df = pd.read_excel(link)
import pandas as pd
df = pd.read_csv(link)