INIAD Syllabusのスクレイピング

import requests
from bs4 import BeautifulSoup

url = "https://g-sys.toyo.ac.jp/syllabus/result"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}

payload = {
    "year": "2020",
    "course": "0",
    "faculty": "1F000-2017",
    "department": "",
    "course_name": "",
    "instructor": "",
    "language": "",
    "keyword1": "",
    "condition1": "",
    "conjuntion": "",
    "keyword2": "",
    "condition2": "",
    "perPage": "1000",
}

r = requests.post(url, headers=headers, data=payload)

r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

import re

data = []

for tr in soup.find("table", id="result_table").find_all("tr"):

    tds = []

    for td in tr.find_all(["th", "td"]):

        tds.append(td.get_text("\n", strip=True).replace(" / ", "\n"))

    if td.name == "td":

        syllabus_jp = td.find("input", class_="btn_syllabus_jp")

        jp = re.findall("\d+", syllabus_jp.get("onclick"))[1] if syllabus_jp else ""

        syllabus_en = td.find("input", class_="btn_syllabus_en")

        en = re.findall("\d+", syllabus_en.get("onclick"))[1] if syllabus_en else ""

        tds[-1] = f"{jp}\n{en}"

    data.append(tds)

import pandas as pd

df_temp = pd.DataFrame(data)

df_temp

dfs = []

for col in df_temp.columns:
    dfs.append(df_temp[col].str.split("\n", expand=True))

df = pd.concat(dfs, axis=1)

df.to_csv("data.csv", header=None, index=None, encoding="utf_8_sig")

pd.read_csv("data.csv", header=0)

新型コロナウイルスの表をtesseractでスクレイピング

binary-star.net

code-graffiti.com

blog.machine-powers.net

qiita.com

ni4muraano.hatenablog.com

京都府

www.pref.kyoto.jp

f:id:imabari_ehime:20200511230455p:plain

愛知県

www.pref.aichi.jp

f:id:imabari_ehime:20200511230520p:plain

最後のその他の陽性者の状況の表だけが順番がずれるので取れない

テスト

日付 京都 愛知
5/10
5/11
5/12
5/13
5/14