import datetime
import os
import re
import time
from urllib.parse import urljoin
import requests
import tweepy
from bs4 import BeautifulSoup
consumer_key = os.environ["CONSUMER_KEY"]
consumer_secret = os.environ["CONSUMER_SECRET"]
access_token = os.environ["ACCESS_TOKEN"]
access_token_secret = os.environ["ACCESS_TOKEN_SECRET"]
def scraping(html):
soup = BeautifulSoup(html, "html.parser")
tables = soup.find_all(
"table", class_="comTblGyoumuCommon", summary="検索結果一覧を表示しています。"
)
JST = datetime.timezone(datetime.timedelta(hours=+9), "JST")
dt_now = datetime.datetime.now(JST)
for table in tables:
date, week = table.td.get_text(strip=True).split()
day = datetime.datetime.strptime(date, "%Y年%m月%d日")
if day.date() == dt_now.date():
result = []
dprev = ["今治市医師会市民病院", "今治市別宮町7-1-40"]
for trs in table.find_all("tr", id=[1, 2, 3]):
id = trs.get("id")
data = list(trs.stripped_strings)
if id == "1" and data[0] == "今治市":
del data[0]
for _ in range(2):
if len(data) > 4 and data[2].startswith("TEL"):
del data[2:4]
if id == "2":
data = dprev[:2] + data
dprev = data
hospital = dict(zip(["name", "address", "subject"], data[0:4]))
hospital["class"] = 8
t = [j for i in data[3:] for j in i.split("~")]
hospital["time"] = "~".join([t[0], t[-1]])
if len(t) == 4 and t[1] != t[2]:
hospital["time"] = "\n".join(["~".join(t[:2]), "~".join(t[2:])])
if "外科" in hospital["subject"]:
hospital["class"] = 1
elif "内科" in hospital["subject"]:
hospital["class"] = 2
elif hospital["subject"] == "小児科":
hospital["class"] = 4
elif hospital["subject"] == "指定なし":
hospital["class"] = 0
hospital["subject"] = ""
match = re.search("(吉海町|宮窪町|伯方町|上浦町|大三島町|関前)", hospital["address"])
if match:
hospital["class"] = 9
hospital["subject"] = "島嶼部"
if hospital["subject"]:
hospital["subject"] = f'【{hospital["subject"]}】'
hospital["text"] = "\n".join(
[hospital["subject"], hospital["name"], hospital["time"]]
).strip()
result.append(hospital)
result.sort(key=lambda x: (x["class"], x["time"]))
twit_date = f"{date} {week}"
twit_riku = "\n\n".join(
[i["text"] for i in result if i["class"] < 9]
).strip()
twit_sima = "\n\n".join(
[i["text"] for i in result if i["class"] > 8]
).strip()
twit_all = "\n\n".join([twit_date, twit_riku, twit_sima]).strip()
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
if len(twit_all) <= 140:
api.update_status(twit_all)
else:
api.update_status("\n\n".join([twit_date, twit_riku]).strip())
time.sleep(30)
api.update_status("\n\n".join([twit_date, twit_sima]).strip())
break
if __name__ == "__main__":
base_url = "http://www.qq.pref.ehime.jp/qq38/WP0805/RP080501BL"
payload = {
"_blockCd": "",
"forward_next": "",
"torinBlockDetailInfo.torinBlockDetail[0].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[1].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[2].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[3].blockCheckFlg": "1",
"torinBlockDetailInfo.torinBlockDetail[4].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[5].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[6].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[7].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[8].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[9].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[10].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[11].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[12].blockCheckFlg": "0",
}
with requests.Session() as s:
r = s.get(base_url)
soup = BeautifulSoup(r.content, "html.parser")
token = soup.find(
"input", attrs={"name": "_csrf"}
).get("value")
payload["_csrf"] = token
url = urljoin(
base_url, soup.find("form", attrs={"id": "_wp0805Form"}).get("action")
)
r = s.post(url, data=payload)
scraping(r.content)
import datetime
from urllib.parse import urljoin
import pathlib
import pandas as pd
import requests
from bs4 import BeautifulSoup
JST = datetime.timezone(datetime.timedelta(hours=+9), "JST")
dt_now = datetime.datetime.now(JST)
dt_str = dt_now.date().isoformat()
def scraping(html):
soup = BeautifulSoup(html, "html.parser")
tables = soup.find_all(
"table", class_="comTblGyoumuCommon", summary="検索結果一覧を表示しています。"
)
result = []
before = ["今治市医師会市民病院", "今治市別宮町7−1−40", "TEL(昼)", "0898-22-7611", "TEL(夜)", None]
for table in tables:
date, week = table.td.get_text(strip=True).split()
for trs in table.find_all("tr", id=[1, 2, 3]):
id = trs.get("id")
data = list(trs.stripped_strings)
if id == "3":
temp = [id, date, week] + data
elif id == "2":
temp = [id, date, week] + before + data
else:
temp = [id, date, week] + data[1:]
if temp[5] != "TEL(昼)":
temp = temp[:5] + ["TEL(昼)", None] + temp[5:]
if temp[7] != "TEL(夜)":
temp = temp[:7] + ["TEL(夜)", None] + temp[7:]
if not temp[10].startswith("0"):
temp = temp[:10] + [None] + temp[10:]
result.append(temp)
before = temp[3:9]
df = pd.DataFrame(
result,
columns=[
"ID",
"日付",
"曜日",
"病院名",
"住所",
"昼",
"TEL(昼)",
"夜",
"TEL(夜)",
"科目",
"日中",
"夜間",
],
)
df.drop(columns=["昼", "夜"], inplace=True)
p = pathlib.Path("data", f"{dt_str}.csv")
p.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(p, index=False, encoding="utf_8_sig")
if __name__ == "__main__":
base_url = "http://www.qq.pref.ehime.jp/qq38/WP0805/RP080501BL"
payload = {
"_blockCd": "",
"forward_next": "",
"torinBlockDetailInfo.torinBlockDetail[0].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[1].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[2].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[3].blockCheckFlg": "1",
"torinBlockDetailInfo.torinBlockDetail[4].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[5].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[6].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[7].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[8].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[9].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[10].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[11].blockCheckFlg": "0",
"torinBlockDetailInfo.torinBlockDetail[12].blockCheckFlg": "0",
}
with requests.Session() as s:
r = s.get(base_url)
soup = BeautifulSoup(r.content, "html.parser")
token = soup.find(
"input", attrs={"name": "_csrf"}
).get("value")
payload["_csrf"] = token
url = urljoin(
base_url, soup.find("form", attrs={"id": "_wp0805Form"}).get("action")
)
r = s.post(url, data=payload)
scraping(r.content)