近鉄

import pickle
import backoff
import requests
from bs4 import BeautifulSoup

url = "https://www.kintetsu.jp/unkou/unkou.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

p = pathlib.Path("kintetsu.pickle")
b = ""

if p.exists():

    with open(p, mode="rb") as fr:
        b = pickle.load(fr)


@backoff.on_exception(
    backoff.expo,
    requests.exceptions.RequestException,
    max_tries=5,
    giveup=lambda e: 400 <= e.response.status_code < 500,
)
def fetch_message(url):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, "lxml")

    message = soup.select_one("body > div > table > tr > td > font").get_text(
        strip=True
    )

    return message


s = fetch_message(url)

if b != s:
    t = s if len(s) < 140 else s[:138] + "…"
    print(t)

    with open(p, mode="wb") as fw:
        pickle.dump(s, fw)

android機種変更

  • Googleアカウントのパスワード確認(旧端末だと再設定可能かも)
  • LINEのメッセージをバックアップ
  • LINEのメールアドレスを設定
  • LINEのパスワード確認
  • +メッセージのバックアップ
  • dアカウントのIDとパスワード確認
  • dアプリ関係はつかわない

milktea.skr.jp

iPhoneのほうがすごい楽

移行ツール

www.docomo.ne.jp

support.google.com

xperia.sony.jp

www.galaxymobile.jp

救急病院202211

import datetime
import pathlib
from urllib.parse import urljoin

import pandas as pd
import requests
from bs4 import BeautifulSoup


base_url = "http://www.qq.pref.ehime.jp/qq38/WP0805/RP080501BL"

payload = {
    "_blockCd": "",
    "forward_next": "",
    "torinBlockDetailInfo.torinBlockDetail[0].blockCheckFlg": "0",
    "torinBlockDetailInfo.torinBlockDetail[1].blockCheckFlg": "0",
    "torinBlockDetailInfo.torinBlockDetail[2].blockCheckFlg": "0",
    "torinBlockDetailInfo.torinBlockDetail[3].blockCheckFlg": "1",
    "torinBlockDetailInfo.torinBlockDetail[4].blockCheckFlg": "0",
    "torinBlockDetailInfo.torinBlockDetail[5].blockCheckFlg": "0",
    "torinBlockDetailInfo.torinBlockDetail[6].blockCheckFlg": "0",
    "torinBlockDetailInfo.torinBlockDetail[7].blockCheckFlg": "0",
    "torinBlockDetailInfo.torinBlockDetail[8].blockCheckFlg": "0",
    "torinBlockDetailInfo.torinBlockDetail[9].blockCheckFlg": "0",
    "torinBlockDetailInfo.torinBlockDetail[10].blockCheckFlg": "0",
    "torinBlockDetailInfo.torinBlockDetail[11].blockCheckFlg": "0",
    "torinBlockDetailInfo.torinBlockDetail[12].blockCheckFlg": "0",
}


with requests.Session() as s:

    r = s.get(base_url)

    soup = BeautifulSoup(r.content, "html.parser")

    # トークンを取得
    token = soup.find("input", attrs={"name": "_csrf"}).get("value")

    # トークンをセット
    payload["_csrf"] = token

    # URL生成
    url = urljoin(
        base_url, soup.find("form", attrs={"id": "_wp0805Form"}).get("action")
    )

    # データ送信
    r = s.post(url, data=payload)

# スクレイピング

soup = BeautifulSoup(r.content, "html.parser")

tables = soup.find_all("table", class_="comTblGyoumuCommon", summary="検索結果一覧を表示しています。")

result = []

for table in tables:

    # 日付取得
    date, week = table.td.get_text(strip=True).split()

    for trs in table.find_all("tr", id=[1, 2, 3]):
        data = (
            [None]
            + [list(td.stripped_strings) for td in trs.find_all("td", recursive=False)]
            + [date, week]
        )
        result.append(data[-5:])

# データラングリング

df0 = (
    pd.DataFrame(result)
    .fillna(method="ffill")
    .set_axis(["医療機関情報", "診療科目", "外来受付時間", "日付", "曜日"], axis=1)
)

# 日付変換
df0["date"] = pd.to_datetime(
    df0["日付"]
    .str.extract("(?P<year>\d{4})年(?P<month>\d{1,2})月(?P<day>\d{1,2})日")
    .astype(int)
).dt.date

# 医療機関情報
df1 = (
    df0["医療機関情報"]
    .apply(pd.Series)
    .drop([2, 4], axis=1)
    .rename(columns={0: "病院名", 1: "住所", 3: "TEL(昼)", 5: "TEL(夜)"})
)

# 医療科目
df2 = df0["診療科目"].apply(pd.Series).rename(columns={0: "診療科目"})

# 外来受付時間
df3 = df0["外来受付時間"].apply(pd.Series).rename(columns={0: "日中", 1: "夜間"})

# 結合
df4 = pd.concat([df0[["日付", "曜日", "date"]], df1, df2, df3], axis=1)

# 診療科目
df4["診療科目ID"] = df4["診療科目"].map({"指定なし": 0, "内科": 2, "小児科": 7})

# 外科系
df4["診療科目ID"].mask(df4["診療科目"].str.contains("外科", na=False), 1, inplace=True)

# 内科系
df4["診療科目ID"].mask(df4["診療科目"].str.contains("内科", na=False), 2, inplace=True)

# 島嶼部
df4["診療科目ID"].mask(
    df4["住所"].str.contains("吉海町|宮窪町|伯方町|上浦町|大三島町|関前", na=False), 9, inplace=True
)

# その他
df4["診療科目ID"] = df4["診療科目ID"].fillna(8).astype(int)

# 開始時間
df4["開始時間"] = pd.to_timedelta(df4["日中"].str.split("~").str[0] + ":00")

# 17:00以降は夜間
flag = df4["開始時間"] >= pd.Timedelta("17:00:00")

df4.loc[flag, "夜間"] = df3.loc[flag, "日中"]
df4.loc[flag, "日中"] = df3.loc[flag, "夜間"]

df4.sort_values(by=["date", "診療科目ID", "開始時間"]).reset_index(drop=True, inplace=True)

df = df4.reindex(
    columns=["日付", "曜日", "病院名", "住所", "TEL(昼)", "TEL(夜)", "診療科目", "日中", "夜間"]
)

# 日付作成
JST = datetime.timezone(datetime.timedelta(hours=+9))
dt_str = datetime.datetime.now(JST).date().isoformat()

p = pathlib.Path("data", f"{dt_str}.csv")
p.parent.mkdir(parents=True, exist_ok=True)

df.to_csv(p, index=False, encoding="utf_8_sig")

救急病院ツイート用

import datetime
import os
import re
import time
from urllib.parse import urljoin

import requests
import tweepy
from bs4 import BeautifulSoup


consumer_key = os.environ["CONSUMER_KEY"]
consumer_secret = os.environ["CONSUMER_SECRET"]
access_token = os.environ["ACCESS_TOKEN"]
access_token_secret = os.environ["ACCESS_TOKEN_SECRET"]


def scraping(html):

    soup = BeautifulSoup(html, "html.parser")

    # スクレイピング
    tables = soup.find_all(
        "table", class_="comTblGyoumuCommon", summary="検索結果一覧を表示しています。"
    )

    JST = datetime.timezone(datetime.timedelta(hours=+9), "JST")
    dt_now = datetime.datetime.now(JST)

    for table in tables:

        date, week = table.td.get_text(strip=True).split()
        day = datetime.datetime.strptime(date, "%Y年%m月%d日")

        if day.date() == dt_now.date():

            result = []

            # 前データ 初期値
            dprev = ["今治市医師会市民病院", "今治市別宮町7-1-40"]

            for trs in table.find_all("tr", id=[1, 2, 3]):

                id = trs.get("id")

                data = list(trs.stripped_strings)

                # 市町村を削除
                if id == "1" and data[0] == "今治市":
                    del data[0]

                # 電話番号を削除
                for _ in range(2):
                    if len(data) > 4 and data[2].startswith("TEL"):
                        del data[2:4]

                # id=2の場合は病院名と住所を結合
                if id == "2":
                    data = dprev[:2] + data

                # print(id, data)

                # 前データとしてセット
                dprev = data

                hospital = dict(zip(["name", "address", "subject"], data[0:4]))

                hospital["class"] = 8

                # 外来受付時間を分割
                t = [j for i in data[3:] for j in i.split("~")]

                # 外来受付時間の前半開始時間と後半終了時間をセット
                hospital["time"] = "~".join([t[0], t[-1]])

                # 外来受付時間の前半終了時間と後半開始時間が違う場合
                if len(t) == 4 and t[1] != t[2]:
                    hospital["time"] = "\n".join(["~".join(t[:2]), "~".join(t[2:])])

                # 診療科目
                # 救急   : 0
                # 外科系 : 1
                # 内科系 : 2
                # 小児科 : 4
                # その他 : 8
                # 島嶼部 : 9

                # 外科系
                if "外科" in hospital["subject"]:
                    hospital["class"] = 1

                # 内科系
                elif "内科" in hospital["subject"]:
                    hospital["class"] = 2

                # 小児科
                elif hospital["subject"] == "小児科":
                    hospital["class"] = 4

                # 救急
                elif hospital["subject"] == "指定なし":
                    hospital["class"] = 0
                    hospital["subject"] = ""

                # 住所が島嶼部の場合は、診療科目を島嶼部に変更
                match = re.search("(吉海町|宮窪町|伯方町|上浦町|大三島町|関前)", hospital["address"])

                if match:

                    hospital["class"] = 9
                    hospital["subject"] = "島嶼部"

                # 診療科目に【】を追加
                if hospital["subject"]:
                    hospital["subject"] = f'【{hospital["subject"]}】'

                # 病院情報をテキスト化
                hospital["text"] = "\n".join(
                    [hospital["subject"], hospital["name"], hospital["time"]]
                ).strip()

                # リストに追加
                result.append(hospital)

            # 診療科目、時間でソート
            result.sort(key=lambda x: (x["class"], x["time"]))

            # 日付をテキスト化
            twit_date = f"{date} {week}"

            # 陸地部で結合
            twit_riku = "\n\n".join(
                [i["text"] for i in result if i["class"] < 9]
            ).strip()

            # 島嶼部で結合
            twit_sima = "\n\n".join(
                [i["text"] for i in result if i["class"] > 8]
            ).strip()

            # 日付、陸地部、島嶼部を結合
            twit_all = "\n\n".join([twit_date, twit_riku, twit_sima]).strip()

            # print(twit_all)
            # print("-" * 20)

            auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
            auth.set_access_token(access_token, access_token_secret)

            api = tweepy.API(auth)

            # 140文字以内か
            if len(twit_all) <= 140:
                # 全文ツイート
                api.update_status(twit_all)

            else:
                # 陸地部ツイート
                api.update_status("\n\n".join([twit_date, twit_riku]).strip())
                
                time.sleep(30)
                
                # 島嶼部他ツイート
                api.update_status("\n\n".join([twit_date, twit_sima]).strip())


            break


if __name__ == "__main__":

    base_url = "http://www.qq.pref.ehime.jp/qq38/WP0805/RP080501BL"

    payload = {
        "_blockCd": "",
        "forward_next": "",
        "torinBlockDetailInfo.torinBlockDetail[0].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[1].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[2].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[3].blockCheckFlg": "1",
        "torinBlockDetailInfo.torinBlockDetail[4].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[5].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[6].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[7].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[8].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[9].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[10].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[11].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[12].blockCheckFlg": "0",
    }

    # 地域選択ページのセッション作成
    with requests.Session() as s:

        r = s.get(base_url)

        soup = BeautifulSoup(r.content, "html.parser")

        # トークンを取得
        token = soup.find(
            "input", attrs={"name": "_csrf"}
        ).get("value")

        # トークンをセット
        payload["_csrf"] = token

        # URL生成
        url = urljoin(
            base_url, soup.find("form", attrs={"id": "_wp0805Form"}).get("action")
        )

        # URL確認
        # print(url)

        # データ送信
        r = s.post(url, data=payload)

    scraping(r.content)
import datetime
from urllib.parse import urljoin
import pathlib

import pandas as pd
import requests
from bs4 import BeautifulSoup

JST = datetime.timezone(datetime.timedelta(hours=+9), "JST")
dt_now = datetime.datetime.now(JST)
dt_str = dt_now.date().isoformat()


def scraping(html):

    soup = BeautifulSoup(html, "html.parser")

    tables = soup.find_all(
        "table", class_="comTblGyoumuCommon", summary="検索結果一覧を表示しています。"
    )

    result = []
    before = ["今治市医師会市民病院", "今治市別宮町7−1−40", "TEL(昼)", "0898-22-7611", "TEL(夜)", None]

    for table in tables:

        # 日付取得
        date, week = table.td.get_text(strip=True).split()

        for trs in table.find_all("tr", id=[1, 2, 3]):

            id = trs.get("id")

            data = list(trs.stripped_strings)

            if id == "3":
                temp = [id, date, week] + data
            elif id == "2":
                temp = [id, date, week] + before + data
            else:
                temp = [id, date, week] + data[1:]

            # TEL(昼)追加
            if temp[5] != "TEL(昼)":
                temp = temp[:5] + ["TEL(昼)", None] + temp[5:]

            # TEL(夜)追加
            if temp[7] != "TEL(夜)":
                temp = temp[:7] + ["TEL(夜)", None] + temp[7:]

            # 夜間へ移動
            if not temp[10].startswith("0"):
                temp = temp[:10] + [None] + temp[10:]

            result.append(temp)
            before = temp[3:9]

    df = pd.DataFrame(
        result,
        columns=[
            "ID",
            "日付",
            "曜日",
            "病院名",
            "住所",
            "昼",
            "TEL(昼)",
            "夜",
            "TEL(夜)",
            "科目",
            "日中",
            "夜間",
        ],
    )

    df.drop(columns=["昼", "夜"], inplace=True)

    p = pathlib.Path("data", f"{dt_str}.csv")
    p.parent.mkdir(parents=True, exist_ok=True)

    df.to_csv(p, index=False, encoding="utf_8_sig")


if __name__ == "__main__":

    base_url = "http://www.qq.pref.ehime.jp/qq38/WP0805/RP080501BL"

    payload = {
        "_blockCd": "",
        "forward_next": "",
        "torinBlockDetailInfo.torinBlockDetail[0].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[1].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[2].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[3].blockCheckFlg": "1",
        "torinBlockDetailInfo.torinBlockDetail[4].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[5].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[6].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[7].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[8].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[9].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[10].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[11].blockCheckFlg": "0",
        "torinBlockDetailInfo.torinBlockDetail[12].blockCheckFlg": "0",
    }

    # 地域選択ページのセッション作成
    with requests.Session() as s:

        r = s.get(base_url)

        soup = BeautifulSoup(r.content, "html.parser")

        # トークンを取得
        token = soup.find(
            "input", attrs={"name": "_csrf"}
        ).get("value")

        # トークンをセット
        payload["_csrf"] = token

        # URL生成
        url = urljoin(
            base_url, soup.find("form", attrs={"id": "_wp0805Form"}).get("action")
        )

        # URL確認
        # print(url)

        # データ送信
        r = s.post(url, data=payload)

    scraping(r.content)

pdftohtmlでワンライナーxml変換

wget "https://www.mlit.go.jp/totikensangyo/const/content/001520358.pdf"
n=`pdfinfo 001520358.pdf | awk '/Pages/{print $2}'`
i=0; c=500; while [ $i -lt $n ]; do b=$i; i=$(($i+$c)); if [ $i -ge $n ]; then i=$n; fi; pdftohtml -f $(($b+1)) -l $i -xml 001520358.pdf data$1.xml; done

TEI XML

yuranhiko.hatenablog.com

blog.imind.jp

orangain.hatenablog.com

lxml.de

from lxml import etree
import pathlib

parser = etree.XMLParser(recover=True)

tree = etree.parse(p, parser)
root = tree.getroot()

# 名前空間マッピング確認
root.nsmap

# タグ確認
print(etree.tostring(root, pretty_print=True, encoding="utf-8").decode())

# 抽出
for i in root.xpath("//tei:physDesc", namespaces=ns):

    print(i.xpath(".//tei:bindingDesc/p/text()", namespaces=ns))

# glob
for p in pathlib.Path("cam_jp_xml").glob("*.xml"):
   print(str(p))

pipx pdm

pypa.github.io

zenn.dev

zenn.dev

qiita.com

sudo apt install pipx
pipx ensurepath
pipx install pdm
pdm config --global install.cache True

mkdir jupyter
cd jupyter

pdm init

pdm add jupyterlab jupyterlab-language-pack-ja-JP
pdm add pandas openpyxl matplotlib japanize_matplotlib
pdm add beautifulsoup4 requests lxml html5lib

pdm add -d flake8 mypy black isort

#pdm add plotly jupyter-contrib-nbextensions ipywidgets

pdm run jupyter lab --no-browser --port 8888 --ip=0.0.0.0 --allow-root --LabApp.token=''

pdm add lckr-jupyterlab-variableinspector
curl -fsSL https://deb.nodesource.com/setup_lts.x | sudo -E bash -
sudo apt install -y nodejs
[tool.pdm.scripts]
start_jupyter = "jupyter lab --no-browser --port 8888 --ip=0.0.0.0 --allow-root --LabApp.token=''"
black = "black ."
isort = "isort ."
flake8 = "flake8 --exit-zero ."
mypy = "mypy --show-column-numbers ."
format = { composite = ["black", "isort"] }
lint = { composite = ["flake8", "mypy"] }
check = { composite = ["format", "lint"] }