PDFの康熙部首・CJK部首補助の文字化け変換

github.com

github.com

!apt update
!apt install build-essential
!apt install autoconf automake libtool
!apt install libqpdf-dev

!git clone https://github.com/trueroad/pdf-fix-tuc.git

%cd pdf-fix-tuc

!./autogen.sh

!mkdir build
%cd build

!../configure
!make
!make install

!pdf-fix-tuc

%cd /content

!pdf-fix-tuc data.pdf data-fix.pdf

imabari.hateblo.jp

imabari.hateblo.jp

気象庁の気象警報をスクレイピング

code4sabae.github.io

import pathlib
import time
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

pref_code = {
    "01": "北海道",
    "02": "青森県",
    "03": "岩手県",
    "04": "宮城県",
    "05": "秋田県",
    "06": "山形県",
    "07": "福島県",
    "08": "茨城県",
    "09": "栃木県",
    "10": "群馬県",
    "11": "埼玉県",
    "12": "千葉県",
    "13": "東京都",
    "14": "神奈川県",
    "15": "新潟県",
    "16": "富山県",
    "17": "石川県",
    "18": "福井県",
    "19": "山梨県",
    "20": "長野県",
    "21": "岐阜県",
    "22": "静岡県",
    "23": "愛知県",
    "24": "三重県",
    "25": "滋賀県",
    "26": "京都府",
    "27": "大阪府",
    "28": "兵庫県",
    "29": "奈良県",
    "30": "和歌山県",
    "31": "鳥取県",
    "32": "島根県",
    "33": "岡山県",
    "34": "広島県",
    "35": "山口県",
    "36": "徳島県",
    "37": "香川県",
    "38": "愛媛県",
    "39": "高知県",
    "40": "福岡県",
    "41": "佐賀県",
    "42": "長崎県",
    "43": "熊本県",
    "44": "大分県",
    "45": "宮崎県",
    "46": "鹿児島県",
    "47": "沖縄県",
}

pref = [v for v in pref_code.values()]

url = "https://www.jma.go.jp/jp/warn/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

"""# スクレイピング"""

# セッション
with requests.Session() as s:

    r = s.get(url, headers=headers)

    r.raise_for_status()

    base = BeautifulSoup(r.content, "html5lib")

    htmls = []

    for tag in tqdm(base.select("div#title > noscript > table > tbody > tr > td > a")):

        area = tag.get_text(strip=True)
        link = urljoin(url, tag.get("href"))

        r = s.get(link, headers=headers)
        r.raise_for_status()

        soup = BeautifulSoup(r.content, "html5lib")

        p = pathlib.Path("html", pathlib.PurePath(link).name)
        p.parent.mkdir(parents=True, exist_ok=True)

        with p.open(mode="w") as fw:
            fw.write(soup.prettify())

        htmls.append({"area": area, "url": link, "path": p})

        time.sleep(1)

import pandas as pd


def fetch_warn(p, area):

    tmp = pd.read_html(p.open(mode="r"), attrs={"id": "WarnTableTable"})[0]

    df = tmp.melt(
        id_vars=[
            ("Unnamed: 0_level_0", "Unnamed: 0_level_1"),
            ("Unnamed: 1_level_0", "Unnamed: 1_level_1"),
            ("Unnamed: 2_level_0", "Unnamed: 2_level_1"),
        ]
    ).dropna(thresh=5)

    df.set_axis(
        ["area1", "area2", "city", "level", "alert", "value"], axis=1, inplace=True
    )

    df["pref"] = area

    return df


dfs = [fetch_warn(html["path"], html["area"]) for html in htmls]

df = pd.concat(dfs).reset_index(drop=True)

for col in df.select_dtypes(include=object).columns:
    df[col] = df[col].str.normalize("NFKC").str.replace("\s", "")

df["pref"].replace(
    {
        "宗谷地方": "北海道",
        "上川・留萌地方": "北海道",
        "網走・北見・紋別地方": "北海道",
        "釧路・根室・十勝地方": "北海道",
        "胆振・日高地方": "北海道",
        "石狩・空知・後志地方": "北海道",
        "渡島・檜山地方": "北海道",
        "沖縄本島地方": "沖縄県",
        "大東島地方": "沖縄県",
        "宮古島地方": "沖縄県",
        "八重山地方": "沖縄県",
    },
    inplace=True,
)

df["value"] = (df["value"] == "●").astype(int)

df_alert = df.pivot_table(
    index="pref", columns="level", values="value", aggfunc=sum
).reindex(index=pref, columns=["警報", "注意報"])

df_alert

df.to_csv("alert.csv", encoding="utf_8_sig")

paddleocr

www.paddlepaddle.org.cn

github.com

pypi.org

zenn.dev

OK f:id:imabari_ehime:20210109222231p:plain

NG メモリクラッシュ f:id:imabari_ehime:20210109222252j:plain

!python -m pip install "paddlepaddle==2.0.0rc1" -i https://mirror.baidu.com/pypi/simple
!pip install "paddleocr>=2.0.1"

再起動

from paddleocr import PaddleOCR,draw_ocr

ocr = PaddleOCR(use_angle_cls=True, lang="japan")

result = ocr.ocr("main.jpg", cls=True)

for line in result:
    print(line)

愛知県の市町村別人口データのjson作成

import datetime
import re
from urllib.parse import urljoin

import pandas as pd
import requests
from bs4 import BeautifulSoup

url = "https://www.pref.aichi.jp/toukei/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}


def get_link(url, text):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, "html.parser")

    tag = soup.find("a", text=text)

    return tag


JST = datetime.timezone(datetime.timedelta(hours=+9), "JST")
dt_now = datetime.datetime.now(JST)

link = get_link(url, "あいちの人口").get("href")

tag = get_link(link, re.compile("現在人口"))

csv_url = urljoin(link, tag.get("href"))

m = re.match("(\d{4})年(\d{1,2})月(\d{1,2})日", tag.get_text())

if m:
    year, month, day = map(int, m.groups())
    dt_now = datetime.datetime(year, month, day)

dt_now


df = pd.read_csv(csv_url, index_col="県市町村").dropna(how="all")

df_city = (
    df[df["性別"] == "男女"]
    .loc[:, ["コード", "総数"]]
    .rename(columns={"コード": "市町村コード", "総数": "人口"})
)

df_city["市町村名"] = df_city.index.str.rstrip("県市町村")

city_data = df_city.to_dict(orient="index")

data = {"lastUpdateAt": dt_now.isoformat(), "data": city_data}

import json

with open("city_data.json", "w", encoding="utf-8") as fw:
    json.dump(data, fw, ensure_ascii=False, indent=4)
{
    "lastUpdateAt": "2020-11-01T00:00:00",
    "data": {
        "愛知県": {
            "市町村コード": 23000,
            "人口": 7538701,
            "市町村名": "愛知"
        },
        "名古屋市": {
            "市町村コード": 23100,
            "人口": 2327689,
            "市町村名": "名古屋"
        },
        "豊橋市": {
            "市町村コード": 23201,
            "人口": 371856,
            "市町村名": "豊橋"
        },
        "岡崎市": {
            "市町村コード": 23202,
            "人口": 385371,
            "市町村名": "岡崎"
        },
        "一宮市": {
            "市町村コード": 23203,
            "人口": 378768,
            "市町村名": "一宮"
        },
        "瀬戸市": {
            "市町村コード": 23204,
            "人口": 127265,
            "市町村名": "瀬戸"
        },
        "半田市": {
            "市町村コード": 23205,
            "人口": 117742,
            "市町村名": "半田"
        },
        "春日井市": {
            "市町村コード": 23206,
            "人口": 306538,
            "市町村名": "春日井"
        },
        "豊川市": {
            "市町村コード": 23207,
            "人口": 183958,
            "市町村名": "豊川"
        },
        "津島市": {
            "市町村コード": 23208,
            "人口": 61030,
            "市町村名": "津島"
        },
        "碧南市": {
            "市町村コード": 23209,
            "人口": 72601,
            "市町村名": "碧南"
        },
        "刈谷市": {
            "市町村コード": 23210,
            "人口": 153481,
            "市町村名": "刈谷"
        },
        "豊田市": {
            "市町村コード": 23211,
            "人口": 422878,
            "市町村名": "豊田"
        },
        "安城市": {
            "市町村コード": 23212,
            "人口": 188705,
            "市町村名": "安城"
        },
        "西尾市": {
            "市町村コード": 23213,
            "人口": 169017,
            "市町村名": "西尾"
        },
        "蒲郡市": {
            "市町村コード": 23214,
            "人口": 79419,
            "市町村名": "蒲郡"
        },
        "犬山市": {
            "市町村コード": 23215,
            "人口": 72926,
            "市町村名": "犬山"
        },
        "常滑市": {
            "市町村コード": 23216,
            "人口": 57689,
            "市町村名": "常滑"
        },
        "江南市": {
            "市町村コード": 23217,
            "人口": 97541,
            "市町村名": "江南"
        },
        "小牧市": {
            "市町村コード": 23219,
            "人口": 148128,
            "市町村名": "小牧"
        },
        "稲沢市": {
            "市町村コード": 23220,
            "人口": 134893,
            "市町村名": "稲沢"
        },
        "新城市": {
            "市町村コード": 23221,
            "人口": 43778,
            "市町村名": "新城"
        },
        "東海市": {
            "市町村コード": 23222,
            "人口": 113284,
            "市町村名": "東海"
        },
        "大府市": {
            "市町村コード": 23223,
            "人口": 92418,
            "市町村名": "大府"
        },
        "知多市": {
            "市町村コード": 23224,
            "人口": 84059,
            "市町村名": "知多"
        },
        "知立市": {
            "市町村コード": 23225,
            "人口": 71868,
            "市町村名": "知立"
        },
        "尾張旭市": {
            "市町村コード": 23226,
            "人口": 82176,
            "市町村名": "尾張旭"
        },
        "高浜市": {
            "市町村コード": 23227,
            "人口": 48765,
            "市町村名": "高浜"
        },
        "岩倉市": {
            "市町村コード": 23228,
            "人口": 47941,
            "市町村名": "岩倉"
        },
        "豊明市": {
            "市町村コード": 23229,
            "人口": 69381,
            "市町村名": "豊明"
        },
        "日進市": {
            "市町村コード": 23230,
            "人口": 92713,
            "市町村名": "日進"
        },
        "田原市": {
            "市町村コード": 23231,
            "人口": 59148,
            "市町村名": "田原"
        },
        "愛西市": {
            "市町村コード": 23232,
            "人口": 61000,
            "市町村名": "愛西"
        },
        "清須市": {
            "市町村コード": 23233,
            "人口": 69771,
            "市町村名": "清須"
        },
        "北名古屋市": {
            "市町村コード": 23234,
            "人口": 86101,
            "市町村名": "北名古屋"
        },
        "弥富市": {
            "市町村コード": 23235,
            "人口": 43083,
            "市町村名": "弥富"
        },
        "みよし市": {
            "市町村コード": 23236,
            "人口": 62919,
            "市町村名": "みよし"
        },
        "あま市": {
            "市町村コード": 23237,
            "人口": 87616,
            "市町村名": "あま"
        },
        "長久手市": {
            "市町村コード": 23238,
            "人口": 62414,
            "市町村名": "長久手"
        },
        "東郷町": {
            "市町村コード": 23302,
            "人口": 44093,
            "市町村名": "東郷"
        },
        "豊山町": {
            "市町村コード": 23342,
            "人口": 15708,
            "市町村名": "豊山"
        },
        "大口町": {
            "市町村コード": 23361,
            "人口": 24248,
            "市町村名": "大口"
        },
        "扶桑町": {
            "市町村コード": 23362,
            "人口": 34260,
            "市町村名": "扶桑"
        },
        "大治町": {
            "市町村コード": 23424,
            "人口": 32550,
            "市町村名": "大治"
        },
        "蟹江町": {
            "市町村コード": 23425,
            "人口": 36827,
            "市町村名": "蟹江"
        },
        "飛島村": {
            "市町村コード": 23427,
            "人口": 4604,
            "市町村名": "飛島"
        },
        "阿久比町": {
            "市町村コード": 23441,
            "人口": 28182,
            "市町村名": "阿久比"
        },
        "東浦町": {
            "市町村コード": 23442,
            "人口": 49156,
            "市町村名": "東浦"
        },
        "南知多町": {
            "市町村コード": 23445,
            "人口": 16788,
            "市町村名": "南知多"
        },
        "美浜町": {
            "市町村コード": 23446,
            "人口": 22490,
            "市町村名": "美浜"
        },
        "武豊町": {
            "市町村コード": 23447,
            "人口": 43115,
            "市町村名": "武豊"
        },
        "幸田町": {
            "市町村コード": 23501,
            "人口": 42409,
            "市町村名": "幸田"
        },
        "設楽町": {
            "市町村コード": 23561,
            "人口": 4408,
            "市町村名": "設楽"
        },
        "東栄町": {
            "市町村コード": 23562,
            "人口": 2937,
            "市町村名": "東栄"
        },
        "豊根村": {
            "市町村コード": 23563,
            "人口": 996,
            "市町村名": "豊根"
        }
    }
}

愛知県のコロナ

下記の住居地は県外で処理

住居地 カウント
岐阜県 23
東京都 18
尾張地方 11
三重県 11
三河地方 5
神奈川県 3
静岡県 3
兵庫県 2
千葉県 2
中国武漢 2
福岡県 2
京都府 2
大阪府 2
福井県 1
滋賀県 1
土岐市 1
可児市 1
一宮保健所管内 1
不定 1
愛媛県 1
茨城県 1
石川県 1
沖縄県 1
岐阜市 1

imabari.hateblo.jp

開始日と終了日が固定だとreindexで補完すればいいので簡単

以下のコードをPandasで移植

github.com

github.com

スクレイピング

!pip install pdfplumber
!pip install simplejson
import datetime
import pathlib
import re
import sys
from urllib.parse import urljoin

import pandas as pd
import pdfplumber
import requests
from bs4 import BeautifulSoup

def fetch_file(url, dir="."):

    r = requests.get(url)
    r.raise_for_status()

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p

def days2date(s):

    y = 2021 if s.name > 16576 else 2020

    days = re.findall("[0-9]{1,2}", s["発表日"])

    if len(days) == 2:
        m, d = map(int, days)
        return pd.Timestamp(year=y, month=m, day=d)
    else:
        return pd.NaT

url = "https://www.pref.aichi.jp/site/covid19-aichi/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}

JST = datetime.timezone(datetime.timedelta(hours=+9), "JST")
dt_now = datetime.datetime.now(JST)
str_now = dt_now.strftime("%Y/%m/%d %H:%M")

r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

dfs = []

for tag in soup.find("span", text="▶ 愛知県内の発生事例").parent.find_all(
    "a", href=re.compile(".pdf$")
)[::-1]:

    link = urljoin(url, tag.get("href"))

    path_pdf = fetch_file(link)

    with pdfplumber.open(path_pdf) as pdf:

        for page in pdf.pages:

            table = page.extract_table()

            df_tmp = pd.DataFrame(table[1:], columns=table[0])

            dfs.append(df_tmp)

df = pd.concat(dfs).set_index("No")

# 発表日が欠損を削除
df.dropna(subset=["発表日"], inplace=True)

df.index = df.index.astype(int)

df.sort_index(inplace=True)

df["発表日"] = df.apply(days2date, axis=1)

df["date"] = df["発表日"].dt.strftime("%Y-%m-%d")

df["w"] = (df["発表日"].dt.dayofweek + 1) % 7
df["w"] = df["w"].astype(str)

df["short_date"] = df["発表日"].dt.strftime("%m\\/%d")

df["発表日"] = df["発表日"].dt.strftime("%Y/%m/%d %H:%M")

cjk = str.maketrans("⻲⻑黑戶⻯⻄⻘⻤", "亀長黒戸竜西青鬼")

df["住居地"] = df["住居地"].str.normalize("NFKC")
df["住居地"] = df["住居地"].apply(lambda s: s.translate(cjk))

df["年代・性別"] = df["年代・性別"].str.normalize("NFKC")

p = pathlib.Path("/data/patients.csv")
p.parent.mkdir(parents=True, exist_ok=True)

df.to_csv(p, encoding="utf_8_sig")

集計

"""
aichi_citys = [
    "名古屋市",
    "一宮市",
    "瀬戸市",
    "春日井市",
    "犬山市",
    "江南市",
    "小牧市",
    "稲沢市",
    "尾張旭市",
    "岩倉市",
    "豊明市",
    "日進市",
    "清須市",
    "北名古屋市",
    "長久手市",
    "東郷町",
    "豊山町",
    "大口町",
    "扶桑町",
    "津島市",
    "愛西市",
    "弥富市",
    "あま市",
    "大治町",
    "蟹江町",
    "飛島村",
    "半田市",
    "常滑市",
    "東海市",
    "大府市",
    "知多市",
    "阿久比町",
    "東浦町",
    "南知多町",
    "美浜町",
    "武豊町",
    "岡崎市",
    "碧南市",
    "刈谷市",
    "豊田市",
    "安城市",
    "西尾市",
    "知立市",
    "高浜市",
    "みよし市",
    "幸田町",
    "豊橋市",
    "豊川市",
    "蒲郡市",
    "新城市",
    "田原市",
    "設楽町",
    "東栄町",
    "豊根村",
]
"""

# city_data.json
with open("city_data.json", "r") as fr:
    json_load = json.load(fr)

city_dic = {k: v["市町村コード"] for k, v in json_load["data"].items()}

aichi_citys = [k for k in city_dic if k != "愛知県"]

ages_list = [
    "10歳未満",
    "10代",
    "20代",
    "30代",
    "40代",
    "50代",
    "60代",
    "70代",
    "80代",
    "90代",
    "100代",
]

df = pd.read_csv(
    "/data/patients.csv",
    keep_default_na=False,
)

patients_list = df.values.tolist()

df["発表日"] = pd.to_datetime(df["発表日"])

df["住居地"] = df["住居地"].where(df["住居地"].isin(aichi_citys), "県外")

# 年代と性別を分割
df_ages = df["年代・性別"].str.extract("(.+)(男性|女性|その他)").rename(columns={0: "年代", 1: "性別"})
df = df.join(df_ages)

df["年代"] = df["年代"].str.strip()

df["年代"] = df["年代"].where(df["年代"].isin(ages_list), "その他")

df["年代"].unique()

args = sys.argv

# 日付のリストを生成
strdt = datetime.datetime.strptime("2020-01-26", "%Y-%m-%d")  # 開始日
enddt = datetime.datetime.strptime(args[1], "%Y-%m-%d")  # 終了日

dt_range = pd.date_range(strdt, enddt)

df_date_num = df["発表日"].value_counts().reindex(index=dt_range, fill_value=0)

df_date_num

df_date_place = pd.crosstab(df["発表日"], df["住居地"]).reindex(
    index=dt_range, columns=aichi_citys + ["県外"], fill_value=0
)

df_date_place

df_date_age_sex = pd.crosstab(index=df["発表日"], columns=[df["性別"], df["年代"]]).reindex(
    index=dt_range, fill_value=0
)

# 県外の市町村コードを99999追加
city_dic.update({"県外": 99999})

# 市町村名を市町村コードに置換
df_date_place.rename(columns=city_dic, inplace=True)

df_date_age_sex

patients_summary_list = []

for dt_date in dt_range:

    patients_summary_list.append(
        {
            "日付": dt_date.strftime("%Y-%m-%d"),
            "小計": int(df_date_num.loc[dt_date]),
            "住居地": df_date_place.loc[dt_date].to_dict(),
            "年代": df_date_age_sex.loc[dt_date].unstack(fill_value=0).to_dict(),
        }
    )

main_summary_history_df = pd.read_csv(
    "/data/main_summary_history.csv",
    keep_default_na=False,
)

inspection_persons_summary_df = pd.read_csv(
    "/data/inspection_persons_summary.csv",
    keep_default_na=False,
)

inspections_summary_df = pd.read_csv(
    "/data/inspections_summary.csv",
    parse_dates=[0],
    keep_default_na=False,
)

inspections_summary_df.rename(columns={"検査日": "日付", "PCR検査件数(件)": "小計"}, inplace=True)

inspections_summary_df.drop(columns=["陽性者数(人)", "備考"], inplace=True)

inspections_summary_df["日付"] = inspections_summary_df["日付"].dt.strftime("%Y-%m-%d")

inspections_summary_list = inspections_summary_df.to_dict(orient="records")

inspections_summary_list

data = {
    "lastUpdate": str_now,
    "patients": {"date": str_now, "data": patients_list},
    "patients_summary": {"date": str_now, "data": patients_summary_list},
    "inspections_summary": {"date": str_now, "data": inspections_summary_list},
    "main_summary_history": {
        "date": str_now,
        "data": main_summary_history_df.to_dict(orient="records"),
    },
    "inspection_persons_summary": {
        "date": str_now,
        "data": inspection_persons_summary_df.to_dict(orient="records"),
    },
}

import json

with open("data.json", "w", encoding="utf-8") as fw:
    json.dump(data, fw, ensure_ascii=False, indent=4)