新潟県のコロナウイルス

# -*- coding: utf-8 -*-

import datetime
import pathlib
import re
from urllib.parse import urljoin

import jaconv
import pandas as pd
import requests
from bs4 import BeautifulSoup

JST = datetime.timezone(datetime.timedelta(hours=+9))
dt_now = datetime.datetime.now(JST)

BASE_URL = "https://www.pref.niigata.lg.jp/site/shingata-corona/index.html"

niigata_names = {
    151009: "新潟市",
    151017: "新潟市北区",
    151025: "新潟市東区",
    151033: "新潟市中央区",
    151041: "新潟市江南区",
    151050: "新潟市秋葉区",
    151068: "新潟市南区",
    151076: "新潟市西区",
    151084: "新潟市西蒲区",
    152021: "長岡市",
    152048: "三条市",
    152056: "柏崎市",
    152064: "新発田市",
    152081: "小千谷市",
    152099: "加茂市",
    152102: "十日町市",
    152111: "見附市",
    152129: "村上市",
    152137: "燕市",
    152161: "糸魚川市",
    152170: "妙高市",
    152188: "五泉市",
    152226: "上越市",
    152234: "阿賀野市",
    152242: "佐渡市",
    152251: "魚沼市",
    152269: "南魚沼市",
    152277: "胎内市",
    153079: "聖籠町",
    153427: "弥彦村",
    153613: "田上町",
    153851: "阿賀町",
    154059: "出雲崎町",
    154610: "湯沢町",
    154822: "津南町",
    155047: "刈羽村",
    155811: "関川村",
    155861: "粟島浦村",
}

niigata_codes = {v: k for k, v in niigata_names.items()}


def niigata_get_code(s):
    return niigata_codes.get(s.strip(), 0)


def str2date(s):

    n = re.findall("[0-9]{1,2}", s)

    y = dt_now.year

    if len(n) == 2:
        m, d = map(int, n)
        return pd.Timestamp(y, m, d)

    else:
        return pd.NaT


def df_update(df1, df2):

    df = df1.reindex(df1.index.union(df2.index))
    df.update(df2)

    return df


def fetch_yousei(url):

    df = pd.read_html(url, index_col=0)[0].T
    df.rename(
        index={"入院中 (予定含む)": "hospitalization", "退院": "discharge"},
        columns={"累計": "count"},
        inplace=True,
    )

    df1 = df.loc[["hospitalization", "discharge"], "count"].copy()
    df1.index.name = "type"

    p_hospitalization_csv = pathlib.Path("dist", "csv", "hospitalization.csv")
    p_hospitalization_csv.parent.mkdir(parents=True, exist_ok=True)

    df1.to_csv(p_hospitalization_csv, encoding="utf_8_sig")


def fetch_excel(url, text):

    r = requests.get(url)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, "html.parser")

    tag = soup.find("a", text=re.compile(f"^{text}"), href=re.compile("xls[mx]?$"))

    if tag:
        link = urljoin(url, tag.get("href"))
        p = fetch_file(link, r"dist/excel")

        return p
    else:
        raise FileNotFoundError("Excelファイルが見つかりません")


def fetch_file(url, dir="."):

    r = requests.get(url)
    r.raise_for_status()

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)
    return p


def fetch_kanja(url):

    df = pd.read_html(url, na_values=["-", "-", "―"])[0]

    df.rename(
        columns={"患者 No. ※報道発表資料へリンク": "No", "患者 No. ※報道発表資料へリンク.1": "報道発表資料"},
        inplace=True,
    )

    df.dropna(thresh=4, inplace=True)

    df["備考"] = df["備考"].fillna("").astype(str)

    df["年代"] = df["年代"].fillna("").astype(str)
    df["年代"] = df["年代"].mask(df["年代"].str.startswith("10歳未満"), "10歳未満")

    df["判明日"] = df["判明日"].apply(
        lambda s: jaconv.z2h(s, kana=False, digit=True, ascii=True).replace(" ", "")
    )
    df["判明日"] = df["判明日"].apply(str2date)

    df["居住地"] = df["居住地"].apply(
        lambda s: jaconv.z2h(s, kana=False, digit=True, ascii=True).replace(" ", "")
    )
    df["居住地"] = df["居住地"].apply(lambda s: s.rstrip(")").split("(")[-1])

    df = df.sort_values("No").reset_index(drop=True)

    df.to_csv("kanja.tsv", sep="\t")

    df1 = df.copy()

    df1.rename(
        columns={
            "判明日": "公表_年月日",
            "居住地": "患者_居住地",
            "年代": "患者_年代",
            "性別": "患者_性別",
            "職業": "患者_職業",
        },
        inplace=True,
    )

    df1["都道府県名"] = "新潟県"
    df1["市区町村名"] = df1["患者_居住地"]
    df1["全国地方公共団体コード"] = df1["市区町村名"].apply(niigata_get_code)

    df2 = df1.reindex(
        columns=[
            "No",
            "全国地方公共団体コード",
            "都道府県名",
            "市区町村名",
            "公表_年月日",
            "発症_年月日",
            "患者_居住地",
            "患者_年代",
            "患者_性別",
            "患者_職業",
            "患者_状態",
            "患者_症状",
            "患者_渡航歴の有無フラグ",
            "患者_退院済フラグ",
            "備考",
        ]
    )
    
    df2.set_index("No", inplace=True)

    p_patients_csv = pathlib.Path("dist", "csv", "150002_niigata_covid19_patients.csv")
    p_patients_csv.parent.mkdir(parents=True, exist_ok=True)

    df2.to_csv(p_patients_csv, encoding="utf_8_sig")


def fetch_soudan(url):

    p_soudan = fetch_excel(url, "センター相談件数")

    df = pd.read_excel(p_soudan, skiprows=3, skipfooter=4)

    df.set_axis(["年", "受付_年月日", "曜日", "相談件数", "紹介人数", "備考"], axis=1, inplace=True)

    flg_is_serial = df["受付_年月日"].astype("str").str.isdigit()

    fromSerial = pd.to_datetime(
        df.loc[flg_is_serial, "受付_年月日"].astype(float),
        unit="D",
        origin=pd.Timestamp("1899/12/30"),
    )

    fromString = df.loc[~flg_is_serial, "受付_年月日"]

    df["受付_年月日"] = pd.concat([fromString, fromSerial])

    df1 = df.loc[flg_is_serial].copy()
    df1.drop(["年", "曜日"], axis=1, inplace=True)
    df1.reset_index(drop=True, inplace=True)

    df1.to_csv("soudan.tsv", sep="\t")

    df2 = df1.copy()

    df2["全国地方公共団体コード"] = 150002
    df2["都道府県名"] = "新潟県"
    df2["市区町村名"] = ""
    df2["備考"] = ""

    df3 = df2.reindex(
        columns=["受付_年月日", "全国地方公共団体コード", "都道府県名", "市区町村名", "相談件数", "備考",]
    )

    df3.set_index("受付_年月日", inplace=True)

    df_temp = pd.read_csv(
        "https://raw.githubusercontent.com/CodeForNiigata/covid19-data-niigata/master/dist/csv/150002_niigata_covid19_test_count.csv",
        index_col=0,
        parse_dates=True,
        dtype={"全国地方公共団体コード": "Int64", "相談件数": "Int64"},
    )

    df4 = df_update(df_temp, df3)
    df4.index = df4.index.strftime("%Y-%m-%d")

    p_callcenter_csv = pathlib.Path(
        "dist", "csv", "150002_niigata_covid19_call_center.csv"
    )
    p_callcenter_csv.parent.mkdir(parents=True, exist_ok=True)

    df4.to_csv(p_callcenter_csv, encoding="utf_8_sig")


def fetch_kensa(url):

    p_kensa = fetch_excel(url, "検査件数一覧表")

    df = pd.read_excel(p_kensa, skiprows=2, skipfooter=2)

    df.set_axis(
        ["年", "実施_年月日", "曜日", "検査実施_件数", "PCRセンター実施件数", "陽性件数"], axis=1, inplace=True
    )

    flg_is_serial = df["実施_年月日"].astype("str").str.isdigit()

    fromSerial = pd.to_datetime(
        df.loc[flg_is_serial, "実施_年月日"].astype(float),
        unit="D",
        origin=pd.Timestamp("1899/12/30"),
    )

    fromString = df.loc[~flg_is_serial, "実施_年月日"]

    df["実施_年月日"] = pd.concat([fromString, fromSerial])

    df1 = df.loc[flg_is_serial].copy()
    df1.drop(["年", "曜日"], axis=1, inplace=True)

    df1["検査実施_件数"] = df1["検査実施_件数"].fillna(0).astype("Int64")
    df1["PCRセンター実施件数"] = df1["PCRセンター実施件数"].fillna(0).astype("Int64")
    df1["陽性件数"] = df1["陽性件数"].fillna(0).astype("Int64")

    df1.reset_index(drop=True, inplace=True)

    df1.to_csv("kensa.tsv", sep="\t")

    df2 = df1.copy()

    df2["全国地方公共団体コード"] = 150002
    df2["都道府県名"] = "新潟県"
    df2["市区町村名"] = ""
    df2["備考"] = ""

    df3 = df2.reindex(
        columns=["実施_年月日", "全国地方公共団体コード", "都道府県名", "市区町村名", "検査実施_件数", "備考",]
    )

    df3.set_index("実施_年月日", inplace=True)

    df_temp = pd.read_csv(
        "https://raw.githubusercontent.com/CodeForNiigata/covid19-data-niigata/master/dist/csv/150002_niigata_covid19_test_count.csv",
        index_col=0,
        parse_dates=True,
        dtype={"全国地方公共団体コード": "Int64", "検査実施_件数": "Int64"},
    )

    df4 = df_update(df_temp, df3)
    df4.index = df4.index.strftime("%Y-%m-%d")

    p_testcount_csv = pathlib.Path(
        "dist", "csv", "150002_niigata_covid19_test_count.csv",
    )

    p_testcount_csv.parent.mkdir(parents=True, exist_ok=True)

    df4.to_csv(p_testcount_csv, encoding="utf_8_sig")


if __name__ == "__main__":

    r = requests.get(BASE_URL)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, "html.parser")

    tag = soup.find("a", text="県内における発生状況の詳細はこちら")

    if tag:
        link = urljoin(BASE_URL, tag.get("href"))

        fetch_yousei(BASE_URL)
        fetch_kanja(link)
        fetch_soudan(link)
        fetch_kensa(link)