仙台市コロナ陽性患者情報2021/03/19以前

ファイル名で並び替え済み

https://docs.google.com/spreadsheets/d/e/2PACX-1vSyOb6XDxKblcW0OYHzbszLljvGgHQZZw2WwsAjLSmi4wQDBw79y0Xsm0rM7z_-kWNcO8j0jLCtQJMb/pub?gid=2130592432&single=true&output=csv

requirements.txt

https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0101shiryo.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0102besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0103shiryo.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0104shiryo.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0105besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0106besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0107besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0108besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0110besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0111besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0112besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0113besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0114besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0115besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0116shiryo.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0117besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0118besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0119besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0120besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0121besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0122besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0123besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0124besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0125shiryo.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0126besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0127besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0128shiryo.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0129_beshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0130besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0131besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0201besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0202besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0203besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0204besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0205shiryo.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0206besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0207besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0209_besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0210besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0211besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0212_besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0213besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0214_besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0215besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0216besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0217_besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0218besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0219_besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0220besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0221besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0223_besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0224besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0225_besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0226besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0227_besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0228shiryo.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0301besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0302_besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0303besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0304besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0305_besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0306besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0307besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0308besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0309besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0310_besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0311besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0312_besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0313_besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0314besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0315besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0316besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0317besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/0318_besshi.pdf
https://www.city.sendai.jp/kenkoanzen-kansen/shise/koho/kisha/r3/documents/1009shiryo.pdf

PDFファイルリンク抽出

import requests
from bs4 import BeautifulSoup

import re
import time
from urllib.parse import urljoin

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
}


def fetch_soup(url, parser="html.parser"):

    r = requests.get(url, headers=headers)
    r.raise_for_status()

    soup = BeautifulSoup(r.content, parser)

    return soup


urls = [
    f"https://www.city.sendai.jp/shise/koho/kisha/r2/{i:02}/index.html"
    for i in range(1, 4)
]

links = []

for url in urls:

    soup1 = fetch_soup(url)

    for tag in soup1.find_all("a", text=re.compile("^新型コロナウイルス感染症の患者の発生について")):
        link1 = urljoin(url, tag.get("href"))

        soup2 = fetch_soup(link1)
        href = soup2.find("a", class_="icon_pdf", text=re.compile("^別紙"))

        link2 = urljoin(url, href.get("href"))

        links.append(link2)

import pandas as pd

se = pd.Series(links)

se.to_csv("requirements.txt", index=False, header=False)

ダウンロード

wget -i requirements.txt -P download -w 1 –random-wait

www.city.sendai.jp

変換のみ

import pdfplumber
import pandas as pd

import pathlib

dfs = []

for p in pathlib.Path("download").glob("*.pdf"):

    with pdfplumber.open(p) as pdf:

        for page in pdf.pages:

            for table in page.extract_tables():
                index, value = zip(*table)

                se = pd.Series(value, index=index)
                se["file"] = p.name
                dfs.append(se)

df = pd.concat(dfs, axis=1).T

df

df.to_csv("sendai.csv", encoding="utf_8_sig")

日付変換、ソート

import pdfplumber
import pandas as pd

import datetime
import pathlib

dfs = []

# PDF変換
for p in pathlib.Path("download").glob("*.pdf"):

    with pdfplumber.open(p) as pdf:

        for page in pdf.pages:

            for table in page.extract_tables():
                index, value = zip(*table)

                se = pd.Series(value, index=index)
                se["file"] = p.name
                dfs.append(se)

# 結合
df = pd.concat(dfs, axis=1).T.reset_index()

# ソート
df.sort_values(["file", "index"], inplace=True)
df.drop("index", axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

# 前後の空白文字、正規化、改行削除
for col in df.select_dtypes(include=object).columns:
    df[col] = (
        df[col].str.strip().str.normalize("NFKC").str.replace("\n", "", regex=True)
    )

# 日付変換
dt_now = datetime.datetime.now()


def str2date(s: pd.Series) -> pd.Series:

    df = (
        s.str.extract("(\d{1,2})月(\d{1,2})日")
        .rename(columns={0: "month", 1: "day"})
        .fillna(0)
        .astype(int)
    )

    df["year"] = dt_now.year

    tmp = pd.to_datetime(df, errors="coerce")

    df["year"] = df["year"].mask(tmp > dt_now, df["year"] - 1)

    return pd.to_datetime(df, errors="coerce")


df["発症日YMD"] = str2date(df["発症日(無症状の場合は検体採取日)"])

df["確認日YMD"] = str2date(df["陽性確認日"])

df.to_csv("sendai.csv", encoding="utf_8_sig")