jsonschema

app.quicktype.io

import requests

INSPECTIONS_SUMMARY_SCHEMA = {
    "$schema": "http://json-schema.org/draft-07/schema",
    "type": "object",
    "required": [
        "data",
        "last_update"
    ],
    "properties": {
        "last_update": {
            "pattern": "^[0-9]{4}\/[0-9]{2}\/[0-9]{2} [0-9]{2}:[0-9]{2}$",
            "type": "string"
        },
        "labels": {
            "type": "array",
            "items": {
                "type": "string",
                "pattern": "^[0-9]{2}/[0-9]{2}$"
            }
        },
        "data": {
            "type": "object",
            "properties": {
                "検査検体数": {
                    "type": "array",
                    "default": [],
                    "items": {
                        "type": "integer",
                        "default": 0
                    }
                },
                "陽性確認": {
                    "type": "array",
                    "default": [],
                    "items": {
                        "type": "integer",
                        "default": 0
                    }
                }
            }
        }
    }
}

PATIENTS_SUMMARY_SCHEMA = {
    "$schema": "http://json-schema.org/draft-07/schema#",
    "type": "object",
    "required": [
        "data",
        "last_update"
    ],
    "properties": {
        "last_update": {
            "pattern": "^[0-9]{4}/[0-9]{2}/[0-9]{2} [0-9]{2}:[0-9]{2}$",
            "type": "string"
        },
        "data": {
            "type": "array",
            "items": {
                "type": "object",
                "required": [
                    "日付",
                    "小計"
                ],
                "properties": {
                    "日付": {
                        "type": "string",
                        "format": "date-time"
                    },
                    "小計": {
                        "default": 0,
                        "type": "integer"
                    }
                }
            }
        }
    }
}

!pip install jsonschema

import jsonschema

# inspections_summary.json

r = requests.get("https://raw.githubusercontent.com/stop-covid19-hyogo/covid19/development/data/inspections_summary.json")

d = r.json()

jsonschema.validate(d, INSPECTIONS_SUMMARY_SCHEMA)

# patients_summary.json

r = requests.get("https://raw.githubusercontent.com/stop-covid19-hyogo/covid19/development/data/patients_summary.json")

d = r.json()

jsonschema.validate(d, PATIENTS_SUMMARY_SCHEMA)

兵庫県のデータ

新型コロナウィルスに感染した患者の状況

https://web.pref.hyogo.lg.jp/kk03/corona_kanjyajyokyo.html

新型コロナウィルス感染症の県内検査状況

https://web.pref.hyogo.lg.jp/kk03/documents/pcr.xlsx

  • inspections.json(日別検査・陽性合計) ※不要
  • inspections_summary.json(日別検査・陽性合計)日付簡略

  • patients_summary.json(日別陽性合計)

新型コロナウイルス陽性者の状況(推移)

https://web.pref.hyogo.lg.jp/kk03/documents/yousei.xlsx

  • main_summary.json(検査・患者症状別合計)
  • sickbeds_summary.json(入院・病床)

兵庫県の圏域別受入可能病床数を取得(pdfminer)

pip install jaconv
pip install pdfminer.six
import re
from typing import Dict, List, Union
from urllib.parse import urljoin

import jaconv
import requests
from bs4 import BeautifulSoup
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBoxHorizontal
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage


def get_numbers_in_text(text: str) -> List[int]:
    return list(map(int, re.findall('[0-9]+', jaconv.z2h(text, digit=True))))


# スクレイピング
url = "https://web.pref.hyogo.lg.jp/kk03/200129.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}

r = requests.get(url, headers=headers)

r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

tag = soup.find("a", class_="icon_pdf", text=re.compile("医療提供体制の確保について"))

link = urljoin(url, tag.get("href"))

#!wget $link -O corona.pdf

# PDF処理

resourceManager = PDFResourceManager()
device = PDFPageAggregator(resourceManager, laparams=LAParams())

boxes = []

with open("corona.pdf", "rb") as fp:

    interpreter = PDFPageInterpreter(resourceManager, device)

    for page in PDFPage.get_pages(fp, maxpages=1):

        # ページを処理する。
        interpreter.process_page(page)

        # LTPageオブジェクトを取得。
        layout = device.get_result()

        tmp = [l for l in layout if isinstance(l, LTTextBoxHorizontal)]

        boxes.extend(tmp)

boxes.sort(key=lambda b: (-b.y1, b.x0))

text = "\n".join([box.get_text().strip() for box in boxes])

print(text)


m = re.search("【圏域別受入可能病床数】\n(.+)\n3  今後の状況の進展に応じて必要となる取組等", text, re.DOTALL)

data = get_numbers_in_text(m.group(0))

print(data)

兵庫県の圏域別受入可能病床数を取得

JAVAのインストールが必要

!pip install tabula-py
import re
from urllib.parse import urljoin

import pandas as pd
import requests
from bs4 import BeautifulSoup

from tabula import read_pdf

url = "https://web.pref.hyogo.lg.jp/kk03/200129.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}

r = requests.get(url, headers=headers)

r.raise_for_status()

soup = BeautifulSoup(r.content, "html.parser")

tag = soup.find("a", class_="icon_pdf", text=re.compile("医療提供体制の確保について"))

link = urljoin(url, tag.get("href"))

dfs = read_pdf(link, pages="1", lattice=True)

s = dfs[2].iloc[0]

# type(s)

d = s.to_dict()

d

total = d.pop("合計")

print(total)

兵庫県の新型コロナウイルス感染症の県内検査状況からGASで簡易APIを作成

qiita.com

officeforest.org

script.google.com

ライブラリからParserを追加

https://script.google.com/macros/s/AKfycby-SbnAWcB_P8h4y1cy4dx8hHHkCTYbONFnXiixYhuXnjhPFwbr/exec

function myFunction() {

    // スクレイピング

    const html = UrlFetchApp.fetch('https://web.pref.hyogo.lg.jp/kf16/singatakoronakensa.html').getContentText();

    const today = new Date();

    // Logger.log(today);

    // 配列をJSON用に変換
    let json = Parser.data(html).from('var dataset=').to(';var start').build().replace(/'/g, '"').replace(/, *?,/g, ', 0,');

    // Logger.log(json);

    let data = JSON.parse(json);

    // スプレッドシートに書き込み
    let mySheet = SpreadsheetApp.getActiveSheet();

    let result = []

    for (let v of data) {

        for (let i = v.length; i < 5; i++) {
            v.push(0);
        }
        result.push(v);
    }

    let lastRow = mySheet.getLastRow();
    let rows = data.length;

    Logger.log(lastRow, rows)

    // 更新確認
    if (lastRow <= rows) {

        mySheet.getRange(2, 1, rows, 5).setValues(result);

    }

}

function getData() {
    let mySheet = SpreadsheetApp.getActiveSheet();
    let rows = mySheet.getDataRange().getValues();

    var keys = rows.splice(0, 1)[0];

    return rows.map(function (row) {
        var obj = {}
        row.map(function (item, index) {
            obj[keys[index]] = item;
        });
        return obj;
    });
}

function doGet(e) {

    const data = getData();

    return ContentService.createTextOutput(JSON.stringify(data, null, 2)).setMimeType(ContentService.MimeType.JSON);
}