import requests INSPECTIONS_SUMMARY_SCHEMA = { "$schema": "http://json-schema.org/draft-07/schema", "type": "object", "required": [ "data", "last_update" ], "properties": { "last_update": { "pattern": "^[0-9]{4}\/[0-9]{2}\/[0-9]{2} [0-9]{2}:[0-9]{2}$", "type": "string" }, "labels": { "type": "array", "items": { "type": "string", "pattern": "^[0-9]{2}/[0-9]{2}$" } }, "data": { "type": "object", "properties": { "検査検体数": { "type": "array", "default": [], "items": { "type": "integer", "default": 0 } }, "陽性確認": { "type": "array", "default": [], "items": { "type": "integer", "default": 0 } } } } } } PATIENTS_SUMMARY_SCHEMA = { "$schema": "http://json-schema.org/draft-07/schema#", "type": "object", "required": [ "data", "last_update" ], "properties": { "last_update": { "pattern": "^[0-9]{4}/[0-9]{2}/[0-9]{2} [0-9]{2}:[0-9]{2}$", "type": "string" }, "data": { "type": "array", "items": { "type": "object", "required": [ "日付", "小計" ], "properties": { "日付": { "type": "string", "format": "date-time" }, "小計": { "default": 0, "type": "integer" } } } } } } !pip install jsonschema import jsonschema # inspections_summary.json r = requests.get("https://raw.githubusercontent.com/stop-covid19-hyogo/covid19/development/data/inspections_summary.json") d = r.json() jsonschema.validate(d, INSPECTIONS_SUMMARY_SCHEMA) # patients_summary.json r = requests.get("https://raw.githubusercontent.com/stop-covid19-hyogo/covid19/development/data/patients_summary.json") d = r.json() jsonschema.validate(d, PATIENTS_SUMMARY_SCHEMA)
兵庫県のデータ
新型コロナウィルスに感染した患者の状況
https://web.pref.hyogo.lg.jp/kk03/corona_kanjyajyokyo.html
新型コロナウィルス感染症の県内検査状況
https://web.pref.hyogo.lg.jp/kk03/documents/pcr.xlsx
- inspections.json(日別検査・陽性合計) ※不要
inspections_summary.json(日別検査・陽性合計)日付簡略
patients_summary.json(日別陽性合計)
新型コロナウイルス陽性者の状況(推移)
兵庫県の圏域別受入可能病床数を取得(pdfminer)
pip install jaconv pip install pdfminer.six
import re from typing import Dict, List, Union from urllib.parse import urljoin import jaconv import requests from bs4 import BeautifulSoup from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LAParams, LTTextBoxHorizontal from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage def get_numbers_in_text(text: str) -> List[int]: return list(map(int, re.findall('[0-9]+', jaconv.z2h(text, digit=True)))) # スクレイピング url = "https://web.pref.hyogo.lg.jp/kk03/200129.html" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko", } r = requests.get(url, headers=headers) r.raise_for_status() soup = BeautifulSoup(r.content, "html.parser") tag = soup.find("a", class_="icon_pdf", text=re.compile("医療提供体制の確保について")) link = urljoin(url, tag.get("href")) #!wget $link -O corona.pdf # PDF処理 resourceManager = PDFResourceManager() device = PDFPageAggregator(resourceManager, laparams=LAParams()) boxes = [] with open("corona.pdf", "rb") as fp: interpreter = PDFPageInterpreter(resourceManager, device) for page in PDFPage.get_pages(fp, maxpages=1): # ページを処理する。 interpreter.process_page(page) # LTPageオブジェクトを取得。 layout = device.get_result() tmp = [l for l in layout if isinstance(l, LTTextBoxHorizontal)] boxes.extend(tmp) boxes.sort(key=lambda b: (-b.y1, b.x0)) text = "\n".join([box.get_text().strip() for box in boxes]) print(text) m = re.search("【圏域別受入可能病床数】\n(.+)\n3 今後の状況の進展に応じて必要となる取組等", text, re.DOTALL) data = get_numbers_in_text(m.group(0)) print(data)
兵庫県の圏域別受入可能病床数を取得
※JAVAのインストールが必要
!pip install tabula-py
import re from urllib.parse import urljoin import pandas as pd import requests from bs4 import BeautifulSoup from tabula import read_pdf url = "https://web.pref.hyogo.lg.jp/kk03/200129.html" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko", } r = requests.get(url, headers=headers) r.raise_for_status() soup = BeautifulSoup(r.content, "html.parser") tag = soup.find("a", class_="icon_pdf", text=re.compile("医療提供体制の確保について")) link = urljoin(url, tag.get("href")) dfs = read_pdf(link, pages="1", lattice=True) s = dfs[2].iloc[0] # type(s) d = s.to_dict() d total = d.pop("合計") print(total)
兵庫県の新型コロナウイルス感染症の県内検査状況からGASで簡易APIを作成
ライブラリからParserを追加
https://script.google.com/macros/s/AKfycby-SbnAWcB_P8h4y1cy4dx8hHHkCTYbONFnXiixYhuXnjhPFwbr/exec
function myFunction() { // スクレイピング const html = UrlFetchApp.fetch('https://web.pref.hyogo.lg.jp/kf16/singatakoronakensa.html').getContentText(); const today = new Date(); // Logger.log(today); // 配列をJSON用に変換 let json = Parser.data(html).from('var dataset=').to(';var start').build().replace(/'/g, '"').replace(/, *?,/g, ', 0,'); // Logger.log(json); let data = JSON.parse(json); // スプレッドシートに書き込み let mySheet = SpreadsheetApp.getActiveSheet(); let result = [] for (let v of data) { for (let i = v.length; i < 5; i++) { v.push(0); } result.push(v); } let lastRow = mySheet.getLastRow(); let rows = data.length; Logger.log(lastRow, rows) // 更新確認 if (lastRow <= rows) { mySheet.getRange(2, 1, rows, 5).setValues(result); } } function getData() { let mySheet = SpreadsheetApp.getActiveSheet(); let rows = mySheet.getDataRange().getValues(); var keys = rows.splice(0, 1)[0]; return rows.map(function (row) { var obj = {} row.map(function (item, index) { obj[keys[index]] = item; }); return obj; }); } function doGet(e) { const data = getData(); return ContentService.createTextOutput(JSON.stringify(data, null, 2)).setMimeType(ContentService.MimeType.JSON); }