愛知県の検査陽性者の状況のjpegからOCRでスクレイピング(表抽出・縦線除去)

!add-apt-repository ppa:alex-p/tesseract-ocr -y

!apt update

!apt install tesseract-ocr
!apt install libtesseract-dev

!tesseract -v

!apt install tesseract-ocr-jpn  tesseract-ocr-jpn-vert
!apt install tesseract-ocr-script-jpan tesseract-ocr-script-jpan-vert

!tesseract --list-langs
!pip install pytesseract
import pathlib
import re
from urllib.parse import urljoin

import numpy as np
import requests
from bs4 import BeautifulSoup

import cv2
import pytesseract
from google.colab.patches import cv2_imshow

# スクレイピング
url = "https://www.pref.aichi.jp/site/covid19-aichi/"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}

r = requests.get(url, headers=headers)
r.raise_for_status()

soup = BeautifulSoup(r.content, "html5lib")
src = soup.find("img", alt=re.compile("検査陽性者$")).get("src")

link = urljoin(url, src)
print(link)

# ダウンロード
def get_file(url, dir="."):

    r = requests.get(url)

    p = pathlib.Path(dir, pathlib.PurePath(url).name)
    p.parent.mkdir(parents=True, exist_ok=True)

    with p.open(mode="wb") as fw:
        fw.write(r.content)

    return p

jpg_path = get_file(link)

# 最新ファイル
src = cv2.imread(str(jpg_path))[2:-2, 2:-2]

# 過去用
src = cv2.imread("2020082918.jpg")[2:-2, 2:-2]

# グレー
gray = cv2.cvtColor(src, cv2.COLOR_BGR2GRAY)

kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
dilated = cv2.dilate(gray, kernel, iterations=1)
diff = cv2.absdiff(dilated, gray)

contour = cv2.bitwise_not(diff)

# JPEGノイズのグレー除去
contour[contour > 200] = 255
# contour[contour < 100] = 0

# リサイズ(横1200固定)
h, w = contour.shape[:2]
wide = int(1200 / w * h)

large = cv2.resize(contour, (1200, wide))

# 楕円形カーネル
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))

# モルフォロジー勾配(物体の境界線)
grad = cv2.morphologyEx(large, cv2.MORPH_GRADIENT, kernel)

# 二値化
_, bw = cv2.threshold(grad, 0.0, 255.0, cv2.THRESH_BINARY | cv2.THRESH_OTSU)

# 矩形カーネル
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9, 1))

# ノイズ除去
connected = cv2.morphologyEx(bw, cv2.MORPH_CLOSE, kernel)

# cv2.RETR_EXTERNAL 輪郭
# cv2.CHAIN_APPROX_NONE 輪郭全点の情報を保持

contours, hierarchy = cv2.findContours(
    connected.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE
)

# マスク用
mask = np.zeros(bw.shape, dtype=np.uint8)

rects = []

for idx in range(len(contours)):
    # 外接矩形
    x, y, w, h = cv2.boundingRect(contours[idx])

    mask[y : y + h, x : x + w] = 0

    # 輪郭を描画
    cv2.drawContours(mask, contours, idx, (255, 255, 255), -1)

    # 面積割合
    r = float(cv2.countNonZero(mask[y : y + h, x : x + w])) / (w * h)

    # 面積、縦・横長さ(表のみ抽出)
    if r > 0.45 and w > 850 and h > 100:
        rects.append((x, x + w, y, y + h))

# Y下、X左でソート
rects = sorted(rects, key=lambda x: (x[3], x[0]))

# 個数
print(len(rects))

print(rects)

# 一番上の表座標
x1, x2, y1, y2 = rects[0]

# 日付部分を切り出し
dst = large[0:y1, 700:-1].copy()
cv2_imshow(dst)

txt = (
    pytesseract.image_to_string(dst, lang="jpn", config="--psm 6")
    .strip()
    .replace(".", "")
    .replace(",", "")
    .replace(" ", "")
)
print(txt)

# 一番上の表の下23%

y_crop = int((y2 - y1) * 0.23)

# 切り出し
dst = large[y1:y2, x1:x2][-y_crop:-5, 5:-5].copy()

cv2_imshow(dst)

edges = cv2.Canny(dst, 100, 200, apertureSize=3)
lines = cv2.HoughLines(edges, 1, np.pi / 2, 25)

# 縦線削除
# theta 角度
for line in lines:
    for rho, theta in line:

        if theta == 0:

            a = np.cos(theta)
            # b = np.sin(theta)

            x0 = int(a * rho)

            x1, x2 = x0, x0
            y1, y2 = 100, -100

            cv2.line(dst, (x1, y1), (x2, y2), (255, 255, 255), 3)

def data_check(text):
    print(text)

    data = list(map(int, re.findall("\d+", text)))
    print(data)

    if(len(data) == 12):
        if data[2] == data[3] + data[4] + data[5]:
            if data[1] == data[2] + data[6] + data[7] + data[8] + data[9] + data[10] + data[11]:
                print("OK")
            else:
                print("陽性者数の集計があいません")
        else:
            print("入院の集計があいません")
    else:
        print("データ数が足りません")

txt = (
    pytesseract.image_to_string(dst, lang="jpn", config="--psm 3")
    .strip()
    .replace(".", "")
    .replace(",", "")
)
data_check(txt)

txt = (
    pytesseract.image_to_string(dst, lang="jpn", config="--psm 6")
    .strip()
    .replace(".", "")
    .replace(",", "")
)
data_check(txt)

cv2_imshow(dst)

txt = (
    pytesseract.image_to_string(dst, lang="jpn", config="--psm 11")
    .strip()
    .replace(".", "")
    .replace(",", "")
)
data_check(txt)

cv2.imwrite("main.png", dst)

# 座標
x1, x2, y1, y2 = rects[1]
x3, x4, y3, y4 = rects[2]

dst = large[y2:y3, 0:-1].copy()
cv2_imshow(dst)

txt = (
    pytesseract.image_to_string(dst, lang="jpn", config="--psm 6")
    .strip()
    .replace(".", "")
    .replace(",", "")
    .replace(" ", "")
)
print(txt)

img = cv2.resize(src, (1200, wide))

for idx, rect in enumerate(rects):

    x1, x2, y1, y2 = rect

    color = np.random.randint(0, 255, 3).tolist()
    cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
    cv2.putText(img, str(idx), (x2, y2), cv2.FONT_HERSHEY_SIMPLEX, 0.8, color, 3)
    
cv2_imshow(img)