!add-apt-repository ppa:alex-p/tesseract-ocr -y
!apt update
!apt install tesseract-ocr
!apt install libtesseract-dev
!tesseract -v
!apt install tesseract-ocr-jpn tesseract-ocr-jpn-vert
!apt install tesseract-ocr-script-jpan tesseract-ocr-script-jpan-vert
!tesseract --list-langs
!pip install pytesseract
import pathlib
import re
from urllib.parse import urljoin
import numpy as np
import requests
from bs4 import BeautifulSoup
import cv2
import pytesseract
from google.colab.patches import cv2_imshow
url = "https://www.pref.aichi.jp/site/covid19-aichi/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
}
r = requests.get(url, headers=headers)
r.raise_for_status()
soup = BeautifulSoup(r.content, "html5lib")
src = soup.find("img", alt=re.compile("検査陽性者$")).get("src")
link = urljoin(url, src)
print(link)
def get_file(url, dir="."):
r = requests.get(url)
p = pathlib.Path(dir, pathlib.PurePath(url).name)
p.parent.mkdir(parents=True, exist_ok=True)
with p.open(mode="wb") as fw:
fw.write(r.content)
return p
jpg_path = get_file(link)
src = cv2.imread(str(jpg_path))[2:-2, 2:-2]
src = cv2.imread("2020082918.jpg")[2:-2, 2:-2]
gray = cv2.cvtColor(src, cv2.COLOR_BGR2GRAY)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
dilated = cv2.dilate(gray, kernel, iterations=1)
diff = cv2.absdiff(dilated, gray)
contour = cv2.bitwise_not(diff)
contour[contour > 200] = 255
h, w = contour.shape[:2]
wide = int(1200 / w * h)
large = cv2.resize(contour, (1200, wide))
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
grad = cv2.morphologyEx(large, cv2.MORPH_GRADIENT, kernel)
_, bw = cv2.threshold(grad, 0.0, 255.0, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9, 1))
connected = cv2.morphologyEx(bw, cv2.MORPH_CLOSE, kernel)
contours, hierarchy = cv2.findContours(
connected.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE
)
mask = np.zeros(bw.shape, dtype=np.uint8)
rects = []
for idx in range(len(contours)):
x, y, w, h = cv2.boundingRect(contours[idx])
mask[y : y + h, x : x + w] = 0
cv2.drawContours(mask, contours, idx, (255, 255, 255), -1)
r = float(cv2.countNonZero(mask[y : y + h, x : x + w])) / (w * h)
if r > 0.45 and w > 850 and h > 100:
rects.append((x, x + w, y, y + h))
rects = sorted(rects, key=lambda x: (x[3], x[0]))
print(len(rects))
print(rects)
x1, x2, y1, y2 = rects[0]
dst = large[0:y1, 700:-1].copy()
cv2_imshow(dst)
txt = (
pytesseract.image_to_string(dst, lang="jpn", config="--psm 6")
.strip()
.replace(".", "")
.replace(",", "")
.replace(" ", "")
)
print(txt)
y_crop = int((y2 - y1) * 0.23)
dst = large[y1:y2, x1:x2][-y_crop:-5, 5:-5].copy()
cv2_imshow(dst)
edges = cv2.Canny(dst, 100, 200, apertureSize=3)
lines = cv2.HoughLines(edges, 1, np.pi / 2, 25)
for line in lines:
for rho, theta in line:
if theta == 0:
a = np.cos(theta)
x0 = int(a * rho)
x1, x2 = x0, x0
y1, y2 = 100, -100
cv2.line(dst, (x1, y1), (x2, y2), (255, 255, 255), 3)
def data_check(text):
print(text)
data = list(map(int, re.findall("\d+", text)))
print(data)
if(len(data) == 12):
if data[2] == data[3] + data[4] + data[5]:
if data[1] == data[2] + data[6] + data[7] + data[8] + data[9] + data[10] + data[11]:
print("OK")
else:
print("陽性者数の集計があいません")
else:
print("入院の集計があいません")
else:
print("データ数が足りません")
txt = (
pytesseract.image_to_string(dst, lang="jpn", config="--psm 3")
.strip()
.replace(".", "")
.replace(",", "")
)
data_check(txt)
txt = (
pytesseract.image_to_string(dst, lang="jpn", config="--psm 6")
.strip()
.replace(".", "")
.replace(",", "")
)
data_check(txt)
cv2_imshow(dst)
txt = (
pytesseract.image_to_string(dst, lang="jpn", config="--psm 11")
.strip()
.replace(".", "")
.replace(",", "")
)
data_check(txt)
cv2.imwrite("main.png", dst)
x1, x2, y1, y2 = rects[1]
x3, x4, y3, y4 = rects[2]
dst = large[y2:y3, 0:-1].copy()
cv2_imshow(dst)
txt = (
pytesseract.image_to_string(dst, lang="jpn", config="--psm 6")
.strip()
.replace(".", "")
.replace(",", "")
.replace(" ", "")
)
print(txt)
img = cv2.resize(src, (1200, wide))
for idx, rect in enumerate(rects):
x1, x2, y1, y2 = rect
color = np.random.randint(0, 255, 3).tolist()
cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
cv2.putText(img, str(idx), (x2, y2), cv2.FONT_HERSHEY_SIMPLEX, 0.8, color, 3)
cv2_imshow(img)