Python OCR #
Overview #
How to get text from an image using Tesseract which is an OCR (Optical Character Recognition) engine and Python.
Precondition #
Version #
- macOS: Big Sur 11.4
- Docker: 20.10.5, build 55c4c88
- docker-compose: 1.29.0, build 07737305
Simple sample #
Version #
- Debian GNU/Linux: 10 (buster)
- Python: 3.9.6
- pyocr: 0.8
- opencv-python: 4.5.3.56
Directory structure #
python-ocr/
├── README.md
├── img
│ ├── .gitkeep
│ └── sample.jpg
├── setup
│ ├── Dockerfile
│ └── docker-compose.yml
└── simple.py
Source #
Dockerfile
FROM python:latest
RUN apt-get update
RUN apt-get -y install \
tesseract-ocr* \
libgl1-mesa-dev \
vim
RUN apt-get clean
RUN pip install --upgrade pip
RUN pip install \
pillow \
pyocr \
# cv2
opencv-python \
opencv-contrib-python
WORKDIR /opt/
ENTRYPOINT ["/usr/bin/tail", "-f", "/dev/null"]
docker-compose.yml
version: '3'
services:
python-ocr:
build:
context: ../
dockerfile: setup/Dockerfile
container_name: python-ocr
volumes:
- ../:/opt
simple.py
import os
import pyocr
import sys
from PIL import Image
if len(sys.argv) != 2:
print("Please input an image file name in the first argument.")
exit()
img_dir = os.path.dirname(__file__) + '/img/'
img_file_name = os.path.basename(sys.argv[1])
img_path = img_dir + img_file_name
img_obj = Image.open(img_path)
builder = pyocr.builders.TextBuilder(tesseract_layout = 6)
ocr_engine = pyocr.get_available_tools()[0]
langs = ocr_engine.get_available_languages()
text = ocr_engine.image_to_string(img_obj, lang='jpn', builder=builder)
print(langs)
print(text)
Preparation #
pytyon-ocr
directory on your host machine
docker-compose -f setup/docker-compose.yml up -d --build
docker exec -it python-ocr bash
Run #
docker container
cd /opt/
python simple.py sample.jpg
Advanced sample #
- grayscale
- threshold processing.
- named argument
Version #
- Same as “simple sample” above.
Directory structure #
python-ocr/
├── README.md
├── img
│ ├── .gitkeep
│ └── sample.jpg
├── library
│ ├── base.py
│ ├── cv.py
│ └── ocr.py
├── main.py
├── setup
│ ├── Dockerfile
│ └── docker-compose.yml
└── simple.py
Source #
base.py
import os
from typing import NoReturn
class Base:
arg_parser = None
img_dir = ''
def __init__(self) -> NoReturn:
self.set_img_dir()
def get_file_name(self, file_path: str) -> str:
return os.path.basename(file_path)
# Get file name without extension.
def get_file_stem(self, file_path: str) -> str:
file_name = self.get_file_name(file_path)
return os.path.splitext(file_name)[0]
def is_file(self, file_path: str) -> bool:
return os.path.isfile(file_path)
def set_img_dir(self, img_dir: str = '') -> NoReturn:
if img_dir != '':
self.img_dir = img_dir
else:
dir = os.path.dirname(os.path.abspath(__file__)) + '/../img/'
self.img_dir = os.path.abspath(dir)
def get_img_dir(self) -> str:
return self.img_dir
cv.py
import cv2 # OpenCV (Open Source Computer Vision Library)
from typing import NoReturn
class ComputerVision:
def get_obj(self, img_path: str) -> cv2:
return cv2.imread(img_path)
def get_grayscale_obj(self, obj: cv2) -> cv2:
return cv2.cvtColor(obj, cv2.COLOR_BGR2GRAY)
# Thresholding
# Convert colors other than black (if a RBG value is greater than a threshold) to white RGB(255,255,255).
def get_threshold_obj(self, obj: cv2, threshold: int = 170) -> cv2:
return cv2.threshold(obj, threshold, 255, cv2.THRESH_BINARY)[1]
def write_obj_to_img(self, obj: cv2, img_path: str) -> NoReturn:
cv2.imwrite(img_path, obj)
ocr.py
import pyocr
from PIL import Image
from pyocr.builders import TextBuilder
from typing import NoReturn
class OpticalCharacterRecognition:
ocr_engine = None
def __init__(self) -> NoReturn:
self.__set_ocr_engine()
def get_img_obj(self, img_path: str) -> Image:
return Image.open(img_path)
def get_available_languages(self) -> list:
return self.ocr_engine.get_available_languages()
def get_text_by_img_obj(self, img_obj: Image, builder: TextBuilder,
lang: str = 'eng') -> str:
return self.ocr_engine.image_to_string(img_obj, lang=lang,
builder=builder)
def get_builder(self, tesseract_layout: int = 3) -> TextBuilder:
return TextBuilder(tesseract_layout=tesseract_layout)
def __set_ocr_engine(self) -> NoReturn:
self.ocr_engine = pyocr.get_available_tools()[0]
main.py
import argparse
from library import base, cv, ocr
# Initialize classes.
sample_base = base.Base()
sample_cv = cv.ComputerVision()
sample_ocr = ocr.OpticalCharacterRecognition()
langs = sample_ocr.get_available_languages()
# Get arguments.
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('--img', type=str, required=True)
args = arg_parser.parse_args()
# Set image paths.
img_file_name = sample_base.get_file_name(args.img)
img_file_stem = sample_base.get_file_stem(args.img)
img_dir = sample_base.get_img_dir() + '/'
img_org_path = img_dir + img_file_name
img_gray_path = img_dir + img_file_stem + '-gray.png'
img_threshold_path = img_dir + img_file_stem + '-threshold.png'
if sample_base.is_file(img_org_path) == False:
print("The file doesn't exist: " + img_org_path)
exit()
# Get CV (Computer Vision Library) object.
cv_obj = sample_cv.get_obj(img_org_path)
# Apply grayscale to the image.
cv_obj = sample_cv.get_grayscale_obj(cv_obj)
sample_cv.write_obj_to_img(cv_obj, img_gray_path)
# Apply Threshold processing to the image.
cv_obj = sample_cv.get_threshold_obj(cv_obj)
sample_cv.write_obj_to_img(cv_obj, img_threshold_path)
# Get text by the processed image.
img_obj = sample_ocr.get_img_obj(img_threshold_path)
builder = sample_ocr.get_builder()
text = sample_ocr.get_text_by_img_obj(img_obj, builder=builder, lang='jpn')
print(langs)
print(text)
Run #
Run below if the docker container is stopped.pytyon-ocr
directory on your host machine
docker-compose -f setup/docker-compose.yml up -d
docker exec -it python-ocr bash
docker container
cd /opt/
python main.py --img sample.jp