Skip to content

Commit

Permalink
🔨 Fixing and processing ocr output text
Browse files Browse the repository at this point in the history
  • Loading branch information
Achraf KHAZRI authored and Achraf KHAZRI committed Sep 26, 2019
1 parent 4da1a5e commit 524d5b8
Showing 1 changed file with 14 additions and 3 deletions.
17 changes: 14 additions & 3 deletions ocr_api/ocr_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from ocr_api.ctpn import CtpnDetector
from ocr_api.preprocessing import PreProcess
from ocr_api.tesseract_engine import TesseractEngine
from textblob import TextBlob
from nltk.tokenize import word_tokenize


class OcrEngine():
Expand Down Expand Up @@ -46,8 +48,8 @@ def run(self, image_path):
processed_image = image_text_only + white_mask

# Save and load processed image
cv2.imwrite("out.png", processed_image)
img = cv2.imread("out.png")
cv2.imwrite("tmp/out.png", processed_image)
img = cv2.imread("tmp/out.png")
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# Deskew
Expand All @@ -58,5 +60,14 @@ def run(self, image_path):
# Text recognition using tesseract
output_recognition = self.recogniser.img2txt(deskewed, 'eng')

return output_recognition
processed_text = ""

tokens = word_tokenize(output_recognition)
for t in tokens:
processed_text = processed_text + " " + t

b = TextBlob(processed_text)
corrected_text = b.correct()

return corrected_text

0 comments on commit 524d5b8

Please sign in to comment.