🔨 Fixing and processing ocr output text

GuiltyNeuron · Sep 26, 2019 · 524d5b8 · 524d5b8
1 parent 4da1a5e
commit 524d5b8
Showing 1 changed file with 14 additions and 3 deletions.
diff --git a/ocr_api/ocr_engine.py b/ocr_api/ocr_engine.py
@@ -3,6 +3,8 @@
 from ocr_api.ctpn import CtpnDetector
 from ocr_api.preprocessing import PreProcess
 from ocr_api.tesseract_engine import TesseractEngine
+from textblob import TextBlob
+from nltk.tokenize import word_tokenize
 
 
 class OcrEngine():
@@ -46,8 +48,8 @@ def run(self, image_path):
         processed_image = image_text_only + white_mask
 
         # Save and load processed image
-        cv2.imwrite("out.png", processed_image)
-        img = cv2.imread("out.png")
+        cv2.imwrite("tmp/out.png", processed_image)
+        img = cv2.imread("tmp/out.png")
         gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
 
         # Deskew
@@ -58,5 +60,14 @@ def run(self, image_path):
         # Text recognition using tesseract
         output_recognition = self.recogniser.img2txt(deskewed, 'eng')
 
-        return output_recognition
+        processed_text = ""
+
+        tokens = word_tokenize(output_recognition)
+        for t in tokens:
+            processed_text = processed_text + " " + t
+
+        b = TextBlob(processed_text)
+        corrected_text = b.correct()
+
+        return corrected_text