Pytesseract result includes unexpected content "\n\x0c"-CodePudding

I'm doing python OCR image to text, and compare if there is duplicate, I'm checking one by one so that I can locate easier

pic link: https://imgur.com/a/0BGmtEV

Main issue: from (original pic in pic link) I saved each of the result of image to text , ex: CAT4B5, CA7T4BB, CATAAF ... and I saved them in list, but when I print that list is like below:

How to remove \n\n and \n\x0c ?

I did the research for a while and I found this, Remove '\n\n\n', '\n' from python list, but I want to avoid this situation in beginning, not after. There should have some way avoid this in beginning.

[' \n\nCAT4B5\n\x0c', 'CA7T4BB\n\x0c', 'CATAAF\n\x0c', 'CAT4C1\n\x0c', '‘CAT4C7\n\x0c', 'CAT4B6\n\x0c', 'CAT4B0\n\x0c', 'CAT4BC\n\x0c', 'CAT4C2\n\x0c', 'CAT4C8\n\x0c', ' \n\x0c', ' \n\nCAT4B7\n\x0c', 'CATAC3\n\x0c', 'CAT4C9\n\x0c', ' \n\nCAT4B2\n\x0c', ' \n\nCA7T4B8\n\x0c', ' \n\nCATACS\n\x0c', 'CATAC4\n\x0c', 'CATACA\n\x0c', ' \n\nCATABS\n\x0c', ' \n\nCAT4B9\n\x0c', ' \n\nCAT4BF\n\x0c', 'CAT4CS\n\x0c', 'CAT4CB\n\x0c', ' \n\nCAT4B4\n\x0c', 'CATABA\n\x0c', ' \n\nCATACE\n\x0c', 'CATACE\n\x0c', 'CAT4CC\n\x0c']

the whole .py script:

import os
import cv2
import numpy as np
from PIL import Image
import pytesseract

image = cv2.imread("/home/student_DC/desktop/optimization_11_10/original_duplicate.png")
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
white_bg = 255*np.ones_like(image)

ret, thresh = cv2.threshold(gray, 60, 255, cv2.THRESH_BINARY_INV)
blur = cv2.medianBlur(thresh, 1)
kernel = np.ones((10, 20), np.uint8)
img_dilation = cv2.dilate(blur, kernel, iterations=1)
im2, ctrs, hier = cv2.findContours(img_dilation.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

sorted_ctrs = sorted(ctrs, key=lambda ctr: cv2.boundingRect(ctr)[0])

xy_list = []
listOfElems = []
listOfDuplicate = []
list_for_duplicate_x_and_y = [ ]

for i, ctr in enumerate(sorted_ctrs):
    # Get bounding box
    x, y, w, h = cv2.boundingRect(ctr)
    roi = image[y:y   h, x:x   w]
    if (h > 50 and w > 50) and h < 200:

        cv2.rectangle(image, (x, y), (x   w, y   h), (255, 255, 255), 1)        
        
        print(x , y )
        
        for xc in (45,150,255,360,465,570):
            if xc-20 < x < xc 20:
                x = xc   26
                break
        else:
            x = 0
                    
        for yc in (132, 243,586,357,470):
            if yc-20 < y < yc 20:
                y = yc   48
                break


             
        else:
            y = 0           

        print("new number" , x , y )
        
        tem_list_x_and_y = [ ] 
        tem_list_for_duplicate_x_and_y = [ ] 

        if (x != 0) and (y != 0):
            # cv2.imwrite(f"/home/student_DC/desktop/optimization_11_10/output_11_10__001/output_y:{y}_x:{x}.png", roi)
            tem_list_x_and_y.append(x)
            tem_list_x_and_y.append(y)
            
            xy_list.append(tem_list_x_and_y)
            w = 59
            h = 23
            new_crop = image[y:y h, x:x w]
            # cv2.imwrite(f"/home/student_DC/desktop/optimization_11_10/output_11_10__002/output_y:{y}_x:{x}.png" , new_crop)
            text = pytesseract.image_to_string(new_crop, lang='eng')
            
            
            if text not in listOfElems:
                listOfElems.append(text)
                print(text)
                print("=  =  =  =  =  ")
            else:
                print("Duplicate text is here:")
                print("x :" , x , "y :",y)
                tem_list_for_duplicate_x_and_y.append(x)
                tem_list_for_duplicate_x_and_y.append(y)
                list_for_duplicate_x_and_y.append(tem_list_x_and_y)

                print("=  =  =  =  =  ")
       
         

print("len is : " ,len(xy_list))

aaa_list = (sorted(xy_list , key=lambda k: [k[1], k[0]]))
print(aaa_list)        

print("list_for_duplicate_x_and_y is :")      

print(list_for_duplicate_x_and_y)      

print("listOfElems is :")
print(listOfElems)


img = cv2.imread("/home/student_DC/desktop/optimization_11_10/original_duplicate.png")

#               cv2.rectangle(img, (duplicate_x, duplicate_y), (duplicate_x   92, duplicate_y   82), (0, 255, 5), 5)              # 綠細框
# cv2.rectangle(img, (120, 120), (150, 150), (255, 0, 0), 5)         # 藍粗框

cv2.imwrite("/home/student_DC/desktop/optimization_11_10/original_duplicate_output.png" , img)

the entire output:

0 0
new number 0 0
44 472
new number 71 518
 

CAT4B5


=  =  =  =  =  
44 357
new number 71 405
CA7T4BB


=  =  =  =  =  
45 586
new number 71 634
CATAAF


=  =  =  =  =  
46 242
new number 71 291
CAT4C1


=  =  =  =  =  
50 132
new number 71 180
‘CAT4C7


=  =  =  =  =  
148 472
new number 176 518
CAT4B6


=  =  =  =  =  
149 587
new number 176 634
CAT4B0


=  =  =  =  =  
149 357
new number 176 405
CAT4BC


=  =  =  =  =  
150 243
new number 176 291
CAT4C2


=  =  =  =  =  
153 132
new number 176 180
CAT4C8


=  =  =  =  =  
253 588
new number 281 634
 


=  =  =  =  =  
253 473
new number 281 518
 

CAT4B7


=  =  =  =  =  
254 357
new number 281 405
Duplicate text is here:
x : 281 y : 405
=  =  =  =  =  
255 243
new number 281 291
CATAC3


=  =  =  =  =  
257 132
new number 281 180
CAT4C9


=  =  =  =  =  
357 588
new number 386 634
 

CAT4B2


=  =  =  =  =  
358 473
new number 386 518
 

CA7T4B8


=  =  =  =  =  
358 361
new number 386 405
 

CATACS


=  =  =  =  =  
359 243
new number 386 291
CATAC4


=  =  =  =  =  
360 132
new number 386 180
CATACA


=  =  =  =  =  
461 589
new number 491 634
 

CATABS


=  =  =  =  =  
462 474
new number 491 518
 

CAT4B9


=  =  =  =  =  
463 358
new number 491 405
 

CAT4BF


=  =  =  =  =  
463 243
new number 491 291
CAT4CS


=  =  =  =  =  
464 131
new number 491 180
CAT4CB


=  =  =  =  =  
566 589
new number 596 634
 

CAT4B4


=  =  =  =  =  
567 474
new number 596 518
CATABA


=  =  =  =  =  
567 361
new number 596 405
 

CATACE


=  =  =  =  =  
568 244
new number 596 291
CATACE


=  =  =  =  =  
568 131
new number 596 180
CAT4CC


=  =  =  =  =  
len is :  30
[[71, 180], [176, 180], [281, 180], [386, 180], [491, 180], [596, 180], [71, 291], [176, 291], [281, 291], [386, 291], [491, 291], [596, 291], [71, 405], [176, 405], [281, 405], [386, 405], [491, 405], [596, 405], [71, 518], [176, 518], [281, 518], [386, 518], [491, 518], [596, 518], [71, 634], [176, 634], [281, 634], [386, 634], [491, 634], [596, 634]]
list_for_duplicate_x_and_y is :
[[281, 405]]
listOfElems is :
[' \n\nCAT4B5\n\x0c', 'CA7T4BB\n\x0c', 'CATAAF\n\x0c', 'CAT4C1\n\x0c', '‘CAT4C7\n\x0c', 'CAT4B6\n\x0c', 'CAT4B0\n\x0c', 'CAT4BC\n\x0c', 'CAT4C2\n\x0c', 'CAT4C8\n\x0c', ' \n\x0c', ' \n\nCAT4B7\n\x0c', 'CATAC3\n\x0c', 'CAT4C9\n\x0c', ' \n\nCAT4B2\n\x0c', ' \n\nCA7T4B8\n\x0c', ' \n\nCATACS\n\x0c', 'CATAC4\n\x0c', 'CATACA\n\x0c', ' \n\nCATABS\n\x0c', ' \n\nCAT4B9\n\x0c', ' \n\nCAT4BF\n\x0c', 'CAT4CS\n\x0c', 'CAT4CB\n\x0c', ' \n\nCAT4B4\n\x0c', 'CATABA\n\x0c', ' \n\nCATACE\n\x0c', 'CATACE\n\x0c', 'CAT4CC\n\x0c']

CodePudding user response：

Use the strip method to remove the unwanted characters from the string when assigning the string value to the text variable.

text = pytesseract.image_to_string(new_crop, lang='eng').strip()

Example:

t = ' \n\nCAT4B5\n\x0c'
t.strip()
# 'CAT4B5'