I'm doing python OCR image to text, and compare if there is duplicate, I'm checking one by one so that I can locate easier
pic link: https://imgur.com/a/0BGmtEV
Main issue: from (original pic in pic link) I saved each of the result of image to text , ex: CAT4B5, CA7T4BB, CATAAF ...
and I saved them in list, but when I print that list is like below:
How to remove \n\n
and \n\x0c
?
I did the research for a while and I found this, Remove '\n\n\n', '\n' from python list, but I want to avoid this situation in beginning, not after. There should have some way avoid this in beginning.
[' \n\nCAT4B5\n\x0c', 'CA7T4BB\n\x0c', 'CATAAF\n\x0c', 'CAT4C1\n\x0c', '‘CAT4C7\n\x0c', 'CAT4B6\n\x0c', 'CAT4B0\n\x0c', 'CAT4BC\n\x0c', 'CAT4C2\n\x0c', 'CAT4C8\n\x0c', ' \n\x0c', ' \n\nCAT4B7\n\x0c', 'CATAC3\n\x0c', 'CAT4C9\n\x0c', ' \n\nCAT4B2\n\x0c', ' \n\nCA7T4B8\n\x0c', ' \n\nCATACS\n\x0c', 'CATAC4\n\x0c', 'CATACA\n\x0c', ' \n\nCATABS\n\x0c', ' \n\nCAT4B9\n\x0c', ' \n\nCAT4BF\n\x0c', 'CAT4CS\n\x0c', 'CAT4CB\n\x0c', ' \n\nCAT4B4\n\x0c', 'CATABA\n\x0c', ' \n\nCATACE\n\x0c', 'CATACE\n\x0c', 'CAT4CC\n\x0c']
- the whole
.py
script:
import os
import cv2
import numpy as np
from PIL import Image
import pytesseract
image = cv2.imread("/home/student_DC/desktop/optimization_11_10/original_duplicate.png")
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
white_bg = 255*np.ones_like(image)
ret, thresh = cv2.threshold(gray, 60, 255, cv2.THRESH_BINARY_INV)
blur = cv2.medianBlur(thresh, 1)
kernel = np.ones((10, 20), np.uint8)
img_dilation = cv2.dilate(blur, kernel, iterations=1)
im2, ctrs, hier = cv2.findContours(img_dilation.copy(), cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
sorted_ctrs = sorted(ctrs, key=lambda ctr: cv2.boundingRect(ctr)[0])
xy_list = []
listOfElems = []
listOfDuplicate = []
list_for_duplicate_x_and_y = [ ]
for i, ctr in enumerate(sorted_ctrs):
# Get bounding box
x, y, w, h = cv2.boundingRect(ctr)
roi = image[y:y h, x:x w]
if (h > 50 and w > 50) and h < 200:
cv2.rectangle(image, (x, y), (x w, y h), (255, 255, 255), 1)
print(x , y )
for xc in (45,150,255,360,465,570):
if xc-20 < x < xc 20:
x = xc 26
break
else:
x = 0
for yc in (132, 243,586,357,470):
if yc-20 < y < yc 20:
y = yc 48
break
else:
y = 0
print("new number" , x , y )
tem_list_x_and_y = [ ]
tem_list_for_duplicate_x_and_y = [ ]
if (x != 0) and (y != 0):
# cv2.imwrite(f"/home/student_DC/desktop/optimization_11_10/output_11_10__001/output_y:{y}_x:{x}.png", roi)
tem_list_x_and_y.append(x)
tem_list_x_and_y.append(y)
xy_list.append(tem_list_x_and_y)
w = 59
h = 23
new_crop = image[y:y h, x:x w]
# cv2.imwrite(f"/home/student_DC/desktop/optimization_11_10/output_11_10__002/output_y:{y}_x:{x}.png" , new_crop)
text = pytesseract.image_to_string(new_crop, lang='eng')
if text not in listOfElems:
listOfElems.append(text)
print(text)
print("= = = = = ")
else:
print("Duplicate text is here:")
print("x :" , x , "y :",y)
tem_list_for_duplicate_x_and_y.append(x)
tem_list_for_duplicate_x_and_y.append(y)
list_for_duplicate_x_and_y.append(tem_list_x_and_y)
print("= = = = = ")
print("len is : " ,len(xy_list))
aaa_list = (sorted(xy_list , key=lambda k: [k[1], k[0]]))
print(aaa_list)
print("list_for_duplicate_x_and_y is :")
print(list_for_duplicate_x_and_y)
print("listOfElems is :")
print(listOfElems)
img = cv2.imread("/home/student_DC/desktop/optimization_11_10/original_duplicate.png")
# cv2.rectangle(img, (duplicate_x, duplicate_y), (duplicate_x 92, duplicate_y 82), (0, 255, 5), 5) # 綠細框
# cv2.rectangle(img, (120, 120), (150, 150), (255, 0, 0), 5) # 藍粗框
cv2.imwrite("/home/student_DC/desktop/optimization_11_10/original_duplicate_output.png" , img)
- the entire output:
0 0
new number 0 0
44 472
new number 71 518
CAT4B5
= = = = =
44 357
new number 71 405
CA7T4BB
= = = = =
45 586
new number 71 634
CATAAF
= = = = =
46 242
new number 71 291
CAT4C1
= = = = =
50 132
new number 71 180
‘CAT4C7
= = = = =
148 472
new number 176 518
CAT4B6
= = = = =
149 587
new number 176 634
CAT4B0
= = = = =
149 357
new number 176 405
CAT4BC
= = = = =
150 243
new number 176 291
CAT4C2
= = = = =
153 132
new number 176 180
CAT4C8
= = = = =
253 588
new number 281 634
= = = = =
253 473
new number 281 518
CAT4B7
= = = = =
254 357
new number 281 405
Duplicate text is here:
x : 281 y : 405
= = = = =
255 243
new number 281 291
CATAC3
= = = = =
257 132
new number 281 180
CAT4C9
= = = = =
357 588
new number 386 634
CAT4B2
= = = = =
358 473
new number 386 518
CA7T4B8
= = = = =
358 361
new number 386 405
CATACS
= = = = =
359 243
new number 386 291
CATAC4
= = = = =
360 132
new number 386 180
CATACA
= = = = =
461 589
new number 491 634
CATABS
= = = = =
462 474
new number 491 518
CAT4B9
= = = = =
463 358
new number 491 405
CAT4BF
= = = = =
463 243
new number 491 291
CAT4CS
= = = = =
464 131
new number 491 180
CAT4CB
= = = = =
566 589
new number 596 634
CAT4B4
= = = = =
567 474
new number 596 518
CATABA
= = = = =
567 361
new number 596 405
CATACE
= = = = =
568 244
new number 596 291
CATACE
= = = = =
568 131
new number 596 180
CAT4CC
= = = = =
len is : 30
[[71, 180], [176, 180], [281, 180], [386, 180], [491, 180], [596, 180], [71, 291], [176, 291], [281, 291], [386, 291], [491, 291], [596, 291], [71, 405], [176, 405], [281, 405], [386, 405], [491, 405], [596, 405], [71, 518], [176, 518], [281, 518], [386, 518], [491, 518], [596, 518], [71, 634], [176, 634], [281, 634], [386, 634], [491, 634], [596, 634]]
list_for_duplicate_x_and_y is :
[[281, 405]]
listOfElems is :
[' \n\nCAT4B5\n\x0c', 'CA7T4BB\n\x0c', 'CATAAF\n\x0c', 'CAT4C1\n\x0c', '‘CAT4C7\n\x0c', 'CAT4B6\n\x0c', 'CAT4B0\n\x0c', 'CAT4BC\n\x0c', 'CAT4C2\n\x0c', 'CAT4C8\n\x0c', ' \n\x0c', ' \n\nCAT4B7\n\x0c', 'CATAC3\n\x0c', 'CAT4C9\n\x0c', ' \n\nCAT4B2\n\x0c', ' \n\nCA7T4B8\n\x0c', ' \n\nCATACS\n\x0c', 'CATAC4\n\x0c', 'CATACA\n\x0c', ' \n\nCATABS\n\x0c', ' \n\nCAT4B9\n\x0c', ' \n\nCAT4BF\n\x0c', 'CAT4CS\n\x0c', 'CAT4CB\n\x0c', ' \n\nCAT4B4\n\x0c', 'CATABA\n\x0c', ' \n\nCATACE\n\x0c', 'CATACE\n\x0c', 'CAT4CC\n\x0c']
CodePudding user response:
Use the strip
method to remove the unwanted characters from the string when assigning the string value to the text
variable.
text = pytesseract.image_to_string(new_crop, lang='eng').strip()
Example:
t = ' \n\nCAT4B5\n\x0c'
t.strip()
# 'CAT4B5'