I am trying to get data from tif files, store it in data
and create a pandas df0
data = []
listOfPages = glob.glob(r"C:/Users/name/*.tif")
for entry in listOfPages:
text = pytesseract.image_to_string(
Image.open(entry), lang="en"
)
data.append(text)
duck1 = re.compile(r'(CHE)(.*)\\n', flags = re.DOTALL | re.MULTILINE)
asker1 = re.compile(r'(my|the)\s zzzz(.*)office', flags = re.DOTALL | re.MULTILINE)
date1 = re.compile(r'\s dfg(\d{2}\.\d{2}\.\d{4})', flags = re.DOTALL | re.MULTILINE)
th1 = re.compile(r'(gh|gh)\s fg\s sdf(.*)(rrr,\s rtr)', flags = re.DOTALL | re.MULTILINE)
frage1 = re.compile(r'(\\neee)(.*)(we|wzz)\s drte\s Srrr:', flags = re.DOTALL | re.MULTILINE)
try:
d2 = duck1.search(text)
if d2:
dru = d2.group(1)
else:
dru = None
except:
pass
try:
asker2 = asker1.search(text)
if asker2:
asker = asker2.group(1)
else:
asker = None
except:
pass
try:
date2 = date1.search(text)
if date2:
datr = date2.group(0)
else:
datr = None
except:
pass
try:
thema2 = thema1.search(text)
if thema2:
thema = thema2.group(1)
else:
thema = None
except:
pass
try:
frage2 = frage1.search(text)
if frage2:
frage = frage2.group(1)
else:
frage = None
except:
pass
data.append([text, dru, asker, datr, thema, frage])
df0 = pd.DataFrame(data, columns =['raw_text', 'wer', 'asker', 'date', 'area', 'que_text'])
print(df0)
ValueError: Shape of passed values is (20, 1), indices imply (20, 6)
What am i doing wrong? I understand the error (from reading on the same subject but different scenarios) that there is supposed to be overlapping indices i would need to drop before appending?
CodePudding user response:
You just have to stop appending text
initially to data
as you are doing it at the last of the loop in the list and this creates data
as a list of lists with shape(20,6) which you need.
for entry in listOfPages:
text = pytesseract.image_to_string(
Image.open(entry), lang="en"
)
#data.append(text)
duck1 = re.compile(r'(CHE)(.*)\\n', flags = re.DOTALL | re.MULTILINE)
asker1 = re.compile(r'(my|the)\s zzzz(.*)office', flags = re.DOTALL | re.MULTILINE)
date1 = re.compile(r'\s dfg(\d{2}\.\d{2}\.\d{4})', flags = re.DOTALL | re.MULTILINE)
th1 = re.compile(r'(gh|gh)\s fg\s sdf(.*)(rrr,\s rtr)', flags = re.DOTALL | re.MULTILINE)
frage1 = re.compile(r'(\\neee)(.*)(we|wzz)\s drte\s Srrr:', flags = re.DOTALL | re.MULTILINE)
try:
d2 = duck1.search(text)
if d2:
dru = d2.group(1)
else:
dru = None
except:
pass
try:
asker2 = asker1.search(text)
if asker2:
asker = asker2.group(1)
else:
asker = None
except:
pass
try:
date2 = date1.search(text)
if date2:
datr = date2.group(0)
else:
datr = None
except:
pass
try:
thema2 = thema1.search(text)
if thema2:
thema = thema2.group(1)
else:
thema = None
except:
pass
try:
frage2 = frage1.search(text)
if frage2:
frage = frage2.group(1)
else:
frage = None
except:
pass
data.append([text, dru, asker, datr, thema, frage])
df0 = pd.DataFrame(data, columns =['raw_text', 'wer', 'asker', 'date', 'area', 'que_text'])
print(df0)
This should work fine.