ValueError: Shape of passed values is (20, 1), indices imply (20, 6)-CodePudding

I am trying to get data from tif files, store it in data and create a pandas df0

data = []
listOfPages = glob.glob(r"C:/Users/name/*.tif")
for entry in listOfPages:
    text = pytesseract.image_to_string(
            Image.open(entry), lang="en"
        )
    data.append(text)
    
    duck1 = re.compile(r'(CHE)(.*)\\n', flags = re.DOTALL | re.MULTILINE)
    asker1 = re.compile(r'(my|the)\s zzzz(.*)office', flags = re.DOTALL | re.MULTILINE)
    date1 = re.compile(r'\s dfg(\d{2}\.\d{2}\.\d{4})', flags = re.DOTALL | re.MULTILINE)
    th1 = re.compile(r'(gh|gh)\s fg\s sdf(.*)(rrr,\s rtr)', flags = re.DOTALL | re.MULTILINE)
    frage1 = re.compile(r'(\\neee)(.*)(we|wzz)\s drte\s Srrr:', flags = re.DOTALL | re.MULTILINE)
    try:
        d2 = duck1.search(text)
        if d2:
            dru = d2.group(1)
        else:
            dru = None
    except:
        pass
    try:
        asker2 = asker1.search(text)
        if asker2:
            asker = asker2.group(1)
        else:
            asker = None
    except:
        pass
    try:
        date2 = date1.search(text)
        if date2:
            datr = date2.group(0)
        else:
            datr = None
    except:
        pass
    try:
        thema2 = thema1.search(text)
        if thema2:
            thema = thema2.group(1)
        else:
            thema = None
    except:
        pass
    try:
        frage2 = frage1.search(text)
        if frage2:
            frage = frage2.group(1)
        else:
            frage = None
    except:
        pass
    data.append([text, dru, asker, datr, thema, frage])
    
df0 = pd.DataFrame(data, columns =['raw_text', 'wer', 'asker', 'date', 'area', 'que_text'])
print(df0)

ValueError: Shape of passed values is (20, 1), indices imply (20, 6)

What am i doing wrong? I understand the error (from reading on the same subject but different scenarios) that there is supposed to be overlapping indices i would need to drop before appending?

CodePudding user response：

You just have to stop appending text initially to data as you are doing it at the last of the loop in the list and this creates data as a list of lists with shape(20,6) which you need.

for entry in listOfPages:
    text = pytesseract.image_to_string(
            Image.open(entry), lang="en"
        )
    #data.append(text)
    
    duck1 = re.compile(r'(CHE)(.*)\\n', flags = re.DOTALL | re.MULTILINE)
    asker1 = re.compile(r'(my|the)\s zzzz(.*)office', flags = re.DOTALL | re.MULTILINE)
    date1 = re.compile(r'\s dfg(\d{2}\.\d{2}\.\d{4})', flags = re.DOTALL | re.MULTILINE)
    th1 = re.compile(r'(gh|gh)\s fg\s sdf(.*)(rrr,\s rtr)', flags = re.DOTALL | re.MULTILINE)
    frage1 = re.compile(r'(\\neee)(.*)(we|wzz)\s drte\s Srrr:', flags = re.DOTALL | re.MULTILINE)
    try:
        d2 = duck1.search(text)
        if d2:
            dru = d2.group(1)
        else:
            dru = None
    except:
        pass
    try:
        asker2 = asker1.search(text)
        if asker2:
            asker = asker2.group(1)
        else:
            asker = None
    except:
        pass
    try:
        date2 = date1.search(text)
        if date2:
            datr = date2.group(0)
        else:
            datr = None
    except:
        pass
    try:
        thema2 = thema1.search(text)
        if thema2:
            thema = thema2.group(1)
        else:
            thema = None
    except:
        pass
    try:
        frage2 = frage1.search(text)
        if frage2:
            frage = frage2.group(1)
        else:
            frage = None
    except:
        pass
    data.append([text, dru, asker, datr, thema, frage])
    
df0 = pd.DataFrame(data, columns =['raw_text', 'wer', 'asker', 'date', 'area', 'que_text'])
print(df0)

This should work fine.