Python: How to treat CSV with text delimiter at the beginning and end of the line?-CodePudding

I need the reading of the files to derive to the dataframe, but I didn't find a solution to handle this file format:

Line 12 of CSV

Date,Entity ID,Entity Name,Time,Contacts Recvd Revisado,Contacts Recvd Act,Contacts Handled Sched,Contacts Handled Access%,Contacts Abndn Act,Contacts Abndn Perc,AHT Revisado,AHT Act,Service Level Revisado,Service Level Act,Occupancy Revisado,Occupancy Act,ASA Revisado,ASA Act,Requirements Revisado,Requirements Act,Requirements  /-,Sched Open,Staff Est.,Staff - Req
"11/1/21,2559,AUTORIZACOES BDC,2:30 AM,1.54,0.00,60.24,""3,911.69"",0.00,0.00,75,0,100.00,0.00,2.56,0.00,0,0,0.69,0.00,0.00,5.00,0.00,0.00",,,,,,,,,,,,,,,,,,,,,,,

this is the code i'm using

import os    
import glob    
import pandas as pd

path = r'\\Srvflssp03\gto\Planejamento_Operacional\Forecast\2021\11 Novembro\Relatórios MIS'

file_list = glob.glob(os.path.join(path, '**/*.csv'), recursive=True)

combined_csv = pd.concat([pd.read_csv(f,quotechar = '"', sep = ',') for f in file_list ])

DataFrame = pd.DataFrame(combined_csv)

DataFrame.head(11)

result in python dataframe

Could anyone tell me what I can do to fix this?

CodePudding user response：

You could read it as normal text, repair line with " ", write all back to file as text, and later read all with pd.read_csv() without problems.

If you already read it then you may try to repair it. You can get text from first column Date and use io.StringIO to read it as csv to another DataFrame - and then you can copy row back to original place.

text = '''Date,Entity ID,Entity Name,Time,Contacts Recvd Revisado,Contacts Recvd Act,Contacts Handled Sched,Contacts Handled Access%,Contacts Abndn Act,Contacts Abndn Perc,AHT Revisado,AHT Act,Service Level Revisado,Service Level Act,Occupancy Revisado,Occupancy Act,ASA Revisado,ASA Act,Requirements Revisado,Requirements Act,Requirements  /-,Sched Open,Staff Est.,Staff - Req
"11/1/21,2559,AUTORIZACOES BDC,2:30 AM,1.54,0.00,60.24,""3,911.69"",0.00,0.00,75,0,100.00,0.00,2.56,0.00,0,0,0.69,0.00,0.00,5.00,0.00,0.00",,,,,,,,,,,,,,,,,,,,,,,
'''

import pandas as pd
import io

# here I use `io` only to simulate file
df = pd.read_csv(io.StringIO(text))
print(df)

# get string from first column (`Date`) in row `0`
line = df.iloc[0]['Date']
print(line)

# convert to new dataframe
df2 = pd.read_csv(io.StringIO(line), header=None)
print(df2)

# copy back to original dataframe
df.iloc[0] = df2.iloc[0]

print(df)

Result:

# df - before changes
                                                Date  ...  Staff - Req
0  11/1/21,2559,AUTORIZACOES BDC,2:30 AM,1.54,0.0...  ...          NaN

[1 rows x 24 columns]

# string `line`
11/1/21,2559,AUTORIZACOES BDC,2:30 AM,1.54,0.00,60.24,"3,911.69",0.00,0.00,75,0,100.00,0.00,2.56,0.00,0,0,0.69,0.00,0.00,5.00,0.00,0.00

# df2
        0     1                 2        3     4   ...   19   20   21   22   23
0  11/1/21  2559  AUTORIZACOES BDC  2:30 AM  1.54  ...  0.0  0.0  5.0  0.0  0.0

[1 rows x 24 columns]

# df - after changes
      Date  Entity ID       Entity Name  ... Sched Open  Staff Est.  Staff - Req
0  11/1/21     2559.0  AUTORIZACOES BDC  ...        5.0         0.0          0.0

[1 rows x 24 columns]

For your full data it will need to use row 10

line = df.iloc[10]['Date']

# ...

df.iloc[10] = df2.iloc[0]  # `df2` uses `0`

EDIT:

Version with more rows and with for-loop

text = '''Date,Entity ID,Entity Name,Time,Contacts Recvd Revisado,Contacts Recvd Act,Contacts Handled Sched,Contacts Handled Access%,Contacts Abndn Act,Contacts Abndn Perc,AHT Revisado,AHT Act,Service Level Revisado,Service Level Act,Occupancy Revisado,Occupancy Act,ASA Revisado,ASA Act,Requirements Revisado,Requirements Act,Requirements  /-,Sched Open,Staff Est.,Staff - Req
"11/1/21,2559,AUTORIZACOES BDC,2:30 AM,1.54,0.00,60.24,""3,911.69"",0.00,0.00,75,0,100.00,0.00,2.56,0.00,0,0,0.69,0.00,0.00,5.00,0.00,0.00",,,,,,,,,,,,,,,,,,,,,,,
11/1/21,2559,AUTORIZACOES BDC,2:30 AM,1.54,0.00,60.24,"3,911.69",0.00,0.00,75,0,100.00,0.00,2.56,0.00,0,0,0.69,0.00,0.00,5.00,0.00,0.00
"11/1/21,2559,AUTORIZACOES BDC,2:30 AM,1.54,0.00,60.24,""3,911.69"",0.00,0.00,75,0,100.00,0.00,2.56,0.00,0,0,0.69,0.00,0.00,5.00,0.00,0.00",,,,,,,,,,,,,,,,,,,,,,,
'''

import pandas as pd
import io

# here I use `io` only to simulate file
df = pd.read_csv(io.StringIO(text))
print(df)

for index, row in df.iterrows():
    # get string from first column (`Date`) in row
    line = row['Date']

    if ',' in line:
        #print(line)

        # convert to new dataframe
        temp_df = pd.read_csv(io.StringIO(line), header=None)
        #print(temp_df)
    
        # copy back to original dataframe
        df.iloc[index] = temp_df.iloc[0]
    
print(df)

and the same with apply() instead of for-loop

text = '''Date,Entity ID,Entity Name,Time,Contacts Recvd Revisado,Contacts Recvd Act,Contacts Handled Sched,Contacts Handled Access%,Contacts Abndn Act,Contacts Abndn Perc,AHT Revisado,AHT Act,Service Level Revisado,Service Level Act,Occupancy Revisado,Occupancy Act,ASA Revisado,ASA Act,Requirements Revisado,Requirements Act,Requirements  /-,Sched Open,Staff Est.,Staff - Req
"11/1/21,2559,AUTORIZACOES BDC,2:30 AM,1.54,0.00,60.24,""3,911.69"",0.00,0.00,75,0,100.00,0.00,2.56,0.00,0,0,0.69,0.00,0.00,5.00,0.00,0.00",,,,,,,,,,,,,,,,,,,,,,,
11/1/21,2559,AUTORIZACOES BDC,2:30 AM,1.54,0.00,60.24,"3,911.69",0.00,0.00,75,0,100.00,0.00,2.56,0.00,0,0,0.69,0.00,0.00,5.00,0.00,0.00
"11/1/21,2559,AUTORIZACOES BDC,2:30 AM,1.54,0.00,60.24,""3,911.69"",0.00,0.00,75,0,100.00,0.00,2.56,0.00,0,0,0.69,0.00,0.00,5.00,0.00,0.00",,,,,,,,,,,,,,,,,,,,,,,
'''

import pandas as pd
import io

# here I use `io` only to simulate file
df = pd.read_csv(io.StringIO(text))
print(df)

def convert(row):
    # get string from first column (`Date`) in row
    line = row['Date']

    if ',' in line:
        # convert to new dataframe
        temp_df = pd.read_csv(io.StringIO(line), header=None, names=df.columns)
        #print(temp_df)
    
        # copy back to original dataframe
        row = temp_df.iloc[0]

    return row

df = df.apply(convert, axis=1)
    
print(df)