How to compare 2 csv and write result to new csv using pandas-CodePudding

I want to compare the sku of source and destination file and if destination sku matches with source, i want to copy the meta_title, meta_description and description of that row

import pandas as pd

source = pd.read_csv('/content/source-source.csv')
destination = pd.read_csv('/content/out.csv')

for i in range(0, len(source)):
    try: 
        source_sku = source.iloc[i]['sku']
        destination_sku = destination.iloc[i]['sku']

        source_meta_title = source.iloc[i]['meta_title']
        source_meta_description = source.iloc[i]['meta_description']
        source_description = source.iloc[i]['description']

        if source_sku == destination_sku:
            destination.loc[i, 'meta_title'] = source_meta_title
            destination.loc[i, 'meta_description'] = source_meta_description
            destination.loc[i, 'description'] = source_description
         
        destination.to_csv('merged.csv', index=False, encoding='utf-8-sig')
    except ValueError:
        break

source.csv

sku meta_title meta_description description
a    ab         ab               ab
b    bb         bb               bb

destination.csv

sku meta_title meta_description description
a    bb         bb               bb
b    ab         ab               ab

merged.csv

sku meta_title meta_description description
a    ab         ab               ab
b    bb         bb               bb

my code is working now, but if the number of lines changes increase, then it throw error

CodePudding user response：

I think this is what you want. It keeps the set of SKUs from destination and replaces the values for SKUs that are also in source.

import pandas as pd

source = pd.read_csv('/content/source-source.csv')
destination = pd.read_csv('/content/out.csv')

destination = destination.merge(
    source,
    how='left',
    on=['sku'],
    suffixes=('_old', ''),
)

value_cols = ['meta_title', 'meta_description', 'description']

for c in value_cols:
    destination[c] = destination[c].fillna(destination[f'{c}_old'])

destination = destination.drop(columns=[
    f'{c}_old'
    for c
    in value_cols
])

destination.to_csv('merged.csv', index=False, encoding='utf-8-sig')

CodePudding user response：

If in the merged dataset, you want to keep the rows in destination and source with a different value in ['sku'], you could try doing this way

import pandas as pd

source = pd.DataFrame({'sku':['a','b', 'c'], 'meta_title':['ab','bb', 'ac'], 'meta_description':['ab','bb', 'ac'], 'description':['ab','bb', 'ac']})
destination = pd.DataFrame({'sku':['a','b', 'd'], 'meta_title':['bb','ab', 'dd'], 'meta_description':['bb','ab', 'dd'], 'description':['bb','ab', 'dd']})

df = source.merge(destination, how='outer', on='sku', indicator=True, suffixes=['', '_2'])

cols = ['sku', 'meta_title', 'meta_description', 'description', '_merge']

df = df[cols].loc[df['sku'].drop_duplicates().index].reset_index(drop=True)


# To check only rows with the same SKUs
df[df['_merge']=='both']