How to Replace Dataframe Column Values Based on Condition of Second Dataframe Values-CodePudding

When a match of replacements.csv > Link Changed > 'Yes' is found, I want to carry out the following:

match column replacements.csv > Fruit to main.csv > External Links
replace matching fruits found in main.csv > External Links with replacements.csv > Fruit Link

To demonstrate, I need the required output to be shown as below:

replacements.csv

Fruit,Fruit Link,Link Changed
banana,https://en.wikipedia.org/wiki/Banana,
blueberry,https://en.wikipedia.org/wiki/Blueberry,
strawberry,https://en.wikipedia.org/wiki/Strawberry,Yes
raspberry,https://en.wikipedia.org/wiki/Raspberry,Yes
cherry,https://en.wikipedia.org/wiki/Cherry,
apple,https://en.wikipedia.org/wiki/Apple,Yes

main.csv

Title,External Links
Smoothie Recipes,"['banana', 'blueberry', 'strawberry', 'raspberry', 'apple']"
Fruit Pies,"['cherry', 'apple']"

required output

Title,External Links
Smoothie Recipes,"['banana', 'blueberry', 'https://en.wikipedia.org/wiki/Strawberry', 'https://en.wikipedia.org/wiki/Raspberry', 'https://en.wikipedia.org/wiki/Apple']"
Fruit Pies,"['cherry', 'https://en.wikipedia.org/wiki/Apple']"

Code

import pandas as pd

replacements = pd.read_csv('replacements.csv')
main = pd.read_csv('main.csv')

all_scrapes = []
fruits_found = []

## Replace main.csv > External Links when replacements.csv > Link Changed = Yes
def swap_urls(fruit, fruit_link):

    counter = 0

    while counter < len(main):
        title = main['Title'][counter]
        external_links = main['External Links'][counter]

        fruit_count = len(external_links.split(","))
        fruit_item_row = main['External Links'][counter].replace("'","").replace("[","").replace("]","").replace(" ","") # [0] represents main.csv row

        items = 0
        while items < fruit_count:
          single_fruit_list = fruit_item_row.split(',')[items]

          if fruit in single_fruit_list:
            print('Current Fruit Item:', single_fruit_list)
            external_links = external_links.replace(fruit, fruit_link)
            #fruits_found.append(fruit)

            product = {
              'Title': title,
              'External Link': external_links,
              #'Fruits Found': fruits_found,
              }

            print('  Product:', product)
            all_scrapes.append(product)
          else:
            pass

          items  =1

        counter  =1
    return all_scrapes


## Pass Fruit & Fruit Link values to function swap_urls when replacements.csv > Link Changed = Yes
y = 0
while y < len(replacements):
  fruit = replacements['Fruit'][y]
  fruit_link = replacements['Fruit Link'][y]
  link_changed = replacements['Link Changed'][y]

  if replacements['Link Changed'][y] == 'Yes':
      print(f'replacement.csv row [{y}]: {fruit}, Fruit Link: {fruit_link}, Link Changed: \x1b[92m{link_changed}\x1b[0m')
      swap_urls(fruit, fruit_link)
  else:
      print(f'replacement.csv row [{y}]: {fruit}, Fruit Link: {fruit_link}, Link Changed: No')
  y  =1


## Save results to File
df = pd.DataFrame(all_scrapes)
print('DF:\n', df)
df.to_excel('Result.xlsx', index=False)

Issue

I'm able to identify the fruits in replacements.csv with their counterparts in main.csv, however I'm unable to update main.csv > External Links as a single entry when multiple fruits are found. See generated output file results.xlsx

Any help would be much appreciated.

CodePudding user response：

import pandas as pd

replacements = pd.read_csv("replacements.csv")
main = pd.read_csv("main.csv")


# returns replacement link or fruit
def fruit_link(x):
    if x[1:-1] not in (replacements['Fruit'].values):
        return x[1:-1]
    return replacements.loc[replacements['Fruit'] == x[1:-1], 'Fruit Link'].values[0]\
    if replacements.loc[replacements['Fruit'] == x[1:-1], 'Link Changed'].values == 'Yes' else x[1:-1]


# split string of list to list
main["External Links"] = main["External Links"].apply(lambda x: x[1:-1].split(', '))

# explode main to fruits
main = main.explode("External Links")

# applying fruit_link to retrieve link or fruit
main["External Links"] = main["External Links"].apply(fruit_link)

# implode back
main = main.groupby('Title').agg({'External Links': lambda x: x.tolist()}).reset_index()

OUTPUT:

              Title                                                                                                                                   External Links
0        Fruit Pies                                                                                                  ['cherry', https://en.wikipedia.org/wiki/Apple]
1  Smoothie Recipes  ['banana', 'blueberry', https://en.wikipedia.org/wiki/Strawberry, https://en.wikipedia.org/wiki/Raspberry, https://en.wikipedia.org/wiki/Apple]

CodePudding user response：

Here is a relatively simple way to do this:

r = pd.read_csv('replacements.csv')
df = pd.read_csv('main.csv')

# make a proper list from the strings in 'External Links':
import ast

df['External Links'] = df['External Links'].apply(ast.literal_eval)

# make a dict for mapping
dct = r.dropna(subset='Link Changed').set_index('Fruit')['Fruit Link'].to_dict()
>>> dct
{'strawberry': 'https://en.wikipedia.org/wiki/Strawberry',
 'raspberry': 'https://en.wikipedia.org/wiki/Raspberry',
 'apple': 'https://en.wikipedia.org/wiki/Apple'}

# map, leaving the key by default
df['External Links'] = (
    df['External Links'].explode().map(lambda k: dct.get(k, k))
    .groupby(level=0).apply(pd.Series.tolist)
)

# result
>>> df
              Title                                     External Links
0  Smoothie Recipes  [banana, blueberry, https://en.wikipedia.org/w...
1        Fruit Pies      [cherry, https://en.wikipedia.org/wiki/Apple]

# result, as csv (to show quotation marks etc.)
>>> df.to_csv(index=False)
Title,External Links
Smoothie Recipes,"['banana', 'blueberry', 'https://en.wikipedia.org/wiki/Strawberry', 'https://en.wikipedia.org/wiki/Raspberry', 'https://en.wikipedia.org/wiki/Apple']"
Fruit Pies,"['cherry', 'https://en.wikipedia.org/wiki/Apple']"