import pandas as pd
# Source data
df = pd.DataFrame( data={'A': ['aaa, aaa, aaa', 'aaa, aaa, bbb', 'bbb, aaa, aaa', 'aaa, bbb, ccc'],'B': [ '1', '1', '1', '1']})
for row in df.iterrows():
df['A'] = df['A']df['A'].duplicated()].unique()
Result:
ValueError: Length of values (100) does not match length of index (1254)
I would like to show duplicated values:
df_2 = pd.DataFrame( data={'A': [ 'aaa', 'aaa', 'aaa', ''],'B': [ '1', '1', '1', '1']})
CodePudding user response:
Use collections.Counter
to count and filter the duplicates:
from collections import Counter
df["A"] = [ ",".join(k for k, v in Counter(ai.split(", ")).items() if v > 1) for ai in df["A"]]
print(df)
Output
A B
0 aaa 1
1 aaa 1
2 aaa 1
3 1