Iterate Over Rows in Pandas DataFrame Deleting All Values Within a Specified Number of Columns After-CodePudding

As the title implies I would like to iterate over the rows of my dataframe shown below.

I have a specific string that occurs somewhere within every row of my dataframe. I would like to delete every value within every column of that row, up until a specific column, (in this case 'zz').

In every row, every value after the specific string ('a') should be deleted, up until 'zz'. I do not want to delete any row values in column 'zz' or furthermore any values after column 'zz', i.e. in column 'aa'.

afterString = 'a'

df = {
    'bb': ['a', 'z', 'y'],
    'vv': ['b', 'a', 'z'],
    'ww': ['c', 'b', 'a'],
    'xx': ['d', 'c', 'b'],
    'yy': ['e', 'd', 'c'],
    'zz': ['f', 'e', 'd'],
    'aa': ['g', 'f', 'e']

}
output = {
    'bb': ['a', 'z', 'y'],
    'vv': ['', 'a', 'z'],
    'ww': ['', '', 'a'],
    'xx': ['', '', ''],
    'yy': ['', '', ''],
    'zz': ['f', 'e', 'd']
    'aa': ['g', 'f', 'e']
}

CodePudding user response：

Here is an interative solution. Maybe not that efficient on large dataframes, but it does the job:

import pandas as pd

data = {
    'bb': ['a', 'z', 'y'],
    'vv': ['b', 'a', 'z'],
    'ww': ['c', 'b', 'a'],
    'xx': ['d', 'c', 'b'],
    'yy': ['e', 'd', 'c'],
    'zz': ['f', 'e', 'd'],
    'aa': ['g', 'f', 'e']

}
df = pd.DataFrame(data)

def check_row(row):
    for index, value in row.items(): #loop columns in row
        if 'a' in row[:index].to_list() and not row[index]=='a': #set value to None if 'a' is in a previous column
            row[index] = None
    return row

df[df.columns[~df.columns.isin(['zz', 'aa'])]] = df[df.columns[~df.columns.isin(['zz', 'aa'])]].apply(check_row, axis=1) #apply function to all columns except zz and aa

Result:

	bb	vv	ww	zz	aa
0	a			f	g
1	z	a		e	f
2	y	z	a	d	e

CodePudding user response：

Please see my answer below:

import pandas as pd
import numpy as np

d = {
    'bb': ['a', 'z', 'y'],
    'vv': ['b', 'a', 'z'],
    'ww': ['c', 'b', 'a'],
    'xx': ['d', 'c', 'b'],
    'yy': ['e', 'd', 'c'],
    'zz': ['f', 'e', 'd'],
    'aa': ['g', 'f', 'e']

}

df = pd.DataFrame(d)

def edit_rows(row, afterString):
    try:
        a_pos = row.to_list().index(afterString)
        for index, val in enumerate(row):
            row[index] = np.nan if index > a_pos else val
        return row
    except ValueError: # In case 'a' is not present in the analysed row at all
        return row
    

afterString = 'a'
df.iloc[:, :df.columns.get_loc("zz")] = df.iloc[:, :df.columns.get_loc("zz")].apply(lambda row: edit_rows(row, afterString), axis=1)