Assign a common random value per group in pandas-CodePudding

I have a df:

customer_id     order_id    product_id  refund  price   
2QUKPQ3GB1      OE5Q7M1     WFL2ILKU3Z  False   692    
ISGF2N3OQL      I8E6PGG     WFL2ILKU3Z  False   668    
ISGF2N3OQL      I8E6PGG     WFL2ILKU3Z  True    541

I have 2 functions that return a random positive or negative comment with the following structure:

def random_negative_sentence():
    # ommiting the list of random words

    product = random.choice(products)
    adj = random.choice(negative)
    color = random.choice(colors)
    who = random.choice(person)
    verb = random.choice(negative_verbs)
    end = random.choice(l_neg)


    sentence1 = f'{who} {verb} the {color} {adj} {product}. {end}.'
    sentence2 = f'The {color} {product} is so {adj} that even {who} {verb} it! '   end
    sentence3 = f'{who} {verb} the {color} {product} because it is so {adj}! '   end
    sentence4 = f'{who} {verb} the {color} {product} because it is {adj} and { random.choice(negative)}! '   end

    return random.choice([sentence1, sentence2, sentence3, sentence4])

random_positive_sentence()
# my kids hate the blue rocking char because it is hard to assemble and clumsy! That was the last time I used this service

I am now trying to assign a random review to my df depending if it was refunded or not. I am able to do this using df['review'] = df['refund'].apply(lambda x: random_positive_sentence() if x == False else random_negative_sentence())

But then I realised that for the same product_id the review is using a different product, so I still want to assign everything randomly depending on the refund but the product should be the same per product_id column. So that all of the randomly generated reviews would be about the same product, but reviews would still be different.

How can I assign reviews depending on the refund column per groups of product_id? So that the end result would look like this:

customer_id     order_id    product_id  refund  price   review
2QUKPQ3GB1      OE5Q7M1     WFL2ILKU3Z  False   692    'I love the zaffre sturdy rocking char. Will recommend to my coworkers'
ISGF2N3OQL      I8E6PGG     WFL2ILKU3Z  False   668    'my grandmom appreciate the amber rocking char because it is reliable and easy to move! Will recommend to my family.'
ISGF2N3OQL      I8E6PGG     WFL2ILKU3Z  True    541    'my grandpa abhor the yellow hard to assemble rocking char. I will never use this service again.'

CodePudding user response：

Can you pass the id to the function? You might have to do something with product_id for indexing products, if the product names in products are made up (e.g. use rank to create a dummy product_id)

def random_negative_sentence(prod_id_idx):
    # ommiting the list of random words

    product = products[prod_id_idx]
    adj = random.choice(negative)
    color = random.choice(colors)
    who = random.choice(person)
    verb = random.choice(negative_verbs)
    end = random.choice(l_neg)


    sentence1 = f'{who} {verb} the {color} {adj} {product}. {end}.'
    sentence2 = f'The {color} {product} is so {adj} that even {who} {verb} it! '   end
    sentence3 = f'{who} {verb} the {color} {product} because it is so {adj}! '   end
    sentence4 = f'{who} {verb} the {color} {product} because it is {adj} and { random.choice(negative)}! '   end

    return random.choice([sentence1, sentence2, sentence3, sentence4])

df['review'] = (df
    .assign(prod_id_idx = lambda x: x['product_id'].rank(method="dense").astype(int))
    .apply(lambda x: random_positive_sentence(x['prod_id_idx']) if x['refund'] == False else random_negative_sentence(x['prod_id_idx']))
)

CodePudding user response：

I'd recommend using numpy's random module here, as you already have numpy installed if you're using pandas, and it offers the ability to generate large vectors of random values and additional options such as sampling without replacement for a large set of values.

For this problem, I'd first use np.random.Generator.choice with replace=False to assign product names to each row in your dataframe. Then, you can construct random messages for each using the name:

rng = np.random.default_rng()
df['product_name'] = rng.choice(products, size=len(df), replace=False)

Now, rewrite your function to work on a row of your dataframe (a series) rather than independent of your data:

def random_negative_sentence(row):
    # ommiting the list of random words

    adj = random.choice(negative)
    color = random.choice(colors)
    who = random.choice(person)
    verb = random.choice(negative_verbs)
    end = random.choice(l_neg)


    sentence1 = f'{who} {verb} the {color} {adj} {row.product}. {end}.'
    sentence2 = f'The {color} {row.product} is so {adj} that even {who} {verb} it! '   end
    sentence3 = f'{who} {verb} the {color} {row.product} because it is so {adj}! '   end
    sentence4 = f'{who} {verb} the {color} {row.product} because it is {adj} and { random.choice(negative)}! '   end

    return random.choice([sentence1, sentence2, sentence3, sentence4])

Now, you can apply this to your frame:

df["review"] = df.apply(random_negative_sentence, axis=0)

You could also have a wrapper function which dispatches based on negative or positive since you'll have row.refund to work with. Since you don't provide random_positive_sentence I'll let you work that out but it should be very straightforward.

Additionally, you could refactor this further to use vectorized string operations and do this entirely in numpy and pandas:

product = rng.choice(products, size=len(df), replace=False)
adj = rng.choice(negative, size=len(df))
color = rng.choice(colors, size=len(df))
who = rng.choice(person, size=len(df))
verb = rng.choice(negative_verbs, size=len(df))
end = rng.choice(l_neg, size=len(df))
negative = rng.choice(negatives, size=len(df)) # name change here

# convert to pd.Series with an index matching df so you can add these together.
# Also ensure "object" data type so the strings can be expanded
product = pd.Series(product, index=df.index).astype("O")
adj = pd.Series(adj, index=df.index).astype("O")
color = pd.Series(color, index=df.index).astype("O")
who = pd.Series(who, index=df.index).astype("O")
verb = pd.Series(verb, index=df.index).astype("O")
end = pd.Series(end, index=df.index).astype("O")
negative = pd.Series(negative, index=df.index).astype("O")

negative_sentences = pd.DataFrame({
    "sentence1": (
        who   " "   verb   " the "   color   " "   adj
           " "   product   ". "   end   "."
    ),
    "sentence2": (
        "The "   color   " "   product   " is so "   adj
          " that even "   who   " "   verb   " it! "   end
    ),
    "sentence3": (
        who   " "   verb   " the "   color   " "   product
          " because it is so "   adj   "! "   end
    ),
    "sentence4": (
        who   " "   verb   " the "   color   " "   product
          " because it is "   adj   " and "   negative   "! "   end
    ),
})

# randomly choose from these sentences
df["review"] = negative_setences.take(rng.randint(0, 3, size=len(df)), axis="columns")

Again, choosing from positive vs. negative sentences could be something simple, e.g.:

random_negative_sentence = negative_setences.take(
    rng.randint(0, 3, size=len(df)), axis="columns"
)
random_positive_sentence = positive_setences.take(
    rng.randint(0, 3, size=len(df)), axis="columns"
)

df["review"] = random_negative_sentence.where(
    df["refund"], random_positive_sentence
)