Home > OS >  Pandas groupby with multiple conditions
Pandas groupby with multiple conditions

Time:12-23

I'm trying to create a summary of call logs. There are 4 cases

  1. There is only one call log record for a phone and it has outcome, we choose its values for duration, status and outcome_record
  2. Multiple call logs of same phone has outcome, we choose the summary, duration and outcome_record of call log with max duration
  3. There is only one call log record for a phone and it doesn't have outcome, we choose its values for duration and status. outcome_record will be None
  4. Multiple call logs of same phone doesn't have outcome, we choose the summary and duration of call log with max duration. outcome_record will be None

What I tried is looping on the groups. But it is terribly slow when dealing with huge amount of data. I think I need to use pandas methods instead of looping. How to use pandas methods to achieve the same, with multiple conditions. Thanks.

import pandas as pd
def get_summarized_call_logs_df(df):
    data_list = []
    phone_groups = df.groupby('phone')
    unique_phones = df.phone.unique()
    for ph in unique_phones:
        row_data = {"phone": ph}
        group = phone_groups.get_group(ph)
        group_len = len(group)
        if True in group['outcome'].to_list():
            outcome = group.loc[group['outcome'] == True]
            row_data.update({"has_outcome": True})
            if outcome.phone.count() == 1:
                # Cases where there is outcome for single calls
                row_data.update({"status": outcome.status.iloc[0],
                                 "duration": outcome.duration.iloc[0],
                                 "outcome_record": outcome.id.iloc[0]})
            else:
                # Cases where there is outcome for multiple calls
                # We choose the status and duration of outcome record with maximum duration
                out_rec = outcome.loc[outcome['duration'] == outcome['duration'].max()]
                row_data.update({"status": out_rec.status.iloc[0],
                                 "duration": out_rec.duration.iloc[0],
                                 "outcome_record": out_rec.id.iloc[0]})
        else:
            row_data.update({"has_outcome": False, "outcome_record": None})
            if group_len == 1:
                # Cases where there is no outcome for single calls
                row_data.update({"status": group.status.iloc[0], "duration": group.duration.iloc[0]})
            else:
                # Cases where there is no outcome for multiple calls
                # We choose the status and duration of the record with maximum duration
                row_data.update({"status": group.loc[group['duration'] == group['duration'].max()].status.iloc[0],
                                "duration": group.loc[group['duration'] == group['duration'].max()].duration.iloc[0]})
        data_list.append(row_data)
    new_df = pd.DataFrame(data_list)
    return new_df

if __name__ == "__main__":
    data = [
    {"id": 1, "phone": "123", "outcome": True, "status": "sale", "duration": 1550},
    {"id": 2, "phone": "123", "outcome": False, "status": "failed", "duration": 3},
    {"id": 3, "phone": "123", "outcome": False, "status": "no_ring", "duration": 5},
    {"id": 4, "phone": "456", "outcome": True, "status": "call_back", "duration": 550},
    {"id": 5, "phone": "456", "outcome": True, "status": "sale", "duration": 2500},
    {"id": 6, "phone": "456", "outcome": False, "status": "no_ring", "duration": 5},
    {"id": 7, "phone": "789", "outcome": False, "status": "no_pick", "duration": 4},
    {"id": 8, "phone": "741", "outcome": False, "status": "try_again", "duration": 25},
    {"id": 9, "phone": "741", "outcome": False, "status": "try_again", "duration": 10},
    {"id": 10, "phone": "741", "outcome": False, "status": "no_ring", "duration": 5},
    ]
    df = pd.DataFrame(data)
    new_df = get_summarized_call_logs_df(df)
    print(new_df)

It should produce an output

  phone  has_outcome     status  duration  outcome_record
0   123         True       sale      1550             1.0
1   456         True       sale      2500             5.0
2   789        False    no_pick         4             NaN
3   741        False  try_again        25             NaN

CodePudding user response:

I think you can simplify the logic. If you sort your values mainly by 'outcome' and 'duration', you just have to drop duplicates and keep the last row of each sorted groups like this:

cols = ['phone', 'outcome', 'duration']
new_df = df.sort_values(cols).drop_duplicates('phone', keep='last')
print(new_df)

# Output:
   id phone  outcome     status  duration
0   1   123     True       sale      1550
4   5   456     True       sale      2500
7   8   741    False  try_again        25
6   7   789    False    no_pick         4

CodePudding user response:

Just to give an alternative option, based on convtools:

from convtools import conversion as c

# fmt: off
data = [
    {"id": 1, "phone": "123", "outcome": True, "status": "sale", "duration": 1550},
    {"id": 2, "phone": "123", "outcome": False, "status": "failed", "duration": 3},
    {"id": 3, "phone": "123", "outcome": False, "status": "no_ring", "duration": 5},
    {"id": 4, "phone": "456", "outcome": True, "status": "call_back", "duration": 550},
    {"id": 5, "phone": "456", "outcome": True, "status": "sale", "duration": 2500},
    {"id": 6, "phone": "456", "outcome": False, "status": "no_ring", "duration": 5},
    {"id": 7, "phone": "789", "outcome": False, "status": "no_pick", "duration": 4},
    {"id": 8, "phone": "741", "outcome": False, "status": "try_again", "duration": 25},
    {"id": 9, "phone": "741", "outcome": False, "status": "try_again", "duration": 10},
    {"id": 10, "phone": "741", "outcome": False, "status": "no_ring", "duration": 5},
]
# fmt: on

# you are interested in rows with max duration
max_duration_call_log = c.ReduceFuncs.MaxRow(c.item("duration"))

# you need to know whether there's been an outcome
has_outcome = c.ReduceFuncs.Count(where=c.item("outcome")) > 0

converter = (
    c.group_by(c.item("phone"))
    .aggregate(
        {
            "phone": c.item("phone"),
            "has_outcome": has_outcome,
            "status": max_duration_call_log.item("status"),
            "duration": max_duration_call_log.item("duration"),
            "outcome_record": c.if_(
                has_outcome,
                max_duration_call_log.item("id"),
                None,
            ),
        }
    )
    # this step generates and compiles ad hoc function
    .gen_converter()
)

# fmt: off
assert converter(data) == [
    {'phone': '123', 'has_outcome': True, 'status': 'sale', 'duration': 1550, 'outcome_record': 1},
    {'phone': '456', 'has_outcome': True, 'status': 'sale', 'duration': 2500, 'outcome_record': 5},
    {'phone': '789', 'has_outcome': False, 'status': 'no_pick', 'duration': 4, 'outcome_record': None},
    {'phone': '741', 'has_outcome': False, 'status': 'try_again', 'duration': 25, 'outcome_record': None},
]
# fmt: on

  • Related