I'm trying to create a summary of call logs. There are 4 cases
- There is only one call log record for a phone and it has outcome, we choose its values for duration, status and outcome_record
- Multiple call logs of same phone has outcome, we choose the summary, duration and outcome_record of call log with max duration
- There is only one call log record for a phone and it doesn't have outcome, we choose its values for duration and status. outcome_record will be None
- Multiple call logs of same phone doesn't have outcome, we choose the summary and duration of call log with max duration. outcome_record will be None
What I tried is looping on the groups. But it is terribly slow when dealing with huge amount of data. I think I need to use pandas methods instead of looping. How to use pandas methods to achieve the same, with multiple conditions. Thanks.
import pandas as pd
def get_summarized_call_logs_df(df):
data_list = []
phone_groups = df.groupby('phone')
unique_phones = df.phone.unique()
for ph in unique_phones:
row_data = {"phone": ph}
group = phone_groups.get_group(ph)
group_len = len(group)
if True in group['outcome'].to_list():
outcome = group.loc[group['outcome'] == True]
row_data.update({"has_outcome": True})
if outcome.phone.count() == 1:
# Cases where there is outcome for single calls
row_data.update({"status": outcome.status.iloc[0],
"duration": outcome.duration.iloc[0],
"outcome_record": outcome.id.iloc[0]})
else:
# Cases where there is outcome for multiple calls
# We choose the status and duration of outcome record with maximum duration
out_rec = outcome.loc[outcome['duration'] == outcome['duration'].max()]
row_data.update({"status": out_rec.status.iloc[0],
"duration": out_rec.duration.iloc[0],
"outcome_record": out_rec.id.iloc[0]})
else:
row_data.update({"has_outcome": False, "outcome_record": None})
if group_len == 1:
# Cases where there is no outcome for single calls
row_data.update({"status": group.status.iloc[0], "duration": group.duration.iloc[0]})
else:
# Cases where there is no outcome for multiple calls
# We choose the status and duration of the record with maximum duration
row_data.update({"status": group.loc[group['duration'] == group['duration'].max()].status.iloc[0],
"duration": group.loc[group['duration'] == group['duration'].max()].duration.iloc[0]})
data_list.append(row_data)
new_df = pd.DataFrame(data_list)
return new_df
if __name__ == "__main__":
data = [
{"id": 1, "phone": "123", "outcome": True, "status": "sale", "duration": 1550},
{"id": 2, "phone": "123", "outcome": False, "status": "failed", "duration": 3},
{"id": 3, "phone": "123", "outcome": False, "status": "no_ring", "duration": 5},
{"id": 4, "phone": "456", "outcome": True, "status": "call_back", "duration": 550},
{"id": 5, "phone": "456", "outcome": True, "status": "sale", "duration": 2500},
{"id": 6, "phone": "456", "outcome": False, "status": "no_ring", "duration": 5},
{"id": 7, "phone": "789", "outcome": False, "status": "no_pick", "duration": 4},
{"id": 8, "phone": "741", "outcome": False, "status": "try_again", "duration": 25},
{"id": 9, "phone": "741", "outcome": False, "status": "try_again", "duration": 10},
{"id": 10, "phone": "741", "outcome": False, "status": "no_ring", "duration": 5},
]
df = pd.DataFrame(data)
new_df = get_summarized_call_logs_df(df)
print(new_df)
It should produce an output
phone has_outcome status duration outcome_record
0 123 True sale 1550 1.0
1 456 True sale 2500 5.0
2 789 False no_pick 4 NaN
3 741 False try_again 25 NaN
CodePudding user response:
I think you can simplify the logic. If you sort your values mainly by 'outcome' and 'duration', you just have to drop duplicates and keep the last row of each sorted groups like this:
cols = ['phone', 'outcome', 'duration']
new_df = df.sort_values(cols).drop_duplicates('phone', keep='last')
print(new_df)
# Output:
id phone outcome status duration
0 1 123 True sale 1550
4 5 456 True sale 2500
7 8 741 False try_again 25
6 7 789 False no_pick 4
CodePudding user response:
Just to give an alternative option, based on convtools:
from convtools import conversion as c
# fmt: off
data = [
{"id": 1, "phone": "123", "outcome": True, "status": "sale", "duration": 1550},
{"id": 2, "phone": "123", "outcome": False, "status": "failed", "duration": 3},
{"id": 3, "phone": "123", "outcome": False, "status": "no_ring", "duration": 5},
{"id": 4, "phone": "456", "outcome": True, "status": "call_back", "duration": 550},
{"id": 5, "phone": "456", "outcome": True, "status": "sale", "duration": 2500},
{"id": 6, "phone": "456", "outcome": False, "status": "no_ring", "duration": 5},
{"id": 7, "phone": "789", "outcome": False, "status": "no_pick", "duration": 4},
{"id": 8, "phone": "741", "outcome": False, "status": "try_again", "duration": 25},
{"id": 9, "phone": "741", "outcome": False, "status": "try_again", "duration": 10},
{"id": 10, "phone": "741", "outcome": False, "status": "no_ring", "duration": 5},
]
# fmt: on
# you are interested in rows with max duration
max_duration_call_log = c.ReduceFuncs.MaxRow(c.item("duration"))
# you need to know whether there's been an outcome
has_outcome = c.ReduceFuncs.Count(where=c.item("outcome")) > 0
converter = (
c.group_by(c.item("phone"))
.aggregate(
{
"phone": c.item("phone"),
"has_outcome": has_outcome,
"status": max_duration_call_log.item("status"),
"duration": max_duration_call_log.item("duration"),
"outcome_record": c.if_(
has_outcome,
max_duration_call_log.item("id"),
None,
),
}
)
# this step generates and compiles ad hoc function
.gen_converter()
)
# fmt: off
assert converter(data) == [
{'phone': '123', 'has_outcome': True, 'status': 'sale', 'duration': 1550, 'outcome_record': 1},
{'phone': '456', 'has_outcome': True, 'status': 'sale', 'duration': 2500, 'outcome_record': 5},
{'phone': '789', 'has_outcome': False, 'status': 'no_pick', 'duration': 4, 'outcome_record': None},
{'phone': '741', 'has_outcome': False, 'status': 'try_again', 'duration': 25, 'outcome_record': None},
]
# fmt: on