how to count between A and B in same columns dataframe (pandas)
count "CUT" in m/c code column between "STD" and "STD" which are repeated in many time in columns
see below image attached
CodePudding user response:
There are obv many different ways.
One solution is this:
import pandas as pd
def c_counter(df, A, B, C):
starts = [i for i, x in enumerate(df['m/c code']) if x == 'A']
ends = [i for i, x in enumerate(df['m/c code']) if x == 'B']
df['Count'] = ''
for start, end in zip(starts, ends):
df['Count'][start:end] = sum(df['m/c code'][start:end] == 'C')
return(df)
df = pd.DataFrame({'m/c code': ['A', 'X', 'C', 'X', 'C', 'X', 'B', 'X', 'A', 'C', 'X', 'B']})
A = 'A'
B = 'B'
C = 'C'
c_counter(df, A, B, C)
Out:
m/c code Count
0 A 2
1 X 2
2 C 2
3 X 2
4 C 2
5 X 2
6 B
7 X
8 A 1
9 C 1
10 X 1
11 B
Next time, please make sure to include sample code.
CodePudding user response:
Another solution to your problem would be to create an auxiliary column that labels each interval. Then you can apply a groupby
alongside the transform method, to perform the counting. Here's the code:
from __future__ import annotations
import pandas as pd
import numpy as np
# == Helper functions (Not part of the actual solution) =======================
# You can ignore these functions, as they don't actually are part of the
# solution, but rather a way to generate some data to test the implementation.
def random_dates(
start_date: str | pd.Timestamp,
end_date: str | pd.Timestamp,
size: int = 10,
) -> pd.DatetimeIndex:
"""Generate random dates between two dates.
Parameters
----------
start_date : str | pd.Timestamp
Start date.
end_date : str | pd.Timestamp
End date.
size : int, optional
Number of dates to generate, by default 10.
Returns
-------
pd.DatetimeIndex
Random dates.
Examples
--------
>>> random_dates("2020-01-01", "2020-01-31", size=5) # doctest: ELLIPSIS
DatetimeIndex(['2020-01-05', '2020-01-12', ...], dtype='datetime64[ns]', freq=None)
"""
start_u = pd.to_datetime(start_date).value // 10**9
end_u = pd.to_datetime(end_date).value // 10**9
return pd.to_datetime(np.random.randint(start_u, end_u, size), unit="s")
def generate_random_frame(
start_date: str | pd.Timestamp,
end_date: str | pd.Timestamp,
size: int = 10,
) -> pd.DataFrame:
"""
Generate a DataFrame to test the solution.
Parameters
----------
start_date : str | pd.Timestamp
Start date. Must be a string representing a date, like "YYYY-MM-DD",
or "YYYY-MM-DD HH:MM:SS". Optionally, can also be a pandas Timestamp
object.
end_date : str | pd.Timestamp
End date. Must be a string representing a date, like "YYYY-MM-DD",
or "YYYY-MM-DD HH:MM:SS". Optionally, can also be a pandas Timestamp.
size : int, default 10
Number of rows to generate.
Returns
-------
pd.DataFrame
DataFrame with random dates and random values. The resulting DataFrame
has the following columns:
- "Time": random datetimes between `start_date` and `end_date`.
- "m/c code": random strings from a set of 7 possible values:
"END", "CUT", "STD", "BL1", "ALS", "ST1", or "CLN".
"""
mc_code_choices = ["END", "CUT", "STD", "BL1", "ALS", "ST1", "CLN"]
return pd.DataFrame(
{
"Time": random_dates(start_date, end_date, size),
"m/c code": np.random.choice(mc_code_choices, size),
}
)
# == Solution ==================================================================
def flag_groups_and_count(
df: pd.DataFrame,
group_colname: str = "m/c code",
lowbound_value: str = "END",
upbound_value: str = "STD",
value_to_count: str = "CUT",
count_colname: str = "Count",
flag_colname: str = "FLAG",
) -> pd.DataFrame:
"""
Flag groups and count the number of times a specified value appears on each group.
Groups are defined by values between `lowbound_value` and `upbound_value`
in the column `group_colname`. The flag is set to 1 for the first group.
Subsequent groups are flagged as 2, 3, etc. Groups are flagged as 0
represent the absence of a group.
After flagging the groups, function counts the number of times
the value specified by the `value_to_count` parameter appears.
Parameters
----------
df : pd.DataFrame
DataFrame to flag.
group_colname : str, default "m/c code"
Column name to group by.
lowbound_value : str, default "END"
Value to start each group.
upbound_value : str, default "STD"
Value to end each group.
value_to_count : str, default "CUT"
Value to count inside the groupby.
count_colname : str, default "Count"
Name of the column to store the counts.
flag_colname : str, default "FLAG"
Name of the column to store each group.
Returns
-------
pd.DataFrame
Original DataFrame with the added flag column.
"""
# Set the initial parameters, used to control the creation of the groups.
current_group = 1 # The current group number.
flag_row = False # Indicates whether the current row should be flagged.
# Create the column that stores the group numbers.
# Set all values initially to 0
df[flag_colname] = 0
# Iterate over each row of the dataframe.
# - index: index of each row. Same values you find by calling df.index
# - row: a pandas Series object with the values of each row.
for index, row in df.iterrows():
# If the current row has a 'm/c code' value equal to 'END',
# then set the flag_row variable to True to indicate that
# the next rows should be set to `current_group` untill
# it finds a row with 'm/c code' value that equals to 'STD'.
if row[group_colname] == lowbound_value:
flag_row = True
# Does this row belong to a group? If so, set it to `current_group`.
if flag_row:
df.loc[df.index.isin([index]), flag_colname] = current_group
# If the current row has a 'm/c code' value equal to 'STD',
# then we reached the end of a group. Set the flag_row variable
# to False indicating that the next rows should not be flagged to a
# group.
if row[group_colname] == upbound_value:
# Did we reach the end of a group, or simply found another value
# equal to "STD" before the next interval starts?
# This is to avoid incrementing the group number when in fact we didn't
# reach a new interval.
if flag_row:
current_group = 1
flag_row = False
# Groupby 'm/c code' column values by the newly created flag column.
# Inside this groupby, use the `transform` method to count the number of
# times the value "CUT" appears inside each group.
# Store the count in a new column called "Count".
df[count_colname] = df.groupby(flag_colname, as_index=False)[
group_colname
].transform(lambda group: (group == value_to_count).sum())
# Same as:
# df["Count"] = df.groupby("FLAG", as_index=False)[
# "m/c code"
# ].transform(lambda group: (group == "CUT").sum())
# When the flag column is equal to 0, it means that there's no interval.
# Therefore, set such counts to 0. Intervals represent the rows with
# values for the 'm/c code' column between adjacent "END" and "STD" values.
df.loc[test_df[flag_colname] == 0, count_colname] = 0
# Same as: test_df.loc[test_df["FLAG"] == 0, "Count"] = 0
return df
# == Test our code =============================================================
# Parameters to use for generating test DataFrame:
start_date = "2021-07-01 00:00:00"
end_date = "2021-07-03 00:00:00"
# Generate test DataFrame
test_df = generate_random_frame(start_date, end_date, size=30)
# Call the function that defines the implementation to the problem.
test_df = test_df.pipe(flag_groups_and_count)
test_df
Here's a screenshot of the output: