Home > other >  how to count between A and B in same columns dataframe (pandas)
how to count between A and B in same columns dataframe (pandas)

Time:10-06

how to count between A and B in same columns dataframe (pandas)

count "CUT" in m/c code column between "STD" and "STD" which are repeated in many time in columns

see below image attached

enter image description here

CodePudding user response:

There are obv many different ways.

One solution is this:

import pandas as pd

def c_counter(df, A, B, C):
    
    starts = [i for i, x in enumerate(df['m/c code']) if x == 'A']
    ends = [i for i, x in enumerate(df['m/c code']) if x == 'B']
    
    df['Count'] = ''
    for start, end in zip(starts, ends):
        df['Count'][start:end] = sum(df['m/c code'][start:end] == 'C')
    
    return(df)
    
df = pd.DataFrame({'m/c code': ['A', 'X', 'C', 'X', 'C', 'X', 'B', 'X', 'A', 'C', 'X', 'B']})
A = 'A'
B = 'B'
C = 'C'

c_counter(df, A, B, C)

Out: 
   m/c code Count
0         A     2
1         X     2
2         C     2
3         X     2
4         C     2
5         X     2
6         B      
7         X      
8         A     1
9         C     1
10        X     1
11        B 

Next time, please make sure to include sample code.

CodePudding user response:

Another solution to your problem would be to create an auxiliary column that labels each interval. Then you can apply a groupby alongside the transform method, to perform the counting. Here's the code:

from __future__ import annotations
import pandas as pd
import numpy as np


# == Helper functions (Not part of the actual solution) =======================
# You can ignore these functions, as they don't actually are part of the
# solution, but rather a way to generate some data to test the implementation.
def random_dates(
    start_date: str | pd.Timestamp,
    end_date: str | pd.Timestamp,
    size: int = 10,
) -> pd.DatetimeIndex:
    """Generate random dates between two dates.

    Parameters
    ----------
    start_date : str | pd.Timestamp
        Start date.
    end_date : str | pd.Timestamp
        End date.
    size : int, optional
        Number of dates to generate, by default 10.

    Returns
    -------
    pd.DatetimeIndex
        Random dates.

    Examples
    --------
    >>> random_dates("2020-01-01", "2020-01-31", size=5) # doctest:  ELLIPSIS
    DatetimeIndex(['2020-01-05', '2020-01-12', ...], dtype='datetime64[ns]', freq=None)
    """
    start_u = pd.to_datetime(start_date).value // 10**9
    end_u = pd.to_datetime(end_date).value // 10**9
    return pd.to_datetime(np.random.randint(start_u, end_u, size), unit="s")


def generate_random_frame(
    start_date: str | pd.Timestamp,
    end_date: str | pd.Timestamp,
    size: int = 10,
) -> pd.DataFrame:
    """
    Generate a DataFrame to test the solution.

    Parameters
    ----------
    start_date : str | pd.Timestamp
        Start date. Must be a string representing a date, like "YYYY-MM-DD",
        or "YYYY-MM-DD HH:MM:SS". Optionally, can also be a pandas Timestamp
        object.
    end_date : str | pd.Timestamp
        End date. Must be a string representing a date, like "YYYY-MM-DD",
        or "YYYY-MM-DD HH:MM:SS". Optionally, can also be a pandas Timestamp.
    size : int, default 10
        Number of rows to generate.

    Returns
    -------
    pd.DataFrame
        DataFrame with random dates and random values. The resulting DataFrame
        has the following columns:

            - "Time": random datetimes between `start_date` and `end_date`.
            - "m/c code": random strings from a set of 7 possible values:
              "END", "CUT", "STD", "BL1", "ALS", "ST1", or "CLN".
    """

    mc_code_choices = ["END", "CUT", "STD", "BL1", "ALS", "ST1", "CLN"]
    return pd.DataFrame(
        {
            "Time": random_dates(start_date, end_date, size),
            "m/c code": np.random.choice(mc_code_choices, size),
        }
    )


# == Solution ==================================================================
def flag_groups_and_count(
    df: pd.DataFrame,
    group_colname: str = "m/c code",
    lowbound_value: str = "END",
    upbound_value: str = "STD",
    value_to_count: str = "CUT",
    count_colname: str = "Count",
    flag_colname: str = "FLAG",
) -> pd.DataFrame:
    """
    Flag groups and count the number of times a specified value appears on each group.

    Groups are defined by values between `lowbound_value` and `upbound_value`
    in the column `group_colname`. The flag is set to 1 for the first group.
    Subsequent groups are flagged as 2, 3, etc. Groups are flagged as 0
    represent the absence of a group.

    After flagging the groups, function counts the number of times
    the value specified by the `value_to_count` parameter appears.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame to flag.
    group_colname : str, default "m/c code"
        Column name to group by.
    lowbound_value : str, default "END"
        Value to start each group.
    upbound_value : str, default "STD"
        Value to end each group.
    value_to_count : str, default "CUT"
        Value to count inside the groupby.
    count_colname : str, default "Count"
        Name of the column to store the counts.
    flag_colname : str, default "FLAG"
        Name of the column to store each group.

    Returns
    -------
    pd.DataFrame
        Original DataFrame with the added flag column.
    """

    # Set the initial parameters, used to control the creation of the groups.
    current_group = 1  # The current group number.
    flag_row = False   # Indicates whether the current row should be flagged.
    
    # Create the column that stores the group numbers.
    # Set all values initially to 0
    df[flag_colname] = 0

    # Iterate over each row of the dataframe.
    # - index: index of each row. Same values you find by calling df.index
    # - row: a pandas Series object with the values of each row.
    for index, row in df.iterrows():
        # If the current row has a 'm/c code' value equal to 'END',
        # then set the flag_row variable to True to indicate that
        # the next rows should be set to `current_group` untill
        # it finds a row with 'm/c code' value that equals to 'STD'.
        if row[group_colname] == lowbound_value:
            flag_row = True
        # Does this row belong to a group? If so, set it to `current_group`.
        if flag_row:
            df.loc[df.index.isin([index]), flag_colname] = current_group
        # If the current row has a 'm/c code' value equal to 'STD',
        # then we reached the end of a group. Set the flag_row variable
        # to False indicating that the next rows should not be flagged to a
        # group.
        if row[group_colname] == upbound_value:
            # Did we reach the end of a group, or simply found another value
            # equal to "STD" before the next interval starts?
            # This is to avoid incrementing the group number when in fact we didn't
            # reach a new interval.
            if flag_row:
                current_group  = 1
            flag_row = False

    # Groupby 'm/c code' column values by the newly created flag column.
    # Inside this groupby, use the `transform` method to count the number of
    # times the value "CUT" appears inside each group.
    # Store the count in a new column called "Count".
    df[count_colname] = df.groupby(flag_colname, as_index=False)[
        group_colname
    ].transform(lambda group: (group == value_to_count).sum())
    # Same as:
    # df["Count"] = df.groupby("FLAG", as_index=False)[
    #     "m/c code"
    # ].transform(lambda group: (group == "CUT").sum())

    # When the flag column is equal to 0, it means that there's no interval.
    # Therefore, set such counts to 0. Intervals represent the rows with 
    # values for the 'm/c code' column between adjacent "END" and "STD" values.
    df.loc[test_df[flag_colname] == 0, count_colname] = 0
    # Same as: test_df.loc[test_df["FLAG"] == 0, "Count"] = 0
    
    return df


# == Test our code =============================================================

# Parameters to use for generating test DataFrame:
start_date = "2021-07-01 00:00:00"
end_date = "2021-07-03 00:00:00"

# Generate test DataFrame
test_df = generate_random_frame(start_date, end_date, size=30)

# Call the function that defines the implementation to the problem.
test_df = test_df.pipe(flag_groups_and_count)
test_df


Here's a screenshot of the output:

enter image description here

  • Related