Pandas Similar rows Search-CodePudding

How would I filter data on multiple criteria through the spreadsheet using python(pandas)?

I am trying to filter transactions with all Curr1=USD, where Trade Time within 1 minute, Have the same Notional 1 and have the Price within .5% spread between transactions. Then the row with the furthest(highest) Maturity would be moved to a different Sheet in excel.

Example of the data: GoogleDrive Excel File

Thank you in advance!

CodePudding user response：

from openpyxl import load_workbook
import pandas as pd

path = 'Import.xlsx'
sheet_name = 'DifferentSheet'
currency = 'USD'
max_spread = 0.005

def filter_transactions(transactions, currency, max_spread):
    df = transactions.copy()
    df['Trade Time'] = df['Trade Time'].dt.round(freq='min')
    df = df[df['Curr 1'] == currency]
    instruments = set(df['Instrument'])
    cols = ['instrument', 'notional', 'minute', 'max_maturity', 'currency2', 'notional2']
    filtered_transactions = pd.DataFrame(columns=cols)
    for instrument in instruments:
        df_instr = df[df['Instrument'] == instrument]
        for notional in set(df_instr['Notional 1']):
            df_notional = df_instr[df_instr['Notional 1'] == notional]
            minutes = pd.date_range(
                df_notional['Trade Time'].min(),
                df_notional['Trade Time'].max(),
                freq='min'
            )
            for minute in minutes:
                df_minute = df_notional[df_notional['Trade Time'] == minute]
                if df_minute.shape[0] > 0:
                    pricedeltas = df_minute['Price'][::-1].pct_change().fillna(0)[::-1]
                    deltacond = (pricedeltas.abs() <= max_spread)
                    max_maturity = pd.to_datetime('1970-01-01')
                    for cond, maturity, curr2, notional2 in zip(
                        deltacond, df_minute['Maturity'],
                        df_minute['Curr 2'], df_minute['Notional 2']
                    ):
                        if cond:
                            if maturity > max_maturity:
                                max_maturity = maturity
                        else:
                            obs = pd.DataFrame(
                                [[instrument, notional, minute, max_maturity, curr2, notional2]],
                                columns=cols
                            )
                            filtered_transactions = pd.concat([filtered_transactions, obs])
                            max_maturity = pd.to_datetime('1970-01-01')
    return filtered_transactions

def add_sheet(filtered_transactions, path, sheet_name):
    book = load_workbook(path)
    writer = pd.ExcelWriter(path, engine='openpyxl')
    writer.book = book
    transactions_filtered.to_excel(writer, sheet_name=sheet_name, index=False)
    writer.save()

transactions = pd.read_excel(path)
transactions_filtered = filter_transactions(transactions, currency, max_spread)
add_sheet(transactions_filtered, path, sheet_name)
print(transactions_filtered)

prints

index	instrument	notional	minute	max_maturity	currency2	notional2
0	USD COP NDF	17000000	2022-08-04 08:50:00	1970-01-01 00:00:00	COP	73380500000
1	USD COP NDF	4000000	2022-08-04 08:38:00	1970-01-01 00:00:00	COP	17304000000
2	USD COP NDF	1000000	2022-08-04 08:49:00	1970-01-01 00:00:00	COP	4326100000
3	USD COP NDF	10000000	2022-08-04 08:32:00	1970-01-01 00:00:00	COP	43260000000
...	...	...	...	...	...	...

This approach adds a new sheet to the existing Excel file with columns ['instrument', 'notional', 'minute', 'max_maturity', 'currency2', 'notional2'] that contains a row with the maximum observed maturity (and 'Currency 2' as well as 'Notional 2') for each group of consecutive transactions with absolute prices differences less than 0.5% and identical ['instrument', 'notional' and 'minute'], effectively filtering from 1113 down to just 112 rows.