How would I filter data on multiple criteria through the spreadsheet using python(pandas)?
I am trying to filter transactions with all Curr1=USD, where Trade Time within 1 minute, Have the same Notional 1 and have the Price within .5% spread between transactions. Then the row with the furthest(highest) Maturity would be moved to a different Sheet in excel.
Example of the data: GoogleDrive Excel File
Thank you in advance!
CodePudding user response:
from openpyxl import load_workbook
import pandas as pd
path = 'Import.xlsx'
sheet_name = 'DifferentSheet'
currency = 'USD'
max_spread = 0.005
def filter_transactions(transactions, currency, max_spread):
df = transactions.copy()
df['Trade Time'] = df['Trade Time'].dt.round(freq='min')
df = df[df['Curr 1'] == currency]
instruments = set(df['Instrument'])
cols = ['instrument', 'notional', 'minute', 'max_maturity', 'currency2', 'notional2']
filtered_transactions = pd.DataFrame(columns=cols)
for instrument in instruments:
df_instr = df[df['Instrument'] == instrument]
for notional in set(df_instr['Notional 1']):
df_notional = df_instr[df_instr['Notional 1'] == notional]
minutes = pd.date_range(
df_notional['Trade Time'].min(),
df_notional['Trade Time'].max(),
freq='min'
)
for minute in minutes:
df_minute = df_notional[df_notional['Trade Time'] == minute]
if df_minute.shape[0] > 0:
pricedeltas = df_minute['Price'][::-1].pct_change().fillna(0)[::-1]
deltacond = (pricedeltas.abs() <= max_spread)
max_maturity = pd.to_datetime('1970-01-01')
for cond, maturity, curr2, notional2 in zip(
deltacond, df_minute['Maturity'],
df_minute['Curr 2'], df_minute['Notional 2']
):
if cond:
if maturity > max_maturity:
max_maturity = maturity
else:
obs = pd.DataFrame(
[[instrument, notional, minute, max_maturity, curr2, notional2]],
columns=cols
)
filtered_transactions = pd.concat([filtered_transactions, obs])
max_maturity = pd.to_datetime('1970-01-01')
return filtered_transactions
def add_sheet(filtered_transactions, path, sheet_name):
book = load_workbook(path)
writer = pd.ExcelWriter(path, engine='openpyxl')
writer.book = book
transactions_filtered.to_excel(writer, sheet_name=sheet_name, index=False)
writer.save()
transactions = pd.read_excel(path)
transactions_filtered = filter_transactions(transactions, currency, max_spread)
add_sheet(transactions_filtered, path, sheet_name)
print(transactions_filtered)
prints
index | instrument | notional | minute | max_maturity | currency2 | notional2 |
---|---|---|---|---|---|---|
0 | USD COP NDF | 17000000 | 2022-08-04 08:50:00 | 1970-01-01 00:00:00 | COP | 73380500000 |
1 | USD COP NDF | 4000000 | 2022-08-04 08:38:00 | 1970-01-01 00:00:00 | COP | 17304000000 |
2 | USD COP NDF | 1000000 | 2022-08-04 08:49:00 | 1970-01-01 00:00:00 | COP | 4326100000 |
3 | USD COP NDF | 10000000 | 2022-08-04 08:32:00 | 1970-01-01 00:00:00 | COP | 43260000000 |
... | ... | ... | ... | ... | ... | ... |
This approach adds a new sheet to the existing Excel file with columns ['instrument', 'notional', 'minute', 'max_maturity', 'currency2', 'notional2']
that contains a row with the maximum observed maturity (and 'Currency 2'
as well as 'Notional 2'
) for each group of consecutive transactions with absolute prices differences less than 0.5% and identical ['instrument', 'notional' and 'minute']
, effectively filtering from 1113
down to just 112
rows.