I'm looking for a one line column calculation that can find out the index OFFSET from the current row backwards, using a window size.
For example, given this dataset:
import numpy as np
import pandas as pd
# Create a reproducible, static dataframe.
# 1 minute SPY data. Skip to the bottom...
df = pd.DataFrame([
{
"time": "2021-10-26 9:30",
"open": "457.2",
"high": "457.29",
"low": "456.78",
"close": "456.9383",
"volume": "594142"
},
{
"time": "2021-10-26 9:31",
"open": "456.94",
"high": "457.07",
"low": "456.8",
"close": "456.995",
"volume": "194061"
},
{
"time": "2021-10-26 9:32",
"open": "456.99",
"high": "457.22",
"low": "456.84",
"close": "457.21",
"volume": "186114"
},
{
"time": "2021-10-26 9:33",
"open": "457.22",
"high": "457.45",
"low": "457.2011",
"close": "457.308",
"volume": "294158"
},
{
"time": "2021-10-26 9:34",
"open": "457.31",
"high": "457.4",
"low": "457.25",
"close": "457.32",
"volume": "172574"
},
{
"time": "2021-10-26 9:35",
"open": "457.31",
"high": "457.48",
"low": "457.18",
"close": "457.44",
"volume": "396668"
},
{
"time": "2021-10-26 9:36",
"open": "457.48",
"high": "457.6511",
"low": "457.44",
"close": "457.57",
"volume": "186777"
},
{
"time": "2021-10-26 9:37",
"open": "457.5699",
"high": "457.73",
"low": "457.5699",
"close": "457.69",
"volume": "187596"
},
{
"time": "2021-10-26 9:38",
"open": "457.7",
"high": "457.73",
"low": "457.54",
"close": "457.63",
"volume": "185570"
},
{
"time": "2021-10-26 9:39",
"open": "457.63",
"high": "457.64",
"low": "457.31",
"close": "457.59",
"volume": "164707"
},
{
"time": "2021-10-26 9:40",
"open": "457.59",
"high": "457.72",
"low": "457.46",
"close": "457.7199",
"volume": "167438"
},
{
"time": "2021-10-26 9:41",
"open": "457.72",
"high": "457.8",
"low": "457.68",
"close": "457.72",
"volume": "199951"
},
{
"time": "2021-10-26 9:42",
"open": "457.73",
"high": "457.74",
"low": "457.6",
"close": "457.62",
"volume": "152134"
},
{
"time": "2021-10-26 9:43",
"open": "457.6",
"high": "457.65",
"low": "457.45",
"close": "457.5077",
"volume": "142530"
},
{
"time": "2021-10-26 9:44",
"open": "457.51",
"high": "457.64",
"low": "457.4001",
"close": "457.61",
"volume": "122575"
},
{
"time": "2021-10-26 9:45",
"open": "457.61",
"high": "457.76",
"low": "457.58",
"close": "457.75",
"volume": "119886"
},
{
"time": "2021-10-26 9:46",
"open": "457.74",
"high": "457.75",
"low": "457.37",
"close": "457.38",
"volume": "183157"
},
{
"time": "2021-10-26 9:47",
"open": "457.42",
"high": "457.49",
"low": "457.37",
"close": "457.44",
"volume": "128542"
},
{
"time": "2021-10-26 9:48",
"open": "457.43",
"high": "457.49",
"low": "457.33",
"close": "457.44",
"volume": "154181"
},
{
"time": "2021-10-26 9:49",
"open": "457.43",
"high": "457.5898",
"low": "457.42",
"close": "457.47",
"volume": "163063"
},
{
"time": "2021-10-26 9:50",
"open": "457.45",
"high": "457.59",
"low": "457.44",
"close": "457.555",
"volume": "96229"
},
{
"time": "2021-10-26 9:51",
"open": "457.56",
"high": "457.61",
"low": "457.31",
"close": "457.4217",
"volume": "110380"
},
{
"time": "2021-10-26 9:52",
"open": "457.42",
"high": "457.56",
"low": "457.42",
"close": "457.47",
"volume": "107518"
},
{
"time": "2021-10-26 9:53",
"open": "457.475",
"high": "457.51",
"low": "457.4",
"close": "457.48",
"volume": "78062"
},
{
"time": "2021-10-26 9:54",
"open": "457.49",
"high": "457.57",
"low": "457.42",
"close": "457.46",
"volume": "133883"
},
{
"time": "2021-10-26 9:55",
"open": "457.47",
"high": "457.56",
"low": "457.45",
"close": "457.51",
"volume": "98998"
},
{
"time": "2021-10-26 9:56",
"open": "457.51",
"high": "457.54",
"low": "457.43",
"close": "457.43",
"volume": "110237"
},
{
"time": "2021-10-26 9:57",
"open": "457.43",
"high": "457.65",
"low": "457.375",
"close": "457.65",
"volume": "98794"
},
{
"time": "2021-10-26 9:58",
"open": "457.66",
"high": "457.69",
"low": "457.35",
"close": "457.45",
"volume": "262154"
},
{
"time": "2021-10-26 9:59",
"open": "457.45",
"high": "457.47",
"low": "457.33",
"close": "457.4",
"volume": "74685"
},
{
"time": "2021-10-26 10:00",
"open": "457.41",
"high": "457.48",
"low": "457.18",
"close": "457.38",
"volume": "166617"
},
{
"time": "2021-10-26 10:01",
"open": "457.39",
"high": "457.7",
"low": "457.39",
"close": "457.5",
"volume": "265649"
},
{
"time": "2021-10-26 10:02",
"open": "457.51",
"high": "457.57",
"low": "457.39",
"close": "457.53",
"volume": "131947"
},
{
"time": "2021-10-26 10:03",
"open": "457.53",
"high": "457.54",
"low": "457.4",
"close": "457.51",
"volume": "80111"
},
{
"time": "2021-10-26 10:04",
"open": "457.51",
"high": "457.62",
"low": "457.5",
"close": "457.6101",
"volume": "117174"
},
{
"time": "2021-10-26 10:05",
"open": "457.621",
"high": "457.64",
"low": "457.51",
"close": "457.58",
"volume": "168758"
},
{
"time": "2021-10-26 10:06",
"open": "457.58",
"high": "457.64",
"low": "457.46",
"close": "457.61",
"volume": "84076"
},
{
"time": "2021-10-26 10:07",
"open": "457.62",
"high": "457.7401",
"low": "457.62",
"close": "457.66",
"volume": "125156"
},
{
"time": "2021-10-26 10:08",
"open": "457.665",
"high": "457.69",
"low": "457.5",
"close": "457.67",
"volume": "116919"
},
{
"time": "2021-10-26 10:09",
"open": "457.69",
"high": "457.72",
"low": "457.5",
"close": "457.57",
"volume": "102551"
},
{
"time": "2021-10-26 10:10",
"open": "457.56",
"high": "457.75",
"low": "457.56",
"close": "457.7",
"volume": "109165"
},
{
"time": "2021-10-26 10:11",
"open": "457.7",
"high": "457.725",
"low": "457.63",
"close": "457.66",
"volume": "146209"
},
{
"time": "2021-10-26 10:12",
"open": "457.665",
"high": "457.88",
"low": "457.64",
"close": "457.86",
"volume": "210620"
},
{
"time": "2021-10-26 10:13",
"open": "457.855",
"high": "457.96",
"low": "457.83",
"close": "457.95",
"volume": "159975"
},
{
"time": "2021-10-26 10:14",
"open": "457.95",
"high": "458.02",
"low": "457.93",
"close": "457.95",
"volume": "152042"
},
{
"time": "2021-10-26 10:15",
"open": "457.96",
"high": "458.15",
"low": "457.96",
"close": "458.08",
"volume": "146047"
},
{
"time": "2021-10-26 10:16",
"open": "458.085",
"high": "458.17",
"low": "457.99",
"close": "458.15",
"volume": "100732"
},
{
"time": "2021-10-26 10:17",
"open": "458.17",
"high": "458.33",
"low": "458.155",
"close": "458.245",
"volume": "235072"
},
{
"time": "2021-10-26 10:18",
"open": "458.25",
"high": "458.29",
"low": "458.14",
"close": "458.16",
"volume": "422002"
},
{
"time": "2021-10-26 10:19",
"open": "458.17",
"high": "458.2801",
"low": "458.1699",
"close": "458.28",
"volume": "114611"
},
{
"time": "2021-10-26 10:20",
"open": "458.29",
"high": "458.39",
"low": "458.24",
"close": "458.37",
"volume": "241797"
},
{
"time": "2021-10-26 10:21",
"open": "458.37",
"high": "458.42",
"low": "458.31",
"close": "458.345",
"volume": "124824"
},
{
"time": "2021-10-26 10:22",
"open": "458.33",
"high": "458.49",
"low": "458.33",
"close": "458.47",
"volume": "132125"
}
])
... and using the below code...
# Convert close to numeric re: the .csv to .json
# converter tool I used online...
df['close'] = pd.to_numeric(df['close'])
# Define the lookback length
lookback_window = 20
# Define a blank target column
df['MaxIndexOffset'] = None
# WORKING EXAMPLE
# Iterate through the df
for i in range(lookback_window, len(df)):
# Convert the current chunk of rows to a list and reverse it
curr_range = df['close'].iloc[i-lookback_window:i].to_list()
curr_range.reverse()
# Then find the max between 0 and the first instance of the max
# value. The first instance (because we reversed the order) will
# be the number I need. If the current row IS the max value, then
# the offset will be 0. If the max value is 2 bars ago, it'll be 2, etc.
x1 = max(0, curr_range.index(max(curr_range)))
df['MaxIndexOffset'].iloc[i] = x1
# Export the correct answer dataset to compare the next function
# to below
df.to_csv("correct_answers.csv", index=False)
# NON-WORKING EXAMPLE
# What I'd like to do is NOT use a for loop to do the above. I envision
# we can use np.where() here? But I don't know how yet. Psuedo might look
# something like:
df['MaxIndexOffset2'] = i - (df['close'].rolling(window=lookback_window).tolist().reverse().index(max(df['close'].rolling(window=lookback_window))))
# ... but obviously doesn't work. "AttributeError: 'Rolling' object has no attribute 'to_list'"
# Check for matches
df.to_csv("correct_answers2.csv", index=False)
...is there a way to do this without the for
loop? I'm looking to get the offset (i.e. number of rows back from the current row, up to the lookback_window
) of the max value in that window. If there are multiples of the same max value, it should just return the closest one from the current row. Any ideas??? Thanks!
CodePudding user response:
Reverse the close
column, then apply np.argmax
on a rolling
window
of size equal to lookback_window
then shift
the result upwards and assign it to the column MaxIndexOffset
df['MaxIndexOffset'] = df['close'][::-1].rolling(lookback_window)\
.apply(np.argmax, raw=True).shift(-lookback_window 1)
time open high low close volume MaxIndexOffset
0 2021-10-26 9:30 457.2 457.29 456.78 456.9383 594142 NaN
1 2021-10-26 9:31 456.94 457.07 456.8 456.9950 194061 NaN
2 2021-10-26 9:32 456.99 457.22 456.84 457.2100 186114 NaN
3 2021-10-26 9:33 457.22 457.45 457.2011 457.3080 294158 NaN
4 2021-10-26 9:34 457.31 457.4 457.25 457.3200 172574 NaN
5 2021-10-26 9:35 457.31 457.48 457.18 457.4400 396668 NaN
6 2021-10-26 9:36 457.48 457.6511 457.44 457.5700 186777 NaN
7 2021-10-26 9:37 457.5699 457.73 457.5699 457.6900 187596 NaN
8 2021-10-26 9:38 457.7 457.73 457.54 457.6300 185570 NaN
9 2021-10-26 9:39 457.63 457.64 457.31 457.5900 164707 NaN
10 2021-10-26 9:40 457.59 457.72 457.46 457.7199 167438 NaN
11 2021-10-26 9:41 457.72 457.8 457.68 457.7200 199951 NaN
12 2021-10-26 9:42 457.73 457.74 457.6 457.6200 152134 NaN
13 2021-10-26 9:43 457.6 457.65 457.45 457.5077 142530 NaN
14 2021-10-26 9:44 457.51 457.64 457.4001 457.6100 122575 NaN
15 2021-10-26 9:45 457.61 457.76 457.58 457.7500 119886 NaN
16 2021-10-26 9:46 457.74 457.75 457.37 457.3800 183157 NaN
17 2021-10-26 9:47 457.42 457.49 457.37 457.4400 128542 NaN
18 2021-10-26 9:48 457.43 457.49 457.33 457.4400 154181 NaN
19 2021-10-26 9:49 457.43 457.5898 457.42 457.4700 163063 4.0
20 2021-10-26 9:50 457.45 457.59 457.44 457.5550 96229 5.0
21 2021-10-26 9:51 457.56 457.61 457.31 457.4217 110380 6.0
22 2021-10-26 9:52 457.42 457.56 457.42 457.4700 107518 7.0
23 2021-10-26 9:53 457.475 457.51 457.4 457.4800 78062 8.0
24 2021-10-26 9:54 457.49 457.57 457.42 457.4600 133883 9.0
25 2021-10-26 9:55 457.47 457.56 457.45 457.5100 98998 10.0
26 2021-10-26 9:56 457.51 457.54 457.43 457.4300 110237 11.0
27 2021-10-26 9:57 457.43 457.65 457.375 457.6500 98794 12.0
28 2021-10-26 9:58 457.66 457.69 457.35 457.4500 262154 13.0
29 2021-10-26 9:59 457.45 457.47 457.33 457.4000 74685 14.0
30 2021-10-26 10:00 457.41 457.48 457.18 457.3800 166617 15.0
31 2021-10-26 10:01 457.39 457.7 457.39 457.5000 265649 16.0
32 2021-10-26 10:02 457.51 457.57 457.39 457.5300 131947 17.0
33 2021-10-26 10:03 457.53 457.54 457.4 457.5100 80111 18.0
34 2021-10-26 10:04 457.51 457.62 457.5 457.6101 117174 19.0
35 2021-10-26 10:05 457.621 457.64 457.51 457.5800 168758 8.0
36 2021-10-26 10:06 457.58 457.64 457.46 457.6100 84076 9.0
37 2021-10-26 10:07 457.62 457.7401 457.62 457.6600 125156 0.0
38 2021-10-26 10:08 457.665 457.69 457.5 457.6700 116919 0.0
39 2021-10-26 10:09 457.69 457.72 457.5 457.5700 102551 1.0
40 2021-10-26 10:10 457.56 457.75 457.56 457.7000 109165 0.0
41 2021-10-26 10:11 457.7 457.725 457.63 457.6600 146209 1.0
42 2021-10-26 10:12 457.665 457.88 457.64 457.8600 210620 0.0
43 2021-10-26 10:13 457.855 457.96 457.83 457.9500 159975 0.0
44 2021-10-26 10:14 457.95 458.02 457.93 457.9500 152042 0.0
45 2021-10-26 10:15 457.96 458.15 457.96 458.0800 146047 0.0
46 2021-10-26 10:16 458.085 458.17 457.99 458.1500 100732 0.0
47 2021-10-26 10:17 458.17 458.33 458.155 458.2450 235072 0.0
48 2021-10-26 10:18 458.25 458.29 458.14 458.1600 422002 1.0
49 2021-10-26 10:19 458.17 458.2801 458.1699 458.2800 114611 0.0
50 2021-10-26 10:20 458.29 458.39 458.24 458.3700 241797 0.0
51 2021-10-26 10:21 458.37 458.42 458.31 458.3450 124824 1.0
52 2021-10-26 10:22 458.33 458.49 458.33 458.4700 132125 0.0