Sum and Count for similar events over the date range PostgreSQL-CodePudding

In Postgres I have a dataset of events, I'd like to count number of the event and profit for them having same ticker and same filter_name which occurred within time range, for example in the last 3 hours.

id  ticker  filter_name     notification_date           price
1   "A"     "#FILTER_V2"    "2022-04-07 20:36:13.332"   0.0680000
2   "B"     "#FILTER_V1"    "2022-04-07 20:59:06.35"    0.5
3   "C"     "#FILTER_V2"    "2022-04-07 20:59:06.355"   1
4   "A"     "#FILTER_V1"    "2022-04-07 21:53:49.15"    0.0694000
5   "D"     "#FILTER_V2"    "2022-04-07 21:53:49.155"   0.8
6   "B"     "#FILTER_V2"    "2022-04-07 22:07:44.331"   0.51
7   "C"     "#FILTER_V2"    "2022-04-07 22:14:41.905"   2

Desired output is:

id  ticker  filter_name     notification_date          same_event_in_last_3_hours         same_signal_profit_in_last_3_hours     
1   "A"     "#FILTER_V2"    "2022-04-07 20:36:13.332"  0                                  0 or null
2   "B"     "#FILTER_V1"    "2022-04-07 20:59:06.35"   0                                  0 or null   
3   "C"     "#FILTER_V2"    "2022-04-07 20:59:06.355"  0                                  0 or null
4   "A"     "#FILTER_V2"    "2022-04-07 21:53:49.15"   1                                  2.05
5   "D"     "#FILTER_V2"    "2022-04-07 21:53:49.155"  0                                  0 or null
6   "B"     "#FILTER_V2"    "2022-04-07 22:07:44.331"  0                                  0 or null
7   "C"     "#FILTER_V2"    "2022-04-07 22:14:41.905"  1                                  100

Currently I've created a following sql to solve it:

select
id
,ticker 
,filter_name
,notification_date
,(select count(*) from notifications as q where 
  n.notification_date - q.notification_date <= interval '3 hours'
  and q.notification_date < n.notification_date
  and q.ticker = n.ticker 
  and q.filter_name = n.filter_name
) as same_event_in_last_3_hours
,(select TRUNC(sum((n.price - q.price)/ q.price * 100), 2)  from notifications as q where 
  n.notification_date - q.notification_date <= interval '3 hours'
  and q.notification_date < n.notification_date
  and q.ticker = n.ticker 
  and q.filter_name = n.filter_name
) as same_signal_profit_in_last_3_hours
from notifications n
order by id

But it works slowly and I don't know how to optimize it. Below is query plan:

"Limit  (cost=0.42..27444866.46 rows=2000 width=86) (actual time=47.399..93729.981 rows=2000 loops=1)"
"  Output: n.id, n.ticker, n.filter_name, n.notification_date, n.price, ((SubPlan 1)), ((SubPlan 2))"
"  Buffers: shared hit=12316848"
"  ->  Index Scan using notifications_pkey on public.notifications n  (cost=0.42..2306686101.61 rows=168096 width=86) (actual time=47.398..93728.629 rows=2000 loops=1)"
"        Output: n.id, n.ticker, n.filter_name, n.notification_date, n.price, (SubPlan 1), (SubPlan 2)"
"        Buffers: shared hit=12316848"
"        SubPlan 1"
"          ->  Aggregate  (cost=6861.17..6861.18 rows=1 width=8) (actual time=23.425..23.425 rows=1 loops=2000)"
"                Output: count(*)"
"                Buffers: shared hit=6158000"
"                ->  Seq Scan on public.notifications q  (cost=0.00..6861.16 rows=3 width=0) (actual time=15.115..23.407 rows=1 loops=2000)"
"                      Output: q.id, q.basis, q.filter_name, q.highest_since_notified, q.lowest_since_notified, q.notification_date, q.price, q.ticker, q.exchange_name, q.predictions"
"                      Filter: ((q.notification_date < n.notification_date) AND ((q.ticker)::text = (n.ticker)::text) AND ((q.filter_name)::text = (n.filter_name)::text) AND ((n.notification_date - q.notification_date) <= '03:00:00'::interval))"
"                      Rows Removed by Filter: 168450"
"                      Buffers: shared hit=6158000"
"        SubPlan 2"
"          ->  Aggregate  (cost=6861.19..6861.20 rows=1 width=32) (actual time=23.422..23.423 rows=1 loops=2000)"
"                Output: trunc(sum((((n.price - q_1.price) / q_1.price) * '100'::numeric)), 2)"
"                Buffers: shared hit=6158000"
"                ->  Seq Scan on public.notifications q_1  (cost=0.00..6861.16 rows=3 width=6) (actual time=15.102..23.401 rows=1 loops=2000)"
"                      Output: q_1.id, q_1.basis, q_1.filter_name, q_1.highest_since_notified, q_1.lowest_since_notified, q_1.notification_date, q_1.price, q_1.ticker, q_1.exchange_name, q_1.predictions"
"                      Filter: ((q_1.notification_date < n.notification_date) AND ((q_1.ticker)::text = (n.ticker)::text) AND ((q_1.filter_name)::text = (n.filter_name)::text) AND ((n.notification_date - q_1.notification_date) <= '03:00:00'::interval))"
"                      Rows Removed by Filter: 168450"
"                      Buffers: shared hit=6158000"
"Planning Time: 0.189 ms"
"Execution Time: 93731.304 ms"

CodePudding user response：

select id, ticker, filter_name, notification_date,
    count(*) over (partition by ticker, filter_name
                   order by notification_date
                   range between interval '3 hours' preceding and current row) - 1
                   as same_event_in_prior_3_hours
from notifications;

https://dbfiddle.uk/?rdbms=postgres_14&fiddle=02429f61288a493a52ce5fbae127c2d9

Doing some algebra(*) on the profit calculation would let you transform it into a sum of reciprocals:

sum(1/price) over (<the same window>) - 1/price as reciprocal_sum

So you can combine that with the previous result:

with data as (
select id, ticker, filter_name, notification_date, price,
    -- subtract current row results
    (count(*) over w) - 1 as same_event_in_prior_3_hours,
    (sum(1 / price) over w) - (1 / price) as reciprocal_sum
from notifications
window (partition by ticker, filter_name
        order by notification_date
        range between interval '3 hours' preceding and current row)
)
select *,
    (price * reciprocal_sum - same_event_in_prior_3_hours) * 100
        as same_signal_profit_in_last_3_hours
from data;

https://dbfiddle.uk/?rdbms=postgres_14&fiddle=9916601bed0a587c205dc98d1661a6e1

Algebra:

sum((p0 - pn) / pn) = (p0 - p1) / p1   (p0 - p2) / p2   (p0 - p3) / p3 ...
                    = p0 / p1 - 1   p0 / p2 - 1   p0 / p3 - 1 ...
                    = p0 * (1 / p1   1 / p2   1 / p3 ...   1 / pn) - n

CodePudding user response：

I believe join is faster than subquery.

SELECT
    n1.id,
    n1.ticker,
    n1.filter_name,
    n1.notification_date,
    count(n2.id) AS same_event
FROM
    notifications n1
    LEFT OUTER JOIN notifications n2 ON n1.id != n2.id
    AND n1.ticker = n2.ticker
    AND n1.filter_name = n2.filter_name
    AND n1.notification_date - n2.notification_date <= interval '3 hours'
    AND n1.notification_date - n2.notification_date >= '0 hours'
GROUP BY
    n1.id,
    n1.ticker,
    n1.filter_name,
    n1.notification_date;