SQLite Window functions-CodePudding

This is a simplified ER diagram of my database:

What I'd like to retrieve is, for each vendor_item:

The highest price (excluding the last capture)
The lowest price (excluding the last capture)
The current price (i.e. the last capture)

This is some sample data of the PRICE_DATA table to give you an idea:

vendor_item_id	capture_ts	price
124	2022-03-02 09:00:12.851043	46.78
124	2022-03-02 14:07:49.423343	42.99
124	2022-03-04 08:20:07.636140	43.99
124	2022-03-05 08:29:20.421764	42.99
124	2022-03-08 08:33:59.043372	42.99
129	2022-03-02 08:55:14.401816	21.52
129	2022-03-02 14:11:20.544427	25.54
129	2022-03-04 08:24:06.976667	25.72
129	2022-03-08 08:22:46.734662	30.83
132	2022-03-02 09:04:18.144494	41.99
132	2022-03-03 08:29:15.981712	42.99
132	2022-03-04 08:27:39.327779	41.99
132	2022-03-07 08:29:41.236009	42.99
132	2022-03-08 08:27:44.318570	40.99

This is the SQL statement I have so far:

select distinct vendor_item_id
      ,last_value(price) over win as curr_price
      ,min(price) over win as low_price
      ,max(price) over win as high_price
from price_data
window win as (partition by vendor_item_id 
               order by capture_ts 
               rows between unbounded preceding 
                        and unbounded following);

While this gives me more or less what I'm looking for, there are a couple of issues:

The highest and lowest price take into account all records, instead of excluding the most recent capture.
If I don't add distinct to the query, I end up with duplicate records (this is probably my fault, for failing to properly grasp the windowing functionality).

Desired result:

vendor_item_id	curr_price	low_price	high_price
124	42.99	42.99	46.78
129	30.83	21.52	25.72
132	40.99	41.99	42.99

Thanks for your help!

CodePudding user response：

Use a CTE that returns the max capture_ts for each vendor_item_id and then get low_price and high_price with conditional aggregation:

WITH cte AS (
  SELECT *, MAX(capture_ts) OVER (PARTITION BY vendor_item_id) max_capture_ts
  FROM price_data
)
SELECT DISTINCT vendor_item_id,
       FIRST_VALUE(price) OVER (PARTITION BY vendor_item_id ORDER BY capture_ts DESC) curr_price,
       MIN(CASE WHEN capture_ts < max_capture_ts THEN price END) OVER (PARTITION BY vendor_item_id) low_price, 
       MAX(CASE WHEN capture_ts < max_capture_ts THEN price END) OVER (PARTITION BY vendor_item_id) high_price
FROM cte;

See the demo.

CodePudding user response：

I ended up using CTEs and regular aggregate functions to solve the question:

with v_last_capture as (
    select vendor_item_id
          ,max(capture_ts) last_capture_ts
      from price_data pd
     group by vendor_item_id
)
, v_curr_price as (
    select pd.*
      from price_data pd 
      inner join v_last_capture vc 
            on (pd.vendor_item_id = vc.vendor_item_id and 
                pd.capture_ts = vc.last_capture_ts)
)
, v_other_prices as (
    select vendor_item_id
          ,min(pd.price) as min_price
          ,max(pd.price) as max_price
      from price_data pd
     where id not in (select id from v_curr_price)
     group by vendor_item_id 
)
select vc.id
      ,vc.vendor_item_id 
      ,vc.price as curr_price 
      ,vc.stock
      ,vo.min_price
      ,vo.max_price 
  from v_curr_price vc 
  left join v_other_prices vo on (vc.vendor_item_id = vo.vendor_item_id)

Explain plan:

QUERY PLAN
|--MATERIALIZE 4
|  |--SCAN TABLE price_data AS pd
|  `--USE TEMP B-TREE FOR GROUP BY
|--MATERIALIZE 5
|  |--SCAN TABLE price_data AS pd
|  |--LIST SUBQUERY 6
|  |  |--MATERIALIZE 8
|  |  |  |--SCAN TABLE price_data AS pd
|  |  |  `--USE TEMP B-TREE FOR GROUP BY
|  |  |--SCAN SUBQUERY 8 AS vc
|  |  `--SEARCH TABLE price_data AS pd USING AUTOMATIC COVERING INDEX (vendor_item_id=? AND capture_ts=?)
|  `--USE TEMP B-TREE FOR GROUP BY
|--SCAN TABLE price_data AS pd
|--SEARCH SUBQUERY 4 AS vc USING AUTOMATIC COVERING INDEX (vendor_item_id=?)
`--SEARCH SUBQUERY 5 AS vo USING AUTOMATIC COVERING INDEX (vendor_item_id=?)

The answer of @forpas works just as well (and the query is more concise). Here's the explain plan for his query:

QUERY PLAN
|--CO-ROUTINE 3
|  |--CO-ROUTINE 4
|  |  |--CO-ROUTINE 1
|  |  |  |--CO-ROUTINE 5
|  |  |  |  |--SCAN TABLE price_data
|  |  |  |  `--USE TEMP B-TREE FOR ORDER BY
|  |  |  `--SCAN SUBQUERY 5
|  |  |--SCAN SUBQUERY 1
|  |  `--USE TEMP B-TREE FOR ORDER BY
|  |--SCAN SUBQUERY 4
|  `--USE TEMP B-TREE FOR ORDER BY
|--SCAN SUBQUERY 3
`--USE TEMP B-TREE FOR DISTINCT

CodePudding user response：

You can use window filters to remove the last row for your "except the latest capture" requirement

select distinct
    p.vendor_item_id
    ,last_value(p.price) over vendor_item as curr_price
    ,min(price) filter (where p.capture_ts < latest.capture_ts) over vendor_item as low_price
    ,max(price) filter (where p.capture_ts < latest.capture_ts) over vendor_item as high_price
from
    price_data p
    inner join (
        select vendor_item_id, max(capture_ts) capture_ts from price_data group by vendor_item_id
    ) latest on latest.vendor_item_id = p.vendor_item_id
window
    vendor_item as (
        partition by p.vendor_item_id
        order by p.capture_ts 
        rows between unbounded preceding and unbounded following
    );

result

124 42.99   42.99   46.78
129 30.83   21.52   25.72
132 40.99   41.99   42.99

I suppose that capture_ts is unique per vendor_item_id, otherwise you'd have to create a smarter filter.

Query plan on a naked price_data table with no indexes defined:

QUERY PLAN
|--CO-ROUTINE 3
|  |--MATERIALIZE 1
|  |  |--SCAN TABLE price_data
|  |  `--USE TEMP B-TREE FOR GROUP BY
|  |--SCAN TABLE price_data AS p
|  |--SEARCH SUBQUERY 1 AS latest USING AUTOMATIC COVERING INDEX (vendor_item_id=?)
|  `--USE TEMP B-TREE FOR ORDER BY
|--SCAN SUBQUERY 3
`--USE TEMP B-TREE FOR DISTINCT

when a covering index is defined (create index ix_price_data on price_data (vendor_item_id, capture_ts, price)), things get a tiny bit simpler:

QUERY PLAN
|--CO-ROUTINE 3
|  |--MATERIALIZE 1
|  |  `--SCAN TABLE price_data USING COVERING INDEX ix_price_data
|  |--SCAN SUBQUERY 1 AS latest
|  |--SEARCH TABLE price_data AS p USING COVERING INDEX ix_price_data (vendor_item_id=?)
|  `--USE TEMP B-TREE FOR ORDER BY
|--SCAN SUBQUERY 3
`--USE TEMP B-TREE FOR DISTINCT