Home > Blockchain >  Translation of Postgres Query to be Snowflake compatible
Translation of Postgres Query to be Snowflake compatible

Time:10-22

I'm trying to convert this pl/pgSQL function to work on Snowflake. Unfortunately, I just started using Snowflake and cannot manage to convert it properly. Snowflake plans to support pgSQL queries by the end of the year, unfortunately, it's not the case yet..

Here is a quick introduction for you to understand what my tables store and what the function does. I have three tables. You can find DDL statements for all tables and sample data at the bottom of this question.

  1. Events - this is the source table.
  2. Durations - this is the target table.
  3. Properties - this is a table used for reference that is updated whenever the function is called.

Once a day, I import data to the events table. The events we are interested in are events with device_types 1 and 2 (entry/exit). Then I run my function and it calculates the correct durations between events with the same card_nr. After that, I import those durations into my duration table and update the properties table.

Here's an example of the events:

Data

And here's an example of the durations once the function was called:

enter image description here

The most important things I need to cover are:

  1. I should filter on the manufacturer to always be 'XX' since there are manufacturers like YY or ZZ
  2. I should only calculate the durations where events.event_time >= durationLimitDate from properties
  3. I need to make sure that the event_id_arrival and event_id_departure are not in the target table (duration) already to avoid duplicates!
  4. Once I inserted the calculations to the duration table, I must update the durationLimitDate in the properties. Knowing that durationLimitDate = (Max(event_time) - durationLimitDays))

Function

CREATE OR REPLACE FUNCTION calculateduration() RETURNS void AS $function$

WITH cte AS (SELECT e.id, e.card_nr, e.event_time, e.ticket_type, e.manufacturer, e.carpark_id, e.device_type,
ROW_NUMBER() OVER (ORDER BY e.card_nr, e.carpark_id, e.event_time, e.device_type) AS rn
FROM events e
LEFT JOIN durations d ON d.event_id_arrival = e.id OR d.event_id_departure = e.id
WHERE e.event_time >= (SELECT PROP_VALUE::timestamp FROM properties WHERE prop_key = 'DURATION.LIMIT.DATE')
AND e.device_type IN (1, 2)
AND event_type = 2
AND e.manufacturer LIKE 'XX%'
AND d.id IS NULL)

INSERT INTO durations (id, odb_created_at, event_id_arrival, event_id_departure,
event_time_arrival, event_time_departure,
card_nr, ticket_type, duration, manufacturer, carpark_id)

SELECT nextval('durations_id_seq'),
current_timestamp,
arrived_entry.id,
departed_entry.id,
arrived_entry.event_time,
departed_entry.event_time,
arrived_entry.card_nr,
arrived_entry.ticket_type,
date_part('epoch', departed_entry.event_time::timestamp - arrived_entry.event_time::timestamp),
arrived_entry.manufacturer,
arrived_entry.carpark_id
FROM (SELECT * FROM cte WHERE cte.device_type = 1) AS arrived_entry
INNER JOIN (SELECT * FROM cte WHERE cte.device_type = 2) AS departed_entry ON arrived_entry.card_nr = departed_entry.card_nr
AND arrived_entry.carpark_id = departed_entry.carpark_id
AND arrived_entry.rn   1 = departed_entry.rn;


UPDATE properties
SET PROP_VALUE = (SELECT (MAX(event_time) - ((SELECT PROP_VALUE FROM properties WHERE prop_key = 'DURATION.LIMIT.DAYS') ||' day')::interval) FROM events WHERE event_time >= (SELECT PROP_VALUE::timestamp FROM properties WHERE prop_key = 'DURATION.LIMIT.DATE'))
WHERE PROP_KEY ='DURATION.LIMIT.DATE';

$function$

LANGUAGE sql;

DDL SCRIPTS

-- events
        CREATE TABLE IF NOT EXISTS events (
        id bigint NOT NULL autoincrement start 1 increment 1 PRIMARY KEY,
        odb_created_at timestamp without time zone NOT NULL,
        event_time timestamp without time zone NOT NULL,
        device_type integer NOT NULL,
        event_type integer NOT NULL,
        ticket_type integer NOT NULL,
        card_nr character varying(100),
        count integer DEFAULT 1 NOT NULL,
        manufacturer character varying(200),
        carpark_id bigint
    ); 

     -- durations
    CREATE TABLE IF NOT EXISTS durations (
        id bigint NOT NULL autoincrement start 1 increment 1 PRIMARY KEY,
        odb_created_at timestamp without time zone NOT NULL,
        event_id_arrival bigint,
        event_id_departure bigint,
        event_time_arrival timestamp without time zone,
        event_time_departure timestamp without time zone,
        card_nr character varying(100),
        ticket_type integer,
        duration integer,
        manufacturer character varying(200),
        carpark_id bigint
    );

    --properties
    create or replace TABLE PROPERTIES (
        PROP_KEY VARCHAR(80) NOT NULL,
        PROP_VALUE VARCHAR(250),
        primary key (PROP_KEY)
    );

Sample data:

INSERT INTO properties (prop_key,prop_value) VALUES
     ('DURATION.LIMIT.DAYS','30'),
     ('DURATION.LIMIT.DATE','2021-08-01 00:00:00.00');

    INSERT INTO public.events (id, odb_created_at, event_time, device_type, event_type, ticket_type, card_nr, count, manufacturer, carpark_id) VALUES(188160996, '2021-10-02 04:28:26.338', '2021-10-01 09:14:41.32', 1, 2, 11, '03998988030897300007782', 1, 'XX', 1852);
INSERT INTO public.events (id, odb_created_at, event_time, device_type, event_type, ticket_type, card_nr, count, manufacturer, carpark_id) VALUES(188160790, '2021-10-02 04:28:26.248', '2021-10-01 09:31:10.94', 2, 2, 11, '03998988030897300007782', 1, 'XX', 1852);
INSERT INTO public.events (id, odb_created_at, event_time, device_type, event_type, ticket_type, card_nr, count, manufacturer, carpark_id) VALUES(188146489, '2021-10-02 04:26:55.069', '2021-10-01 10:03:01.57', 1, 2, 500, '01479804030429500089598', 1, 'XX', 1563);
INSERT INTO public.events (id, odb_created_at, event_time, device_type, event_type, ticket_type, card_nr, count, manufacturer, carpark_id) VALUES(188146069, '2021-10-02 04:26:54.852', '2021-10-01 11:49:58.45', 2, 2, 500, '01479804030429500089598', 1, 'XX', 1563);
INSERT INTO public.events (id, odb_created_at, event_time, device_type, event_type, ticket_type, card_nr, count, manufacturer, carpark_id) VALUES(188161161, '2021-10-02 04:28:26.372', '2021-10-01 18:44:33.62', 1, 2, 11, '03998988030897300007782', 1, 'XX', 1852);
INSERT INTO public.events (id, odb_created_at, event_time, device_type, event_type, ticket_type, card_nr, count, manufacturer, carpark_id) VALUES(188160950, '2021-10-02 04:28:26.329', '2021-10-01 18:45:51.903', 2, 2, 11, '03998988030897300007782', 1, 'XX', 1852);
INSERT INTO public.events (id, odb_created_at, event_time, device_type, event_type, ticket_type, card_nr, count, manufacturer, carpark_id) VALUES(188161227, '2021-10-02 04:28:26.374', '2021-10-01 23:21:18.58', 1, 2, 11, '04139733030897300003136', 1, 'XX', 1852);
INSERT INTO public.events (id, odb_created_at, event_time, device_type, event_type, ticket_type, card_nr, count, manufacturer, carpark_id) VALUES(188160974, '2021-10-02 04:28:26.334', '2021-10-01 23:24:03.29', 2, 2, 11, '04139733030897300003136', 1, 'XX', 1852);
INSERT INTO public.events (id, odb_created_at, event_time, device_type, event_type, ticket_type, card_nr, count, manufacturer, carpark_id) VALUES(188239864, '2021-10-03 04:24:43.345', '2021-10-02 06:49:55.97', 1, 2, 11, '01719400030897300061410', 1, 'XX', 1852);
INSERT INTO public.events (id, odb_created_at, event_time, device_type, event_type, ticket_type, card_nr, count, manufacturer, carpark_id) VALUES(188239649, '2021-10-03 04:24:43.308', '2021-10-02 07:02:08.72', 2, 2, 11, '01719400030897300061410', 1, 'XX', 1852);

Thank you!

UNIQUE TEST

      CREATE TABLE IF NOT EXISTS test (
        id bigint NOT NULL AUTOINCREMENT PRIMARY KEY -- Check the syntax!
        , odb_created_at timestamp without time zone NOT NULL
        , event_time timestamp without time zone NOT NULL
        , device_type integer NOT NULL
        , event_type integer NOT NULL
        , ticket_type integer NOT NULL
        , card_nr character varying(100)
        , count integer DEFAULT 1 NOT NULL
        , manufacturer character varying(200)
        , carpark_id bigint
        , UNIQUE (card_nr, event_time) -- NATURAL KEY
    );
   
   
   INSERT INTO EUILOGS_DEV.ILOGS.TEST (ODB_CREATED_AT, EVENT_TIME, DEVICE_TYPE, EVENT_TYPE, TICKET_TYPE, CARD_NR, COUNT, MANUFACTURER, CARPARK_ID) 
   VALUES(current_timestamp(), '2021-01-01 15:00:00.000', 1, 1, 1, 'CARD1', 1, 'MAN1', 1);
  INSERT INTO EUILOGS_DEV.ILOGS.TEST (ODB_CREATED_AT, EVENT_TIME, DEVICE_TYPE, EVENT_TYPE, TICKET_TYPE, CARD_NR, COUNT, MANUFACTURER, CARPARK_ID) 
   VALUES(current_timestamp(), '2021-01-01 15:00:00.000', 1, 1, 1, 'CARD1', 1, 'MAN1', 1);

CodePudding user response:

You need to use sequence to generate auto numbers: https://docs.snowflake.com/en/user-guide/querying-sequences.html

So create a sequence first to use it:

create or replace sequence seq1;

Then use the below function:

create or replace procedure calculateduration() 
RETURNS string
LANGUAGE JAVASCRIPT 
AS $$
var query1 = 
`
  INSERT INTO durations (id, odb_created_at, event_id_arrival, event_id_departure,
  event_time_arrival, event_time_departure,
  card_nr, ticket_type, duration, manufacturer, carpark_id)

  WITH cte AS (
      SELECT e.id, e.card_nr, e.event_time, e.ticket_type, e.manufacturer, e.carpark_id, e.device_type,
      ROW_NUMBER() OVER (ORDER BY e.card_nr, e.carpark_id, e.event_time, e.device_type) AS rn
      FROM events e
      LEFT JOIN durations d ON d.event_id_arrival = e.id OR d.event_id_departure = e.id
      WHERE e.event_time >= (SELECT PROP_VALUE::timestamp FROM properties WHERE prop_key = 'DURATION.LIMIT.DATE')
      AND e.device_type IN (1, 2)
      AND event_type = 2
      AND e.manufacturer LIKE 'XX%'
      AND d.id IS NULL
  )
  SELECT 
    seq1.nextval,
    current_timestamp(),
    arrived_entry.id,
    departed_entry.id,
    arrived_entry.event_time,
    departed_entry.event_time,
    arrived_entry.card_nr,
    arrived_entry.ticket_type,
    timestampdiff(second, arrived_entry.event_time, departed_entry.event_time),
    arrived_entry.manufacturer,
    arrived_entry.carpark_id
  FROM (SELECT * FROM cte WHERE cte.device_type = 1) AS arrived_entry
  INNER JOIN (SELECT * FROM cte WHERE cte.device_type = 2) AS departed_entry 
    ON arrived_entry.card_nr = departed_entry.card_nr
  AND arrived_entry.carpark_id = departed_entry.carpark_id
  AND arrived_entry.rn   1 = departed_entry.rn
`;

snowflake.execute({ sqlText: query1 });

var query2 = "SELECT PROP_VALUE FROM properties WHERE prop_key = 'DURATION.LIMIT.DAYS'";
var stmt = snowflake.createStatement({ sqlText: query2 });
var resultSet = stmt.execute(); 
resultSet.next();
var prop_value = resultSet.getColumnValue(1); 

var query3 =
`
  UPDATE properties
  SET PROP_VALUE = (
    SELECT dateadd(day, -1 * ${prop_value}, MAX(event_time)) FROM events 
    WHERE event_time >= (
      SELECT PROP_VALUE::timestamp FROM properties WHERE prop_key = 'DURATION.LIMIT.DATE'
    )
  )
  WHERE PROP_KEY ='DURATION.LIMIT.DATE';
`

stmt = snowflake.createStatement({ sqlText: query3 });
stmt.execute();

return 'true';
$$;

Then call the procedure:

call calculateduration();

The code is simple without much validation and checking, but it should do the job you need.

CodePudding user response:

(this is a plain-sql rewrite)

  • The serials/IDENTITY should not be assigned (except maybe for populating the initial tables)
  • to avoid inserting duplicates, you need a natural key
  • (short table aliases are easier to read)

\i tmp.sql

-- DDL SCRIPTS

-- events
        CREATE TABLE IF NOT EXISTS events (
        id bigint NOT NULL GENERATED BY DEFAULT AS IDENTITY (start 1 increment 1) PRIMARY KEY -- Check the syntax!
        , odb_created_at timestamp without time zone NOT NULL
        , event_time timestamp without time zone NOT NULL
        , device_type integer NOT NULL
        , event_type integer NOT NULL
        , ticket_type integer NOT NULL
        , card_nr character varying(100)
        , count integer DEFAULT 1 NOT NULL
        , manufacturer character varying(200)
        , carpark_id bigint
        , UNIQUE (card_nr, event_time) -- NATURAL KEY
    );

     -- durations
    CREATE TABLE IF NOT EXISTS durations (
        id bigint NOT NULL GENERATED BY DEFAULT AS IDENTITY (start 1 increment 1) PRIMARY KEY -- Check the syntax!
        , odb_created_at timestamp without time zone NOT NULL
        , event_id_arrival bigint
        , event_id_departure bigint
        , event_time_arrival timestamp without time zone
        , event_time_departure timestamp without time zone
        , card_nr character varying(100)
        , ticket_type integer
        , duration integer
        , manufacturer character varying(200)
        , carpark_id bigint
        , UNIQUE (card_nr,event_id_arrival) -- NATURAL KEY
        , UNIQUE (card_nr,event_id_departure) -- (alternate) NATURAL KEY
    );

    --properties
    DROP TABLE IF EXISTS properties ;
    CREATE TABLE IF NOT EXISTS properties (
        prop_key VARCHAR(80) NOT NULL
        , prop_value VARCHAR(250)
        , primary key (prop_key)
    );
-- Sample data:

INSERT INTO properties (prop_key,prop_value) VALUES
     ('DURATION.LIMIT.DAYS','30'),
     ('DURATION.LIMIT.DATE','2021-08-01 00:00:00.00');

    INSERT INTO events (id, odb_created_at, event_time, device_type, event_type, ticket_type, card_nr, count, manufacturer, carpark_id)
    VALUES(188160996, '2021-10-02 04:28:26.338', '2021-10-01 09:14:41.32', 1, 2, 11, '03998988030897300007782', 1, 'XX', 1852)
        , (188160790, '2021-10-02 04:28:26.248', '2021-10-01 09:31:10.94', 2, 2, 11, '03998988030897300007782', 1, 'XX', 1852)
        , (188146489, '2021-10-02 04:26:55.069', '2021-10-01 10:03:01.57', 1, 2, 500, '01479804030429500089598', 1, 'XX', 1563)
        , (188146069, '2021-10-02 04:26:54.852', '2021-10-01 11:49:58.45', 2, 2, 500, '01479804030429500089598', 1, 'XX', 1563)
        , (188161161, '2021-10-02 04:28:26.372', '2021-10-01 18:44:33.62', 1, 2, 11, '03998988030897300007782', 1, 'XX', 1852)
        , (188160950, '2021-10-02 04:28:26.329', '2021-10-01 18:45:51.903', 2, 2, 11, '03998988030897300007782', 1, 'XX', 1852)
        , (188161227, '2021-10-02 04:28:26.374', '2021-10-01 23:21:18.58', 1, 2, 11, '04139733030897300003136', 1, 'XX', 1852)
        , (188160974, '2021-10-02 04:28:26.334', '2021-10-01 23:24:03.29', 2, 2, 11, '04139733030897300003136', 1, 'XX', 1852)
        , (188239864, '2021-10-03 04:24:43.345', '2021-10-02 06:49:55.97', 1, 2, 11, '01719400030897300061410', 1, 'XX', 1852)
        , (188239649, '2021-10-03 04:24:43.308', '2021-10-02 07:02:08.72', 2, 2, 11, '01719400030897300061410', 1, 'XX', 1852)
        ;
        -- Rewritten funtion body
-- EXPLAIN ANALYZE
WITH omg AS (
        SELECT e.id, e.card_nr, e.event_time, e.ticket_type, e.manufacturer, e.carpark_id, e.device_type
        , row_number() OVER (ORDER BY e.card_nr, e.carpark_id, e.event_time, e.device_type) AS rn
        FROM events e
        WHERE e.event_time >= (SELECT prop_value::timestamp FROM properties WHERE prop_key = 'DURATION.LIMIT.DATE')
        AND e.device_type IN (1, 2)
        AND e.event_type = 2
        AND e.manufacturer LIKE 'XX%'
        AND NOT EXISTS (
                SELECT *
                FROM durations d WHERE d.event_id_arrival = e.id OR d.event_id_departure = e.id
                )
        )

INSERT INTO durations (odb_created_at
        , event_id_arrival, event_id_departure
        , event_time_arrival, event_time_departure
        , card_nr, ticket_type
        , duration, manufacturer, carpark_id)

SELECT -- nextval('durations_id_seq'),
        current_timestamp
        , t0.id, t1.id
        , t0.event_time, t1.event_time
        , t0.card_nr
        , t0.ticket_type
        , date_part('epoch', t1.event_time::timestamp - t0.event_time::timestamp)
        , t0.manufacturer
        , t0.carpark_id
FROM (SELECT * FROM omg WHERE device_type = 1) AS t0
JOIN (SELECT * FROM omg WHERE device_type = 2) AS t1
        ON t0.card_nr = t1.card_nr
        AND t0.carpark_id = t1.carpark_id
        AND t0.rn   1 = t1.rn -- this is questionable
        -- Avoid duplicate insertions
WHERE NOT EXISTS (
        SELECT * FROM durations nx
        WHERE nx.card_nr = t0.card_nr
        AND nx.event_time_arrival = t0.event_time
        )
        ;
        -- This is beyond repair, IMHO
UPDATE properties
SET prop_value = (
        SELECT (MAX(event_time) - ((SELECT prop_value FROM properties
        WHERE prop_key = 'DURATION.LIMIT.DAYS') ||' day')::interval)
        FROM events WHERE event_time >= (SELECT prop_value::timestamp FROM properties
        WHERE prop_key = 'DURATION.LIMIT.DATE')
        )
WHERE prop_key ='DURATION.LIMIT.DATE';

select * from durations;
select * from properties;
  • Related