Home > Software engineering >  Using table columns as a filter/regexp for another table
Using table columns as a filter/regexp for another table

Time:12-01

Need to figure out an efficient way to query a table using another table as a filter/config (postgres 14.5).

The filter table has 4 levels that match 4 levels in the data. Each level can be read as a RegExp with wildcard, where wildcard is null. But the rules are MUTUALLY EXCLUSIVE. meaning rule aa-* excludes the rule aa-ab-* and vice versa.

e.g. Filter Table

ID Category Level 1 Level 2 Level 3 Level 4
Rule1 A aa null null null
Rule2 A aa ab null null
Rule3 A ab null null null
Rule4 A ab ac aa null

In this case filter rule #1 is matching all aa-* data, except when it's aa-ab-* (rule #2) Similarly, rule #3 will match ab-* data, except ab-ac-aa-*

e.g. Data Table

Data Category Level 1 Level 2 Level 3 Level 4
Data1 A aa aa ac aa
Data2 A aa aa null null
Data3 A aa ab null null
Data4 A ab ab null null
Data5 A ab ac null null
Data6 A ab ac dd null

This way the join between the two should produce result like:

Data ID Rule ID
Data1 Rule1
Data2 Rule1
Data3 Rule2
Data4 Rule3
Data5 Rule3
Data6 Rule3

Approach toward combining all levels into a string with "level-combo" seem to be inefficient and has gaps. The join based on coalesce(lvl, '') is not helpful either, it's not addressing the wildcard rules.

where coalesce(a.lvl1,'') = coalesce(c.lvl1,'')
and coalesce(a.lvl2,'') = coalesce(c.lvl2,'')
and coalesce(a.lvl3,'') = coalesce(c.lvl3,'')
and coalesce(a.lvl4,'') = coalesce(c.lvl4,'')

DATA to replicate:

WITH config (id, category, lvl1, lvl2, lvl3, lvl4) AS (
  VALUES
   (1, 's', null,  null, null, null ),
   (2, 's', 'u7', null,  null, null ),
   (3, 's', 'u6', 'u1',  null, null ),
   (4, 's', 'u5', 'ud',  'u2', null ),
   (5, 's', 'u5', 'ud',  'u3', null ),
   (6, 's', 'u5', 'ud',  'u4', 'ok' ),
   (9, 's', 'u4', null,  null, null ),
   (7, 's', 'u4', 'u1',  'u2', 'u3' ),
   (8, 's', 'u4', 'cu',  'u2', null )
),
datum (id, data_id, internal_id, start_date, end_date, category, lvl1, lvl2, lvl3, lvl4) AS (
  VALUES
   (1,  'x1', '111',  '2022-01-01', '2022-12-01', 's', null,  null, null, null ),
   (2,  'x2', '112',  '2022-01-01', '2022-12-01', 's', 'u7', null,  null, null ),
   (3,  'x3', '113',  '2022-01-01', '2022-12-01', 's', 'u6', 'u1',  null, null ),
   (4,  'x4', '114',  '2022-01-01', '2022-12-01', 's', 'u5', 'ud',  'u2', null ),
   (5,  'x5', '115',  '2022-01-01', '2022-12-01', 's', 'u5', 'ud',  'u3', null ),
   (6,  'x6', '116',  '2022-01-01', '2022-12-01', 's', 'u5', 'ud',  'u4', 'ok' ),
   (9,  'x9', '119',  '2022-01-01', '2022-12-01', 's', 'u4', null,  null, null ),
   (7,  'x7', '117',  '2022-01-01', '2022-12-01', 's', 'u4', 'u1',  'u2', 'u3' ),
   (8,  'x8', '118',  '2022-01-01', '2022-12-01', 's', 'u4', 'cu',  'u2', null ),
   (9,  'x2', '112',  '2022-01-01', '2022-12-01', 's', 'u9', null,  null, null ),
   (10, 'x3', '113',  '2022-01-01', '2022-12-01', 's', 'u5', 'u1',  null, null ),
   (11, 'x4', '114',  '2022-01-01', '2022-12-01', 's', 'u5', 'dd',  'u2', null ),
   (12, 'x5', '115',  '2022-01-01', '2022-12-01', 's', 'u5', 'ud',  'u3', 'ck' ),
   (13, 'x6', '116',  '2022-01-01', '2022-12-01', 's', 'u5', 'ud',  'u4', 'no' )
  )
SELECT
   *
FROM config c
join datum d on c.category = d.category
and coalesce(c.lvl1, '') = coalesce(d.lvl1, '')
and ...  ;

CodePudding user response:

String concatenation maybe? (Note this is SSMS syntax, I don't know postgresql)

SELECT d.dataID, r.ruleID
FROM data d
LEFT JOIN rule r ON 

d.Level1   
COALESCE(d.Level2   '-')   
COALESCE(d.Level3   '-')   
COALESCE(d.Level4   '-') 

LIKE 
--% is any number of characters
'%'   r.Level1   
COALESCE(r.Level2   '-')   
COALESCE(r.Level3   '-')   
COALESCE(r.Level4   '-')   '%'

This will return all matches, so you'll also need to order your rules by priority, group the matches by dataID and return the minimum rule.

CodePudding user response:

This will only work if id is unique.

your second datum table has a double 9

This will sum up all hits between both tables and the highest score for a c id wins

WITH config (id, category, lvl1, lvl2, lvl3, lvl4) AS (
  VALUES
   (1, 's', null,  null, null, null ),
   (2, 's', 'u7', null,  null, null ),
   (3, 's', 'u6', 'u1',  null, null ),
   (4, 's', 'u5', 'ud',  'u2', null ),
   (5, 's', 'u5', 'ud',  'u3', null ),
   (6, 's', 'u5', 'ud',  'u4', 'ok' ),
   (9, 's', 'u4', null,  null, null ),
   (7, 's', 'u4', 'u1',  'u2', 'u3' ),
   (8, 's', 'u4', 'cu',  'u2', null )
),
datum (id, data_id, internal_id, start_date, end_date, category, lvl1, lvl2, lvl3, lvl4) AS (
  VALUES
   (1,  'x1', '111',  '2022-01-01', '2022-12-01', 's', null,  null, null, null ),
   (2,  'x2', '112',  '2022-01-01', '2022-12-01', 's', 'u7', null,  null, null ),
   (3,  'x3', '113',  '2022-01-01', '2022-12-01', 's', 'u6', 'u1',  null, null ),
   (4,  'x4', '114',  '2022-01-01', '2022-12-01', 's', 'u5', 'ud',  'u2', null ),
   (5,  'x5', '115',  '2022-01-01', '2022-12-01', 's', 'u5', 'ud',  'u3', null ),
   (6,  'x6', '116',  '2022-01-01', '2022-12-01', 's', 'u5', 'ud',  'u4', 'ok' ),
   (7,  'x9', '119',  '2022-01-01', '2022-12-01', 's', 'u4', null,  null, null ),
   (8,  'x7', '117',  '2022-01-01', '2022-12-01', 's', 'u4', 'u1',  'u2', 'u3' ),
   (9,  'x8', '118',  '2022-01-01', '2022-12-01', 's', 'u4', 'cu',  'u2', null ),
   (10,  'x2', '112',  '2022-01-01', '2022-12-01', 's', 'u9', null,  null, null ),
   (11, 'x3', '113',  '2022-01-01', '2022-12-01', 's', 'u5', 'u1',  null, null ),
   (12, 'x4', '114',  '2022-01-01', '2022-12-01', 's', 'u5', 'dd',  'u2', null ),
   (13, 'x5', '115',  '2022-01-01', '2022-12-01', 's', 'u5', 'ud',  'u3', 'ck' ),
   (14, 'x6', '116',  '2022-01-01', '2022-12-01', 's', 'u5', 'ud',  'u4', 'no' )
  ),
  SUM_lvl as (
SELECT
   c.id as C_id,d.id as d_id,
  (coalesce(c.lvl1, '')  = coalesce(d.lvl1, ''))::int  
    (coalesce(c.lvl2, '')  = coalesce(d.lvl2, ''))::int  
  (coalesce(c.lvl3, '')  = coalesce(d.lvl3, ''))::int  
    (coalesce(c.lvl3, '')  = coalesce(d.lvl3, ''))::int  as level
FROM config c
join datum d on c.category = d.category)
, selhigh as (SELECT 
c_id,d_id,level,
  ROW_NUMBER() OVER(PARTITION BY c_id ORDER BY level DESC,d_id ASC) rn
FROM SUM_lvl)
SELECT c_id,d_id FROM selhigh WHERE rn = 1


c_id d_id
1 1
2 2
3 3
4 4
5 5
6 6
7 8
8 9
9 7
SELECT 9

fiddle

  • Related