I am trying to split up and identify some IDs, which are poorly constructed manually.
Input (left) and expected output (right):
- '145.100, 150.200' => '145.100', '150.20'
- '160.10A' => '160.10A'
- '145.100, 150.200' => '145.100', '150.200'
- '10.545' => '10.545'
- '132.134, .9999, .456' => '132.134, '132.9999', '132.456'
- '121.111, .6577, .88A, 112.4444' => '121.111', '121.6577', '121.88A', '112.4444'
#6 is for me the tricky one, and i can't figure out how to solve it. The rest i have solved by dividing the concatenated ID in 1st half (before dot) and 2nd half and then putting them together after.
Small example within postgres is the following:
WITH
tmp AS
(
SELECT '145.100, 150.200' AS dgunr
UNION ALL
SELECT '160.10A'
UNION ALL
SELECT ' 10 . 545'
UNION ALL
SELECT '132.134, .6577, .456'
UNION ALL
SELECT '121.111, .6577, .88c, .999, 112.4444'
),
h_dgu_re AS
(
SELECT
dgunr,
',' || REPLACE(regexp_replace(upper(dgunr), '\s', '', 'g'), '.', '_') || ',' AS dgunr_re
FROM tmp
),
h_dgu_devide AS
(
SELECT
dgunr,
dgunr_re,
(REGEXP_MATCHES(dgunr_re, '(,)([a-zA-Z0-9] )(_)([a-zA-Z0-9] )', 'g')) AS dguall,
(REGEXP_MATCHES(dgunr_re, '(,)([a-zA-Z0-9] )', 'g'))[2] AS dgu1,
(REGEXP_MATCHES(dgunr_re, '(_)([a-zA-Z0-9] )', 'g'))[2] AS dgu2,
(REGEXP_MATCHES(dgunr_re, '(,_)([a-zA-Z0-9] )', 'g'))[2] AS dgu3
FROM h_dgu_re
)
SELECT
dgunr,
dgunr_re,
dguall[2] || '.' || dguall[4],
dgu1,
dgu3
FROM h_dgu_devide
;
The sql output:
dgunr | dgunr_re | dgu_concat | dgu1 | dgu2 | dgu3 |
---|---|---|---|---|---|
160.10A | ,160_10A, | 160.10A | 160 | 10A | |
10 . 545 | ,10_545, | 10.545 | 10 | 545 | |
132.134, .6577, .456 | ,132_134,_6577,_456, | 132.134 | 132 | 134 | 6577 |
132.134, .6577, .456 | ,132_134,_6577,_456, | 6577 | 456 | ||
132.134, .6577, .456 | ,132_134,_6577,_456, 456 | ||||
121.111, .6577, .88c, .999, 112.4444 | ,121_111,_6577,_88C,_999,112_4444, | 121.111 | 121 | 111 | 6577 |
121.111, .6577, .88c, .999, 112.4444 | ,121_111,_6577,_88C,_999,112_4444, | 112.4444 | 112 | 6577 | 88C |
121.111, .6577, .88c, .999, 112.4444 | ,121_111,_6577,_88C,_999,112_4444, | 88C | 999 | ||
121.111, .6577, .88c, .999, 112.4444 | ,121_111,_6577,_88C,_999,112_4444, | 999 | |||
121.111, .6577, .88c, .999, 112.4444 | ,121_111,_6577,_88C,_999,112_4444, | 4444 |
CodePudding user response:
So far this (not so elegant solution) fixed my problem
WITH
tmp AS
(
SELECT '145.100, 150.200' AS dgunr
UNION ALL
SELECT '160.10A'
UNION ALL
SELECT ' 10 . 545'
UNION ALL
SELECT '132. 134, .3456, .456'
UNION ALL
SELECT '121.111, .6577, .88c, .999, 112.4444'
UNION ALL
SELECT '10.12B, .7b, .8b, 11.6B, .8E, .9B, 144.4F'
),
tmp2 AS
(
SELECT
ROW_NUMBER() OVER () AS object_num,
upper(dgunr) AS dgunr
FROM tmp
),
tmp3 AS
(
SELECT
object_num,
dgunr,
REGEXP_MATCHES(dgunr, '([0-9]*)(\s*\.\s*)([A-Z0-9] )', 'g') AS dguall
FROM tmp2
),
tmp4 AS
(
SELECT
dgunr,
object_num,
ROW_NUMBER() OVER (PARTITION BY object_num) AS version_num,
NULLIF(dguall[1], '') AS dgu1,
dguall[3] AS dgu2
FROM tmp3
),
tmp5 AS
(
SELECT
dgunr, object_num, version_num,
COALESCE(
dgu1,
LAG(dgu1, 1) OVER (PARTITION BY object_num ORDER BY version_num),
LAG(dgu1, 2) OVER (PARTITION BY object_num ORDER BY version_num),
LAG(dgu1, 3) OVER (PARTITION BY object_num ORDER BY version_num),
LAG(dgu1, 4) OVER (PARTITION BY object_num ORDER BY version_num),
LAG(dgu1, 5) OVER (PARTITION BY object_num ORDER BY version_num),
LAG(dgu1, 6) OVER (PARTITION BY object_num ORDER BY version_num),
LAG(dgu1, 7) OVER (PARTITION BY object_num ORDER BY version_num),
LAG(dgu1, 8) OVER (PARTITION BY object_num ORDER BY version_num),
LAG(dgu1, 9) OVER (PARTITION BY object_num ORDER BY version_num),
LAG(dgu1, 10) OVER (PARTITION BY object_num ORDER BY version_num),
LAG(dgu1, 11) OVER (PARTITION BY object_num ORDER BY version_num),
LAG(dgu1, 12) OVER (PARTITION BY object_num ORDER BY version_num),
LAG(dgu1, 13) OVER (PARTITION BY object_num ORDER BY version_num),
LAG(dgu1, 14) OVER (PARTITION BY object_num ORDER BY version_num),
LAG(dgu1, 15) OVER (PARTITION BY object_num ORDER BY version_num),
LAG(dgu1, 16) OVER (PARTITION BY object_num ORDER BY version_num),
LAG(dgu1, 17) OVER (PARTITION BY object_num ORDER BY version_num),
LAG(dgu1, 18) OVER (PARTITION BY object_num ORDER BY version_num),
LAG(dgu1, 19) OVER (PARTITION BY object_num ORDER BY version_num),
LAG(dgu1, 20) OVER (PARTITION BY object_num ORDER BY version_num)
) AS dgu1,
dgu2
FROM tmp4
ORDER BY object_num, version_num
)
SELECT
dgunr, dgu1 || '.' || dgu2 AS dgunr_edit
FROM tmp5
;
CodePudding user response:
It's not entirely clear to me if you want the results as separate columns or an array. Here is how I would approach getting the results into separate columns. First, the regex approach seems overly complicated. I would remove all spaces and use string_to_array to separate each element into an array. Then, just check to see if the element starts with '.' or not and concatenate as needed:
WITH
tmp AS
(
SELECT '145.100, 150.200' AS dgunr
UNION ALL
SELECT '160.10A'
UNION ALL
SELECT ' 10 . 545'
UNION ALL
SELECT '132.134, .6577, .456'
UNION ALL
SELECT '121.111, .6577, .88c, .999, 112.4444'
),
id_array AS (
SELECT dgunr,
string_to_array(replace(dgunr, ' ', ''), ',') as id_array
FROM tmp
)
SELECT dgunr,
id_array[1] as dg1,
CASE WHEN starts_with(id_array[2], '.') THEN split_part(id_array[1], '.', 1) || id_array[2] ELSE id_array[2] END as dgu2,
CASE WHEN starts_with(id_array[3], '.') THEN split_part(id_array[1], '.', 1) || id_array[3] ELSE id_array[3] END as dgu2,
CASE WHEN starts_with(id_array[4], '.') THEN split_part(id_array[1], '.', 1) || id_array[4] ELSE id_array[4] END as dgu3,
CASE WHEN starts_with(id_array[5], '.') THEN split_part(id_array[1], '.', 1) || id_array[5] ELSE id_array[5] END as dgu4
FROM id_array;
dgunr | dg1 | dgu2 | dgu2 | dgu3 | dgu4
-------------------------------------- --------- ---------- --------- --------- ----------
145.100, 150.200 | 145.100 | 150.200 | | |
160.10A | 160.10A | | | |
10 . 545 | 10.545 | | | |
132.134, .6577, .456 | 132.134 | 132.6577 | 132.456 | |
121.111, .6577, .88c, .999, 112.4444 | 121.111 | 121.6577 | 121.88c | 121.999 | 112.4444
(5 rows)