I am working on the below regex. I need to extract the substring after "NO." but I have an issue with 4 and 5 ids below where the expected output is 10A - 20A and 10 - 20 but instead I am getting 10A and 10.
with test (id, col) as (
select 1, 'JOHN NO.10A 1234 TEST1' from dual union all
select 2, 'JOHN NO.10 1234 TEST1' from dual union all
select 3, 'JOHN NO.10A 1234 TEST1' from dual union all
select 4, 'JOHN NO.10A TEST1 - JOHN NO.20A TEST2' from dual union all
select 5, 'JOHN NO.10 TEST1 - JOHN NO.20 TEST2' from dual union all
select 6, 'JOHN NO.10A TEST1234' from dual union all
select 7, 'JOHN 10A TEST1234' from dual
)
select
id,
col,
regexp_substr(col, '(NO.)(\S*)(\s?)',1,1,'i',2) as qwe
from test
ID | COL | EXPECTED |
---|---|---|
1 | JOHN NO.10A 1234 TEST1 | 10A |
2 | JOHN NO.10 1234 TEST1 | 10 |
3 | JOHN NO.10A 1234 TEST1 | 10A |
4 | JOHN NO.10A TEST1 - JOHN NO.20A TEST2 | 10A - 20A |
5 | JOHN NO.10 TEST1 - JOHN NO.20 TEST2 | 10 - 20 |
6 | JOHN NO.10A TEST1234 | 10A |
7 | JOHN 10A TEST1234 | Null |
db<>fiddle here
CodePudding user response:
You could use a recursive CTE or hierarchical query to split out the multiple occurrences of the pattern you're looking for:
select
id,
col,
level as lvl,
regexp_substr(col, '(NO.)(\S*)(\s?)',1,level,'i',2) as qwe
from test
connect by id = prior id
and prior dbms_random.value is not null
and level <= regexp_count(col, '(NO.)(\S*)(\s?)')
ID | COL | LVL | QWE |
---|---|---|---|
1 | JOHN NO.10A 1234 TEST1 | 1 | 10A |
2 | JOHN NO.10 1234 TEST1 | 1 | 10 |
3 | JOHN NO.10A 1234 TEST1 | 1 | 10A |
4 | JOHN NO.10A TEST1 - JOHN NO.20A TEST2 | 1 | 10A |
4 | JOHN NO.10A TEST1 - JOHN NO.20A TEST2 | 2 | 20A |
5 | JOHN NO.10 TEST1 - JOHN NO.20 TEST2 | 1 | 10 |
5 | JOHN NO.10 TEST1 - JOHN NO.20 TEST2 | 2 | 20 |
6 | JOHN NO.10A TEST1234 | 1 | 10A |
7 | JOHN 10A TEST1234 | 1 | null |
... and then aggregate those back to a single string per ID, with a subquery:
select
id,
col,
listagg(qwe, ' - ') within group (order by lvl) as qwe
from (
select
id,
col,
level as lvl,
regexp_substr(col, '(NO.)(\S*)(\s?)',1,level,'i',2) as qwe
from test
connect by id = prior id
and prior dbms_random.value is not null
and level <= regexp_count(col, '(NO.)(\S*)(\s?)')
)
group by id, col
ID | COL | QWE |
---|---|---|
1 | JOHN NO.10A 1234 TEST1 | 10A |
2 | JOHN NO.10 1234 TEST1 | 10 |
3 | JOHN NO.10A 1234 TEST1 | 10A |
4 | JOHN NO.10A TEST1 - JOHN NO.20A TEST2 | 10A - 20A |
5 | JOHN NO.10 TEST1 - JOHN NO.20 TEST2 | 10 - 20 |
6 | JOHN NO.10A TEST1234 | 10A |
7 | JOHN 10A TEST1234 | null |
or directly:
-- with connect-by to get multiple values and listagg to join them
select
id,
col,
listagg(regexp_substr(col, '(NO.)(\S*)(\s?)',1,level,'i',2), ' - ')
within group (order by level) as qwe
from test
connect by id = prior id
and prior dbms_random.value is not null
and level <= regexp_count(col, '(NO.)(\S*)(\s?)')
group by id, col
ID | COL | QWE |
---|---|---|
1 | JOHN NO.10A 1234 TEST1 | 10A |
2 | JOHN NO.10 1234 TEST1 | 10 |
3 | JOHN NO.10A 1234 TEST1 | 10A |
4 | JOHN NO.10A TEST1 - JOHN NO.20A TEST2 | 10A - 20A |
5 | JOHN NO.10 TEST1 - JOHN NO.20 TEST2 | 10 - 20 |
6 | JOHN NO.10A TEST1234 | 10A |
7 | JOHN 10A TEST1234 | null |
You've tagged your query with multiple database versions; if you are really using 10g (or 11gR1) then listagg()
won't be available, but there are other options for string aggregation. Or if you know you will only have at most two occurrences, you could pull them out separately and concatenate them - that just needs work to only have the ' - '
connector if it's needed.
CodePudding user response:
with test (id, col) as (
select 1, 'JOHN NO.10A 1234 TEST1' from dual union all
select 2, 'JOHN NO.10 1234 TEST1' from dual union all
select 3, 'JOHN NO.10A 1234 TEST1' from dual union all
select 4, 'JOHN NO.10A TEST1 - JOHN NO.20A TEST2' from dual union all
select 5, 'JOHN NO.10 TEST1 - JOHN NO.20 TEST2' from dual union all
select 6, 'JOHN NO.10A TEST1234' from dual union all select 7, 'JOHN 10A TEST1234' from dual
),
ctest as (
select id, col, regexp_count(col, '(NO.)(\S*)(\s?)') as c
from test
)
select id, col, decode(c, 0, null, regexp_substr(col, '(NO.)(\S*)(\s?)', 1 , c, 'i' )) as qwe
from ctest
;