Oracle Regex query-CodePudding

I am working on the below regex. I need to extract the substring after "NO." but I have an issue with 4 and 5 ids below where the expected output is 10A - 20A and 10 - 20 but instead I am getting 10A and 10.

with test (id, col) as (
  select 1, 'JOHN NO.10A 1234 TEST1'  from dual union all
  select 2, 'JOHN NO.10 1234 TEST1'  from dual union all
  select 3, 'JOHN NO.10A 1234 TEST1'  from dual union all
  select 4, 'JOHN NO.10A TEST1 - JOHN NO.20A TEST2'      from dual union all
  select 5, 'JOHN NO.10 TEST1 - JOHN NO.20 TEST2'      from dual union all
  select 6, 'JOHN NO.10A TEST1234'  from dual union all
   select 7, 'JOHN 10A TEST1234'   from dual
)
select
  id,
  col,
  regexp_substr(col, '(NO.)(\S*)(\s?)',1,1,'i',2) as qwe
from test

ID	COL	EXPECTED
1	JOHN NO.10A 1234 TEST1	10A
2	JOHN NO.10 1234 TEST1	10
3	JOHN NO.10A 1234 TEST1	10A
4	JOHN NO.10A TEST1 - JOHN NO.20A TEST2	10A - 20A
5	JOHN NO.10 TEST1 - JOHN NO.20 TEST2	10 - 20
6	JOHN NO.10A TEST1234	10A
7	JOHN 10A TEST1234	Null

db<>fiddle here

CodePudding user response：

You could use a recursive CTE or hierarchical query to split out the multiple occurrences of the pattern you're looking for:

select
  id,
  col,
  level as lvl,
  regexp_substr(col, '(NO.)(\S*)(\s?)',1,level,'i',2) as qwe
from test
connect by id = prior id
and prior dbms_random.value is not null
and level <= regexp_count(col, '(NO.)(\S*)(\s?)')

ID	COL	LVL	QWE
1	JOHN NO.10A 1234 TEST1	1	10A
2	JOHN NO.10 1234 TEST1	1	10
3	JOHN NO.10A 1234 TEST1	1	10A
4	JOHN NO.10A TEST1 - JOHN NO.20A TEST2	1	10A
4	JOHN NO.10A TEST1 - JOHN NO.20A TEST2	2	20A
5	JOHN NO.10 TEST1 - JOHN NO.20 TEST2	1	10
5	JOHN NO.10 TEST1 - JOHN NO.20 TEST2	2	20
6	JOHN NO.10A TEST1234	1	10A
7	JOHN 10A TEST1234	1	null

... and then aggregate those back to a single string per ID, with a subquery:

select
  id,
  col,
  listagg(qwe, ' - ') within group (order by lvl) as qwe
from (
  select
    id,
    col,
    level as lvl,
    regexp_substr(col, '(NO.)(\S*)(\s?)',1,level,'i',2) as qwe
  from test
  connect by id = prior id
  and prior dbms_random.value is not null
  and level <= regexp_count(col, '(NO.)(\S*)(\s?)')
)
group by id, col

ID	COL	QWE
1	JOHN NO.10A 1234 TEST1	10A
2	JOHN NO.10 1234 TEST1	10
3	JOHN NO.10A 1234 TEST1	10A
4	JOHN NO.10A TEST1 - JOHN NO.20A TEST2	10A - 20A
5	JOHN NO.10 TEST1 - JOHN NO.20 TEST2	10 - 20
6	JOHN NO.10A TEST1234	10A
7	JOHN 10A TEST1234	null

or directly:

-- with connect-by to get multiple values and listagg to join them
select
  id,
  col,
  listagg(regexp_substr(col, '(NO.)(\S*)(\s?)',1,level,'i',2), ' - ')
    within group (order by level) as qwe
from test
connect by id = prior id
and prior dbms_random.value is not null
and level <= regexp_count(col, '(NO.)(\S*)(\s?)')
group by id, col

ID	COL	QWE
1	JOHN NO.10A 1234 TEST1	10A
2	JOHN NO.10 1234 TEST1	10
3	JOHN NO.10A 1234 TEST1	10A
4	JOHN NO.10A TEST1 - JOHN NO.20A TEST2	10A - 20A
5	JOHN NO.10 TEST1 - JOHN NO.20 TEST2	10 - 20
6	JOHN NO.10A TEST1234	10A
7	JOHN 10A TEST1234	null

fiddle

You've tagged your query with multiple database versions; if you are really using 10g (or 11gR1) then listagg() won't be available, but there are other options for string aggregation. Or if you know you will only have at most two occurrences, you could pull them out separately and concatenate them - that just needs work to only have the ' - ' connector if it's needed.

CodePudding user response：

with test (id, col) as (
    select 1, 'JOHN NO.10A 1234 TEST1' from dual union all 
    select 2, 'JOHN NO.10 1234 TEST1' from dual union all 
    select 3, 'JOHN NO.10A 1234 TEST1' from dual union all 
    select 4, 'JOHN NO.10A TEST1 - JOHN NO.20A TEST2' from dual union all 
    select 5, 'JOHN NO.10 TEST1 - JOHN NO.20 TEST2' from dual union all 
    select 6, 'JOHN NO.10A TEST1234' from dual union all select 7, 'JOHN 10A TEST1234' from dual 
),
ctest as (
    select id, col, regexp_count(col, '(NO.)(\S*)(\s?)') as c
    from test
)
select id, col, decode(c, 0, null, regexp_substr(col, '(NO.)(\S*)(\s?)', 1 , c, 'i' ))  as qwe
from ctest
;