How can I split by a character without ignoring nulls?-CodePudding

I'm trying to use regexp_subst to split a delimited string. I'm running into an issue when delimited fields are null. The regexp_substr ignores the nulls and moves to the next occurrence of the delimiter. Is there a way to do this with regexp_substr? If not, what alternative do you use?

--Expecting hello, gets hello
select regexp_substr('hello@@world', '[^@] ', 1, 1)
from dual;

--Expecting null, gets world
select regexp_substr('hello@@world', '[^@] ', 1, 2)
from dual;

--Expecting world, gets null
select regexp_substr('hello@@world', '[^@] ', 1, 3)
from dual;

EDIT: tried this, but it works only with | which isn't an option

CodePudding user response：

Answering based on Matbailie's input in above comment

select regexp_substr('hello@@world', '(.*?)(@|$)', 1, 1,NULL,1)
from dual
union all
--Expecting null, gets null
select regexp_substr('hello@@world', '(.*?)(@|$)', 1, 2,NULL,1)
from dual
union all
--Expecting world, gets world
select regexp_substr('hello@@world', '(.*?)(@|$)', 1, 3,NULL,1)
from dual;

CodePudding user response：

You do not need regular expressions. It can be done with simple (and faster) string functions in a recursive sub-query:

WITH data (value) AS (
  SELECT 'hello@@world' FROM DUAL
),
bounds (value, start_pos, end_pos) AS (
  SELECT value,
         1,
         INSTR(value, '@', 1)
  FROM   data
UNION ALL
  SELECT value,
         end_pos   1,
         INSTR(value, '@', end_pos   1)
  FROM   bounds
  WHERE  end_pos > 0
)
SEARCH DEPTH FIRST BY value SET order_id
SELECT CASE end_pos
       WHEN 0
       THEN SUBSTR(value, start_pos)
       ELSE SUBSTR(value, start_pos, end_pos - start_pos)
       END AS item
FROM   bounds;

Which outputs:

ITEM

hello

null

world

ITEM
hello
null
world

Or, if you want the data in columns (rather than rows):

WITH data (value) AS (
  SELECT 'hello@@world' FROM DUAL
),
bounds (value, pos1, pos2) AS (
  SELECT value,
         INSTR(value, '@', 1, 1),
         INSTR(value, '@', 1, 2)
  FROM   data
)
SELECT SUBSTR(value, 1, pos1 - 1) AS item1,
       SUBSTR(value, pos1   1, pos2 - pos1 - 1) AS item2,
       SUBSTR(value, pos2   1) AS item3
FROM   bounds

Which outputs:

ITEM1 ITEM2 ITEM3

hello null world

ITEM1	ITEM2	ITEM3
hello	null	world

If you did want to use (slower) regular expressions then:

WITH data (value) AS (
  SELECT 'hello@@world' FROM DUAL
)
SELECT item
FROM   data d
       CROSS JOIN LATERAL(
         SELECT REGEXP_SUBSTR( d.value, '(.*?)(@|$)', 1, LEVEL, NULL, 1) AS item
         FROM   DUAL
         CONNECT BY LEVEL < REGEXP_COUNT( d.value, '(.*?)(@|$)')
       )

or, for columns:

WITH data (value) AS (
  SELECT 'hello@@world' FROM DUAL
)
SELECT REGEXP_SUBSTR(value, '(.*?)(@|$)', 1, 1, NULL, 1) AS item1,
       REGEXP_SUBSTR(value, '(.*?)(@|$)', 1, 2, NULL, 1) AS item2,
       REGEXP_SUBSTR(value, '(.*?)(@|$)', 1, 3, NULL, 1) AS item3
FROM   data

(Which both have the same output as above)

db<>fiddle here