How to segregate a string into percentage component and normal text in Oracle?-CodePudding

Given a string like

73 % polyester, 20 % modacrylic, 7 % cotton

there should be 3 rows with 2 columns:

Percentage	Component
73	polyester
20	modacrylic
7	cotton

5,5% cotton, 20% modacrylic, 74,5 % polyester, min. 90 % recycled material

Output should have 3 rows:

Percentage	Component
5,5	cotton
20	modacrylic
74,5	polyester

If a string doesn't start with a numeric character then by default the Percentage Column should get value as 100 and component should get all the characters before Comma (',')

e.g.

Polyester fibre , 150 g/sq.m.

Percentage	Component
100	Polyester fibre

I have written following logic but it's not working as expected:

SELECT
       a.item_no,
       a.item_type,
       a.code_sup,
       a.type_sup,
       a.from_dtime,
       a.id,
       a.material_name,
       a.str AS component,
       b.str AS percentage
FROM
       ( SELECT
                item_no,
                item_type,
                code_sup,
                type_sup,
                from_dtime,
                id,
                material_name,
                level rowseq,
                regexp_substr(str,'[^/] ',1,ROWNUM) str
         FROM
                ( SELECT
                         '23456' item_no,
                         'PLASTIC' item_type,
                         '10121' code_sup,
                         'SUP' type_sup,
                         '27-Nov-2020' from_dtime,
                         '1.1' id,
                         '26,5 % polyester, min. 90% recycled, 67 % cotton' material_name,
                         level rowseq,
                         regexp_substr(replace('26,5 % polyester, min. 90% recycled, 67 % cotton','% /','%/'),'[^%] ',1,ROWNUM) str
                   FROM
                        dual
                   CONNECT BY
                        level <= regexp_count('26,5 % polyester, min. 90% recycled, 67 % cotton','[^%] ')
                 )
         WHERE
                 rowseq = 1
         CONNECT BY
                 level <= regexp_count(str,'[^/] ')     
       ) a,
       (
         SELECT
                item_no,
                item_type,
                code_sup,
                type_sup,
                from_dtime,
                id,
                material_name,
                ROWNUM rowseq,
                TRIM(regexp_substr(str,'[^/%] ',1,ROWNUM) ) str
         FROM
                ( SELECT
                         '23456' item_no,
                         'PLASTIC' item_type,
                         '10121' code_sup,
                         'SUP' type_sup,
                         '27-Nov-2020' from_dtime,
                         '1.1' id,
                         '26,5 % polyester, min. 90% recycled, 67 % cotton' material_name,
                         level rowseq,
                         regexp_substr(replace('26,5 % polyester, min. 90% recycled, 67 % cotton','% /','%/'),'[^%] ',1,ROWNUM) str
                  FROM
                         dual
                         CONNECT BY
                         level <= regexp_count('26,5 % polyester, min. 90% recycled, 67 % cotton','[^%] ')
                 )
          WHERE
                 rowseq = 2
                 CONNECT BY
                 level <= regexp_count(str,'[^/%] ')
       ) b
WHERE  a.rowseq = b.rowseq
AND    a.str IS NOT NULL
ORDER BY
       a.item_no,
       a.item_type,
       a.code_sup,
       a.type_sup,
       a.from_dtime,
       a.id ;

Here I am getting only 1 row with Percentage as 26,5 and Component as polyester , min. 90 , although I should get 2 rows

Percentage	Component
26,5	polyester
67	cotton

Please guide.

CodePudding user response：

If you consider your string as a csv then there is a way to split the elements from csv string and extract percentage and component. Comments inside code:

Select
    TRIM(SubStr(COL1, 1, InStr(COL1, '%') - 1)) "PERCENTAGE",
    TRIM(SubStr(COL1, InStr(COL1, '%')   1)) "COMPONENT"
FROM
    (
        SELECT  
            COL1
        FROM    
            (   
                SELECT 
                    INDX, 
                    MY_STR1, 
                    COL1_ELEMENTS, 
                    CASE WHEN SubStr(TRIM(COL1), 1, 1) IN('0','1','2','3','4','5','6','7','8','9') THEN COL1 ELSE '100 % ' || COL1 END "COL1"
                FROM    
                    (   
                        SELECT 
                            0 "INDX", 
                            COL1 "MY_STR1", 
                            COL1_ELEMENTS "COL1_ELEMENTS", 
                            COL1 "COL1"
                        FROM
                            (
                                SELECT
                                  REPLACE(COL1, DELIMITER || ' ', DELIMITER) "COL1",    
                                  Trim(Length(Replace(COL1, DELIMITER || ' ', DELIMITER))) - Trim(Length(Translate(REPLACE(COL1, DELIMITER || ' ', DELIMITER), 'A' || DELIMITER, 'A')))   1 "COL1_ELEMENTS"
                                FROM (SELECT 
                                        '73 % polyester, 20 % modacrylic, something else, 67 % cotton' "COL1", ',' "DELIMITER" -- here comes your string and delimiter, if delimiter is not comma (,) then you should replace ',' with your delimiter in RULES clause
                                      FROM DUAL)        
                            )
                    )
                MODEL       
                    DIMENSION BY(0 as INDX)
                    MEASURES(COL1, COL1_ELEMENTS, CAST('a' as VarChar2(4000)) as MY_STR1)
                    RULES ITERATE (10)      --UNTIL (ITERATION_NUMBER <= COL1_ELEMENTS[ITERATION_NUMBER   1]) -- If you don't know the number of elements this should be bigger then you aproximation. Othewrwise it will split given number of elements
                    (
                        COL1_ELEMENTS[ITERATION_NUMBER   1] = COL1_ELEMENTS[0],
                        MY_STR1[0] = COL1[CV()],
                        MY_STR1[ITERATION_NUMBER   1] = SubStr(MY_STR1[ITERATION_NUMBER], InStr(MY_STR1[ITERATION_NUMBER], ',', 1)   Length(',')),
                        COL1[ITERATION_NUMBER   1] = SubStr(MY_STR1[ITERATION_NUMBER], 1, CASE WHEN InStr(MY_STR1[ITERATION_NUMBER], ',') <> 0 THEN InStr(MY_STR1[ITERATION_NUMBER], ',')-1 ELSE Length(MY_STR1[ITERATION_NUMBER]) END)
                    )
            )
        WHERE INDX > 0 And INDX <= COL1_ELEMENTS
    )
--
-- Result:
--
-- PERCENTAGE   COMPONENT
-- 73           polyester
-- 20           modacrylic
-- 100          something else
-- 67           cotton

CodePudding user response：

You need to do your string transformation step-by-step and thoroughly.

Remove White space after comma and percent

replace(replace('26,5 % polyester, min. 90% recycled, 67 % cotton','%  ','% '),', ',',')

2.Using "," instead of "%" as a string delimiter,

regexp_substr(REGEXP_REPLACE(replace(replace('26,5 % polyester, min. 90% recycled, 67 % cotton','%  ','% '),', ',','), '(\d ),(\d )', '\1.\2'),'[^,] ',1,ROWNUM) str

3.Filter for both row 1 and 3 NOT JUST 1

    WHERE rowseq IN (1, 3)

NOT SURE why you need the nested query at line 13 and the join subquery "b" starting at line 46 (IT'S REDUNDUNT)
Do your component, percent split using the '%'

    REGEXP_REPLACE(a.str, '^(\d .*\d*)(%)([^,]*)$','\3') AS component,
    REGEXP_REPLACE(a.str, '^(\d .*\d*)(%)([^,]*)$','\1') AS percentage

And you will get..

SELECT
                    a.item_no,
                    a.item_type,
                    a.code_sup,
                    a.type_sup,
                    a.from_dtime,
                    a.id,
                    a.material_name,
                    a.rowseq,
                    REGEXP_REPLACE(a.str, '^(\d .*\d*)(%)([^,]*)$','\3') AS component,
                    REGEXP_REPLACE(a.str, '^(\d .*\d*)(%)([^,]*)$','\1') AS percentage                    
                FROM
                    (  SELECT
                    '23456' item_no,
                    'PLASTIC' item_type,
                    '10121' code_sup,
                    'SUP' type_sup,
                    '27-Nov-2020' from_dtime,
                    '1.1' id,
                   '26,5 % polyester, min. 90% recycled, 67 % cotton' material_name,
                    level rowseq,
                    regexp_substr(REGEXP_REPLACE(replace(replace('26,5 % polyester, min. 90% recycled, 67 % cotton','%  ','% '),', ',','), '(\d ),(\d )', '\1.\2'),'[^,] ',1,ROWNUM) str
                    
                FROM
                    dual
                CONNECT BY
                    level <= regexp_count(REGEXP_REPLACE(replace('26,5 % polyester, min. 90% recycled, 67 % cotton','% /','%/'), '(\d ),(\d )', '\1.\2'),'[^,] ')
                         
                    ) a
                WHERE rowseq IN (1, 3)
                ORDER BY
                    a.item_no,
                    a.item_type,
                    a.code_sup,
                    a.type_sup,
                    a.from_dtime,
                    a.id ;

For more complicated scenarios and robust code do the following. Of course you need to specify your transformation RULES carefully inside the WITH Clause, I'm Not a big fan of nested statements but the use case demands it.

var the_specification varchar2(500);
exec :the_specification := '5,5% cotton, 20% modacrylic, 74,5 % polyester, min. 90 % recycled material, Polyester fibre , 150 g/sq.m.';

WITH specstr as (
        SELECT
           REGEXP_REPLACE(
                REGEXP_REPLACE(
                  REGEXP_REPLACE( 
                      REGEXP_REPLACE( 
                          REGEXP_REPLACE(
                            REGEXP_REPLACE(
                              REGEXP_REPLACE(:the_specification,'\s*%','% '), -- remove white space tagging the percentage'%' sign
                            '(\d ),(\d )', '\1.\2'), -- Replace dot'.' as decimal separator
                          '(,*[^,]*recycl[^,]*,*)', ','), -- remove any additonal comment THE_RULE: consider 'recycl' as comment
                        '(^[[:alnum:][:space:]] ),|,([[:alnum:][:space:]] ),|,([[:alnum:][:space:]] $)', ',100% \1\2\3,'), -- Add the default Percentage Column '100%'
                      '\s{2,}', ' '), -- remove double white space
                    ',{2,}|\s,|,\s', ','), -- remove duplicate commas ',' or any trailing and tagging white space from comma
                  '^,*|,*$', '') refined_str -- remove comma from the start and end                
            FROM DUAL
)
SELECT REGEXP_REPLACE(mtrl_ratio.str, '^(\d ,*\d*)(%)([^,]*)$','\1') Percentage,
       REGEXP_REPLACE(mtrl_ratio.str, '^(\d ,*\d*)(%)([^,]*)$','\3') Component
FROM (
  SELECT        
        level rowseq,
        REGEXP_REPLACE(
          regexp_substr(
            specstr.refined_str
           ,'[^,] ',1,ROWNUM), -- split string using comma as delimiter
        '(\d )\.(\d )', '\1,\2') str -- Replace Back comma',' as decimal separator)
    FROM
        specstr
    CONNECT BY
        level <= regexp_count(
                  specstr.refined_str
                  ,'[^,] ')
)mtrl_ratio
WHERE REGEXP_LIKE(mtrl_ratio.str, '%')