PLSQL: How do I count the occurrences of a substring in all values in a column-CodePudding

I have a table like this in an Oracle database:

substring	string
abc	123-def-abc
def	123-def
ghi	123-def-ghi
jkl	123-456-jkl
mno	123-456-jkl-mno

I need to count the occurrences of each substring in all the rows of the string column. An example result would be:

substring	string	count
abc	123-def-abc	1
def	123-def	3
ghi	123-def-ghi	1
jkl	123-456-jkl	2
mno	123-456-jkl-mno	1

How can I achieve this result?

CodePudding user response：

This might be one option.

For sample data

SQL> with test (substring, string) as
  2    (select 'abc', '123-def-abc' from dual union all
  3     select 'def', '123-def'     from dual union all
  4     select 'ghi', '123-def-ghi' from dual union all
  5     select 'jkl', '123-456-jkl' from dual union all
  6     select 'mno', '123-456-jkl-mno' from dual
  7    ),

check whether substring exists in string (and use cross join, as you have to check all combinations)

  8  temp as
  9    (select a.substring,
 10            case when instr(b.string, a.substring) > 1 then 1 else 0 end cnt
 11     from test a cross join test b
 12  --   group by a.substring
 13    )

Finally, return the result by joining the "original" table and the temp CTE:

 14  select a.substring, a.string, sum(b.cnt) cnt
 15  from test a join temp b on a.substring = b.substring
 16  group by a.substring, a.string
 17  order by a.substring;

SUB STRING                 CNT
--- --------------- ----------
abc 123-def-abc              1
def 123-def                  3
ghi 123-def-ghi              1
jkl 123-456-jkl              2
mno 123-456-jkl-mno          1

SQL>

CodePudding user response：

Few solutions: DBFiddle

cross apply (or lateral or cross join):

select *
from t 
    cross apply(
       select count(*) cnt
       from t t2 
       where t2.string like '%'||t.substring||'%'
    ) a
order by substring;

subquery:

select
  t.*
  ,(
       select count(*) cnt
       from t t2 
       where t2.string like '%'||t.substring||'%'
   ) cnt
from t
order by substring;

connect by:

select 
   substring,string,count(*)
from (
  select
     connect_by_root substring as substring
    ,connect_by_root string    as string
  from t
  connect by nocycle
     level<=2
     and string like '%'||(prior t.substring)||'%'
)
group by substring,string
order by substring;

model:

select 
  substring,string,cnt
from t
model
  dimension by (substring,string)
  measures(0 as cnt,string as string2)
  rules(
    cnt[any,any] order by substring = count(*)[any,string like '%'||cv()||'%']
  )
order by substring;

xmlquery xmlagg:

select--  NO_XML_QUERY_REWRITE
  substring,string,
  xmlcast(
         xmlquery(
           'count($D/ROW/VAL[contains(., $X)])'
           passing
              xmlelement("ROW", (xmlagg(xmlelement(VAL, string)) over())) as d, 
              substring as x
           returning content) as number) as cnt
from t
order by substring;

CodePudding user response：

This can be done with a CTE with 2 calls to the original tables, one using distinct to get the substrings and another to get all the occurrences of each string, cross-joined. We then use aggregation to count the number of occurrences of each match.

create table if not exists strings(
   substrin varchar(3),
   strin varchar(20)
   );
delete from strings;
insert into strings values('abc','123-def-abc');
insert into strings values('def','123-def');
insert into strings values('def','123-def');
insert into strings values('ghi','123-def-ghi');
insert into strings values('jkl','123-456-jkl');
insert into strings values('jkl','123-456-jkl');
insert into strings values('mno','123-456-jkl-mno');
with sss as
   ( select s1.substrin sb, s2.strin sg
   from (Select distinct substrin from strings ) s1,
   strings s2)
select
   sb 'substring',
   sg 'string',
   count(*) 'count'
from sss
where sg like concat('%',sb,'%')
group by sg,sb;

Note that this does not give the total number of lines which match the substring but the number of occurrences of each pair.