Please help the query extract the number after "_tid-" from the Title column.
CodePudding user response:
Use the Vertica Regular Expression function collection. Regular expressions in Vertica correspond to the perl regex functionality in their behaviour.
So, with your input, search for the first group that follows _tid-
and consists of consecutive digits (\d
)...
WITH
-- your input, don't use in final query ...
indata(Title,Extract_ID) AS (
SELECT 'sdffsdvprocessortype:lnjklel&text=&textSearch=&pageSize=10&SSid=psj6tcd5b1g7_tid-87945','87945'
UNION ALL SELECT 'https://www.google.com/hk/en/age.html?SSid=ps_c8x4v1r2a3_tid-8952777456','8952777456'
UNION ALL SELECT 'https://www.google.com/hk/en/ge/dhci.html?SSid=ps_7228fk5sbh_tid-5879','5879'
UNION ALL SELECT 'https://www.google.com/fr/fr/c/3328412?q=%3:tSearch=&pageSize=10&SSid=pseydgg8h2_tid-9858867','9858867'
UNION ALL SELECT 'https://www.google.com/fr/fr/seas/1011028701?SSid=ps_yne5j6fmqv_tid-6879582','6879582'
UNION ALL SELECT 'https://www.google.com/il/en/sera/p/1010192786?SSid=ps_gydi5nk673_tid-5577484126','5577484126'
UNION ALL SELECT 'pid=ps_98qcokfh3_tid-548965&q=:relevance:facet_Processorstype','548965'
UNION ALL SELECT 'pid=ps_345ey5na9_tid-95861469rq=%3relevance:facet_Processo','95861469'
UNION ALL SELECT 'npyamjhsgx_tid-002154785 /p/1010192775?SSid=ps_npyamjhsgx_tid-002154785','002154785'
UNION ALL SELECT 'https://www.google.com/us/en/ke.html?ssid=ps_wc998kn__tid-0012889','0012889'
)
-- end of your input, real query starts here ...
SELECT
REGEXP_SUBSTR(
title -- input string
, '_tid-(\d )' -- regular experssion (note the bit in parentheses, that's the first group)
, 1 -- starting point
, 1 -- occurrence ordinal number
, '' -- modifier (case insensitive, etc. check perl docu)
, 1 -- parentheses base grouping expression's ordinal number
) AS calc_extract
, *
FROM indata;
-- out calc_extract | Title | Extract_ID
-- out -------------- ------------------------------------------------------------------------------------------------ ------------
-- out 87945 | sdffsdvprocessortype:lnjklel&text=&textSearch=&pageSize=10&SSid=psj6tcd5b1g7_tid-87945 | 87945
-- out 8952777456 | https://www.google.com/hk/en/age.html?SSid=ps_c8x4v1r2a3_tid-8952777456 | 8952777456
-- out 5879 | https://www.google.com/hk/en/ge/dhci.html?SSid=ps_7228fk5sbh_tid-5879 | 5879
-- out 9858867 | https://www.google.com/fr/fr/c/3328412?q=%3:tSearch=&pageSize=10&SSid=pseydgg8h2_tid-9858867 | 9858867
-- out 6879582 | https://www.google.com/fr/fr/seas/1011028701?SSid=ps_yne5j6fmqv_tid-6879582 | 6879582
-- out 5577484126 | https://www.google.com/il/en/sera/p/1010192786?SSid=ps_gydi5nk673_tid-5577484126 | 5577484126
-- out 548965 | pid=ps_98qcokfh3_tid-548965&q=:relevance:facet_Processorstype | 548965
-- out 95861469 | pid=ps_345ey5na9_tid-95861469rq=%3relevance:facet_Processo | 95861469
-- out 002154785 | npyamjhsgx_tid-002154785 /p/1010192775?SSid=ps_npyamjhsgx_tid-002154785 | 002154785
-- out 0012889 | https://www.google.com/us/en/ke.html?ssid=ps_wc998kn__tid-0012889 | 0012889