I have a column value like:
Input:
This is ssn12345 string dob 12-09-1998 string
This is ssn12345786 stringabc string
Output:
This is ******** string ************** string
This is *********** stringabc string
whenever we found ssn or dob the value should be get masked as shown above example. string of dynamic length.
I am trying below query -
SELECT CASE
WHEN start_pos > 0
THEN SUBSTR( 'This is ssn12345 string dob 12-09-1998 string', 1, start_pos - 1)
|| RPAD('*', end_pos - start_pos, '*')
|| SUBSTR('This is ssn12345 string dob 12-09-1998 string', end_pos)
ELSE 'This is ssn12345 string dob 12-09-1998 string'
END AS masked_value
FROM (
SELECT 'This is ssn12345 string dob 12-09-1998 string',
REGEXP_INSTR( 'This is ssn12345 string dob 12-09-1998 string', 'ssn[0-9. -] ' || 'dob[0-9. -] ', 1, 1, 0, NULL) AS start_pos,
REGEXP_INSTR( 'This is ssn12345 string dob 12-09-1998 string', 'ssn[0-9. -] ' || 'dob[0-9. -] ', 1, 1, 1, NULL) AS end_pos
FROM dual
)
I am trying above query not getting as expected result as above
it is working only when we have -
input like - This is ssn12345 dob 12-09-1998 string
then it masking
output as - This is ******** ************** string
CodePudding user response:
From your previous question, just change the regular expression you are matching:
WITH data ( value, masked_value, end_pos ) AS (
SELECT value,
REGEXP_REPLACE(
value,
'ssn([ #]?\d ) |dob ?(-?\d ) ',
RPAD(
'*',
LENGTH(
REGEXP_SUBSTR(value, 'ssn([ #]?\d ) |dob ?(-?\d ) ', 1, 1)
),
'*'
),
1,
1
),
REGEXP_INSTR(
value,
'ssn([ #]?\d ) |dob ?(-?\d ) ',
1,
1,
1
)
FROM table_name
UNION ALL
SELECT value,
REGEXP_REPLACE(
masked_value,
'ssn([ #]?\d ) |dob ?(-?\d ) ',
RPAD(
'*',
LENGTH(
REGEXP_SUBSTR(masked_value, 'ssn([ #]?\d ) |dob ?(-?\d ) ', end_pos, 1)
),
'*'
),
end_pos,
1
),
REGEXP_INSTR(
masked_value,
'ssn([ #]?\d ) |dob ?(-?\d ) ',
end_pos,
1,
1
)
FROM data
WHERE end_pos > 0
)
SEARCH DEPTH FIRST BY value SET value_order
SELECT value, masked_value
FROM data
WHERE end_pos = 0;
Which, for the sample data:
CREATE TABLE table_name ( value ) AS
SELECT 'This is ssn1 string' FROM DUAL UNION ALL
SELECT 'This is ssn12 string' FROM DUAL UNION ALL
SELECT 'This is ssn12345 string' FROM DUAL UNION ALL
SELECT 'This is ssn123456789 string' FROM DUAL UNION ALL
SELECT 'This is ssn#12345 string ssn 5678 9765' FROM DUAL UNION ALL
SELECT 'This is ssn12345 string dob 12-09-1998 string' FROM DUAL UNION ALL
SELECT 'This is dob 12-09-1998 string' FROM DUAL;
Outputs:
VALUE MASKED_VALUE This is dob 12-09-1998 string This is ************** string This is ssn#12345 string ssn 5678 9765 This is ********* string ************* This is ssn1 string This is **** string This is ssn12 string This is ***** string This is ssn12345 string This is ******** string This is ssn12345 string dob 12-09-1998 string This is ******** string ************** string This is ssn123456789 string This is ************ string
db<>fiddle here
CodePudding user response:
If you want to replace a fixed number of occurrences of the pattern then you can use:
SELECT value,
CASE
WHEN start_pos2 > 0
THEN SUBSTR( value, 1, start_pos1 - 1)
|| RPAD('*', end_pos1 - start_pos1 - 1, '*')
|| SUBSTR(value, end_pos1, start_pos2 - end_pos1)
|| RPAD('*', end_pos2 - start_pos2, '*')
|| SUBSTR(value, end_pos2)
WHEN start_pos1 > 0
THEN SUBSTR( value, 1, start_pos1 - 1)
|| RPAD('*', end_pos1 - start_pos1, '*')
|| SUBSTR(value, end_pos1)
ELSE value
END AS masked_value
FROM (
SELECT value,
REGEXP_INSTR( value, 'ssn([ #]*\d ) |dob([. -]*\d ) ', 1, 1, 0, NULL) AS start_pos1,
REGEXP_INSTR( value, 'ssn([ #]*\d ) |dob([. -]*\d ) ', 1, 1, 1, NULL) AS end_pos1,
REGEXP_INSTR( value, 'ssn([ #]*\d ) |dob([. -]*\d ) ', 1, 2, 0, NULL) AS start_pos2,
REGEXP_INSTR( value, 'ssn([ #]*\d ) |dob([. -]*\d ) ', 1, 2, 1, NULL) AS end_pos2
FROM table_name
);
Note: this will only work for the first two occurrences in the string. If you need to match a dynamic number of occurrences then you will need to use a recursive query.
Which, for the sample data:
CREATE TABLE table_name ( value ) AS
SELECT 'This is ssn1 string' FROM DUAL UNION ALL
SELECT 'This is ssn12 string' FROM DUAL UNION ALL
SELECT 'This is ssn12345 string' FROM DUAL UNION ALL
SELECT 'This is ssn123456789 string' FROM DUAL UNION ALL
SELECT 'This is ssn#12345 string ssn 5678 9765' FROM DUAL UNION ALL
SELECT 'This is ssn12345 string dob 12-09-1998 string' FROM DUAL UNION ALL
SELECT 'This is dob 12-09-1998 string' FROM DUAL;
Outputs:
VALUE MASKED_VALUE This is ssn1 string This is **** string This is ssn12 string This is ***** string This is ssn12345 string This is ******** string This is ssn123456789 string This is ************ string This is ssn#12345 string ssn 5678 9765 This is ******** string ************* This is ssn12345 string dob 12-09-1998 string This is ******* string ************** string This is dob 12-09-1998 string This is ************** string
db<>fiddle here
CodePudding user response:
This works but it replaces any matched string with a fixed number of asterixes.
WITH sample_data (c) AS
(
SELECT 'This is ssn12. 34-5 string dob 12-09-1998 string' FROM DUAL UNION ALL
SELECT '5 Strings: ssn12. 34-5, dob 12-09-1998, nr 3: ssn18898, nr 4: dob 01-01-2000, nr 5:ssn1663' FROM DUAL UNION ALL
SELECT 'This is ssn12345786 stringabc string' FROM DUAL
)
SELECT regexp_replace(c,'((ssn[0-9. -] )|(dob [0-9. -] ))[0-9.-]','*******') FROM sample_data;
This is ******* string ******* string
5 Strings: *******, *******, nr 3: *******, nr 4: *******, nr 5:*******
This is ******* stringabc string