Home > OS >  Multiple columns matching in where clause
Multiple columns matching in where clause

Time:02-17

I have four columns (Col1, Col2, Col3, Col4) in a table with some millions records in SQL Server 2019.

In stored procedure, I have to pass four input parameters @Col1, @Col2, @Col3, @Col4 and it should return success/failure whether all four values found, irrespective of column sequence. For example @Col1 can match with Col2.

Some values can be null in Col2, Col3, Col4 but Col1 will always have some data in it.

I have prepared some sample data and scenarios that I have tested.

CREATE TABLE SampleData(Id INT IDENTITY(1,1), Col1 VARCHAR(20), Col2 VARCHAR(20), Col3 VARCHAR(20), Col4 VARCHAR(20))

INSERT INTO SampleData(Col1, Col2, Col3, Col4)
SELECT 'ABC','DEF','GHI','JKL' UNION 
SELECT '123','456','789','100' UNION 
SELECT 'ABC','XYZ','','' UNION 
SELECT '9898','6565',NULL,NULL UNION 
SELECT '989844','D656555','','' UNION 
SELECT '8888','9999','7777','6666' UNION 
SELECT '1234','5678','4321',NULL UNION
SELECT '465456465',NULL,NULL,NULL   

 

The Stored Procedure

CREATE PROC dbo.ValidateSampleData(
 @Col1 VARCHAR(20) = NULL
,@Col2 VARCHAR(20) = NULL
,@Col3 VARCHAR(20) = NULL
,@Col4 VARCHAR(20) = NULL
)
AS
BEGIN
    
    Declare @a as bit = 0, @Message VARCHAR(50) = 'Data Not Matched'
    if(@Col1 is NULL or @Col2 is NULL or @Col3 is NULL or @Col4 is NULL )
    begin
        set @a = 1
    end
 
    SELECT  @Message = 'Data Matched ' 
    FROM    SampleData SD
    where   (Col1 in (@Col1,@Col2,@Col3,@Col4) or (Col1 is null and @a = 1))
    and     (Col2 in (@Col1,@Col2,@Col3,@Col4) or (Col2 is null and @a = 1))
    and     (Col3 in (@Col1,@Col2,@Col3,@Col4) or (Col3 is null and @a = 1))
    and     (Col4 in (@Col1,@Col2,@Col3,@Col4) or (Col4 is null and @a = 1))

    and     (select sum(    (case when Col1 is null then 1 else 0 end)
                        (case when Col2 is null then 1 else 0 end)
                        (case when Col3 is null then 1 else 0 end)
                        (case when Col4 is null then 1 else 0 end)
                    ) from SampleData where Id = SD.Id) = 
            (select sum(    (case when @Col1 is null then 1 else 0 end)
                    (case when @Col2 is null then 1 else 0 end)
                    (case when @Col3 is null then 1 else 0 end)
                    (case when @Col4 is null then 1 else 0 end))) 

    SELECT @Message
END

Here is some sample set of data that I tried

DECLARE 
@Col1 VARCHAR(20) = NULL
,@Col2 VARCHAR(20) = NULL
,@Col3 VARCHAR(20) = NULL
,@Col4 VARCHAR(20) = NULL 



--Case 0  - Should not matched 
SELECT @Col1 = 'ABC', @Col2 = 'XYZ' , @Col3 = 'testtest' , @Col4 = ''  
EXEC ValidateSampleData @Col1=@Col1, @Col2=@Col2, @Col3=@Col3, @Col4=@Col4  

--Case 1 
SELECT @Col1 = 'ABC', @Col2 = 'DEF' , @Col3 = 'GHI' , @Col4 = 'JKL'  
EXEC ValidateSampleData @Col1=@Col1, @Col2=@Col2, @Col3=@Col3, @Col4=@Col4  

--Case 2
SELECT @Col1 = 'DEF', @Col2 = 'JKL' , @Col3 = 'ABC' , @Col4 = 'GHI' 
EXEC ValidateSampleData @Col1=@Col1, @Col2=@Col2, @Col3=@Col3, @Col4=@Col4  

--Case 3 
SELECT @Col1 = '123', @Col2 = '456' , @Col3 = '789' , @Col4 = '100'  
EXEC ValidateSampleData @Col1=@Col1, @Col2=@Col2, @Col3=@Col3, @Col4=@Col4  

--Case 4 
SELECT @Col1 = '1234', @Col2 = '5678' , @Col3 = '4321' , @Col4 = NULL  
EXEC ValidateSampleData @Col1=@Col1, @Col2=@Col2, @Col3=@Col3, @Col4=@Col4  

--Case 5 
SELECT @Col1 = '465456465', @Col2 = NULL , @Col3 = NULL , @Col4 = NULL  
EXEC ValidateSampleData @Col1=@Col1, @Col2=@Col2, @Col3=@Col3, @Col4=@Col4  

--Case 6 
SELECT @Col1 = '8888', @Col2 = '9999' , @Col3 = '7777' , @Col4 = '6666'  
EXEC ValidateSampleData @Col1=@Col1, @Col2=@Col2, @Col3=@Col3, @Col4=@Col4  

--Case 7 
SELECT @Col1 = '9898', @Col2 = '6565' , @Col3 = NULL , @Col4 = NULL
EXEC ValidateSampleData @Col1=@Col1, @Col2=@Col2, @Col3=@Col3, @Col4=@Col4  

--Case 8 
SELECT @Col1 = '989844', @Col2 = 'D656555' , @Col3 = '' , @Col4 = '' 
EXEC ValidateSampleData @Col1=@Col1, @Col2=@Col2, @Col3=@Col3, @Col4=@Col4  

--Case 9   
SELECT @Col1 = 'ABC', @Col2 = 'XYZ' , @Col3 = '' , @Col4 = ''  
EXEC ValidateSampleData @Col1=@Col1, @Col2=@Col2, @Col3=@Col3, @Col4=@Col4  
 
 
--Case 10  - Should not matched
SELECT @Col1 = 'ABC', @Col2 = 'XYZ' , @Col3 = 'tet' , @Col4 = ''  
EXEC ValidateSampleData @Col1=@Col1, @Col2=@Col2, @Col3=@Col3, @Col4=@Col4  
 

I have achieved desired result for some cases but its incorrect with NULL values in table. Case 0 should not matched

Also it doesn't look optimized and very slow on large set of data.

In simple words, I want to check if all 4 values appear in the table, with NULL matching NULL? If they do, return a match, if not, a non match?

CodePudding user response:

The way I would approach is is by first creating a table with all your possible combinations of the 4 values. This can be done by creating a derived table with 4 rows, then joining it to itself 4 times, with each join ensuring that you are not picking a row that has already been picked, i.e.

DECLARE @Values TABLE
(
    Col1 VARCHAR(20),
    Col2 VARCHAR(20),
    Col3 VARCHAR(20),
    Col4 VARCHAR(20)
    UNIQUE (Col1, Col2, Col3, Col4)
);

WITH Data AS
(   SELECT  Value, v.Ordinal
    FROM    (VALUES (1, @Col1), (2, @Col2), (3, @Col3), (4, @Col4)) AS v (Ordinal, Value)
)
INSERT @Values(Col1, Col2, Col3, Col4)
SELECT  DISTINCT d1.Value, d2.Value, d3.Value, d4.Value
FROM    Data AS d1
        INNER JOIN Data AS d2
            ON d2.Ordinal NOT IN (d1.Ordinal)
        INNER JOIN Data AS d3
            ON d3.Ordinal NOT IN (d1.Ordinal, d2.Ordinal)
        INNER JOIN Data AS d4
            ON d4.Ordinal NOT IN (d1.Ordinal, D2.Ordinal, d3.Ordinal);

With a really simple example where @Col1 = 'A', and all others are NULL you'll end up with something like:

Col1 Col2 Col3 Col4
NULL NULL NULL A
NULL NULL A NULL
NULL A NULL NULL
A NULL NULL NULL

You can then use INTERSECT to check against your table. The benefit of this is that INTERSECT does NULL matching, i.e. SELECT NULL INTERSECT SELECT NULL will return a record, whereas SELECT NULL WHERE NULL = NULL will not.

IF EXISTS
    (   SELECT  Col1, Col2, Col3, Col4
        FROM    SampleData
        INTERSECT
        SELECT  Col1, Col2, Col3, Col4
        FROM    @Values
    )
BEGIN
    SELECT 'Data Matched';
END
ELSE
BEGIN
    SELECT 'Data not Matched';
END

The main advantage of this method (aside from returning your expected results) is that it can take advantage of an index on SampleData, so if you add:

CREATE INDEX IX_SampleData ON dbo.SampleData (Col1, Col2, Col3, Col4);

Then this will be used when using INTERSECT, but isn't with your logic as it stands:

enter image description here

With the small data set as it stands this doesn't make much difference, but with much larger data sets it will do. The above is with 200k rows in SampleData.

So your full procedure will be something like:

CREATE OR ALTER PROC dbo.ValidateSampleData(
 @Col1 VARCHAR(20) = NULL
,@Col2 VARCHAR(20) = NULL
,@Col3 VARCHAR(20) = NULL
,@Col4 VARCHAR(20) = NULL
)
AS
BEGIN

    DECLARE @Values TABLE
    (
        Col1 VARCHAR(20),
        Col2 VARCHAR(20),
        Col3 VARCHAR(20),
        Col4 VARCHAR(20)
        UNIQUE (Col1, Col2, Col3, Col4)
    );

    WITH Data AS
    (   SELECT  Value, v.Ordinal
        FROM    (VALUES (1, @Col1), (2, @Col2), (3, @Col3), (4, @Col4)) AS v (Ordinal, Value)
    )
    INSERT @Values(Col1, Col2, Col3, Col4)
    SELECT  DISTINCT d1.Value, d2.Value, d3.Value, d4.Value
    FROM    Data AS d1
            INNER JOIN Data AS d2
                ON d2.Ordinal NOT IN (d1.Ordinal)
            INNER JOIN Data AS d3
                ON d3.Ordinal NOT IN (d1.Ordinal, d2.Ordinal)
            INNER JOIN Data AS d4
                ON d4.Ordinal NOT IN (d1.Ordinal, D2.Ordinal, d3.Ordinal);

    IF EXISTS
        (   SELECT  Col1, Col2, Col3, Col4
            FROM    SampleData
            INTERSECT
            SELECT  Col1, Col2, Col3, Col4
            FROM    @Values
        )
    BEGIN
        SELECT 'Data Matched';
    END
    ELSE
    BEGIN
        SELECT 'Data not Matched';
    END
END 

CodePudding user response:

One solution is to split the four columns into rows, sort and number them; do the same to the four values. You can then compare the two sets:

with cols as (
    select id, value, row_number() over (partition by id order by value) as rn
    from t
    cross apply (values (col1), (col2), (col3), (col4) ) as x(value)
), vals as (
    select value, row_number() over (order by value) as rn
    from (values (@col1), (@col2), (@col3), (@col4) ) as x(value)
)
select id
from cols
join vals on cols.rn = vals.rn and (cols.value = vals.value or cols.value is null and vals.value is null)
group by id
having count(*) = 4

DB<>Fiddle

  • Related