This SQL gives me the blog comments that contain just the banned words defined in my table. I only get the EXACT matches and it removes duplicate rows. It also eliminates variants of a banned word. Which is what I want.
DECLARE @BlogComment TABLE (
BlogCommentId INT IDENTITY PRIMARY KEY,
BlogCommentContent VARCHAR(MAX),
Id int);
INSERT INTO @BlogComment
(BlogCommentContent,
Id)
VALUES
('There are many of us.' ,1),
('This is the man.', 2),
('I hear you.', 2),
('Your the man.',2);
DECLARE @BannedWords TABLE (
BannedWordsId INT IDENTITY PRIMARY KEY,
Word varchar(250));
INSERT INTO @BannedWords (Word) VALUES
('though'),
('man'),
('hear');
;WITH rs AS
(
SELECT word = REPLACE(REPLACE([value],'.',''),',','')
,Id
FROM @BlogComment
CROSS APPLY STRING_SPLIT(BlogCommentContent, SPACE(1))
)
SELECT DISTINCT bw.Word,
rs.id
FROM rs
INNER JOIN @BannedWords bw ON rs.word = bw.Word;
Results of running this are:
Word id
hear 2
man 2
What I expect.
Now I want to take it 1 step further. Test case: I have more than 1 banned word in the same blog comment.
So I altered the code (the table values) to include the test case. A blog comment with 2 banned words.
('He is the man. I hear ya.',2),
I want only 1 row returned for this case. Either one.
Word id
hear 2
And altered the code to accommodate this by adding 2 more lines of code per the 'accepted answer' from - Get top 1 row of each group
,ROW_NUMBER() OVER(PARTITION by Id ORDER BY BlogCommentContent) AS rn
WHERE rn = 1;
DECLARE @BlogComment TABLE (
BlogCommentId INT IDENTITY PRIMARY KEY,
BlogCommentContent VARCHAR(MAX),
Id int);
INSERT INTO @BlogComment
(BlogCommentContent,
Id)
VALUES
('There are many of us.',1),
('He is the man. I hear ya.',2),
('Your the man.',2);
DECLARE @BannedWords TABLE (
BannedWordsId INT IDENTITY PRIMARY KEY,
Word varchar(250));
INSERT INTO @BannedWords (Word) VALUES
('though'),
('man'),
('hear');
;WITH rs AS
(
SELECT word = REPLACE(REPLACE([value],'.',''),',','')
,Id
,ROW_NUMBER() OVER(PARTITION by Id ORDER BY BlogCommentContent) AS rn
FROM @BlogComment
CROSS APPLY STRING_SPLIT(BlogCommentContent, SPACE(1))
)
SELECT DISTINCT bw.Word,
rs.id
FROM rs
INNER JOIN @BannedWords bw ON rs.word = bw.Word
WHERE rn = 1;
Results of running this are no rows returned:
Word id
So, not sure why the suggestion does not work for me.
CodePudding user response:
If I understand correctly, you can try to use COUNT
window function to find the duplicate words.
Judgement the word count whether equal 1
;WITH rs AS
(
SELECT word = REPLACE(REPLACE([value],'.',''),',','')
,Id
,COUNT(*) OVER(PARTITION by Id,REPLACE(REPLACE([value],'.',''),',','')) cnt
FROM @BlogComment
CROSS APPLY STRING_SPLIT(BlogCommentContent, SPACE(1))
)
SELECT bw.Word,
rs.id
FROM rs
INNER JOIN @BannedWords bw ON rs.word = bw.Word
WHERE cnt = 1
CodePudding user response:
You don't need the row_number ... you only need to join split words from each comment and join banned words.. and then count them for each comment ... either all or unique count ..
DECLARE @BlogComment TABLE (
BlogCommentId INT IDENTITY PRIMARY KEY,
BlogCommentContent VARCHAR(MAX),
Id int);
INSERT INTO @BlogComment
(BlogCommentContent,
Id)
VALUES
('There are many of us.',1),
('He is the man. I hear ya.',2),
('Your the man.',2);
DECLARE @BannedWords TABLE (
BannedWordsId INT IDENTITY PRIMARY KEY,
Word varchar(250));
INSERT INTO @BannedWords (Word) VALUES
('though'),
('man'),
('hear');
SELECT split_words.id as comment_id
, count(bw.Word) as total_banned_words
, count(distinct bw.Word) as total_unique_banned_words
FROM (
SELECT word = REPLACE(REPLACE([value],'.',''),',','')
,Id
,COUNT(*) OVER(PARTITION by Id,REPLACE(REPLACE([value],'.',''),',','')) cnt
FROM @BlogComment
CROSS APPLY STRING_SPLIT(BlogCommentContent, SPACE(1))
) split_words
LEFT JOIN @BannedWords bw
ON bw.Word = split_words.word
GROUP BY split_words.id
ORDER BY split_words.id