I am trying to retrieve all PDF links from a string column which contains HTML.
Example text of one column is:
<p>text here <a href="example.com/abc.pdf">link</a>
some other text <a href="example.com">home</a>
<a href="www.example.com/abc123.pdf">link 2</a></p>
I need all links with .pdf extension.
I already tried function like this
ALTER function [dbo].[GetLinks] (@t nvarchar(max))
returns @Links table (link nvarchar(max))
as
begin
declare @strtpos int
set @strtpos=100
declare @endpos int
declare @lnk nvarchar(max)
while @strtpos > 6
begin
set @strtpos = PATINDEX('%href="%', @t) 6
if @strtpos>6 begin
--set @endpos = CHARINDEX ('"',@t,@strtpos 1)
set @endpos = PATINDEX('%.pdf"%',@t) 4
if @endpos>0 begin
set @lnk = substring(@t ,@strtpos, @endpos - @strtpos)
set @strtpos = PATINDEX('%href="%', @lnk) 6
set @t= RIGHT (@t, len(@t) - @endpos)
insert @Links values(@lnk)
end
end
end
return
end
And calling this function from SQL Server like this:
select top 1 * from dbo.GetLinks(' <p>text here <a href="example.com/abc.pdf">link</a>
some other text <a href="example.com">home</a>
<a href="www.example.com/abc123.pdf">link 2</a></p>')
This returns the first link only when I match CHAR, but when I match string ".pdf" it returns long string. Please let me know if I am doing something wrong or need to change approach for this.
CodePudding user response:
If your html column can be converted to XML like your example suggests, your can parse the href values in T-SQL using XML data type methods:
CREATE FUNCTION dbo.GetLinks (@t xml)
RETURNS @Links TABLE (link nvarchar(max))
AS
BEGIN
INSERT @Links
SELECT
AnchorTag.value('@href', 'nvarchar(MAX)') AS link
FROM @t.nodes('//a') AS AnchorTags(AnchorTag);
RETURN;
END;
GO
The same approach can be used with an inline TVF:
CREATE FUNCTION dbo.GetLinks (@t xml)
RETURNS TABLE
AS
RETURN (
SELECT
AnchorTag.value('@href', 'nvarchar(MAX)') AS link
FROM @t.nodes('//a') AS AnchorTags(AnchorTag)
);
GO
CodePudding user response:
Xquery expression can do it simply
DECLARE @html xml = '<p>text here <a href="example.com/abc.pdf">link<b v="3">ok</b></a>some other text <a href="example.com">home</a><a title="er">kj</a><a href="www.example.com/abc123.pdf">link 2</a></p>'
select [pdfLink] = a.value('@href','varchar(max)')
from @html.nodes('//a[@href[contains(., ".pdf")]]') c(a)
CodePudding user response:
If you can't convert your html into xml for whatever reason, you can still do this with regular string manipluation, though it is not pretty.
This solution (ironically) utilises an xml based string splitter to allow for multi-character delimiters, the output of which is then further filtered to only return the .pdf
links:
create or alter function [dbo].[fn_StringSplitXML]
(
@str varchar(max) = '' -- String to split.
,@Delimiter varchar(10) = ',' -- Delimiting value to split on.
,@num int = null -- Which value to return.
)
returns table
as
return
select rn
,item
from(select rn = row_number() over(order by(select null))
,item = ltrim(rtrim(n.i.value('(./text())[1]','varchar(max)')))
from(select x = cast('<x>' replace(@str,@Delimiter,'</x><x>') '</x>' as xml).query('.')) as s
cross apply s.x.nodes('x') as n(i)
) as a
where rn = @num
or @num is null
;
declare @html varchar(1000) =
'<p>text here <a href="example.com/abc.pdf">link</a>
some other text <a href="example.com">home</a>
<a href="www.example.com/abc123.pdf">link 2</a></p>
<input type="text" name="self closed tag" />
<b>some more text</b>
';
select left(s.item
,patindex('%.pdf%',s.item) 3
) as link
from dbo.fn_StringSplitXML(replace(replace(@html
,'>'
,''
)
,'<'
,''
)
,'href="'
,null
) as s
where patindex('%.pdf%',s.item) > 0;
Output
link |
---|
example.com/abc.pdf |
www.example.com/abc123.pdf |
CodePudding user response:
If you are on SQL Server 2016 you can use STRING_SPLIT.
DECLARE @string VARCHAR(8000) = '
<p>text here <a href="example.com/abc.pdf">link</a>
some other text <a href="example.com">home</a>
<a href="www.example.com/abc123.pdf">link 2</a></p>';
SELECT TheUrl = split.value
FROM STRING_SPLIT(@string,'"') AS split
WHERE split.value LIKE '%.pdf';
Returns:
TheUrl
---------------------------
example.com/abc.pdf
www.example.com/abc123.pdf