link example :
<img src="https://thumbs.com/thumbs/test.mp4/test1.mp4-3.jpg" alt="This is the description i want to get too" >
and the method i'm using to parse the links from html downloaded source file :
public List<string> GetLinks(string message)
{
List<string> list = new List<string>();
string txt = message;
foreach (Match item in Regex.Matches(txt, @"(http|ftp|https):\/\/([\w\-_] (?:(?:\.[\w\-_] ) ))([\w\-\.,@?^=%&:/~\ #]*[\w\-\@?^=%&/~\ #])?"))
{
if (item.Value.Contains("thumbs"))
{
int index1 = item.Value.IndexOf("mp4");
string news = ReplaceLastOccurrence(item.Value, "thumbs", "videos");
if (index1 != -1)
{
string result = news.Substring(0, index1 3);
if (!list.Contains(result))
{
list.Add(result);
}
}
}
}
return list;
}
but this wil give only the link i want to get also the link description in this example:
This is a test
Then using it :
string[] files = Directory.GetFiles(@"D:\Videos\");
foreach (string file in files)
{
foreach(string text in GetLinks(File.ReadAllText(file)))
{
if (!videosLinks.Contains(text))
{
videosLinks.Add(text);
}
}
}
and when downloading the links :
private async void btnStartDownload_Click(object sender, EventArgs e)
{
if (videosLinks.Count > 0)
{
for (int i = 0; i < videosLinks.Count; i )
{
string fileName = System.IO.Path.GetFileName(videosLinks[i]);
await DownloadFile(videosLinks[i], @"D:\Videos\videos\" fileName);
}
}
}
but the fileName i want to be the description of each link.
CodePudding user response:
You can use Html Agility Pack which is an HTML parser written in C# to read/write DOM and supports plain XPATH or XSLT. In the example below you can retrieve the description in alt
attribute and others.
Implementation:
using HtmlAgilityPack;
using System;
public class Program
{
public static void Main()
{
HtmlDocument doc = new HtmlDocument();
var html = "<img src=\"https://thumbs.com/thumbs/test.mp4/test1.mp4-3.jpg\" alt=\"This is the description i want to get too\" >";
doc.LoadHtml(html);
HtmlNode image = doc.DocumentNode.SelectSingleNode("//img");
Console.WriteLine("Source: {0}", image.Attributes["src"].Value);
Console.WriteLine("Description: {0}", image.Attributes["alt"].Value);
Console.Read();
}
}
Demo:
https://dotnetfiddle.net/nAAZDL
Output:
Source: https://thumbs.com/thumbs/test.mp4/test1.mp4-3.jpg
Description: This is the description i want to get too
CodePudding user response:
If you use the code using regex, it will take more CPU cycle and perform slow. Use some library like AngleSharp.
I tried to write your code in AngleSharp. This is how I did it.
string test = "<img src=\"https://thumbs.com/thumbs/test.mp4/test1.mp4-3.jpg\" alt=\"This is the description i want to get too\" >\r\n";
var configuration = Configuration.Default.WithDefaultLoader();
var context = BrowsingContext.New(configuration);
using var doc = await context.OpenAsync(req => req.Content(test));
string href = doc.QuerySelector("img").Attributes["alt"].Value;