My problem is that I have no result with this function. In fact, in my csv file or in my list I have no values. I spotted that the problem came from the line : if (htmlDocument != null && htmlDocument.DocumentNode.SelectNodes("//a[@href]")?.ToList() != null). It says me that the DocumentNode is empty. Do you have any idea ?
public class LiensHtml
{
public List<string>? Links { get; set; }
}
public async Task StartCrawlerasync(string url)
{
var httpClient = new HttpClient();
var html = await httpClient.GetStringAsync(url);
var htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(html);
var All_Links = new LiensHtml();
var writer = new StreamWriter("Tous_les_Liens.csv") ;
var csv = new CsvWriter(writer, CultureInfo.InvariantCulture) ;
HtmlNode test = htmlDocument.DocumentNode;
Console.WriteLine(test.ToString());
if (htmlDocument.DocumentNode.SelectNodes("//a[@href]").ToList() != null)
{
foreach (HtmlNode link in (List<HtmlNode>)htmlDocument.DocumentNode.SelectNodes("//a[@href]").ToList())
{
All_Links.Links.Add(link.Attributes["href"].Value);
csv.WriteHeader<LiensHtml>();
csv.WriteRecords(link.Attributes["href"].Value);
};
foreach (string lien in All_Links.Links)
{
Console.WriteLine(lien);
}
}
else
{
Console.WriteLine("C''est vide");
}
}
CodePudding user response:
It would help if you could give the site you are using, but I think:
htmlDocument.DocumentNode.Descendants("a").ToList()
should do the trick.
Edit: After getting the site, I've rewritten your code:
public class LiensHtml
{
public LiensHtml()
{
Links = new List<string>();
}
public List<string>? Links { get; set; }
public async Task StartCrawlerasync(string url)
{
var httpClient = new HttpClient();
var html = await httpClient.GetStringAsync(url);
var htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(html);
var linkList = new LiensHtml();
var writer = new StreamWriter("Tous_les_Liens.csv");
var csv = new CsvWriter(writer, CultureInfo.InvariantCulture);
var links = htmlDocument.DocumentNode.Descendants("link").ToList();
if (links.Count() != 0)
{
foreach (var link in links)
{
var linkValue = link.GetAttributeValue("href", "no link");
linkList.Links.Add(linkValue);
csv.WriteHeader<LiensHtml>();
await csv.WriteRecordsAsync(linkValue);
await csv.FlushAsync();
}
foreach (var lien in linkList.Links) Console.WriteLine(lien);
}
else
{
Console.WriteLine("C''est vide");
}
}
}
CodePudding user response:
After an hour of search, i find that the problem is the Loadhtml : htmlDocument.LoadHtml(html);
I don't find the solution.