I'm trying to scrape all that's inside the html tag of this page: https://www.rtlnieuws.nl/zoeken?q=Philips fraude
var options = new ChromeOptions();
//options.AddArguments("--headless");
//options.AddArgument("--disable-gpu");
options.AddArgument("no-sandbox");
var chromeDriver = new ChromeDriver(@"C:\Users\Sasha\source\repos\debiteurenzeker.businesscheck\src\DebiteurenZeker.BusinessCheck\DZBC.Application\bin\Debug\net6.0", options);
chromeDriver.Navigate().GoToUrl("https://www.rtlnieuws.nl/zoeken?q=Philips fraude"); // for now
var htmlIWebElement = chromeDriver.FindElement(By.TagName("html"));
Console.WriteLine(htmlIWebElement.Text);
Console.ReadLine();
Basically it gets to the GoToUrl line, it opens the page in th browser but then it doesn't do further in the code.
It just times out after 60 seconds.
Here's the error:
fail: Microsoft.AspNetCore.Diagnostics.DeveloperExceptionPageMiddleware[1]
An unhandled exception has occurred while executing the request.
OpenQA.Selenium.WebDriverException: The HTTP request to the remote WebDriver server for URL http://localhost:57661/session/06250c8684ded7772af53ce40bcd3e15/url timed out after 60 seconds.
---> System.Threading.Tasks.TaskCanceledException: The request was canceled due to the configured HttpClient.Timeout of 60 seconds elapsing.
---> System.TimeoutException: The operation was canceled.
---> System.Threading.Tasks.TaskCanceledException: The operation was canceled.
---> System.IO.IOException: Unable to read data from the transport connection: The I/O operation has been aborted because of either a thread exit or an application request..
---> System.Net.Sockets.SocketException (995): The I/O operation has been aborted because of either a thread exit or an application request.
--- End of inner exception stack trace ---
at System.Net.Sockets.Socket.AwaitableSocketAsyncEventArgs.ThrowException(SocketError error, CancellationToken cancellationToken)
at System.Net.Sockets.Socket.AwaitableSocketAsyncEventArgs.System.Threading.Tasks.Sources.IValueTaskSource<System.Int32>.GetResult(Int16 token)
at System.Net.Http.HttpConnection.SendAsyncCore(HttpRequestMessage request, Boolean async, CancellationToken cancellationToken)
--- End of inner exception stack trace ---
at System.Net.Http.HttpConnection.SendAsyncCore(HttpRequestMessage request, Boolean async, CancellationToken cancellationToken)
at System.Net.Http.HttpConnectionPool.SendWithVersionDetectionAndRetryAsync(HttpRequestMessage request, Boolean async, Boolean doRequestAuth, CancellationToken cancellationToken)
at System.Net.Http.DiagnosticsHandler.SendAsyncCore(HttpRequestMessage request, Boolean async, CancellationToken cancellationToken)
at System.Net.Http.RedirectHandler.SendAsync(HttpRequestMessage request, Boolean async, CancellationToken cancellationToken)
at System.Net.Http.HttpClient.<SendAsync>g__Core|83_0(HttpRequestMessage request, HttpCompletionOption completionOption, CancellationTokenSource cts, Boolean disposeCts, CancellationTokenSource pendingRequestsCts, CancellationToken originalCancellationToken)
--- End of inner exception stack trace ---
--- End of inner exception stack trace ---
at System.Net.Http.HttpClient.HandleFailure(Exception e, Boolean telemetryStarted, HttpResponseMessage response, CancellationTokenSource cts, CancellationToken cancellationToken, CancellationTokenSource pendingRequestsCts)
at System.Net.Http.HttpClient.<SendAsync>g__Core|83_0(HttpRequestMessage request, HttpCompletionOption completionOption, CancellationTokenSource cts, Boolean disposeCts, CancellationTokenSource pendingRequestsCts, CancellationToken originalCancellationToken)
at OpenQA.Selenium.Remote.HttpCommandExecutor.MakeHttpRequest(HttpRequestInfo requestInfo)
at OpenQA.Selenium.Remote.HttpCommandExecutor.Execute(Command commandToExecute)
--- End of inner exception stack trace ---
at OpenQA.Selenium.Remote.HttpCommandExecutor.Execute(Command commandToExecute)
at OpenQA.Selenium.Remote.DriverServiceCommandExecutor.Execute(Command commandToExecute)
at OpenQA.Selenium.WebDriver.Execute(String driverCommandToExecute, Dictionary`2 parameters)
at OpenQA.Selenium.WebDriver.set_Url(String value)
at OpenQA.Selenium.Navigator.GoToUrl(String url)
at DZBC.Application.Services.Implementations.ScraperService.ScrapeWebsiteAdverseAsync(CompanyProfile companyProfile) in C:\Users\Sasha\source\repos\debiteurenzeker.businesscheck\src\DebiteurenZeker.BusinessCheck\DZBC.Application\Services\Implementations\ScraperService.cs:line 143
at DZBC.Application.Services.Implementations.CompanyProfileService.CreateCompanyProfileAsync(CompanyProfile companyProfile) in C:\Users\Sasha\source\repos\debiteurenzeker.businesscheck\src\DebiteurenZeker.BusinessCheck\DZBC.Application\Services\Implementations\CompanyProfileService.cs:line 39
at DZBC.API.Controllers.CompanyProfilesv1Controller.PostCompanyProfile(CompanyProfile companyProfile) in C:\Users\Sasha\source\repos\debiteurenzeker.businesscheck\src\DebiteurenZeker.BusinessCheck\DZBC.API\Controllers\v1\CompanyProfilesv1Controller.cs:line 78
at lambda_method5(Closure , Object )
at Microsoft.AspNetCore.Mvc.Infrastructure.ActionMethodExecutor.AwaitableObjectResultExecutor.Execute(IActionResultTypeMapper mapper, ObjectMethodExecutor executor, Object controller, Object[] arguments)
at Microsoft.AspNetCore.Mvc.Infrastructure.ControllerActionInvoker.<InvokeActionMethodAsync>g__Awaited|12_0(ControllerActionInvoker invoker, ValueTask`1 actionResultValueTask)
at Microsoft.AspNetCore.Mvc.Infrastructure.ControllerActionInvoker.<InvokeNextActionFilterAsync>g__Awaited|10_0(ControllerActionInvoker invoker, Task lastTask, State next, Scope scope, Object state, Boolean isCompleted)
at Microsoft.AspNetCore.Mvc.Infrastructure.ControllerActionInvoker.Rethrow(ActionExecutedContextSealed context)
at Microsoft.AspNetCore.Mvc.Infrastructure.ControllerActionInvoker.Next(State& next, Scope& scope, Object& state, Boolean& isCompleted)
at Microsoft.AspNetCore.Mvc.Infrastructure.ControllerActionInvoker.InvokeInnerFilterAsync()
--- End of stack trace from previous location ---
at Microsoft.AspNetCore.Mvc.Infrastructure.ResourceInvoker.<InvokeFilterPipelineAsync>g__Awaited|20_0(ResourceInvoker invoker, Task lastTask, State next, Scope scope, Object state, Boolean isCompleted)
at Microsoft.AspNetCore.Mvc.Infrastructure.ResourceInvoker.<InvokeAsync>g__Awaited|17_0(ResourceInvoker invoker, Task task, IDisposable scope)
at Microsoft.AspNetCore.Mvc.Infrastructure.ResourceInvoker.<InvokeAsync>g__Awaited|17_0(ResourceInvoker invoker, Task task, IDisposable scope)
at Microsoft.AspNetCore.Routing.EndpointMiddleware.<Invoke>g__AwaitRequestTask|6_0(Endpoint endpoint, Task requestTask, ILogger logger)
at Microsoft.AspNetCore.Authorization.AuthorizationMiddleware.Invoke(HttpContext context)
at Swashbuckle.AspNetCore.SwaggerUI.SwaggerUIMiddleware.Invoke(HttpContext httpContext)
at Swashbuckle.AspNetCore.Swagger.SwaggerMiddleware.Invoke(HttpContext httpContext, ISwaggerProvider swaggerProvider)
at Microsoft.AspNetCore.Diagnostics.DeveloperExceptionPageMiddleware.Invoke(HttpContext context)
[17980:2004:0810/153335.421:ERROR:device_event_log_impl.cc(214)] [15:33:35.421] USB: usb_device_handle_win.cc:1048 Failed to read descriptor from node connection: A device attached to the system is not functioning. (0x1F)
[17980:19800:0810/153335.712:ERROR:util.cc(127)] Can't create base directory: C:\Program Files\Google\GoogleUpdater
[17980:2004:0810/153345.058:ERROR:device_event_log_impl.cc(214)] [15:33:45.058] USB: usb_device_handle_win.cc:1048 Failed to read descriptor from node connection: A device attached to the system is not functioning. (0x1F)
[17980:2004:0810/153345.060:ERROR:device_event_log_impl.cc(214)] [15:33:45.060] USB: usb_device_handle_win.cc:1048 Failed to read descriptor from node connection: A device attached to the system is not functioning. (0x1F)
[23420:22580:0810/153408.878:ERROR:gpu_init.cc(486)] Passthrough is not supported, GL is disabled, ANGLE is
Anyone knows what I can do? There were some similar questions asked before but they didn't help.
I also don't understand why in the error it says " OpenQA.Selenium.WebDriverException: The HTTP request to the remote WebDriver server for URL http://localhost:57661/session/06250c8684ded7772af53ce40bcd3e15/url timed out after 60 seconds."
It gives a url with localhost, while in the Chrome I see the https://www.rtlnieuws.nl/zoeken?q=Philips fraude ?
UPDATE: because the page is generated with JS, I also tried waiting for it:
using (IWebDriver chromeDriver = new ChromeDriver(@"C:\Users\Sasha\source\repos\debiteurenzeker.businesscheck\src\DebiteurenZeker.BusinessCheck\DZBC.Application\bin\Debug\net6.0", options))
{
WebDriverWait wait = new WebDriverWait(chromeDriver, TimeSpan.FromSeconds(5));
chromeDriver.Navigate().GoToUrl("https://www.rtlnieuws.nl/zoeken?q=Philips fraude"); // for now
var htmlIWebElement = wait.Until(ExpectedConditions.ElementExists(By.TagName("html")));
Console.WriteLine(htmlIWebElement.Text);
}
But the error is the same.
CodePudding user response:
I made an example for your scenario.
Lets say, we want to scrape the posts in the home page so we need a model to store our data:
public class Post
{
public string ImageSrc { get; set; }
public string Category { get; set; }
public string Title { get; set; }
public string Description { get; set; }
public string Date { get; set; }
public override string ToString()
{
return JsonSerializer.Serialize(this,
new JsonSerializerOptions { WriteIndented = true });
}
}
Next we need to initialize selenium webdriver
var options = new ChromeOptions();
options.AddArgument("--no-sandbox");
using var driver = new ChromeDriver(options);
// Here we setup a fluent wait
var wait = new WebDriverWait(driver, TimeSpan.FromSeconds(20))
{
PollingInterval = TimeSpan.FromMilliseconds(250)
};
wait.IgnoreExceptionTypes(typeof(NoSuchElementException), typeof(StaleElementReferenceException));
// Navigate to the target url
driver.Navigate().GoToUrl("https://www.rtlnieuws.nl/zoeken?q=Philips fraude");
// Accept cookies
var cookieBtn = wait.Until(driver => driver.FindElement(By.Id("onetrust-accept-btn-handler")));
cookieBtn.Click();
// Scroll to end
int count = 0;
await driver.ScrollToEndAsync(d =>
{
// Determine when we are at the end of the page
var tempCount = d.FindElements(By.XPath("//a[@class = 'search-item search-item--artikel']")).Count;
if (tempCount != count)
{
count = tempCount;
return false;
}
return true;
});
// List of post elements
var elements = wait.Until(driver =>
{
return driver.FindElements(By.XPath("//div[@class = 'search-items']//a[contains(@class, 'search-item')]"));
});
// Print Posts in json format
foreach (var e in elements)
{
var post = new Post
{
ImageSrc = e.FindElement(By.XPath(".//img")).GetAttribute("src"),
Category = e.FindElement(By.XPath(".//div/span")).Text,
Title = e.FindElement(By.XPath(".//div/h2")).Text,
Description = e.FindElement(By.XPath(".//div[@class = 'search-item__content']/p[@class = 'search-item__description']")).Text,
Date = e.FindElement(By.XPath(".//div[@class = 'search-item__content']//span[@class = 'search-item__date']")).Text,
};
Console.WriteLine(post);
}
// Just for this sample in order to wait to see our results
Console.ReadLine();
In order to use ScrollToEndAsync
like above, you must create an extension method:
public static class WebDriverExtensions
{
public static async Task ScrollToEndAsync(this IWebDriver driver, Func<IWebDriver, bool> pageEnd)
{
while (!pageEnd.Invoke(driver))
{
var js = (IJavaScriptExecutor)driver;
js.ExecuteScript("window.scrollTo(0, document.body.scrollHeight);");
// Arbitrary delay between scrolling
await Task.Delay(200);
}
}
}