I have been trying to make a web scraper tool to get all the pastebin urls using the bing search engine.
I managed to do that by using web browser and letting the javascript run and then scraping all the source.
string attempt = ""
^
I got 2 problems. the first problem is if I don't write the line MessageBox.Show(this.attempt)
the variable will be empty for some reason. another problem is for now I get only 9 links and it doesn't download the other pages like it should be. I think it's all because of the MessageBox.Show(this.attempt)
thing.
I know my code is not the best and probably there are a lot of much better ways but I would like to get help to understand what's going on here.
Thank you very much
here is my code:
private void Scan(Label pages)
{
string regex = @"https:\/\/pastebin.com\/[a-zA-Z0-9] ";
for (int i = 1; i <= Config.Amount_Of_Pages; i )
{
Parse(i);
MatchCollection matches = Regex.Matches(this.attempt, regex);
MessageBox.Show(this.attempt);
foreach (Match match in matches)
{
Config.List_Of_Urls.Add(match.Value.ToString());
Config.List_Of_Urls = Config.List_Of_Urls.Distinct().ToList();
}
Config.Amount_Of_Pages_Scanned ;
pages.Invoke(new MethodInvoker(delegate { pages.Text = Config.Amount_Of_Pages_Scanned.ToString(); }));
Files.Write_Urls(Config.List_Of_Urls);
}
MessageBox.Show("Done");
}
private void Parse(int i)
{
WebBrowser wb = new WebBrowser();
wb.DocumentCompleted = Wb_DocumentCompleted;
wb.ScriptErrorsSuppressed = true;
wb.Navigate("https://www.bing.com/search?q=site:pastebin.com email:password&first=" i);
}
private void Wb_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
var wb = (WebBrowser)sender;
var html = wb.Document.GetElementsByTagName("HTML")[0].OuterHtml;
this.attempt = html.ToString();
/* ... */
}
CodePudding user response:
- I prefer to use Selenium and suggest it for you.
- If you want to get distinct urls, you should use HashSet instead of List.
- You should add optional part
(www\.)?
to regex. - To handle retry policy, I prefer to use Polly
The result code is:
RetryPolicy retryPolicy = Policy.Handle<Exception>()
.WaitAndRetry(new[]
{
TimeSpan.FromSeconds(5),
TimeSpan.FromSeconds(10),
TimeSpan.FromSeconds(30)
});
string regex = @"https:\/\/(www\.)?pastebin.com\/[a-zA-Z0-9] ";
HashSet<string> sites = new HashSet<string>();
retryPolicy.Execute(() =>
{
using (IWebDriver driver = new ChromeDriver())
{
driver.Navigate().GoToUrl("https://www.bing.com/search?q=site:pastebin.com email:password&first=1");
// We have to wait until the page will download and rendered in the browser.
Thread.Sleep(1000);
foreach (Match match in Regex.Matches(driver.PageSource, regex))
{
sites.Add(match.Value);
}
}
});