I have created a scrapy spider that has to crawl the whole webpage and extract the urls. now I have to remove the social media URL for that I want to make a list of the URLs, but somehow it's not working. when I try to append each URL in list it just continuously make list of urls.
import re
import scrapy
all_urls = []
class QuotesSpider(scrapy.Spider):
name = 'quotes'
start_urls = [
'https://www.wireshark.org/docs/dfref/i/ip.html',
]
def parse(self, response):
page= response.url.split("/")[-2]
filename='quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
for r in response.css('a'):
url = r.css('::attr(href)').get()
print('all the urls are here', url)
for i in url:
all_urls.append(url)
print(all_urls)
CodePudding user response:
An easier way to grab all of the urls on a page would be to chain your css selector and call getall()
.
For example:
import scrapy
all_urls = []
class QuotesSpider(scrapy.Spider):
name = 'quotes'
start_urls = [
'https://www.wireshark.org/docs/dfref/i/ip.html',
]
def parse(self, response):
for url in response.css('a::attr(href)').getall():
all_urls.append(url)
print(all_urls)
OUTPUT
['/', '/news/', '#', '/index.html#aboutWS', '/index.html#download', 'https://blog.wireshark.org/', '/code-of-conduct.html', '#', 'https://ask.wireshark.org/', '/faq.html', '/docs/', '/lists/', '/tools/', 'https://gitlab.co
m/wireshark/wireshark/-/wikis', 'https://gitlab.com/wireshark/wireshark/-/issues', '#', '/develop.html', 'https://www.wireshark.org/docs/wsdg_html_chunked/', 'https://gitlab.com/wireshark/wireshark/-/tree/master', 'https:/
/www.wireshark.org/download/automated', '../', 'https://twitter.com/wiresharknews', 'https://sysdig.com/privacy-policy/', '#', '#']