I am trying to get the data from json but they give me error that HTTP status code is not handled or not allowed
is there anysolution how to handle these error in scrapy what is the reason these error will occur is that many request occur that why they show these error this is the page link https://www.nationalhardwareshow.com/en-us/attend/exhibitor-list.html
import scrapy
from scrapy import FormRequest
from scrapy.crawler import CrawlerProcess
from scrapy.http import Request
class TestSpider(scrapy.Spider):
name = 'test'
url="https://xd0u5m6y4r-dsn.algolia.net/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query"
headers = {
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8,pt;q=0.7',
'Connection': 'keep-alive',
'Origin': 'https://www.nationalhardwareshow.com',
'Referer': 'https://www.nationalhardwareshow.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'cross-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'accept': 'application/json',
'content-type': 'application/x-www-form-urlencoded',
'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
params = {
'x-algolia-agent': 'Algolia for vanilla JavaScript 3.27.1',
'x-algolia-application-id': 'XD0U5M6Y4R',
'x-algolia-api-key': 'd5cd7d4ec26134ff4a34d736a7f9ad47',
'params':'query=&page=0&facetFilters=&optionalFilters=[]',
}
def start_requests(self):
yield scrapy.FormRequest(
url=self.url,
method='POST',
headers=self.headers,
formdata=self.params,
callback=self.parse,
)
def parse(self,response):
print(response.json())
import scrapy
from scrapy import FormRequest
from scrapy.crawler import CrawlerProcess
from scrapy.http import Request
class TestSpider(scrapy.Spider):
name = 'test'
url="https://xd0u5m6y4r-dsn.algolia.net/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query"
headers = {
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8,pt;q=0.7',
'Connection': 'keep-alive',
'Origin': 'https://www.nationalhardwareshow.com',
'Referer': 'https://www.nationalhardwareshow.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'cross-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'accept': 'application/json',
'content-type': 'application/x-www-form-urlencoded',
'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
params = {
'x-algolia-agent': 'Algolia for vanilla JavaScript 3.27.1',
'x-algolia-application-id': 'XD0U5M6Y4R',
'x-algolia-api-key': 'd5cd7d4ec26134ff4a34d736a7f9ad47',
'params':'query=&page=0&facetFilters=&optionalFilters=[]',
}
def start_requests(self):
yield scrapy.FormRequest(
url=self.url,
method='POST',
headers=self.headers,
formdata=self.params,
callback=self.parse,
)
def parse(self,response):
print(response.json())
CodePudding user response:
You are getting HTTP status code is not handled or not allowed
because of headers and param's extravagant.
import scrapy
import json
from scrapy.crawler import CrawlerProcess
class TestSpider(scrapy.Spider):
name = 'test'
custom_settings = {
'CONCURRENT_REQUESTS_PER_DOMAIN': 1,
'DOWNLOAD_DELAY': 1
}
def start_requests(self):
data={"params":"query=&page=0&facetFilters=&optionalFilters=[]"}
headers= {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
api_url='https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47'
yield scrapy.Request(
url= api_url,
method='POST',
headers=headers,
body=json.dumps(data),
callback=self.parse
)
def parse(self, response):
resp = json.loads(response.body)
for item in resp['hits']:
yield {
'Title':item['companyName']
}
if __name__ == "__main__":
process = CrawlerProcess(TestSpider)
process.crawl()
process.start()
Output:
{'Title': 'Bug Bite Thing'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'BULA'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Bunnik Creations'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'McCordick Glove & Safety Inc'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Burro Creative Solutions'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Bytech/Case Logic USA'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Cable Lasso'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Caframo Ltd'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'California Air Tools'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Calloway Mills/Home and More'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Camp Chef'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Canadian Spa Company'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'CAPS-LOCK'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Carson LLC'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Cascade Holdings'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Catania Oils'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'CCH Products'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'CedarCraft'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': "Central Garden & Pet/Pennington/Howard Johnson's Enterprises"}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Centrex Plastic LLC./American Plastics'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Chaby International'}
2022-07-10 04:59:07 [scrapy.core.scraper] DEBUG: Scraped from <200 https://xd0u5m6y4r-2.algolianet.com/1/indexes/event-edition-eve-e6b1ae25-5b9f-457b-83b3-335667332366_en-us/query?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1&x-algolia-application-id=XD0U5M6Y4R&x-algolia-api-key=d5cd7d4ec26134ff4a34d736a7f9ad47>
{'Title': 'Changzhou Feiwang Tool Co.,Ltd.'}
2022-07-10 04:59:07 [scrapy.core.engine] INFO: Closing spider (finished)
2022-07-10 04:59:07 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 643,
'downloader/request_count': 1,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 117197,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'elapsed_time_seconds': 3.180524,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2022, 7, 9, 22, 59, 7, 202813),
'httpcompression/response_bytes': 765918,
'httpcompression/response_count': 1,
'item_scraped_count': 100,