I've having a weird issue where my API request call works perfectly with standard request library, but it throws a 403 using scrapy only. The error message does not give me anything useful, except the "403 error" message. I have dropped my API key from this post, but you can easily obtain your own. Let me know if you need help getting an API key.
Working python code (no scrapy)
import requests
url = "https://www.airbnb.ca/api/v3/ExploreSections"
querystring = {"operationName":"ExploreSections","locale":"en-CA","currency":"CAD","_cb":"1db02z70xkcr690n1h3gp0py4nmy","variables":"{\"isInitialLoad\":true,\"hasLoggedIn\":false,\"cdnCacheSafe\":false,\"source\":\"EXPLORE\",\"exploreRequest\":{\"metadataOnly\":false,\"version\":\"1.8.3\",\"itemsPerGrid\":20,\"tabId\":\"home_tab\",\"refinementPaths\":[\"/homes\"],\"flexibleTripDates\":[\"february\",\"march\"],\"flexibleTripLengths\":[\"weekend_trip\"],\"datePickerType\":\"calendar\",\"placeId\":\"ChIJpTvG15DL1IkRd8S0KlBVNTI\",\"checkin\":\"2022-03-15\",\"checkout\":\"2022-03-16\",\"adults\":2,\"source\":\"structured_search_input_header\",\"searchType\":\"autocomplete_click\",\"query\":\"Toronto, ON\",\"cdnCacheSafe\":false,\"treatmentFlags\":[\"flex_destinations_june_2021_launch_web_treatment\",\"new_filter_bar_v2_fm_header\",\"merch_header_breakpoint_expansion_web\",\"flexible_dates_12_month_lead_time\",\"storefronts_nov23_2021_homepage_web_treatment\",\"flexible_dates_options_extend_one_three_seven_days\",\"super_date_flexibility\",\"micro_flex_improvements\",\"micro_flex_show_by_default\",\"search_input_placeholder_phrases\",\"pets_fee_treatment\"],\"screenSize\":\"large\",\"isInitialLoad\":true,\"hasLoggedIn\":false},\"removeDuplicatedParams\":false}","extensions":"{\"persistedQuery\":{\"version\":1,\"sha256Hash\":\"0d0a5c3b44e87ccaecf084cfc3027a175af11955cffa04bb986406e9b4bdfe6e\"}}"}
headers = {
"x-airbnb-api-key": "YOUR_KEY",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
"content-type": "application/json",
"accept-language": "en-US,en;q=0.9"
}
response = requests.request("GET", url, headers=headers, params=querystring)
print(response.text)
Scrapy spider code:
import scrapy
import json
from urllib.parse import urlencode
class ListingsSpider(scrapy.Spider):
name = 'listings'
allowed_domains = ['airbnb.ca']
def start_requests(self):
params = {"operationName":"ExploreSections","locale":"en-CA","currency":"CAD","_cb":"1db02z70xkcr690n1h3gp0py4nmy","variables":"{\"isInitialLoad\":true,\"hasLoggedIn\":false,\"cdnCacheSafe\":false,\"source\":\"EXPLORE\",\"exploreRequest\":{\"metadataOnly\":false,\"version\":\"1.8.3\",\"itemsPerGrid\":20,\"tabId\":\"home_tab\",\"refinementPaths\":[\"/homes\"],\"flexibleTripDates\":[\"february\",\"march\"],\"flexibleTripLengths\":[\"weekend_trip\"],\"datePickerType\":\"calendar\",\"placeId\":\"ChIJpTvG15DL1IkRd8S0KlBVNTI\",\"checkin\":\"2022-03-15\",\"checkout\":\"2022-03-16\",\"adults\":2,\"source\":\"structured_search_input_header\",\"searchType\":\"autocomplete_click\",\"query\":\"Toronto, ON\",\"cdnCacheSafe\":false,\"treatmentFlags\":[\"flex_destinations_june_2021_launch_web_treatment\",\"new_filter_bar_v2_fm_header\",\"merch_header_breakpoint_expansion_web\",\"flexible_dates_12_month_lead_time\",\"storefronts_nov23_2021_homepage_web_treatment\",\"flexible_dates_options_extend_one_three_seven_days\",\"super_date_flexibility\",\"micro_flex_improvements\",\"micro_flex_show_by_default\",\"search_input_placeholder_phrases\",\"pets_fee_treatment\"],\"screenSize\":\"large\",\"isInitialLoad\":true,\"hasLoggedIn\":false},\"removeDuplicatedParams\":false}","extensions":"{\"persistedQuery\":{\"version\":1,\"sha256Hash\":\"0d0a5c3b44e87ccaecf084cfc3027a175af11955cffa04bb986406e9b4bdfe6e\"}}"}
url = f"https://www.airbnb.ca/api/v3/ExploreSections?{urlencode(params)}"
headers = {
"x-airbnb-api-key": "YOUR_KEY",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
"content-type": "application/json",
"accept-language": "en-US,en;q=0.9"
}
yield scrapy.Request(
url=url,
method='GET',
headers=headers,
callback=self.parse_listings,
)
def parse_listings(self, response):
resp_dict = json.loads(response.body)
yield resp_dict
CodePudding user response:
The problem is with the headers, copy them from the browser and remove cookie
and content-length
if they are present.
import scrapy
import json
from urllib.parse import urlencode
class ListingsSpider(scrapy.Spider):
name = 'listings'
allowed_domains = ['airbnb.ca']
custom_settings = {
'DOWNLOAD_DELAY': 0.5
}
def start_requests(self):
params = {"operationName":"ExploreSections","locale":"en-CA","currency":"CAD","_cb":"1db02z70xkcr690n1h3gp0py4nmy","variables":"{\"isInitialLoad\":true,\"hasLoggedIn\":false,\"cdnCacheSafe\":false,\"source\":\"EXPLORE\",\"exploreRequest\":{\"metadataOnly\":false,\"version\":\"1.8.3\",\"itemsPerGrid\":20,\"tabId\":\"home_tab\",\"refinementPaths\":[\"/homes\"],\"flexibleTripDates\":[\"february\",\"march\"],\"flexibleTripLengths\":[\"weekend_trip\"],\"datePickerType\":\"calendar\",\"placeId\":\"ChIJpTvG15DL1IkRd8S0KlBVNTI\",\"checkin\":\"2022-03-15\",\"checkout\":\"2022-03-16\",\"adults\":2,\"source\":\"structured_search_input_header\",\"searchType\":\"autocomplete_click\",\"query\":\"Toronto, ON\",\"cdnCacheSafe\":false,\"treatmentFlags\":[\"flex_destinations_june_2021_launch_web_treatment\",\"new_filter_bar_v2_fm_header\",\"merch_header_breakpoint_expansion_web\",\"flexible_dates_12_month_lead_time\",\"storefronts_nov23_2021_homepage_web_treatment\",\"flexible_dates_options_extend_one_three_seven_days\",\"super_date_flexibility\",\"micro_flex_improvements\",\"micro_flex_show_by_default\",\"search_input_placeholder_phrases\",\"pets_fee_treatment\"],\"screenSize\":\"large\",\"isInitialLoad\":true,\"hasLoggedIn\":false},\"removeDuplicatedParams\":false}","extensions":"{\"persistedQuery\":{\"version\":1,\"sha256Hash\":\"0d0a5c3b44e87ccaecf084cfc3027a175af11955cffa04bb986406e9b4bdfe6e\"}}"}
url = f"https://www.airbnb.ca/api/v3/ExploreSections?{urlencode(params)}"
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Type": "application/json",
"DNT": "1",
"Host": "www.airbnb.ca",
"Pragma": "no-cache",
"Referer": "https://www.airbnb.ca/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-origin",
"Sec-GPC": "1",
"TE": "trailers",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36",
"X-Airbnb-API-Key": "API_KEY",
"X-Airbnb-GraphQL-Platform": "web",
"X-Airbnb-GraphQL-Platform-Client": "minimalist-niobe",
"X-Airbnb-Supports-Airlock-V2": "true",
"X-CSRF-Token": "null",
"X-CSRF-Without-Token": "1",
"X-KL-Ajax-Request": "Ajax_Request",
"X-Niobe-Short-Circuited": "true"
}
yield scrapy.Request(
url=url,
method='GET',
headers=headers,
callback=self.parse_listings,
)
def parse_listings(self, response):
resp_dict = response.json()
yield resp_dict
Output:
{'data': {'presentation': {'__typename': 'RootPresentationContainer', 'explore': {'__typename': 'ExplorePresentation', 'sections': {'__typename': 'ExploreSections', 'sections': [{'__typename': 'SectionContainer', 'id':
...
...
...
Getting the request headers, example with chrome: