I want to scrape data from this website which has hidden api from which i wanted to extract data but sending form data also not working in scrapy. this is the website main url 'https://www.priceline.com/relax/at/478502/from/20220523/to/20220527/rooms/1/adults/2?vrid=2af9fb11ff31fc1a4170ac6a891116da'
and this is the api url 'https://www.priceline.com/pws/v0/pcln-graph/'
i post the request with the form data but not getting any data except i got 403 response code. this is the code
# packages
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
from scrapy.http import FormRequest
import urllib
import os
import json
import csv
import datetime
# property scraper class
class ResidentialSale(scrapy.Spider):
# scraper name
name = 'therapists'
start_url = 'https://www.priceline.com/relax/at/478502/from/20220523/to/20220527/rooms/1/adults/2?vrid=2af9fb11ff31fc1a4170ac6a891116da'
base_url = 'https://www.priceline.com/pws/v0/pcln-graph/'
# headers
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
}
headers2 = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
#"Accept": "*/*",
#"Accept-Encoding": "gzip, deflate, br",
#"Accept-Language": "en-US,en;q=0.9,bn;q=0.8,es;q=0.7,ar;q=0.6",
#"Connection": "keep-alive",
#"Content-Length": "1843",
"Content-Type": "application/json",
#"Host": "apis.airportthai.co.th",
'origin': 'https://www.priceline.com',
'referer': 'https://www.priceline.com/relax/at/478502/from/20220523/to/20220527/rooms/1/adults/2?vrid=2af9fb11ff31fc1a4170ac6a891116da',
#"sec-ch-ua-mobile": "?0"
}
# payload
payload = {"query":"query getHotelContentDeals($deals: [ContentDealType], $cguid: String, $rid: String, $at: String, $rguid: String, $visitId: String, $appc: String, $responseOptions: String, $addErrToResponse: Boolean, $googleMapStatic: GoogleMapStaticArguments) {\n hotelContent(deals: $deals, rid: $rid, at: $at, rguid: $rguid, cguid: $cguid, visitId: $visitId, appc: $appc, responseOptions: $responseOptions, addErrToResponse: $addErrToResponse, googleMapStatic: $googleMapStatic) {\n rguid\n errorMessage\n hotels {\n name\n starRating\n hotelId\n pclnId\n brandId\n chainCode\n taxId\n propertyTypeId\n quotes {\n text\n __typename\n }\n childrenStayFree\n maxChildrenStayFreeAge\n maxChildrenStayFreeNum\n customDesc {\n paragraphTitle\n text\n __typename\n }\n description\n hotelThemes {\n hotelThemeId\n hotelThemeName\n __typename\n }\n guaranteedBrandsIcon {\n icon\n name\n iconName\n __typename\n }\n policies {\n additionalPolicies\n cardsAccepted\n checkInTime\n checkOutTime\n parkingPolicy {\n policyText\n freeParking\n __typename\n }\n internetPolicy {\n policyText\n freeInternet\n __typename\n }\n childPolicy {\n policyText\n childrenStayFree\n __typename\n }\n childrenDescription\n importantInfo\n coronaInfoCheck\n coronaImportantInfo\n petDescription\n __typename\n }\n location {\n neighborhoodName\n neighborhoodDescription\n neighborhoodId\n lat\n lon\n address {\n addressLine1\n cityName\n provinceCode\n countryName\n zip\n phone\n isoCountryCode\n __typename\n }\n googleMapStatic {\n url\n __typename\n }\n cityID\n zoneId\n __typename\n }\n hotelFeatures {\n breakfastDetails\n features\n topAmenities\n hotelAmenities {\n code\n displayable\n filterable\n free\n name\n type\n category\n categoryId\n globalAmenityName\n relatedImages {\n urls\n __typename\n }\n __typename\n }\n cleanlinessAmensList\n highlightedAmenities\n amenityCategories {\n categoryId\n relatedImages {\n urls\n __typename\n }\n __typename\n }\n __typename\n }\n hotelOtherInfo {\n hotelOtherInfoData {\n id\n name\n detail\n __typename\n }\n __typename\n }\n images {\n imageHDUrl\n imageUrl\n __typename\n }\n __typename\n }\n __typename\n }\n}\n","variables":{"deals":[{"dealId":"478502","isSopqHotel":"false"}],"appc":"DESKTOP","rid":"DTDIRECT","responseOptions":"ALL_AMENITIES,HOTEL_IMAGES,UHD_IMAGES","cguid":"b6a02daf29ebfcd2d3a1f83498e688da","visitId":"2021102715122841282f06-RRLXGQD","addErrToResponse":"true","googleMapStatic":{"size":{"x":320}}},"operationName":"getHotelContentDeals"}
try:
os.remove('abx.csv')
except OSError:
pass
# custom settings
custom_settings = {
'CONCURRENT_REQUEST_PER_DOMAIN': 2,
'DOWNLOAD_DELAY': 1
}
# general crawler
def start_requests(self):
# initial HTTP request
yield scrapy.Request(
url=self.start_url,
#body = json.dumps(self.payload),
headers=self.headers,
#method = "POST",
callback=self.parse
)
def parse(self, res):
print(res.status)
yield FormRequest(
url = self.base_url,
body = json.dumps(self.payload),
method = "POST",
headers = self.headers2,
callback = self.parse2
)
def parse2(self, response):
print(response)
'''
with open('qsranks.csv', 'a') as csv_file:
writer = csv.DictWriter(csv_file, fieldnames=items.keys())
writer.writerow(items)
'''
if __name__ == '__main__':
# run scraper
process = CrawlerProcess()
process.crawl(ResidentialSale)
process.start()
#ResidentialSale.parse(ResidentialSale, '')
all information is in this script.
and the error i getting is this:
021-10-27 21:13:32 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2021-10-27 21:13:34 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.priceline.com/relax/at/478502/from/20220523/to/20220527/rooms/1/adults/2?vrid=2af9fb11ff31fc1a4170ac6a891116da> (referer: None)
200
2021-10-27 21:13:34 [scrapy.core.engine] DEBUG: Crawled (403) <POST https://www.priceline.com/pws/v0/pcln-graph/> (referer: https://www.priceline.com/relax/at/478502/from/20220523/to/20220527/rooms/1/adults/2?vrid=2af9fb11ff31fc1a4170ac6a891116da)
2021-10-27 21:13:35 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <403 https://www.priceline.com/pws/v0/pcln-graph/>: HTTP status code is not handled or not allowed
2021-10-27 21:13:35 [scrapy.core.engine] INFO: Closing spider (finished)
2021-10-27 21:13:35 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
CodePudding user response:
Here is an example as working solution. The problem was in body aka Request Payload. Request Payload data is in json format but when sent it as body data then it creates error because json boolean data type meaning true/false start with lower case but when we send json data as python string(body) then we must convert true/false as True/False if exists. There are a few true/false in Request Payload and I converted them into True/False from body manually and I got response status 200.
Code:
import scrapy
import json
class PriceLine(scrapy.Spider):
name = 'price'
def start_requests(self):
body = {"query":"query getHotelDetails($hotelID: ID, $allInclusive: Boolean, $checkIn: String, $checkOut: String, $roomsCount: Int, $cguid: ID, $cugdor: String, $currencyCode: String, $pclnID: ID, $metaID: ID, $metaHotelId: ID, $rehabRateKey: ID, $preferredRateID: ID, $rID: ID, $rateDisplayOption: String, $rguid: ID, $visitId: String, $refClickID: String, $reviewCount: Float, $paymentRateMerge: Boolean, $multiOccDisplay: Boolean, $multiOccRates: Boolean, $appCode: String, $adults: Int, $children: [String], $unlockDeals: Boolean, $authToken: ID, $responseOptions: String, $includePrepaidFeeRates: Boolean, $addErrToResponse: Boolean, $packagesDetailsSearchQuery: HotelPsapiDetailsArguments) {\n details: hotelDetails(hotelID: $hotelID, checkIn: $checkIn, checkOut: $checkOut, roomsCount: $roomsCount, cguid: $cguid, cugdor: $cugdor, currencyCode: $currencyCode, pclnID: $pclnID, metaID: $metaID, metaHotelId: $metaHotelId, rehabRateKey: $rehabRateKey, preferredRateID: $preferredRateID, rID: $rID, rateDisplayOption: $rateDisplayOption, rguid: $rguid, visitId: $visitId, refClickID: $refClickID, reviewCount: $reviewCount, paymentRateMerge: $paymentRateMerge, multiOccDisplay: $multiOccDisplay, multiOccRates: $multiOccRates, appCode: $appCode, adults: $adults, children: $children, allInclusive: $allInclusive, unlockDeals: $unlockDeals, authToken: $authToken, responseOptions: $responseOptions, includePrepaidFeeRates: $includePrepaidFeeRates, addErrToResponse: $addErrToResponse, packagesDetailsSearchQuery: $packagesDetailsSearchQuery) {\n rguid\n errorMessage\n hotel {\n pkgComponentIndex\n maxPricedOccupancy\n maxOccupancy\n merchandisingInfo {\n color\n badgeText\n bannerHeader\n bannerText\n __typename\n }\n reasonsToBook {\n color\n icon\n header\n substring\n __typename\n }\n hotelViewCount {\n cumulativeViewCount\n __typename\n }\n commonRoomAmenities {\n type\n name\n __typename\n }\n recmdScore\n totalReviewCount\n overallGuestRating\n rooms {\n isUnlockedMemberDeal\n displayableRates {\n originalRates {\n gid\n __typename\n }\n __typename\n }\n __typename\n }\n transformedRooms {\n maxPricedOccupancy\n roomDisplayName\n maxOccupancy\n isGreatForFamily\n roomId\n longDescription\n roomFacilities\n cleanliness {\n score\n totalReviews\n __typename\n }\n beddingOption\n bedCount\n roomThumbnailUrl\n roomSize\n amenities {\n code\n __typename\n }\n imageUrls {\n largeUrl\n mediumUrl\n __typename\n }\n roomOccupancies {\n roomCode\n numberOfAdults\n numberOfChildren\n numberOfBeds\n numberOfRooms\n __typename\n }\n roomRates {\n cartToken\n pkgPriceInformation {\n totalCost\n totalCostPerTraveler\n totalCostWithHotelMandatoryFees\n totalPayNow\n totalPayLater\n totalSavings\n originalCostPerTraveler\n totalStrikethrough\n totalHotelMandatoryFees\n roomMandatoryFees\n __typename\n }\n preferredRateFlag\n pricedOccupancy\n couponApplicable\n suggestedNumOfRooms\n mergedRate {\n isFullyUnlocked\n rateIdentifier\n price\n grandTotal\n currencySymbol\n roomsLeft\n cancellationPolicy\n cancellationPolicyLongText\n cancellationMsg\n refundPolicy\n debugString\n paymentOptionsText\n feeAmount\n isPayLater\n isUniversalCartEligible\n isXSellEligible\n __typename\n }\n isPayLater\n rateIdentifier\n isBestDeal\n price\n grandTotal\n currencySymbol\n roomsLeft\n strikeThroughPrice\n isFreeCancellation\n cancellationPolicy\n cancellationPolicyLongText\n cancellationMsg\n ccRequired\n refundPolicy\n savingPct\n payLaterMessage\n feeAmount\n bannerText\n programName\n merchandisingFlag\n rateLevelAmenities {\n name\n isHighlighted\n __typename\n }\n totalPriceExcludingTaxesAndFeePerStay\n paymentOptionsText\n disclaimerMessage\n debugString\n promos {\n promoType\n isVariableMarkupPromo\n title\n desc\n isHighlighted\n __typename\n }\n isFullyUnlocked\n incrementalPricingIconName\n isUniversalCartEligible\n basketPriceKey\n isXSellEligible\n itemDetailsKey\n bundlePriceKey\n rateKey\n __typename\n }\n cartToken\n basketPriceKey\n itemDetailsKey\n priceKey\n bundlePriceKey\n token\n planCode\n rateTypeCode\n gdsName\n __typename\n }\n guestReviews {\n firstName\n overallScore\n reviewTextGeneral\n reviewTextNegative\n reviewTextPositive\n sourceCode\n travelerType\n travelerTypeId\n creationDate\n __typename\n }\n reviewRatingSummary {\n ratings {\n description\n label\n score\n summaryCount\n summaryValue\n __typename\n }\n travelerType {\n count\n id\n type\n __typename\n }\n __typename\n }\n signInDealsAvailable\n signInDealsMinRate\n ratings {\n category\n score\n __typename\n }\n bookings {\n firstName\n lastNameInitial\n bookedPrice\n bookedCurrencyCode\n justBookedBadge\n __typename\n }\n ratesSummary {\n pricedOccupancy\n suggestedNumOfRooms\n freeCancelableRateAvail\n minPrice\n totalCostPerTraveler\n minStrikePrice\n promptUserToNativeApp\n savingsClaimStrikePrice\n savingsClaimDisclaimer\n savingsClaimPercentage\n minCurrencyCodeSymbol\n minCurrencyCode\n roomLeft\n payWhenYouStayAvailable\n pclnId\n programName\n merchandisingFlag\n preferredRateId\n rateIdentifier\n showRecommendation\n suggestedNumOfRooms\n status\n __typename\n }\n hasNodateRooms\n isAllInclusiveHotel\n location {\n neighborhoodDescription\n __typename\n }\n hotelFeatures {\n features\n highlightedAmenities\n hotelAmenities {\n code\n displayable\n free\n name\n type\n __typename\n }\n topAmenities\n breakfastDetails\n __typename\n }\n policies {\n checkInTime\n checkOutTime\n petDescription\n childrenDescription\n importantInfo\n __typename\n }\n itemKey\n basketItemKey\n componentKey\n retailPrice {\n pricePerPerson\n displayPricePerPerson\n amount\n displayAmount\n __typename\n }\n images {\n imageHDURL\n imageURL\n __typename\n }\n __typename\n }\n componentKeyMap\n los\n signInDealRelatedInfo {\n promptUserToSignIn\n __typename\n }\n __typename\n }\n}\n","variables":{"appCode":"DESKTOP","cguid":"0175bb9aa41f22723c5b1eefa03d025c","checkIn":"20220523","checkOut":"20220527","rID":"DTDIRECT","roomsCount":1,"currencyCode":"USD","refClickID":"","unlockDeals":True,"includePrepaidFeeRates":True,"visitId":"202110271625255708419d-RRLXGQD","addErrToResponse":True,"adults":2,"paymentRateMerge":False,"multiOccDisplay":True,"multiOccRates":True,"hotelID":"478502","rateDisplayOption":"S","reviewCount":5,"responseOptions":"POP_COUNT,REVIEWS,CUSTOM_DESC,RATE_SUMMARY,RATINGS,DETAILED_ROOM,HOTEL_IMAGES,RATE_IMPORTANT_INFO,RATE_CHARGES_DETAIL,PROXIMITY,BOOKINGS,NORATEROOMS,REFUND_INFO"},"operationName":"getHotelDetails"}
yield scrapy.Request(
url='https://www.priceline.com/pws/v0/pcln-graph/',
callback=self.parse,
method ="POST",
body = json.dumps(body),
headers = {
'accept':' */*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,bn;q=0.8,es;q=0.7,ar;q=0.6',
'apollographql-client-name': 'relax',
'apollographql-client-version': 'master-1.1.813',
'content-length': '3452',
'content-type': 'application/json',
'origin': 'https://www.priceline.com',
'referer': 'https://www.priceline.com/relax/at/478502/from/20220523/to/20220527/rooms/1/adults/2?vrid=c97c644f8dd3411a3a8337ad364d86bc',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36'
})
def parse(self, response):
resp = json.loads(response.body)
for h in resp['data']['details']['hotel']['transformedRooms']:
yield {
'roomDisplayName': h['roomDisplayName']}
Output:
{'roomDisplayName': 'Standard Room'}
2021-10-28 01:40:26 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.priceline.com/pws/v0/pcln-graph/>
{'roomDisplayName': 'Standard Room'}
2021-10-28 01:40:26 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.priceline.com/pws/v0/pcln-graph/>
{'roomDisplayName': 'Standard Room with River View'}
2021-10-28 01:40:26 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.priceline.com/pws/v0/pcln-graph/>
{'roomDisplayName': 'Superior Room with Cathedral View'}
2021-10-28 01:40:26 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.priceline.com/pws/v0/pcln-graph/>
{'roomDisplayName': 'Family Room'}