Home > other >  I want to scrape data from website which has hidden api but sendinf form data also not working
I want to scrape data from website which has hidden api but sendinf form data also not working

Time:10-28

I want to scrape data from this website which has hidden api from which i wanted to extract data but sending form data also not working in scrapy. this is the website main url 'https://www.priceline.com/relax/at/478502/from/20220523/to/20220527/rooms/1/adults/2?vrid=2af9fb11ff31fc1a4170ac6a891116da'

and this is the api url 'https://www.priceline.com/pws/v0/pcln-graph/'

i post the request with the form data but not getting any data except i got 403 response code. this is the code

 # packages
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
from scrapy.http import FormRequest
import urllib
import os
import json
import csv
import datetime

# property scraper class
class ResidentialSale(scrapy.Spider):
    # scraper name
    name = 'therapists'
    start_url = 'https://www.priceline.com/relax/at/478502/from/20220523/to/20220527/rooms/1/adults/2?vrid=2af9fb11ff31fc1a4170ac6a891116da'
    base_url = 'https://www.priceline.com/pws/v0/pcln-graph/'
    # headers
    headers = {
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
    }


    headers2 = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36",
        #"Accept": "*/*",
        #"Accept-Encoding": "gzip, deflate, br",
        #"Accept-Language": "en-US,en;q=0.9,bn;q=0.8,es;q=0.7,ar;q=0.6",
        #"Connection": "keep-alive",
        #"Content-Length": "1843",
        "Content-Type": "application/json",
        #"Host": "apis.airportthai.co.th",
        'origin': 'https://www.priceline.com',
        'referer': 'https://www.priceline.com/relax/at/478502/from/20220523/to/20220527/rooms/1/adults/2?vrid=2af9fb11ff31fc1a4170ac6a891116da',
        #"sec-ch-ua-mobile": "?0"

    }


    # payload
    payload = {"query":"query getHotelContentDeals($deals: [ContentDealType], $cguid: String, $rid: String, $at: String, $rguid: String, $visitId: String, $appc: String, $responseOptions: String, $addErrToResponse: Boolean, $googleMapStatic: GoogleMapStaticArguments) {\n  hotelContent(deals: $deals, rid: $rid, at: $at, rguid: $rguid, cguid: $cguid, visitId: $visitId, appc: $appc, responseOptions: $responseOptions, addErrToResponse: $addErrToResponse, googleMapStatic: $googleMapStatic) {\n    rguid\n    errorMessage\n    hotels {\n      name\n      starRating\n      hotelId\n      pclnId\n      brandId\n      chainCode\n      taxId\n      propertyTypeId\n      quotes {\n        text\n        __typename\n      }\n      childrenStayFree\n      maxChildrenStayFreeAge\n      maxChildrenStayFreeNum\n      customDesc {\n        paragraphTitle\n        text\n        __typename\n      }\n      description\n      hotelThemes {\n        hotelThemeId\n        hotelThemeName\n        __typename\n      }\n      guaranteedBrandsIcon {\n        icon\n        name\n        iconName\n        __typename\n      }\n      policies {\n        additionalPolicies\n        cardsAccepted\n        checkInTime\n        checkOutTime\n        parkingPolicy {\n          policyText\n          freeParking\n          __typename\n        }\n        internetPolicy {\n          policyText\n          freeInternet\n          __typename\n        }\n        childPolicy {\n          policyText\n          childrenStayFree\n          __typename\n        }\n        childrenDescription\n        importantInfo\n        coronaInfoCheck\n        coronaImportantInfo\n        petDescription\n        __typename\n      }\n      location {\n        neighborhoodName\n        neighborhoodDescription\n        neighborhoodId\n        lat\n        lon\n        address {\n          addressLine1\n          cityName\n          provinceCode\n          countryName\n          zip\n          phone\n          isoCountryCode\n          __typename\n        }\n        googleMapStatic {\n          url\n          __typename\n        }\n        cityID\n        zoneId\n        __typename\n      }\n      hotelFeatures {\n        breakfastDetails\n        features\n        topAmenities\n        hotelAmenities {\n          code\n          displayable\n          filterable\n          free\n          name\n          type\n          category\n          categoryId\n          globalAmenityName\n          relatedImages {\n            urls\n            __typename\n          }\n          __typename\n        }\n        cleanlinessAmensList\n        highlightedAmenities\n        amenityCategories {\n          categoryId\n          relatedImages {\n            urls\n            __typename\n          }\n          __typename\n        }\n        __typename\n      }\n      hotelOtherInfo {\n        hotelOtherInfoData {\n          id\n          name\n          detail\n          __typename\n        }\n        __typename\n      }\n      images {\n        imageHDUrl\n        imageUrl\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n}\n","variables":{"deals":[{"dealId":"478502","isSopqHotel":"false"}],"appc":"DESKTOP","rid":"DTDIRECT","responseOptions":"ALL_AMENITIES,HOTEL_IMAGES,UHD_IMAGES","cguid":"b6a02daf29ebfcd2d3a1f83498e688da","visitId":"2021102715122841282f06-RRLXGQD","addErrToResponse":"true","googleMapStatic":{"size":{"x":320}}},"operationName":"getHotelContentDeals"}


    try:
       os.remove('abx.csv')
    except OSError:
       pass
    # custom settings
    custom_settings = {
        'CONCURRENT_REQUEST_PER_DOMAIN': 2,
        'DOWNLOAD_DELAY': 1
    }

    # general crawler
    def start_requests(self):

            # initial HTTP request
            yield scrapy.Request(
                url=self.start_url,
                #body = json.dumps(self.payload),
                headers=self.headers,
                #method = "POST",
                callback=self.parse
                      )
    def parse(self, res):
       print(res.status)

       yield FormRequest(
             url = self.base_url,
             body = json.dumps(self.payload),
             method = "POST",
             headers = self.headers2,
             callback = self.parse2
       )

    def parse2(self, response):
       print(response)
       '''
       with open('qsranks.csv', 'a') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=items.keys())
            writer.writerow(items)
       '''
if __name__ == '__main__':
    # run scraper
    process = CrawlerProcess()
    process.crawl(ResidentialSale)
    process.start()

    #ResidentialSale.parse(ResidentialSale, '')

all information is in this script.

and the error i getting is this:

 021-10-27 21:13:32 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2021-10-27 21:13:34 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.priceline.com/relax/at/478502/from/20220523/to/20220527/rooms/1/adults/2?vrid=2af9fb11ff31fc1a4170ac6a891116da> (referer: None)
200
2021-10-27 21:13:34 [scrapy.core.engine] DEBUG: Crawled (403) <POST https://www.priceline.com/pws/v0/pcln-graph/> (referer: https://www.priceline.com/relax/at/478502/from/20220523/to/20220527/rooms/1/adults/2?vrid=2af9fb11ff31fc1a4170ac6a891116da)
2021-10-27 21:13:35 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <403 https://www.priceline.com/pws/v0/pcln-graph/>: HTTP status code is not handled or not allowed
2021-10-27 21:13:35 [scrapy.core.engine] INFO: Closing spider (finished)
2021-10-27 21:13:35 [scrapy.statscollectors] INFO: Dumping Scrapy stats:

CodePudding user response:

Here is an example as working solution. The problem was in body aka Request Payload. Request Payload data is in json format but when sent it as body data then it creates error because json boolean data type meaning true/false start with lower case but when we send json data as python string(body) then we must convert true/false as True/False if exists. There are a few true/false in Request Payload and I converted them into True/False from body manually and I got response status 200.

Code:

import scrapy
import json

class PriceLine(scrapy.Spider):
    name = 'price'

    def start_requests(self):
        body = {"query":"query getHotelDetails($hotelID: ID, $allInclusive: Boolean, $checkIn: String, $checkOut: String, $roomsCount: Int, $cguid: ID, $cugdor: String, $currencyCode: String, $pclnID: ID, $metaID: ID, $metaHotelId: ID, $rehabRateKey: ID, $preferredRateID: ID, $rID: ID, $rateDisplayOption: String, $rguid: ID, $visitId: String, $refClickID: String, $reviewCount: Float, $paymentRateMerge: Boolean, $multiOccDisplay: Boolean, $multiOccRates: Boolean, $appCode: String, $adults: Int, $children: [String], $unlockDeals: Boolean, $authToken: ID, $responseOptions: String, $includePrepaidFeeRates: Boolean, $addErrToResponse: Boolean, $packagesDetailsSearchQuery: HotelPsapiDetailsArguments) {\n  details: hotelDetails(hotelID: $hotelID, checkIn: $checkIn, checkOut: $checkOut, roomsCount: $roomsCount, cguid: $cguid, cugdor: $cugdor, currencyCode: $currencyCode, pclnID: $pclnID, metaID: $metaID, metaHotelId: $metaHotelId, rehabRateKey: $rehabRateKey, preferredRateID: $preferredRateID, rID: $rID, rateDisplayOption: $rateDisplayOption, rguid: $rguid, visitId: $visitId, refClickID: $refClickID, reviewCount: $reviewCount, paymentRateMerge: $paymentRateMerge, multiOccDisplay: $multiOccDisplay, multiOccRates: $multiOccRates, appCode: $appCode, adults: $adults, children: $children, allInclusive: $allInclusive, unlockDeals: $unlockDeals, authToken: $authToken, responseOptions: $responseOptions, includePrepaidFeeRates: $includePrepaidFeeRates, addErrToResponse: $addErrToResponse, packagesDetailsSearchQuery: $packagesDetailsSearchQuery) {\n    rguid\n    errorMessage\n    hotel {\n      pkgComponentIndex\n      maxPricedOccupancy\n      maxOccupancy\n      merchandisingInfo {\n        color\n        badgeText\n        bannerHeader\n        bannerText\n        __typename\n      }\n      reasonsToBook {\n        color\n        icon\n        header\n        substring\n        __typename\n      }\n      hotelViewCount {\n        cumulativeViewCount\n        __typename\n      }\n      commonRoomAmenities {\n        type\n        name\n        __typename\n      }\n      recmdScore\n      totalReviewCount\n      overallGuestRating\n      rooms {\n        isUnlockedMemberDeal\n        displayableRates {\n          originalRates {\n            gid\n            __typename\n          }\n          __typename\n        }\n        __typename\n      }\n      transformedRooms {\n        maxPricedOccupancy\n        roomDisplayName\n        maxOccupancy\n        isGreatForFamily\n        roomId\n        longDescription\n        roomFacilities\n        cleanliness {\n          score\n          totalReviews\n          __typename\n        }\n        beddingOption\n        bedCount\n        roomThumbnailUrl\n        roomSize\n        amenities {\n          code\n          __typename\n        }\n        imageUrls {\n          largeUrl\n          mediumUrl\n          __typename\n        }\n        roomOccupancies {\n          roomCode\n          numberOfAdults\n          numberOfChildren\n          numberOfBeds\n          numberOfRooms\n          __typename\n        }\n        roomRates {\n          cartToken\n          pkgPriceInformation {\n            totalCost\n            totalCostPerTraveler\n            totalCostWithHotelMandatoryFees\n            totalPayNow\n            totalPayLater\n            totalSavings\n            originalCostPerTraveler\n            totalStrikethrough\n            totalHotelMandatoryFees\n            roomMandatoryFees\n            __typename\n          }\n          preferredRateFlag\n          pricedOccupancy\n          couponApplicable\n          suggestedNumOfRooms\n          mergedRate {\n            isFullyUnlocked\n            rateIdentifier\n            price\n            grandTotal\n            currencySymbol\n            roomsLeft\n            cancellationPolicy\n            cancellationPolicyLongText\n            cancellationMsg\n            refundPolicy\n            debugString\n            paymentOptionsText\n            feeAmount\n            isPayLater\n            isUniversalCartEligible\n            isXSellEligible\n            __typename\n          }\n          isPayLater\n          rateIdentifier\n          isBestDeal\n          price\n          grandTotal\n          currencySymbol\n          roomsLeft\n          strikeThroughPrice\n          isFreeCancellation\n          cancellationPolicy\n          cancellationPolicyLongText\n          cancellationMsg\n          ccRequired\n          refundPolicy\n          savingPct\n          payLaterMessage\n          feeAmount\n          bannerText\n          programName\n          merchandisingFlag\n          rateLevelAmenities {\n            name\n            isHighlighted\n            __typename\n          }\n          totalPriceExcludingTaxesAndFeePerStay\n          paymentOptionsText\n          disclaimerMessage\n          debugString\n          promos {\n            promoType\n            isVariableMarkupPromo\n            title\n            desc\n            isHighlighted\n            __typename\n          }\n          isFullyUnlocked\n          incrementalPricingIconName\n          isUniversalCartEligible\n          basketPriceKey\n          isXSellEligible\n          itemDetailsKey\n          bundlePriceKey\n          rateKey\n          __typename\n        }\n        cartToken\n        basketPriceKey\n        itemDetailsKey\n        priceKey\n        bundlePriceKey\n        token\n        planCode\n        rateTypeCode\n        gdsName\n        __typename\n      }\n      guestReviews {\n        firstName\n        overallScore\n        reviewTextGeneral\n        reviewTextNegative\n        reviewTextPositive\n        sourceCode\n        travelerType\n        travelerTypeId\n        creationDate\n        __typename\n      }\n      reviewRatingSummary {\n        ratings {\n          description\n          label\n          score\n          summaryCount\n          summaryValue\n          __typename\n        }\n        travelerType {\n          count\n          id\n          type\n          __typename\n        }\n        __typename\n      }\n      signInDealsAvailable\n      signInDealsMinRate\n      ratings {\n        category\n        score\n        __typename\n      }\n      bookings {\n        firstName\n        lastNameInitial\n        bookedPrice\n        bookedCurrencyCode\n        justBookedBadge\n        __typename\n      }\n      ratesSummary {\n        pricedOccupancy\n        suggestedNumOfRooms\n        freeCancelableRateAvail\n        minPrice\n        totalCostPerTraveler\n        minStrikePrice\n        promptUserToNativeApp\n        savingsClaimStrikePrice\n        savingsClaimDisclaimer\n        savingsClaimPercentage\n        minCurrencyCodeSymbol\n        minCurrencyCode\n        roomLeft\n        payWhenYouStayAvailable\n        pclnId\n        programName\n        merchandisingFlag\n        preferredRateId\n        rateIdentifier\n        showRecommendation\n        suggestedNumOfRooms\n        status\n        __typename\n      }\n      hasNodateRooms\n      isAllInclusiveHotel\n      location {\n        neighborhoodDescription\n        __typename\n      }\n      hotelFeatures {\n        features\n        highlightedAmenities\n        hotelAmenities {\n          code\n          displayable\n          free\n          name\n          type\n          __typename\n        }\n        topAmenities\n        breakfastDetails\n        __typename\n      }\n      policies {\n        checkInTime\n        checkOutTime\n        petDescription\n        childrenDescription\n        importantInfo\n        __typename\n      }\n      itemKey\n      basketItemKey\n      componentKey\n      retailPrice {\n        pricePerPerson\n        displayPricePerPerson\n        amount\n        displayAmount\n        __typename\n      }\n      images {\n        imageHDURL\n        imageURL\n        __typename\n      }\n      __typename\n    }\n    componentKeyMap\n    los\n    signInDealRelatedInfo {\n      promptUserToSignIn\n      __typename\n    }\n    __typename\n  }\n}\n","variables":{"appCode":"DESKTOP","cguid":"0175bb9aa41f22723c5b1eefa03d025c","checkIn":"20220523","checkOut":"20220527","rID":"DTDIRECT","roomsCount":1,"currencyCode":"USD","refClickID":"","unlockDeals":True,"includePrepaidFeeRates":True,"visitId":"202110271625255708419d-RRLXGQD","addErrToResponse":True,"adults":2,"paymentRateMerge":False,"multiOccDisplay":True,"multiOccRates":True,"hotelID":"478502","rateDisplayOption":"S","reviewCount":5,"responseOptions":"POP_COUNT,REVIEWS,CUSTOM_DESC,RATE_SUMMARY,RATINGS,DETAILED_ROOM,HOTEL_IMAGES,RATE_IMPORTANT_INFO,RATE_CHARGES_DETAIL,PROXIMITY,BOOKINGS,NORATEROOMS,REFUND_INFO"},"operationName":"getHotelDetails"}
        
        yield scrapy.Request(
            url='https://www.priceline.com/pws/v0/pcln-graph/',
            callback=self.parse,
            method ="POST",
            body = json.dumps(body),
            headers = {
                'accept':' */*',
                'accept-encoding': 'gzip, deflate, br',
                'accept-language': 'en-US,en;q=0.9,bn;q=0.8,es;q=0.7,ar;q=0.6',
                'apollographql-client-name': 'relax',
                'apollographql-client-version': 'master-1.1.813',
                'content-length': '3452',
                'content-type': 'application/json',
                'origin': 'https://www.priceline.com',
                'referer': 'https://www.priceline.com/relax/at/478502/from/20220523/to/20220527/rooms/1/adults/2?vrid=c97c644f8dd3411a3a8337ad364d86bc',
                'sec-ch-ua-mobile': '?0',
                'sec-fetch-dest': 'empty',
                'sec-fetch-mode': 'cors',
                'sec-fetch-site': 'same-origin',
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36'
                }) 
        
    def parse(self, response):
        resp = json.loads(response.body)
        for h in resp['data']['details']['hotel']['transformedRooms']:
            yield {
                'roomDisplayName': h['roomDisplayName']}
                

Output:

{'roomDisplayName': 'Standard Room'}
2021-10-28 01:40:26 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.priceline.com/pws/v0/pcln-graph/>
{'roomDisplayName': 'Standard Room'}
2021-10-28 01:40:26 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.priceline.com/pws/v0/pcln-graph/>    
{'roomDisplayName': 'Standard  Room with River View'}
2021-10-28 01:40:26 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.priceline.com/pws/v0/pcln-graph/>    
{'roomDisplayName': 'Superior Room with Cathedral View'}
2021-10-28 01:40:26 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.priceline.com/pws/v0/pcln-graph/>    
{'roomDisplayName': 'Family Room'}
  • Related