Home > Blockchain >  Python BeautifulSoup web-scraping Tripadvisor view a review
Python BeautifulSoup web-scraping Tripadvisor view a review

Time:11-13

So I am new to web scraping and trying to view list of reviews for a particular hotel. I am initially trying to view for a particular review by selecting a particular class, and I am not getting any output, even when I try to check the status code of the request, I don't get any output. I believe my code is taking really long to run.

Does web scraping take time to run or there is a problem with my code?

import requests
from bs4 import BeautifulSoup

headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

url = "https://www.tripadvisor.ca/Hotel_Review-g154913-d1587398-Reviews-Le_Germain_Hotel_Calgary-Calgary_Alberta.html"
req = requests.get(url, headers)

print (req.status_code)
soup = BeautifulSoup(req.content, 'html.parser')

review = soup.find_all(class_="XllAv H4 _a").get_text()
print(review)

CodePudding user response:

changed few headers keys and some requests parameters i got error on .get_text() so replaced with other

import requests
from bs4 import BeautifulSoup

headers = {
        'Access-Control-Allow-Origin': '*',
        'Access-Control-Allow-Methods': 'GET',
        'Access-Control-Allow-Headers': 'Content-Type',
        'accept': '*/*',
        'accept-encoding': 'gzip, deflate',
        'accept-language': 'en,mr;q=0.9',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'}

url = "https://www.tripadvisor.ca/Hotel_Review-g154913-d1587398-Reviews-Le_Germain_Hotel_Calgary-Calgary_Alberta.html"
req = requests.get(url,headers=headers,timeout=5,verify=False)
print (req.status_code)
soup = BeautifulSoup(req.content, 'html.parser')

#review = soup.find_all(class_="XllAv H4 _a").get_text()
#print(review)
for x in soup.body.find_all(class_="XllAv H4 _a"):
    print(x.text)

CodePudding user response:

Actually, Your desired data is generating from api calls json response following the post request. Here is the working example as solution.

Code:

import requests

body = [
    {
        "query": "query ReviewListQuery($locationId: Int!, $offset: Int, $limit: Int, $filters: [FilterConditionInput!], $prefs: ReviewListPrefsInput, $initialPrefs: ReviewListPrefsInput, $filterCacheKey: String, $prefsCacheKey: String, $keywordVariant: String!, $needKeywords: Boolean = true) {\n  cachedFilters: personalCache(key: $filterCacheKey)\n  cachedPrefs: personalCache(key: $prefsCacheKey)\n  locations(locationIds: [$locationId]) {\n    locationId\n    parentGeoId\n    name\n    placeType\n    reviewSummary {\n      rating\n      count\n    }\n    keywords(variant: $keywordVariant) @include(if: $needKeywords) {\n      keywords {\n        keyword\n      }\n    }\n    ...GoogleTranslateButton_LOCATION\n    ...GoogleTranslateResponseButton_LOCATION\n    ...OwnerResponse_LOCATION\n    ...SocialBar_LOCATION\n    ...ReviewListView_LOCATION\n    reviewListPage(page: {offset: $offset, limit: $limit}, filters: $filters, prefs: $prefs, initialPrefs: $initialPrefs, filterCacheKey: $filterCacheKey, prefsCacheKey: $prefsCacheKey) {\n      totalCount\n      preferredReviewIds\n      reviews {\n        ...LocationReviewHeader_REVIEW\n        ...RatingLine_HOTELS_REVIEW\n        ...ReviewTitle_REVIEW\n        ...GoogleTranslateButton_REVIEW\n        ...GoogleTranslateResponseButton_REVIEW\n        ...InlineRoomTip_REVIEW\n        ...EventDate_REVIEW\n        ...AdditionalRatings_REVIEW\n        ...TripType_REVIEW\n        ...OwnerResponse_REVIEW\n        ...InlineReviewPhotos_REVIEW\n        ...ResponseDisclaimerFoot_REVIEW\n        ...DisclaimerFoot_REVIEW\n        ...AttributionFooter_REVIEW\n        ...SocialBar_REVIEW\n        ...ExpandableReview_REVIEW\n        ...RateMTFooter_REVIEW\n        ...SingleReview_REVIEW\n      }\n    }\n    reviewAggregations {\n      ratingCounts\n      languageCounts\n      alertStatusCount\n    }\n  }\n}\n\nfragment LocationReviewHeader_REVIEW on Review {\n  id\n  url\n  location {\n    locationId\n    name\n  }\n  createdDate\n  publishedDate\n  provider {\n    isLocalProvider\n  }\n  userProfile {\n    id\n    userId: id\n    isMe\n    isVerified\n    displayName\n    username\n    avatar {\n      id\n      photoSizes {\n        url\n        width\n        height\n      }\n    }\n    hometown {\n      locationId\n      fallbackString\n      location {\n        locationId\n        additionalNames {\n          long\n        }\n        name\n      }\n    }\n    contributionCounts {\n      sumAllUgc\n      helpfulVote\n    }\n    route {\n      url\n    }\n  }\n}\n\nfragment RatingLine_HOTELS_REVIEW on Review {\n  rating\n  publishedDate\n  publishPlatform\n  location {\n    placeType\n  }\n}\n\nfragment ReviewTitle_REVIEW on Review {\n  title\n  language\n  url\n}\n\nfragment GoogleTranslateButton_REVIEW on Review {\n  language\n  translationType\n}\n\nfragment GoogleTranslateButton_LOCATION on LocationInformation {\n  parentGeoId\n}\n\nfragment GoogleTranslateResponseButton_REVIEW on Review {\n  language\n  translationType\n  mgmtResponse {\n    id\n    language\n    translationType\n  }\n}\n\nfragment GoogleTranslateResponseButton_LOCATION on LocationInformation {\n  parentGeoId\n}\n\nfragment InlineRoomTip_REVIEW on Review {\n  roomTip\n}\n\nfragment EventDate_REVIEW on Review {\n  tripInfo {\n    stayDate\n  }\n  location {\n    placeType\n  }\n}\n\nfragment AdditionalRatings_REVIEW on Review {\n  additionalRatings {\n    rating\n    ratingLabel\n  }\n}\n\nfragment TripType_REVIEW on Review {\n  tripInfo {\n    tripType\n  }\n}\n\nfragment OwnerResponse_LOCATION on LocationInformation {\n  name\n  currentUserOwnerStatus {\n    isValid\n  }\n}\n\nfragment OwnerResponse_REVIEW on Review {\n  text\n  publishedDate\n  username\n  connectionToSubject\n  language\n  mgmtResponse {\n    id\n    text\n    language\n    publishedDate\n    username\n    connectionToSubject\n  }\n}\n\nfragment InlineReviewPhotos_REVIEW on Review {\n  id\n  locationId\n  title\n  text\n  rating\n  absoluteUrl\n  mcid\n  translationType\n  mtProviderId\n  photos {\n    id\n    statuses\n    photoSizes {\n      url\n      width\n      height\n    }\n  }\n  userProfile {\n    id\n    displayName\n    username\n  }\n}\n\nfragment ResponseDisclaimerFoot_REVIEW on Review {\n  mgmtResponse {\n    id\n  }\n  provider {\n    isLocalProvider\n  }\n}\n\nfragment DisclaimerFoot_REVIEW on Review {\n  translationType\n  location {\n    locationId\n    parentGeoId\n  }\n  provider {\n    isLocalProvider\n    isToolsProvider\n  }\n  original {\n    id\n    url\n    locationId\n    userId\n    language\n    submissionDomain\n  }\n}\n\nfragment AttributionFooter_REVIEW on Review {\n  locationId\n  mcid\n  attribution\n}\n\nfragment SocialBar_REVIEW on Review {\n  __typename\n  locationId\n  helpfulVotes\n  photoIds\n  route {\n    url\n  }\n  socialStatistics {\n    followCount\n    isFollowing\n    isLiked\n    isReposted\n    isSaved\n    likeCount\n    repostCount\n    tripCount\n  }\n  status\n  userId\n  userProfile {\n    id\n    displayName\n    isFollowing\n  }\n  location {\n    __typename\n    locationId\n    additionalNames {\n      normal\n      long\n      longOnlyParent\n      longParentAbbreviated\n      longOnlyParentAbbreviated\n      longParentStateAbbreviated\n      longOnlyParentStateAbbreviated\n      geo\n      abbreviated\n      abbreviatedRaw\n      abbreviatedStateTerritory\n      abbreviatedStateTerritoryRaw\n    }\n    parent {\n      locationId\n      additionalNames {\n        normal\n        long\n        longOnlyParent\n        longParentAbbreviated\n        longOnlyParentAbbreviated\n        longParentStateAbbreviated\n        longOnlyParentStateAbbreviated\n        geo\n        abbreviated\n        abbreviatedRaw\n        abbreviatedStateTerritory\n        abbreviatedStateTerritoryRaw\n      }\n    }\n  }\n}\n\nfragment SocialBar_LOCATION on LocationInformation {\n  locationId\n  currentUserOwnerStatus {\n    isValid\n  }\n}\n\nfragment ExpandableReview_REVIEW on Review {\n  text\n  language\n}\n\nfragment RateMTFooter_REVIEW on Review {\n  locationId\n  absoluteUrl\n  mcid\n  translationType\n  mtProviderId\n  originalLanguage\n  rating\n}\n\nfragment SingleReview_REVIEW on Review {\n  id\n  locationId\n  title\n  labels\n  rating\n  absoluteUrl\n  mcid\n  translationType\n  mtProviderId\n  alertStatus\n}\n\nfragment ReviewListView_LOCATION on LocationInformation {\n  locationId\n  parentGeoId\n  accommodationCategory\n  currentUserOwnerStatus {\n    isValid\n  }\n  url\n}\n",
        "variables": {
            "locationId": 1587398,
            "offset": 10,
            "filters": [
                {
                 "axis": "LANGUAGE",
                 "selections": [
                     "en"
                 ]
                }
            ],
            "prefs":None,
            "initialPrefs":{

            },
            "limit": 5,
            "filterCacheKey": "locationReviewFilters_1587398",
            "prefsCacheKey": "locationReviewPrefs",
            "needKeywords": False,
            "keywordVariant": "location_keywords_v2_llr_order_30_en"
        }
    },
    {
        "query": "query NearbyQuery($locationId: Int!, $geoId: Int!, $useGQForAirports: Boolean!, $deviceType: BaAggregation_DeviceType!, $commerceCountryId: Int!, $servletName: String!, $trafficSource: BaAggregation_TrafficSource, $hotelTravelInfo: BaAggregation_HotelTravelInfoInput, $distanceUnit: UnitLengthInput!, $locale: String!, $currency: String!, $getRentalCarPricing: Boolean!) {\n  currentLocation: locations(locationIds: [$locationId]) {\n    name\n    locationId\n    placeType\n    latitude\n    longitude\n    localLanguage\n    localizedNames {\n      locale\n      value\n    }\n    localizedStreetAddresses {\n      locale\n      fullAddress\n    }\n    streetAddress {\n      fullAddress\n    }\n    parent {\n      nearbyAttractionsUrl: attractionOverviewURL\n    }\n    nearbyHotelsUrl: locationsNearUrl(type: HOTEL)\n    nearbyRestaurantsUrl: locationsNearUrl(type: RESTAURANT)\n    businessAdvantageData(deviceType: $deviceType, commerceCountryId: $commerceCountryId, servletName: $servletName, trafficSource: $trafficSource, hotelTravelInfo: $hotelTravelInfo) {\n      ...ContactBaFields\n    }\n    neighborhoods {\n      id\n      name\n      description\n    }\n    nearbyTransit: nearby(radius: 1.0, page: {limit: 2, offset: 0}, locationFilter: {placeType: METRO_STATION}) {\n      locationId\n      locationDescription\n      name\n      distanceFromCenter\n    }\n    detail {\n      ...NearbyWithPreferredPOIs\n    }\n    nearbyAirports: nearestAirports(radius: 50.0, limit: 2) @include(if: $useGQForAirports) {\n      distanceFromCenter\n      airportInfo {\n        parent {\n          flightsUrl\n        }\n        locationName\n        airportLocationId\n        locationId\n      }\n    }\n  }\n  rentalCarPricing: RentalCarsLocation_summaryContent(request: {locationId: $geoId, pointOfSale: $locale, currency: $currency}) @include(if: $getRentalCarPricing) {\n    cheapestRate {\n      dailyRate\n    }\n  }\n  parentGeo: locations(locationIds: [$geoId]) {\n    name\n    rentalCarsUrl\n  }\n  hotels(locationIds: [$locationId]) {\n    walkScore\n  }\n}\n\nfragment ContactBaFields on BaAggregation_LocationProductData {\n  contactLinks {\n    contactLinkType\n    displayPhone\n    rawPhone\n    emailParts\n    clickTrackingUrl @encode\n    column\n  }\n}\n\nfragment StandardLocationInfo on LocationInformation {\n  locationId\n  name\n  placeType\n  latitude\n  longitude\n  url\n  reviewSummary {\n    rating\n    count\n  }\n}\n\nfragment LocationDetail on LocationInformation {\n  detail {\n    ... on Restaurant {\n      cuisines: tags(tagCategoryTypes: [CUISINES]) {\n        tagId\n        tagNameLocalized\n      }\n    }\n    ... on Attraction {\n      category: tags(tagCategoryTypes: [ATTRACTIONS_L2_CATEGORY]) {\n        tagId\n        tagNameLocalized\n      }\n      type: tags(tagCategoryTypes: [ATTRACTIONS_L3_TYPE]) {\n        tagId\n        tagNameLocalized\n      }\n    }\n  }\n}\n\nfragment NearbyLocationInfo on NearbyLocationInformation {\n  distanceFromCenter\n  locationId\n  placeType\n  location {\n    ...StandardLocationInfo\n    ...LocationDetail\n  }\n}\n\nfragment NearbyWithPreferredPOIs on Hotel {\n  nearbyWithPreferredPOIs(distanceUnit: $distanceUnit) {\n    attractionCount\n    distanceRange\n    mapDistanceRange\n    distanceUnit\n    hotelCount\n    restaurantCount\n    nearbyLocationList {\n      ...NearbyLocationInfo\n    }\n    preferredPOIList {\n      eateries {\n        ...NearbyLocationInfo\n      }\n      attractions {\n        ...NearbyLocationInfo\n      }\n    }\n    topPOI {\n      ...NearbyLocationInfo\n    }\n  }\n}\n",
        "variables": {
            "locationId": 1587398,
            "geoId": 154913,
            "useGQForAirports": False,
            "deviceType": "DESKTOP",
            "commerceCountryId": 293935,
            "servletName": "Hotel_Review",
            "trafficSource": "ba",
            "hotelTravelInfo": {
                "adultCount": 2,
                "checkInDate": "2022-04-30",
                "checkOutDate": "2022-05-01",
                "childrenCount": 0,
                "roomCount": 1,
                "usedDefaultDates": True
            },
            "distanceUnit": "KILOMETERS",
            "locale": "en_CA",
            "currency": "BDT",
            "getRentalCarPricing": False
        }
    }
]
headers = {
    "content-type": "application/json",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
    }

url = "https://www.tripadvisor.ca/data/graphql/batched"

r = requests.post(url, data=json.dumps(body), headers=headers)
response = r.json()[0]['data']['locations'][0]['reviewListPage']['reviews']
#print(response)

for resp in response:
    print(resp['title'])

Output:

Amazing hotel
Great staycation!
Not a pleasurable experience !
Stay here
One night in paradise
  • Related