So I am new to web scraping and trying to view list of reviews for a particular hotel. I am initially trying to view for a particular review by selecting a particular class, and I am not getting any output, even when I try to check the status code of the request, I don't get any output. I believe my code is taking really long to run.
Does web scraping take time to run or there is a problem with my code?
import requests
from bs4 import BeautifulSoup
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
url = "https://www.tripadvisor.ca/Hotel_Review-g154913-d1587398-Reviews-Le_Germain_Hotel_Calgary-Calgary_Alberta.html"
req = requests.get(url, headers)
print (req.status_code)
soup = BeautifulSoup(req.content, 'html.parser')
review = soup.find_all(class_="XllAv H4 _a").get_text()
print(review)
CodePudding user response:
changed few headers
keys
and some requests
parameters
i got error on .get_text()
so replaced with other
import requests
from bs4 import BeautifulSoup
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'accept': '*/*',
'accept-encoding': 'gzip, deflate',
'accept-language': 'en,mr;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'}
url = "https://www.tripadvisor.ca/Hotel_Review-g154913-d1587398-Reviews-Le_Germain_Hotel_Calgary-Calgary_Alberta.html"
req = requests.get(url,headers=headers,timeout=5,verify=False)
print (req.status_code)
soup = BeautifulSoup(req.content, 'html.parser')
#review = soup.find_all(class_="XllAv H4 _a").get_text()
#print(review)
for x in soup.body.find_all(class_="XllAv H4 _a"):
print(x.text)
CodePudding user response:
Actually, Your desired data is generating from api calls json response following the post request. Here is the working example as solution.
Code:
import requests
body = [
{
"query": "query ReviewListQuery($locationId: Int!, $offset: Int, $limit: Int, $filters: [FilterConditionInput!], $prefs: ReviewListPrefsInput, $initialPrefs: ReviewListPrefsInput, $filterCacheKey: String, $prefsCacheKey: String, $keywordVariant: String!, $needKeywords: Boolean = true) {\n cachedFilters: personalCache(key: $filterCacheKey)\n cachedPrefs: personalCache(key: $prefsCacheKey)\n locations(locationIds: [$locationId]) {\n locationId\n parentGeoId\n name\n placeType\n reviewSummary {\n rating\n count\n }\n keywords(variant: $keywordVariant) @include(if: $needKeywords) {\n keywords {\n keyword\n }\n }\n ...GoogleTranslateButton_LOCATION\n ...GoogleTranslateResponseButton_LOCATION\n ...OwnerResponse_LOCATION\n ...SocialBar_LOCATION\n ...ReviewListView_LOCATION\n reviewListPage(page: {offset: $offset, limit: $limit}, filters: $filters, prefs: $prefs, initialPrefs: $initialPrefs, filterCacheKey: $filterCacheKey, prefsCacheKey: $prefsCacheKey) {\n totalCount\n preferredReviewIds\n reviews {\n ...LocationReviewHeader_REVIEW\n ...RatingLine_HOTELS_REVIEW\n ...ReviewTitle_REVIEW\n ...GoogleTranslateButton_REVIEW\n ...GoogleTranslateResponseButton_REVIEW\n ...InlineRoomTip_REVIEW\n ...EventDate_REVIEW\n ...AdditionalRatings_REVIEW\n ...TripType_REVIEW\n ...OwnerResponse_REVIEW\n ...InlineReviewPhotos_REVIEW\n ...ResponseDisclaimerFoot_REVIEW\n ...DisclaimerFoot_REVIEW\n ...AttributionFooter_REVIEW\n ...SocialBar_REVIEW\n ...ExpandableReview_REVIEW\n ...RateMTFooter_REVIEW\n ...SingleReview_REVIEW\n }\n }\n reviewAggregations {\n ratingCounts\n languageCounts\n alertStatusCount\n }\n }\n}\n\nfragment LocationReviewHeader_REVIEW on Review {\n id\n url\n location {\n locationId\n name\n }\n createdDate\n publishedDate\n provider {\n isLocalProvider\n }\n userProfile {\n id\n userId: id\n isMe\n isVerified\n displayName\n username\n avatar {\n id\n photoSizes {\n url\n width\n height\n }\n }\n hometown {\n locationId\n fallbackString\n location {\n locationId\n additionalNames {\n long\n }\n name\n }\n }\n contributionCounts {\n sumAllUgc\n helpfulVote\n }\n route {\n url\n }\n }\n}\n\nfragment RatingLine_HOTELS_REVIEW on Review {\n rating\n publishedDate\n publishPlatform\n location {\n placeType\n }\n}\n\nfragment ReviewTitle_REVIEW on Review {\n title\n language\n url\n}\n\nfragment GoogleTranslateButton_REVIEW on Review {\n language\n translationType\n}\n\nfragment GoogleTranslateButton_LOCATION on LocationInformation {\n parentGeoId\n}\n\nfragment GoogleTranslateResponseButton_REVIEW on Review {\n language\n translationType\n mgmtResponse {\n id\n language\n translationType\n }\n}\n\nfragment GoogleTranslateResponseButton_LOCATION on LocationInformation {\n parentGeoId\n}\n\nfragment InlineRoomTip_REVIEW on Review {\n roomTip\n}\n\nfragment EventDate_REVIEW on Review {\n tripInfo {\n stayDate\n }\n location {\n placeType\n }\n}\n\nfragment AdditionalRatings_REVIEW on Review {\n additionalRatings {\n rating\n ratingLabel\n }\n}\n\nfragment TripType_REVIEW on Review {\n tripInfo {\n tripType\n }\n}\n\nfragment OwnerResponse_LOCATION on LocationInformation {\n name\n currentUserOwnerStatus {\n isValid\n }\n}\n\nfragment OwnerResponse_REVIEW on Review {\n text\n publishedDate\n username\n connectionToSubject\n language\n mgmtResponse {\n id\n text\n language\n publishedDate\n username\n connectionToSubject\n }\n}\n\nfragment InlineReviewPhotos_REVIEW on Review {\n id\n locationId\n title\n text\n rating\n absoluteUrl\n mcid\n translationType\n mtProviderId\n photos {\n id\n statuses\n photoSizes {\n url\n width\n height\n }\n }\n userProfile {\n id\n displayName\n username\n }\n}\n\nfragment ResponseDisclaimerFoot_REVIEW on Review {\n mgmtResponse {\n id\n }\n provider {\n isLocalProvider\n }\n}\n\nfragment DisclaimerFoot_REVIEW on Review {\n translationType\n location {\n locationId\n parentGeoId\n }\n provider {\n isLocalProvider\n isToolsProvider\n }\n original {\n id\n url\n locationId\n userId\n language\n submissionDomain\n }\n}\n\nfragment AttributionFooter_REVIEW on Review {\n locationId\n mcid\n attribution\n}\n\nfragment SocialBar_REVIEW on Review {\n __typename\n locationId\n helpfulVotes\n photoIds\n route {\n url\n }\n socialStatistics {\n followCount\n isFollowing\n isLiked\n isReposted\n isSaved\n likeCount\n repostCount\n tripCount\n }\n status\n userId\n userProfile {\n id\n displayName\n isFollowing\n }\n location {\n __typename\n locationId\n additionalNames {\n normal\n long\n longOnlyParent\n longParentAbbreviated\n longOnlyParentAbbreviated\n longParentStateAbbreviated\n longOnlyParentStateAbbreviated\n geo\n abbreviated\n abbreviatedRaw\n abbreviatedStateTerritory\n abbreviatedStateTerritoryRaw\n }\n parent {\n locationId\n additionalNames {\n normal\n long\n longOnlyParent\n longParentAbbreviated\n longOnlyParentAbbreviated\n longParentStateAbbreviated\n longOnlyParentStateAbbreviated\n geo\n abbreviated\n abbreviatedRaw\n abbreviatedStateTerritory\n abbreviatedStateTerritoryRaw\n }\n }\n }\n}\n\nfragment SocialBar_LOCATION on LocationInformation {\n locationId\n currentUserOwnerStatus {\n isValid\n }\n}\n\nfragment ExpandableReview_REVIEW on Review {\n text\n language\n}\n\nfragment RateMTFooter_REVIEW on Review {\n locationId\n absoluteUrl\n mcid\n translationType\n mtProviderId\n originalLanguage\n rating\n}\n\nfragment SingleReview_REVIEW on Review {\n id\n locationId\n title\n labels\n rating\n absoluteUrl\n mcid\n translationType\n mtProviderId\n alertStatus\n}\n\nfragment ReviewListView_LOCATION on LocationInformation {\n locationId\n parentGeoId\n accommodationCategory\n currentUserOwnerStatus {\n isValid\n }\n url\n}\n",
"variables": {
"locationId": 1587398,
"offset": 10,
"filters": [
{
"axis": "LANGUAGE",
"selections": [
"en"
]
}
],
"prefs":None,
"initialPrefs":{
},
"limit": 5,
"filterCacheKey": "locationReviewFilters_1587398",
"prefsCacheKey": "locationReviewPrefs",
"needKeywords": False,
"keywordVariant": "location_keywords_v2_llr_order_30_en"
}
},
{
"query": "query NearbyQuery($locationId: Int!, $geoId: Int!, $useGQForAirports: Boolean!, $deviceType: BaAggregation_DeviceType!, $commerceCountryId: Int!, $servletName: String!, $trafficSource: BaAggregation_TrafficSource, $hotelTravelInfo: BaAggregation_HotelTravelInfoInput, $distanceUnit: UnitLengthInput!, $locale: String!, $currency: String!, $getRentalCarPricing: Boolean!) {\n currentLocation: locations(locationIds: [$locationId]) {\n name\n locationId\n placeType\n latitude\n longitude\n localLanguage\n localizedNames {\n locale\n value\n }\n localizedStreetAddresses {\n locale\n fullAddress\n }\n streetAddress {\n fullAddress\n }\n parent {\n nearbyAttractionsUrl: attractionOverviewURL\n }\n nearbyHotelsUrl: locationsNearUrl(type: HOTEL)\n nearbyRestaurantsUrl: locationsNearUrl(type: RESTAURANT)\n businessAdvantageData(deviceType: $deviceType, commerceCountryId: $commerceCountryId, servletName: $servletName, trafficSource: $trafficSource, hotelTravelInfo: $hotelTravelInfo) {\n ...ContactBaFields\n }\n neighborhoods {\n id\n name\n description\n }\n nearbyTransit: nearby(radius: 1.0, page: {limit: 2, offset: 0}, locationFilter: {placeType: METRO_STATION}) {\n locationId\n locationDescription\n name\n distanceFromCenter\n }\n detail {\n ...NearbyWithPreferredPOIs\n }\n nearbyAirports: nearestAirports(radius: 50.0, limit: 2) @include(if: $useGQForAirports) {\n distanceFromCenter\n airportInfo {\n parent {\n flightsUrl\n }\n locationName\n airportLocationId\n locationId\n }\n }\n }\n rentalCarPricing: RentalCarsLocation_summaryContent(request: {locationId: $geoId, pointOfSale: $locale, currency: $currency}) @include(if: $getRentalCarPricing) {\n cheapestRate {\n dailyRate\n }\n }\n parentGeo: locations(locationIds: [$geoId]) {\n name\n rentalCarsUrl\n }\n hotels(locationIds: [$locationId]) {\n walkScore\n }\n}\n\nfragment ContactBaFields on BaAggregation_LocationProductData {\n contactLinks {\n contactLinkType\n displayPhone\n rawPhone\n emailParts\n clickTrackingUrl @encode\n column\n }\n}\n\nfragment StandardLocationInfo on LocationInformation {\n locationId\n name\n placeType\n latitude\n longitude\n url\n reviewSummary {\n rating\n count\n }\n}\n\nfragment LocationDetail on LocationInformation {\n detail {\n ... on Restaurant {\n cuisines: tags(tagCategoryTypes: [CUISINES]) {\n tagId\n tagNameLocalized\n }\n }\n ... on Attraction {\n category: tags(tagCategoryTypes: [ATTRACTIONS_L2_CATEGORY]) {\n tagId\n tagNameLocalized\n }\n type: tags(tagCategoryTypes: [ATTRACTIONS_L3_TYPE]) {\n tagId\n tagNameLocalized\n }\n }\n }\n}\n\nfragment NearbyLocationInfo on NearbyLocationInformation {\n distanceFromCenter\n locationId\n placeType\n location {\n ...StandardLocationInfo\n ...LocationDetail\n }\n}\n\nfragment NearbyWithPreferredPOIs on Hotel {\n nearbyWithPreferredPOIs(distanceUnit: $distanceUnit) {\n attractionCount\n distanceRange\n mapDistanceRange\n distanceUnit\n hotelCount\n restaurantCount\n nearbyLocationList {\n ...NearbyLocationInfo\n }\n preferredPOIList {\n eateries {\n ...NearbyLocationInfo\n }\n attractions {\n ...NearbyLocationInfo\n }\n }\n topPOI {\n ...NearbyLocationInfo\n }\n }\n}\n",
"variables": {
"locationId": 1587398,
"geoId": 154913,
"useGQForAirports": False,
"deviceType": "DESKTOP",
"commerceCountryId": 293935,
"servletName": "Hotel_Review",
"trafficSource": "ba",
"hotelTravelInfo": {
"adultCount": 2,
"checkInDate": "2022-04-30",
"checkOutDate": "2022-05-01",
"childrenCount": 0,
"roomCount": 1,
"usedDefaultDates": True
},
"distanceUnit": "KILOMETERS",
"locale": "en_CA",
"currency": "BDT",
"getRentalCarPricing": False
}
}
]
headers = {
"content-type": "application/json",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
}
url = "https://www.tripadvisor.ca/data/graphql/batched"
r = requests.post(url, data=json.dumps(body), headers=headers)
response = r.json()[0]['data']['locations'][0]['reviewListPage']['reviews']
#print(response)
for resp in response:
print(resp['title'])
Output:
Amazing hotel
Great staycation!
Not a pleasurable experience !
Stay here
One night in paradise