Find span element based on text written inside li Bs4 scraping-CodePudding

I want to find the text located in the <li>, if it exists I want to scrape the <span> text, but if it does not exist I will raise exception, for example:

if 'Floor' found then scrape the span

This is my code and it works perfect but scraping everything without any condition :

import requests
from bs4 import BeautifulSoup as bs
import pandas as pd

# Set base url & headers :
baseurl = 'https://aqarmap.com.eg'
headers = { 
    'User_Agent' : 
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
    }
test_link = 'https://aqarmap.com.eg/en/listing/3138984-for-rent-cairo-new-cairo-el-narges-el-narges-omarat'
r = requests.get(test_link , headers=headers)
soup = bs(r.content,'lxml')
title = soup.find('h1').text.replace('\n','')
loc = soup.find('span', {'property':'name'}).find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
sub_loc = soup.find('span', {'property':'name'}).find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
floor = soup.find('span' , class_='badge badge-default').text.replace('\n','')
room = soup.find('span' , class_='badge badge-default').find_next('span').text.replace('\n','')
baths = soup.find('span' , class_='badge badge-default').find_next('span').text.replace('\n','')
finish = soup.find('span' , class_='badge badge-default').find_next('span').find_next('span').find_next('span').text.replace('\n','')
view = soup.find('span' , class_='badge badge-default').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
area = soup.find('span' , class_='badge badge-default').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
date = soup.find('span' , class_='badge badge-default').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
price = soup.find('div' , class_='listing-price-content').find_next('span').text
print(title,loc,sub_loc,floor,room,baths,finish,view,area,date,price)

CodePudding user response：

In general, it would be good to check if the tag you are looking for exists before applying the text method:

title = title.text.strip() if (title := soup.find('h1')) else None

To select tag by text and check if it exists, you can go with css selectors and -soup-contains():

floor = tag.text.strip() if (tag := soup.select_one('ul.list-group li:-soup-contains("Floor") span')) else None

Above works well for some tags, but to go generic and get rid of these confusing property selections, I would suggest the following - Use a dict to store the information in a list of dicts. So you are save if you create a dataframe based on it and a propertiy is missing. Pandas will fill this outomatically with nan.

data = {}
data['title'] = soup.find('h1').text.strip()
data['loc'] = soup.find('span', {'property':'name'}).find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
data['sub_loc'] = soup.find('span', {'property':'name'}).find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
data.update(dict([li.stripped_strings for li in soup.select('ul.list-group li')]))

Benefits - You can do adjustments simple, filter if you like and export results in a strucured way.

Example

import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
    
baseurl = 'https://aqarmap.com.eg'
headers = { 
    'User_Agent' : 
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
    }

data = []

def scrape(test_link):
    r = requests.get(test_link , headers=headers)
    soup = bs(r.content,'lxml')

    data = {}
    data['title'] = soup.find('h1').text.strip()
    data['loc'] = soup.find('span', {'property':'name'}).find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
    data['sub_loc'] = soup.find('span', {'property':'name'}).find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').find_next('span').text.replace('\n','')
    data.update(dict([li.stripped_strings for li in soup.select('ul.list-group li')]))
    return data

urlList = ['https://aqarmap.com.eg/en/listing/3138984-for-rent-cairo-new-cairo-el-narges-el-narges-omarat',
           'https://aqarmap.com.eg/en/listing/3124476-for-rent-cairo-new-cairo-el-narges-el-narges-omarat?source=related-listing-source']

for url in urlList:
    data.append(scrape(url))

pd.DataFrame(data)

Output

title	loc	sub_loc	Floor	Room	Baths	Finish Type	Size (in meters)	Listing ID	Publish Date	Price	Seller Role	Payment Method	Price Per Meter	View
Furnished Apartment For rent in El Narges Omarat	El Narges	El Narges Omarat	3	3	2	SUPER_LUX	180 M²	EG-3138984	09/01/2022	19,000 EGP	Agent	Cash	106 EGP/M²	nan
Furnished Apartment For rent in El Narges Omarat	El Narges	El Narges Omarat	2	2	2	SUPER_LUX	180 M²	EG-3124476	30/12/2021	19,000 EGP	Agent	Cash	106 EGP/M²	Garden