Home > Software engineering >  Python web scraping rightmove
Python web scraping rightmove

Time:06-27

I have a dataset called "data" that looks like this:

postcode    location_id url_name
NE30-2BG    1159502     https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=POSTCODE^1159502

I'm using the code below to take the url from the data above and retrieve property details from Rightmove. I want to be able to output the postcode (from data) alongside the output below. As the code stands, I'm unable to link the data retrieved from my RightmoveScraper to the original postcode. Any ideas appreciated!

class RightmoveScraper:
    results = []
    
    def fetch(self, url):
        print('HTTP GET request to URL: %s' % url, end ='')
        response = requests.get(url)
        print(' | Status code: %s' % response.status_code)
        
        return response
    
    def parse(self, html):
        content = BeautifulSoup(html, 'html.parser')   #lxml
        
        titles = [title.text.strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
        bedrooms = [title.text.split('bedroom')[0].strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
        addresses = [address['content'] for address in content.findAll('meta', {'itemprop': 'streetAddress'})]
        descriptions = [description.text for description in content.findAll('span', {'data-test': 'property-description'})]
        prices = [price.text.strip() for price in content.findAll('div', {'class': 'propertyCard-priceValue'})]
        under_over = [underover.text.strip() for underover in content.findAll('div', {'class': 'propertyCard-priceQualifier'})]
        #code1 = [price.text.strip() for price in content.findAll('type', {'hidden': 'value'})]
        dates = [date.text for date in content.findAll('span', {'class': 'propertyCard-branchSummary-addedOrReduced'})]
        sellers = [seller.text.split('by')[-1].strip() for seller in content.findAll('span',{'class': 'propertyCard-branchSummary-branchName'})]
        
        
        for index in range(0, len(titles)):
            self.results.append({
                'title': titles[index],
                'no_of_bedrooms' : bedrooms[index],
                'address': addresses[index],
                'description': descriptions[index],
                'price': prices[index],
                'under_over': under_over[index],
                #'code1': code1[index],
                'date': dates[index],
                'seller': sellers[index]})
            

    def to_csv(self):
        with open('rightmove_data.csv','w') as csv_file:
            writer = csv.DictWriter(csv_file,fieldnames=self.results[0].keys())
            writer.writeheader()
            
            for row in self.results:
                writer.writerow(row)
                
            print('Stored results to "rightmove_data.csv"')
            
                
    def run(self):
        for url_name in data['url_name']:
            #postcode = data['postcode']
            url = url_name
            response = self.fetch(url)
            self.parse(response.text)
            
        self.to_csv()
        
        
          
if __name__ == '__main__':
    scraper = RightmoveScraper() 
    scraper.run()

CodePudding user response:

It seems you use DataFrame so you could use .iterrows() like this

import pandas as pd

data = {
    'postcode': ['A','B','C'], 
    'url_name': ['www1','www2','www3'], 
    'other':    ['X','Y','Z']
}

df = pd.DataFrame(data)

def run():
    for index, row in df.iterrows():
        print('index   :', index   
        print('postcode:', row['postcode'])
        print('url_name:', row['url_name'])
        print('other   :', row['other'])
        print('----')
        #response = self.fetch(row['url_name'])
        #self.parse(response.text, row['postcode'])

run()

Result:

index   : 0
postcode: A
url_name: www1
other   : X
----
index   : 1
postcode: B
url_name: www2
other   : Y
----
index   : 2
postcode: C
url_name: www3
other   : Z

Or you could use .apply() to execute function on all rows.

import pandas as pd

def process(row):
    print('postcode:', row['postcode'])
    print('url_name:', row['url_name'])
    print('other   :', row['other'])
    print('----')
    #response = self.fetch(row['url_name'])
    #self.parse(response.text, row['postcode'])
    
data = {
    'postcode': ['A','B','C'], 
    'url_name': ['www1','www2','www3'], 
    'other':    ['X','Y','Z']
}

df = pd.DataFrame(data)

def run():
    df.apply(process, axis=1)

run()

CodePudding user response:

With thanks to furas, this works a treat! Thank you!

 import pandas as pd
 import requests
 from bs4 import BeautifulSoup
 import csv

data = pd.read_csv('postcode data.csv')
df = pd.DataFrame(data)

class RightmoveScraper:
    results = []
    
    def fetch(self, url):
        print('HTTP GET request to URL: %s' % url, end ='')
        
        response = requests.get(url)
        print(' | Status code: %s' % response.status_code)
        
        return response
    

    def parse(self, html, pp):
        content = BeautifulSoup(html, 'html.parser')   #lxml
        
        titles = [title.text.strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
        bedrooms = [title.text.split('bedroom')[0].strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
        addresses = [address['content'] for address in content.findAll('meta', {'itemprop': 'streetAddress'})]
        descriptions = [description.text for description in content.findAll('span', {'data-test': 'property-description'})]
        prices = [price.text.strip() for price in content.findAll('div', {'class': 'propertyCard-priceValue'})]
        under_over = [underover.text.strip() for underover in content.findAll('div', {'class': 'propertyCard-priceQualifier'})]
        dates = [date.text for date in content.findAll('span', {'class': 'propertyCard-branchSummary-addedOrReduced'})]
        sellers = [seller.text.split('by')[-1].strip() for seller in content.findAll('span',{'class': 'propertyCard-branchSummary-branchName'})]
        
        
        for index in range(0, len(titles)):
            self.results.append({
                'postcode': pp,
                'title': titles[index],
                'no_of_bedrooms' : bedrooms[index],
                'address': addresses[index],
                'description': descriptions[index],
                'price': prices[index],
                'under_over': under_over[index],
                'date': dates[index],
                'seller': sellers[index]})           
            

    def to_csv(self):
        with open('output.csv','w') as csv_file:
            writer = csv.DictWriter(csv_file,fieldnames=self.results[0].keys())
            writer.writeheader()
            
            for row in self.results:
                writer.writerow(row)
                
            print('Stored results to "output.csv"')
            
               
    def run(self):
        for index, row in df.iterrows():
            pp = row['postcode']
            url = row['url_name']
            response = self.fetch(url)
            self.parse(response.text, pp)
    
                
        self.to_csv()

        
        
          
if __name__ == '__main__':
    scraper = RightmoveScraper() 
    scraper.run()
  • Related