I have a dataset called "data" that looks like this:
postcode location_id url_name
NE30-2BG 1159502 https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=POSTCODE^1159502
I'm using the code below to take the url from the data above and retrieve property details from Rightmove. I want to be able to output the postcode (from data) alongside the output below. As the code stands, I'm unable to link the data retrieved from my RightmoveScraper to the original postcode. Any ideas appreciated!
class RightmoveScraper:
results = []
def fetch(self, url):
print('HTTP GET request to URL: %s' % url, end ='')
response = requests.get(url)
print(' | Status code: %s' % response.status_code)
return response
def parse(self, html):
content = BeautifulSoup(html, 'html.parser') #lxml
titles = [title.text.strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
bedrooms = [title.text.split('bedroom')[0].strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
addresses = [address['content'] for address in content.findAll('meta', {'itemprop': 'streetAddress'})]
descriptions = [description.text for description in content.findAll('span', {'data-test': 'property-description'})]
prices = [price.text.strip() for price in content.findAll('div', {'class': 'propertyCard-priceValue'})]
under_over = [underover.text.strip() for underover in content.findAll('div', {'class': 'propertyCard-priceQualifier'})]
#code1 = [price.text.strip() for price in content.findAll('type', {'hidden': 'value'})]
dates = [date.text for date in content.findAll('span', {'class': 'propertyCard-branchSummary-addedOrReduced'})]
sellers = [seller.text.split('by')[-1].strip() for seller in content.findAll('span',{'class': 'propertyCard-branchSummary-branchName'})]
for index in range(0, len(titles)):
self.results.append({
'title': titles[index],
'no_of_bedrooms' : bedrooms[index],
'address': addresses[index],
'description': descriptions[index],
'price': prices[index],
'under_over': under_over[index],
#'code1': code1[index],
'date': dates[index],
'seller': sellers[index]})
def to_csv(self):
with open('rightmove_data.csv','w') as csv_file:
writer = csv.DictWriter(csv_file,fieldnames=self.results[0].keys())
writer.writeheader()
for row in self.results:
writer.writerow(row)
print('Stored results to "rightmove_data.csv"')
def run(self):
for url_name in data['url_name']:
#postcode = data['postcode']
url = url_name
response = self.fetch(url)
self.parse(response.text)
self.to_csv()
if __name__ == '__main__':
scraper = RightmoveScraper()
scraper.run()
CodePudding user response:
It seems you use DataFrame
so you could use .iterrows()
like this
import pandas as pd
data = {
'postcode': ['A','B','C'],
'url_name': ['www1','www2','www3'],
'other': ['X','Y','Z']
}
df = pd.DataFrame(data)
def run():
for index, row in df.iterrows():
print('index :', index
print('postcode:', row['postcode'])
print('url_name:', row['url_name'])
print('other :', row['other'])
print('----')
#response = self.fetch(row['url_name'])
#self.parse(response.text, row['postcode'])
run()
Result:
index : 0
postcode: A
url_name: www1
other : X
----
index : 1
postcode: B
url_name: www2
other : Y
----
index : 2
postcode: C
url_name: www3
other : Z
Or you could use .apply()
to execute function on all rows.
import pandas as pd
def process(row):
print('postcode:', row['postcode'])
print('url_name:', row['url_name'])
print('other :', row['other'])
print('----')
#response = self.fetch(row['url_name'])
#self.parse(response.text, row['postcode'])
data = {
'postcode': ['A','B','C'],
'url_name': ['www1','www2','www3'],
'other': ['X','Y','Z']
}
df = pd.DataFrame(data)
def run():
df.apply(process, axis=1)
run()
CodePudding user response:
With thanks to furas, this works a treat! Thank you!
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv
data = pd.read_csv('postcode data.csv')
df = pd.DataFrame(data)
class RightmoveScraper:
results = []
def fetch(self, url):
print('HTTP GET request to URL: %s' % url, end ='')
response = requests.get(url)
print(' | Status code: %s' % response.status_code)
return response
def parse(self, html, pp):
content = BeautifulSoup(html, 'html.parser') #lxml
titles = [title.text.strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
bedrooms = [title.text.split('bedroom')[0].strip() for title in content.findAll('h2', {'class': 'propertyCard-title'})]
addresses = [address['content'] for address in content.findAll('meta', {'itemprop': 'streetAddress'})]
descriptions = [description.text for description in content.findAll('span', {'data-test': 'property-description'})]
prices = [price.text.strip() for price in content.findAll('div', {'class': 'propertyCard-priceValue'})]
under_over = [underover.text.strip() for underover in content.findAll('div', {'class': 'propertyCard-priceQualifier'})]
dates = [date.text for date in content.findAll('span', {'class': 'propertyCard-branchSummary-addedOrReduced'})]
sellers = [seller.text.split('by')[-1].strip() for seller in content.findAll('span',{'class': 'propertyCard-branchSummary-branchName'})]
for index in range(0, len(titles)):
self.results.append({
'postcode': pp,
'title': titles[index],
'no_of_bedrooms' : bedrooms[index],
'address': addresses[index],
'description': descriptions[index],
'price': prices[index],
'under_over': under_over[index],
'date': dates[index],
'seller': sellers[index]})
def to_csv(self):
with open('output.csv','w') as csv_file:
writer = csv.DictWriter(csv_file,fieldnames=self.results[0].keys())
writer.writeheader()
for row in self.results:
writer.writerow(row)
print('Stored results to "output.csv"')
def run(self):
for index, row in df.iterrows():
pp = row['postcode']
url = row['url_name']
response = self.fetch(url)
self.parse(response.text, pp)
self.to_csv()
if __name__ == '__main__':
scraper = RightmoveScraper()
scraper.run()