I have a data mining script that returns my data into an arrays like that:
price_per_m2 = [742.0, 1210.0, 954.0, 1078.0, 910.0, 1553.0, 0, 1.0, 417.0, 553.0, 41.0, 550.0, 367.0, 11.0, 533.0, 2.0, 1139.0, 1466.0, 1042.0, 800.0, 906.0, 60.0, 91.0, 812.0, 412.0, 1000.0, 64.0, 778.0, 63.0, 1043.0, 899.0, 951.0]
type_of_property = ['Магазин', 'Двустаен апартамент', 'Тристаен апартамент', 'Тристаен апартамент', 'Тристаен апартамент', 'Тристаен апартамент', 'Парцел', 'Парцел', 'Гараж', 'Офис', 'Заведение', 'Офис', 'Гараж', 'Парцел', 'Офис', 'Парцел', 'Офис', 'Офис', 'Магазин', 'Магазин', 'Гараж', 'Земеделски имот', 'Парцел', 'Магазин', 'Офис', 'Двустаен апартамент', 'Парцел', 'Магазин', 'Парцел', 'Двустаен апартамент', 'Едностаен апартамент', 'Двустаен апартамент', 'Офис', 'Едностаен апартамент', 'Земеделски имот', 'Офис', 'Едностаен апартамент', 'Едностаен апартамент', 'Магазин', 'Двустаен апартамент', 'Офис', 'Двустаен апартамент', 'Едностаен апартамент', 'Двустаен апартамент']
- note that the two arrays might not be of equal length as I didn't paste the complete arrays due to the fact that they are too long.
The ultimate goal would be to create one excel file out of all the arrays (extracted on a daily basis) on a daily basis.
However the goal for now is:
- create a pandas array out of one of the above mentioned arrays
- save that array to excel file.
What I have done so far:
df_price_per_m2 = pd.DataFrame(data=price_per_m2)
df_type_of_property = pd.DataFrame(type_of_property)
df_price_per_m2.to_excel('sqm.xlsx')
df_type_of_property.to_excel('sqm.xlsx')
As you will notice I have tried, both having the word 'data=' and not having it. My programs return an error on the first line of this code.
Full program:
from requests_html import HTMLSession
from bs4 import BeautifulSoup
import pandas as pd
import re
import os
s = HTMLSession()
url = 'https://www.imoti.net/bg/obiavi/r/prodava/sofia/?page=1&sid=fSNNpb'
r = s.get(url)
soup_for_last_page = BeautifulSoup(r.text, 'html.parser')
# Get all the data from the page
def getdata(url):
r = s.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
# print(soup)
return soup
def getnextpage(soup):
page = soup.find('nav', {'class': 'paginator'})
if page.find('a', {'class': 'next-page-btn'}):
url = str(page.find('a', {'class': 'next-page-btn'})['href'])
return url
else:
return
last_page = soup_for_last_page.find('a', {'class': 'last-page'})
last_page_number = int(last_page.get_text())
urls = []
for page in range(1, last_page_number 1):
url = f'https://www.imoti.net/bg/obiavi/r/prodava/sofia/?page={page}&sid=fSNNpb'
urls.append(url)
# while True:
# soup = getdata(url)
# url = getnextpage(soup)
# if not url:
# break
# urls.append(url)
# #print(url)
prices = []
type_of_property = []
sqm_area = []
locations = []
publisher = []
price_per_m2 = []
def price_per_m2_0(x):
if x.get_text().strip().find('/:') == -1:
return 0
else:
return float(x.get_text().strip().split('/:')[1].strip().replace('EUR', '').strip().replace(' ', ''))
def get_sqm(links):
for i in links:
soup = getdata(i)
for sqm in soup.find('ul', {'class': 'list-view real-estates'}).find_all('div', {'class': 'inline-group'}):
sqm_value = sqm.get_text().split(',')[1].split()[0]
sqm_area.append(sqm_value)
return sqm_area
def get_location(links):
for i in links:
soup = getdata(i)
for location in soup.find('ul', {'class': 'list-view real-estates'}).find_all('div', {'class': 'inline-group'}):
location_value = location.get_text().split(',')[-1].strip()
locations.append(location_value)
return locations
def get_type(links):
for i in links:
soup = getdata(i)
for property_type in soup.find('ul', {'class': 'list-view real-estates'}).find_all('div', {'class': 'inline-group'}):
property_type_value = ' '.join(
property_type.get_text().split(',')[0].split()[1:3])
type_of_property.append(property_type_value)
return type_of_property
def get_publisher(links):
for i in links:
soup = getdata(i)
for publish in soup.find('ul', {'class': 'list-view real-estates'}).find_all('span', {'class': 're-offer-type'})[1::2]:
publish_value = publish.get_text().strip()
publisher.append(publish_value)
return publisher
def get_price_per_m2(links):
for i in links:
soup = getdata(i)
for price_per_m2_ in soup.find('ul', {'class': 'list-view real-estates'}).find_all('ul', {'class': 'parameters'}):
price_per_m2_value = price_per_m2_0(price_per_m2_)
price_per_m2.append(price_per_m2_value)
return price_per_m2
def total_price(links):
for i in links:
soup = getdata(i)
for price in soup.find('ul', {'class': 'list-view real-estates'}).find_all('strong', {'class': 'price'}):
price_text = price.get_text()
price_arr = re.findall('[0-9] ', price_text)
final_price = ''
for each_sub_price in price_arr:
final_price = each_sub_price
prices.append(final_price)
return prices
print(get_sqm(urls))
print(get_location(urls))
print(get_type(urls))
print(get_publisher(urls))
print(get_price_per_m2(urls))
print(total_price(urls))
df_get_sqm = pd.DataFrame(data=get_sqm)
df_get_location = pd.DataFrame(get_location)
df_get_type = pd.DataFrame(get_type)
df_get_publisher = pd.DataFrame(get_publisher)
df_get_price_per_m2 = pd.DataFrame(get_price_per_m2)
df_total_price = pd.DataFrame(total_price)
df_get_sqm.to_excel('sqm.xlsx')
EDIT: Error message that I get:
Traceback (most recent call last):
File "/Users/tdonov/Desktop/Python/Realestate Scraper/real_estate_test.py", line 130, in <module>
df_get_sqm = pd.DataFrame(data=get_sqm)
File "/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py", line 590, in __init__
raise ValueError("DataFrame constructor not properly called!")
ValueError: DataFrame constructor not properly called!
[Finished in 70.198s]
CodePudding user response:
try :
df_price_per_m2 = pd.DataFrame(data={'price':price_per_m2})
CodePudding user response:
create a pandas array out of one of the above mentioned arrays
Please note that that things like [1,2,3]
are generally called lists not arrays in python
. If you have single flat list (like your price_per_m2
) then pandas.Series
should suffice, please try following
import pandas as pd
price_per_m2 = [742.0, 1210.0, 954.0, 1078.0, 910.0, 1553.0, 0, 1.0, 417.0, 553.0, 41.0, 550.0, 367.0, 11.0, 533.0, 2.0, 1139.0, 1466.0, 1042.0, 800.0, 906.0, 60.0, 91.0, 812.0, 412.0, 1000.0, 64.0, 778.0, 63.0, 1043.0, 899.0, 951.0]
s = pd.Series(price_per_m2)
s.to_excel('sqm.xlsx')
Read pandas.Series_to_excel
docs if you want to know more about writing pandas.Series
to excel file.