Home > OS >  Web-Scraping using BeautifulSoup (missing values when scraping)
Web-Scraping using BeautifulSoup (missing values when scraping)

Time:10-20

I have been trying to webscrape a realtor website using BeautifulSoup and encountered 2 difficulties that I cannot seem to fix.

Difficulties:

  1. When I run my code below, I am missing some date values. The dataframe should hold 68 rows of data scraped from the first page. The description and title scrapes return 68 rows, but the date scrape returns 66. I don't get N/A values returned if its missing either. Does anyone have an idea why this is? When I inspected the website elements it had the same tags, except it is listed as VIP or Special (promotion) apartments.
  2. Secondly, I cannot seem to figure out how to scrape meta itemprop tags. I keep getting blank values when I use:
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
for tag2 in tag.findAll('div', attrs={'class':'announcement-block__date'}):

Thank you in advance for any assistance you could provide.

Python Code:

from urllib.request import urlopen,Request
from bs4 import BeautifulSoup as bsoup
import ssl
import pandas as pd

def get_headers():
   #Headers
   headers={'accept':'text/html,application/xhtml xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
           'accept-language':'en-US,en;q=0.9',
           'cache-control':'max-age=0',
           'upgrade-insecure-requests':'1',
           'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
   return headers

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
count = 1 # for pagination

#Make list holder
title = []
description = []
date = []

urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/']

for x in urls:
   count=1
   y=x
   while(count < 2):  # will get only 1st page
       print(x)
       req = Request(x, headers=get_headers())  #req all headers
       htmlfile = urlopen(req)
       htmltext = htmlfile.read()
       soup = bsoup(htmltext,'html.parser')
       
       for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
           for tag2 in tag.findAll('a', attrs={'class':'announcement-block__title'}):
               text = tag2.get_text().strip()
               if len(text) > 0:
                   title.append(text)
               else:
                   title.append('N/A')
               
       for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
           for tag2 in tag.findAll('div', attrs={'class':'announcement-block__description'}):
               text = tag2.get_text().strip()
               if len(text) > 0:
                   description.append(text)
               else:
                   description.append('N/A')
               
       for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
           for tag2 in tag.findAll('div', attrs={'class':'announcement-block__date'}):
               text = tag2.get_text().strip()
               if len(text) > 0:
                   date.append(text)
               else:
                   date.append('N/A')

       # Go to next page
       count=count 1
       page = '?page=' str(count)
       x=y page

data_frame = pd.DataFrame(list(zip(title,description,date)),columns=['Title', 'Description', 'Date'])

CodePudding user response:

You get 66 items because your date[] contains only 66 elements, therefore, you need to check all three fields at once in one for loop. Your if else checks do nothing as there are no announcement-block__date divs with empty content on the page.

from urllib.request import urlopen,Request
from bs4 import BeautifulSoup as bsoup
import ssl
import pandas as pd

def get_headers():
   #Headers
   headers={'accept':'text/html,application/xhtml xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
           'accept-language':'en-US,en;q=0.9',
           'cache-control':'max-age=0',
           'upgrade-insecure-requests':'1',
           'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
   return headers

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
count = 1 # for pagination

#Make list holder
info = {
    'title': [],
    'description': [],
    'date': []
}

urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/']

for x in urls:
   count=1
   y=x
   while(count < 2):  # will get only 1st page
       print(x)
       req = Request(x, headers=get_headers())  #req all headers
       htmlfile = urlopen(req)
       htmltext = htmlfile.read()
       soup = bsoup(htmltext,'html.parser')
       for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
            title = tag.find('a', attrs={'class':'announcement-block__title'})
            description = tag.find('div', attrs={'class':'announcement-block__description'})
            date = tag.find('div', attrs={'class':'announcement-block__date'})
            info['title'].append(title.get_text().strip() if title else 'N/A')
            info['description'].append(description.get_text().strip() if description else 'N/A')
            info['date'].append(date.get_text().strip() if date else 'N/A')
       # Go to next page
       count=count 1
       page = '?page=' str(count)
       x=y page

data_frame = pd.DataFrame(list(zip(info['title'], info['description'], info['date'])),columns=['Title', 'Description', 'Date'])
print(len(info['title']), len(info['description']), len(info['date']))
print(data_frame)

About your second question, a similar question has already been answered here

  • Related