Home > Net >  How to get information in div tag with html.parser?
How to get information in div tag with html.parser?

Time:01-18

I have a html tree where needed price is inside div tag and under the span tag. My code printing an empty value. How to I get needed price?

<div class='card-product__price'>
    <span class='card-product__price-old>Old price is here></span>
        'Needed price is here'
</div>
class Parser(HTMLParser):

    def handle_starttag(self, tag, attrs):
        if (not self.price_is_found and
            'class' not in self._product_info and
            tag == 'div'):
        attrs = dict(attrs)
        if attrs.get('class') == 'card-product__details':
            self.is_price_field = True

    def handle_data(self, data):
        if (not self.price_is_found and
            self.is_price_field and
            'class' not in self._product_info):
        self.lst.append(data)
        self._product_info['price'] = re.sub('[^\d ]', '', data)
        self.price_is_found = True

CodePudding user response:

Updated version :

from bs4 import BeautifulSoup
html = '<div > <span >Old price is here</span> "Needed price is here" </div>'
soup = BeautifulSoup(html, 'html.parser')
price = soup.find('div', {'class': 'card-product__price'}).text
print(price)

This will print out the string "Needed price is here". You can also use the css selector to get the desired value in a more elegant way.

price = soup.select_one('.card-product__price').text

You can also use PyQuery library it uses jquery like syntax to get the desired value.

from pyquery import PyQuery as pq

doc = pq(html)
price = doc('.card-product__price').text()

If you want to use html.parser then try this out.

from html.parser import HTMLParser

class PriceParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.in_price_tag = False
        self.price = ""

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if tag == 'div' and attrs.get('class') == 'card-product__price':
            self.in_price_tag = True

    def handle_data(self, data):
        if self.in_price_tag:
            self.price = data
            self.in_price_tag = False

html = '<div > <span >Old price is here</span> "Needed price is here" </div>'
parser = PriceParser()
parser.feed(html)
print(parser.price)

CodePudding user response:

Try:

from html.parser import HTMLParser

class PriceParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.in_price_tag = False
        self.prices = []

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if tag == 'div' and attrs.get('class') == 'card-product__price':
            self.in_price_tag = True

    def handle_endtag(self, tag):
        if tag == 'div' and self.in_price_tag:
            self.in_price_tag = False

    def handle_data(self, data):
        data = data.strip()
        if self.in_price_tag:
            self.prices.append(data)

html = '<div > <span >Old price is here</span> "Needed price is here" </div>'
parser = PriceParser()
parser.feed(html)
print(parser.prices[-1])  # <-- the needed price is the last element of the array

Prints:

"Needed price is here"
  • Related