I have an HTML page and I want to get the links from this page and then convert them into JSON format. This is the link to searchpage
Here is what I have tried.
class HtmltoJsonParser(HTMLParser):
def __init__(self,raise_exception = True):
HTMLParser.__init__(self)
#self.reset()
self.doc = {}
self.path = []
self.cur = self.doc
self.line = 0
self.raise_exception = raise_exception
@property
def json(self):
return self.doc
@staticmethod
def to_json(content, raise_exception = True):
parser = HtmltoJsonParser(raise_exception = raise_exception)
parser.feed(content)
return parser.json
def handle_starttag(self, tag, attrs):
# Only parse the 'anchor' tag.
if tag == "a":
for name,link in attrs:
if name == "href" and link.startswith("http"):
self.cur["" name]= link
#print (link)
I took the help from this blog. I want to get an output like this
{
"ads": [
{
"position": 1,
"link": "https://www.googleadservices.com/pagead/aclk?sa=L&ai=DChcSEwitk5Ou2qX6AhVK07IKHdyyCwQYABADGgJscg&ohost=www.google.com&cid=CAASJeRoa3Q-GtJJqeqbQ0EjhhL22QNYj4Sg_79Man_cWa0tjzSi8Ho&sig=AOD64_3-qhJH4tfcxt1VMfxwOTF8BKeFXA&q&adurl&ved=2ahUKEwikz4uu2qX6AhVXAxAIHfwECwoQ0Qx6BAgFEAM",
},
{
"position": 2,
"link": "https://www.googleadservices.com/pagead/aclk?sa=L&ai=DChcSEwitk5Ou2qX6AhVK07IKHdyyCwQYABAAGgJscg&ohost=www.google.com&cid=CAASJeRoa3Q-GtJJqeqbQ0EjhhL22QNYj4Sg_79Man_cWa0tjzSi8Ho&sig=AOD64_1ZUcXQhcCFUYnBHo3jqlckXL2agg&q&adurl&ved=2ahUKEwikz4uu2qX6AhVXAxAIHfwECwoQ0Qx6BAgCEAE",
} ] }
but Im getting this
{'href': 'https://policies.google.com/terms?hl=en-PL&fg=1'}
Why is it not appending the link to JSON self.cur? I have tried appending it but I got key error every time.
CodePudding user response:
the problem is here
self.cur["" name]= link
as name=='href'
is true, this will always update the value stored at name
and not append it. try this.
def handle_starttag(self, tag, attrs):
# Only parse the 'anchor' tag.
if tag == "a":
for name, link in attrs:
print(attrs)
if name == "href" and link.startswith("http"):
cur = {}
cur["position"]= self.line
self.line = 1
cur["link"] = link
self.doc["ads"].append(cur)
#print (link)
but with what you have mentioned you should change link.startswith("http")
to link.startswith("/url?q=")