How to format the dictionary object in the list of dictionaries?-CodePudding

I got the following list after scraping from the website, let's suppose random.com

tags1 = [{tag.name: tag['src']} for tag in soup.find_all('script')]
tags2 = [{tag.name: tag['href']} for tag in soup.find_all(name="link",attrs={'rel':'stylesheet'})]
tag_list = tags1   tags2 

print(tag_list)

[ {'script': 'js/custom.js'}, {'script': 'https:cdnjs.cloudflare.c
om/ajax/libs/fancybox/2.1.5/jquery.fancybox.min.js'}, {'link': 'css/bootstrap.min.css'}, {'link': 'css/style.css'}, {'link': 'css/responsive.css'}, {'link': 'css/jqu
ery.mCustomScrollbar.min.css'}, {'link': 'https://netdna.bootstrapcdn.com/font-awesome/4.0.3/css/font-awesome.css'}]

I want to modify this list according to the conditions:

remove https:// from the values
Separate the values into two parts: domain and path.
If there are no domain names, add the domain name as random.com

The expected output will be like:

[ {'script': [{'domain':'random.com','path':'js/custom.js'}]}, {'script': [{'domain':'cdnjs.cloudflare.c
om','path':'ajax/libs/fancybox/2.1.5/jquery.fancybox.min.js'}]}, {'link': [{'domain':'random.com','path':'css/bootstrap.min.css'}]}, {'link': [{'domain':'random.com','path':'css/style.css'}]}, {'link': [{'domain':'random.com','path':'css/responsive.css'}]}, {'link': [{'domain':'random.com','path':'css/jqu
ery.mCustomScrollbar.min.css'}]}, {'link': [{'domain':'netdna.bootstrapcdn.com','path':'font-awesome/4.0.3/css/font-awesome.css'}]}]

Something like this.

CodePudding user response：

Here is a solution you can give it a try, using urllib.parse

from urllib.parse import urlparse

output_ = []

# --> Regex to format URI with invalid schema
extract_uri = re.compile(r":(. )")

for tag in tags:
    for k, v in tag.items():

        extract_ = extract_uri.search(v)

        # --> Identify the URI with schema & prefix format the schema
        if extract_:
            v = "https://"   extract_.group(1).replace("//", "")

        parse_ = urlparse(v)  # --> Parse the URI
        output_.append({
            k: [{
                "domain": parse_.netloc if parse_.netloc else "random.com",
                "path": parse_.path
            }]
        })

print ( output )

[{'script': [{'domain': 'random.com', 'path': 'js/custom.js'}]},
 {'script': [{'domain': 'cdnjs.cloudflare.com',
              'path': '/ajax/libs/fancybox/2.1.5/jquery.fancybox.min.js'}]},
 {'link': [{'domain': 'random.com', 'path': 'css/bootstrap.min.css'}]},
 {'link': [{'domain': 'random.com', 'path': 'css/style.css'}]},
 {'link': [{'domain': 'random.com', 'path': 'css/responsive.css'}]},
 {'link': [{'domain': 'random.com',
            'path': 'css/jquery.mCustomScrollbar.min.css'}]},
 {'link': [{'domain': 'netdna.bootstrapcdn.com',
            'path': '/font-awesome/4.0.3/css/font-awesome.css'}]}]

CodePudding user response：

Try this -

import re

tagList = [{'script': 'js/custom.js'},
           {'script': 'https:cdnjs.cloudflare.com/ajax/libs/fancybox/2.1.5/jquery.fancybox.min.js'},
           {'link': 'css/bootstrap.min.css'}, {'link': 'css/style.css'},
           {'link': 'css/responsive.css'}, {'link': 'css/jquery.mCustomScrollbar.min.css'},
           {'link': 'https://netdna.bootstrapcdn.com/font-awesome/4.0.3/css/font-awesome.css'}]

print(tagList)
reqTagList = []

for i in tagList:
    for k, v in i.items():
        result = re.match(r"https\W*", v) # Using regex to find https with leading non word characters so it will work for both https: and https://
        if result is not None:
            url = v[result.end():]
            reqTagList.append(
                {k: [
                    {
                        'domain': url.split('/')[0],
                        'path': '/'.join(url.split('/')[1:])
                    }]
                })
        else:
            reqTagList.append(
                {k: [
                    {
                        'domain': 'random.com',
                        'path': v
                    }]
                })

print(reqTagList)