How to filter from a list of dictionaries and write to a file?-CodePudding

I want some more improvements to my previous question: How to format the dictionary object in the list of dictionaries?

print(tag_list)

[[{'script': [{'domain': 'random.com', 'path': 'js/custom.js'}]},
 {'script': [{'domain': 'cdnjs.cloudflare.com',
              'path': '/ajax/libs/fancybox/2.1.5/jquery.fancybox.min.js'}]},
 {'link': [{'domain': 'random.com', 'path': 'css/bootstrap.min.css'}]},
 {'link': [{'domain': 'random.com', 'path': 'css/style.css'}]},
 {'link': [{'domain': 'random.com', 'path': 'css/responsive.css'}]},
 {'link': [{'domain': 'random.com',
            'path': 'css/jquery.mCustomScrollbar.min.css'}]},
 {'link': [{'domain': 'netdna.bootstrapcdn.com',
            'path': '/font-awesome/4.0.3/css/font-awesome.css'}]}]]

I want to get all the data that is inside the 'domain' key and store them in a new file domain.txt line by line.

domain.txt

random.com
cdnjs.cloudfare.com
netdna.bootstrapcdn.com

The repetition should be avoided.

CodePudding user response：

One approach:

data = [[{'script': [{'domain': 'random.com', 'path': 'js/custom.js'}]},
         {'script': [{'domain': 'cdnjs.cloudflare.com',
                      'path': '/ajax/libs/fancybox/2.1.5/jquery.fancybox.min.js'}]},
         {'link': [{'domain': 'random.com', 'path': 'css/bootstrap.min.css'}]},
         {'link': [{'domain': 'random.com', 'path': 'css/style.css'}]},
         {'link': [{'domain': 'random.com', 'path': 'css/responsive.css'}]},
         {'link': [{'domain': 'random.com',
                    'path': 'css/jquery.mCustomScrollbar.min.css'}]},
         {'link': [{'domain': 'netdna.bootstrapcdn.com',
                    'path': '/font-awesome/4.0.3/css/font-awesome.css'}]}]]

# open file for writing
with open("domain.txt", "w") as outfile:
    # create a set to check for duplicates
    seen = set()
    for top in data:
        for e in top:

            # get domain data either from script or link
            se = e.get("script") or e.get("link")

            # fetch the domain name
            domain = se[0]["domain"]

            # write if not previously seen
            if domain not in seen:
                seen.add(domain)
                outfile.write(f"{domain}\n")

Output

random.com
cdnjs.cloudflare.com
netdna.bootstrapcdn.com

CodePudding user response：

It does rather look as though the abundance of inner lists is unnecessary but in case you really need them then this should handle all eventualities:

taglist = [[{'script': [{'domain': 'random.com', 'path': 'js/custom.js'}]},
 {'script': [{'domain': 'cdnjs.cloudflare.com',
              'path': '/ajax/libs/fancybox/2.1.5/jquery.fancybox.min.js'}]},
 {'link': [{'domain': 'random.com', 'path': 'css/bootstrap.min.css'}]},
 {'link': [{'domain': 'random.com', 'path': 'css/style.css'}]},
 {'link': [{'domain': 'random.com', 'path': 'css/responsive.css'}]},
 {'link': [{'domain': 'random.com',
            'path': 'css/jquery.mCustomScrollbar.min.css'}]},
 {'link': [{'domain': 'netdna.bootstrapcdn.com',
            'path': '/font-awesome/4.0.3/css/font-awesome.css'}]}]]
D = set()
with open('domain.txt', 'w') as dfile:
    for tag in taglist:
        for subtag in tag:
            if (d := subtag.get('script', None)) is None:
                if (d := subtag.get('link', None)) is None:
                    continue
            for e in d:
                if (domain := e.get('domain', None)):
                    D.add(domain)
    for domain in D:
        print(domain, file=dfile)

[ Note: You'll need Python 3.8 for this ]