I got the following list after scraping from the website, let's suppose random.com
tags1 = [{tag.name: tag['src']} for tag in soup.find_all('script')]
tags2 = [{tag.name: tag['href']} for tag in soup.find_all(name="link",attrs={'rel':'stylesheet'})]
tag_list = tags1 tags2
print(tag_list)
[ {'script': 'js/custom.js'}, {'script': 'https:cdnjs.cloudflare.c
om/ajax/libs/fancybox/2.1.5/jquery.fancybox.min.js'}, {'link': 'css/bootstrap.min.css'}, {'link': 'css/style.css'}, {'link': 'css/responsive.css'}, {'link': 'css/jqu
ery.mCustomScrollbar.min.css'}, {'link': 'https://netdna.bootstrapcdn.com/font-awesome/4.0.3/css/font-awesome.css'}]
I want to modify this list according to the conditions:
- remove
https://
from the values - Separate the values into two parts: domain and path.
- If there are no domain names, add the domain name as
random.com
The expected output will be like:
[ {'script': [{'domain':'random.com','path':'js/custom.js'}]}, {'script': [{'domain':'cdnjs.cloudflare.c
om','path':'ajax/libs/fancybox/2.1.5/jquery.fancybox.min.js'}]}, {'link': [{'domain':'random.com','path':'css/bootstrap.min.css'}]}, {'link': [{'domain':'random.com','path':'css/style.css'}]}, {'link': [{'domain':'random.com','path':'css/responsive.css'}]}, {'link': [{'domain':'random.com','path':'css/jqu
ery.mCustomScrollbar.min.css'}]}, {'link': [{'domain':'netdna.bootstrapcdn.com','path':'font-awesome/4.0.3/css/font-awesome.css'}]}]
Something like this.
CodePudding user response:
Here is a solution you can give it a try, using urllib.parse
from urllib.parse import urlparse
output_ = []
# --> Regex to format URI with invalid schema
extract_uri = re.compile(r":(. )")
for tag in tags:
for k, v in tag.items():
extract_ = extract_uri.search(v)
# --> Identify the URI with schema & prefix format the schema
if extract_:
v = "https://" extract_.group(1).replace("//", "")
parse_ = urlparse(v) # --> Parse the URI
output_.append({
k: [{
"domain": parse_.netloc if parse_.netloc else "random.com",
"path": parse_.path
}]
})
print ( output )
[{'script': [{'domain': 'random.com', 'path': 'js/custom.js'}]},
{'script': [{'domain': 'cdnjs.cloudflare.com',
'path': '/ajax/libs/fancybox/2.1.5/jquery.fancybox.min.js'}]},
{'link': [{'domain': 'random.com', 'path': 'css/bootstrap.min.css'}]},
{'link': [{'domain': 'random.com', 'path': 'css/style.css'}]},
{'link': [{'domain': 'random.com', 'path': 'css/responsive.css'}]},
{'link': [{'domain': 'random.com',
'path': 'css/jquery.mCustomScrollbar.min.css'}]},
{'link': [{'domain': 'netdna.bootstrapcdn.com',
'path': '/font-awesome/4.0.3/css/font-awesome.css'}]}]
CodePudding user response:
Try this -
import re
tagList = [{'script': 'js/custom.js'},
{'script': 'https:cdnjs.cloudflare.com/ajax/libs/fancybox/2.1.5/jquery.fancybox.min.js'},
{'link': 'css/bootstrap.min.css'}, {'link': 'css/style.css'},
{'link': 'css/responsive.css'}, {'link': 'css/jquery.mCustomScrollbar.min.css'},
{'link': 'https://netdna.bootstrapcdn.com/font-awesome/4.0.3/css/font-awesome.css'}]
print(tagList)
reqTagList = []
for i in tagList:
for k, v in i.items():
result = re.match(r"https\W*", v) # Using regex to find https with leading non word characters so it will work for both https: and https://
if result is not None:
url = v[result.end():]
reqTagList.append(
{k: [
{
'domain': url.split('/')[0],
'path': '/'.join(url.split('/')[1:])
}]
})
else:
reqTagList.append(
{k: [
{
'domain': 'random.com',
'path': v
}]
})
print(reqTagList)