Hello I am trying to parse the name "beer.master.121" from the input_url by using regex, and I am looking for a better Regex than the one I have
Actually my function and results are the following:
import urllib
from urllib.parse import urlparse, urlsplit
input_url = 'https://www.pizza.com/beer.master.121/margaretha/98799csduu99003/'
def get_url_data(input_url):
url_parts = urlsplit(input_url)
query = dict(urllib.parse.parse_qsl(url_parts.query))
path_ = url_parts.path
if 'margaretha/' in input_url:
publisher = re.search('\w (?=\s*/[^/])', path_).group(0)
print(publisher)
return publisher
When I run the code, I get only the last word:
get_url_data(input_url)
'121'
Desired outputs:
input_url = 'https://www.pizza.com/beer.master.121/margaretha/98799csduu99003/'
get_url_data(input_url)
'beer.master.121'
input_url = 'https://www.pizza.com/beer.master/margaretha/98799csduuppP000/'
get_url_data(input_url)
'beer.master'
input_url = 'https://www.pizza.com/beer/margaretha/98799csduuppP000/'
get_url_data(input_url)
'beer'
input_url = 'https://www.pizza.com/lovely/10022648/margaretha/939520'
get_url_data(input_url)
'10022648'
input_url = 'https://www.pizza.com/lovely/jhonson.1002278/margaretha/939520'
get_url_data(input_url)
'jhonson.1002278'
CodePudding user response:
Try this:
from urllib.parse import urlsplit
def get_url_data(input_url):
path = urlsplit(input_url).path
try:
idx = path.index('margaretha')
except:
return None
return path[:idx - 1].rsplit('/', 1)[-1]
CodePudding user response:
Another method with other info.
Output
url: https://www.pizza.com/beer.master.121/margaretha/98799csduu99003/
network location: www.pizza.com
directories: ['beer.master.121', 'margaretha', '98799csduu99003']
target: beer.master.121
url: https://www.pizza.com/beer.master/margaretha/98799csduuppP000/
network location: www.pizza.com
directories: ['beer.master', 'margaretha', '98799csduuppP000']
target: beer.master
url: https://www.pizza.com/beer/margaretha/98799csduuppP000/
network location: www.pizza.com
directories: ['beer', 'margaretha', '98799csduuppP000']
target: beer
url: https://www.pizza.com/lovely/10022648/margaretha/939520
network location: www.pizza.com
directories: ['lovely', '10022648', 'margaretha', '939520']
target: 10022648
url: https://www.pizza.com/lovely/jhonson.1002278/margaretha/939520
network location: www.pizza.com
directories: ['lovely', 'jhonson.1002278', 'margaretha', '939520']
target: jhonson.1002278
Code
from urllib.parse import urlparse
urls = [
'https://www.pizza.com/beer.master.121/margaretha/98799csduu99003/',
'https://www.pizza.com/beer.master/margaretha/98799csduuppP000/',
'https://www.pizza.com/beer/margaretha/98799csduuppP000/',
'https://www.pizza.com/lovely/10022648/margaretha/939520',
'https://www.pizza.com/lovely/jhonson.1002278/margaretha/939520'
]
for url in urls:
print()
print(f'url: {url}')
parts = urlparse(url)
print(f'network location: {parts.netloc}')
directories = parts.path.strip('/').split('/')
print(f'directories: {directories}')
margaretha_index = directories.index('margaretha')
ret = directories[margaretha_index-1]
print(f'target: {ret}')
def get_url_data(url):
parts = urlparse(url)
directories = parts.path.strip('/').split('/')
margaretha_index = directories.index('margaretha')
return directories[margaretha_index-1]
Reference
https://practicaldatascience.co.uk/data-science/how-to-parse-url-structures-using-python