I tried to scraping a section “Estadístiques” with all subsection inside and make a list with the url of all subsection.
Apparently, I thought with this code run correctly, but I see that for example the scraping of “Estadística de l’ensenyament 2021-2022” do not take all subsection inside of this.
education_statistic_section = "https://educacio.gencat.cat/ca/departament/estadistiques/" # Principal webside with subsection of eduction statistic section -- level 0
html_eduaction_section_levels = "distribuidora-item grey" # Class with there are education section levels (each one)
web_education = "https://educacio.gencat.cat"
list_title_level_1 = []
list_web_level_1 = []
list_all_web=[]
list_title_subsecction = []
list_web_subsecction = []
list_title_document = []
list_web_document = []
def parse_url(url):
response = requests.get(url)
content = response.content
parsed_response = BeautifulSoup(content, "lxml")
return parsed_response
def first_secction_statistic (): # Function to scraping title and url of all div class distribuidora-item grey of variable education_statistic_section
soup = parse_url(education_statistic_section) # Parse the principal webside
html_div_level_1 = soup.find_all('div', {'class':html_eduaction_section_levels}) # Variable with all class distribuidora-item grey
for html_elements_level_1 in html_div_level_1: # For each elemen of variable html_div_level_1
list_title_level_1.append( html_elements_level_1.text.strip()) # Innsert to list and exracte text
html_tags_as_level_1= html_elements_level_1.find('a') # Extract data of html tag "a" element
list_web_level_1.append(web_education html_tags_as_level_1.get('href'))# Innsert to list data form variable html_tags_as_level_1
first_secction_statistic()
# csv = pd.DataFrame({'Títols nivell 1': pd.Series(list_title_level_1), 'Web nivell 1': pd.Series(list_web_level_1)})
def all_web_subsecction_statistic (): # Function to list all subsecction of education section
for i in list_web_level_1: # For item in list
soup = None # Clean variable
soup = parse_url(i) # Parse the principal webside
html_tags_a = soup.find_all('a') # Find all html 'a' --> this has the url of documents or interest link
for element in html_tags_a: # For element in html_tags_a
str_element = str(element.get('href')) # String of get element 'href'
if str_element.startswith('/ca/departament/estadistiques/'): # if has a subsecction of statistic
subsecction_web_statistic= web_education str_element # webside of subsecction that it's sum of web_educatio and str_element
if subsecction_web_statistic not in list_all_web: # if not in list add
list_all_web.append(subsecction_web_statistic)
all_web_subsecction_statistic()
list_all_web
CodePudding user response:
Finally I iterate the two times more and I go it
education_statistic_section = "https://educacio.gencat.cat/ca/departament/estadistiques/" # Principal webside with subsection of eduction statistic section -- level 0
html_eduaction_section_levels = "distribuidora-item grey" # Class with there are education section levels (each one)
web_education = "https://educacio.gencat.cat"
list_title_level_1 = []
list_web_level_1 = []
list_all_web=[]
list_title_subsecction = []
list_web_subsecction = []
list_title_document = []
list_web_document = []
def parse_url(url):
response = requests.get(url)
content = response.content
parsed_response = BeautifulSoup(content, "lxml")
return parsed_response
def first_secction_statistic (): # Function to scraping title and url of all div class distribuidora-item grey of variable education_statistic_section
soup = parse_url(education_statistic_section) # Parse the principal webside
html_div_level_1 = soup.find_all('div', {'class':html_eduaction_section_levels}) # Variable with all class distribuidora-item grey
for html_elements_level_1 in html_div_level_1: # For each elemen of variable html_div_level_1
list_title_level_1.append( html_elements_level_1.text.strip()) # Innsert to list and exracte text
html_tags_as_level_1= html_elements_level_1.find('a') # Extract data of html tag "a" element
list_web_level_1.append(web_education html_tags_as_level_1.get('href'))# Innsert to list data form variable html_tags_as_level_1
first_secction_statistic()
# csv = pd.DataFrame({'Títols nivell 1': pd.Series(list_title_level_1), 'Web nivell 1': pd.Series(list_web_level_1)})
def all_web_subsecction_statistic (): # Function to list all subsecction of education section
for i in list_web_level_1: # For item in list
soup = None # Clean variable
soup = parse_url(i) # Parse the principal webside
html_tags_a = soup.find_all('a') # Find all html 'a' --> this has the url of documents or interest link
for element in html_tags_a: # For element in html_tags_a
str_element = str(element.get('href')) # String of get element 'href'
if str_element.startswith('/ca/departament/estadistiques/'): # if has a subsecction of statistic
subsecction_web_statistic= web_education str_element # webside of subsecction that it's sum of web_educatio and str_element
soup = None
soup = parse_url(subsecction_web_statistic)
html_tags_a = None
html_tags_a = soup.find_all('a')
for element in html_tags_a:
str_element = None
str_element = str(element.get('href'))
if str_element.startswith('/ca/departament/estadistiques/'):
if subsecction_web_statistic not in list_all_web:
list_all_web.append(subsecction_web_statistic)
if subsecction_web_statistic not in list_all_web: # if not in list add
list_all_web.append(subsecction_web_statistic)
all_web_subsecction_statistic()
for i in list_all_web:
soup = None # Clean variable
soup = parse_url(i) # Parse the principal webside
html_tags_a = soup.find_all('a') # Find all html 'a' --> this has the url of documents or interest link
for element in html_tags_a: # For element in html_tags_a
str_element = str(element.get('href')) # String of get element 'href'
if str_element.startswith('/ca/departament/estadistiques/'): # if has a subsecction of statistic
subsecction_web_statistic= web_education str_element # webside of subsecction that it's sum of web_educatio and str_element
soup = None
soup = parse_url(subsecction_web_statistic)
html_tags_a = None
html_tags_a = soup.find_all('a')
for element in html_tags_a:
str_element = None
str_element = str(element.get('href'))
if str_element.startswith('/ca/departament/estadistiques/'):
if subsecction_web_statistic not in list_all_web:
list_all_web.append(subsecction_web_statistic)
if subsecction_web_statistic not in list_all_web: # if not in list add
list_all_web.append(subsecction_web_statistic)