Home > Back-end >  Scraping a List of subsection url of website
Scraping a List of subsection url of website

Time:10-04

I tried to scraping a section “Estadístiques” with all subsection inside and make a list with the url of all subsection.

Apparently, I thought with this code run correctly, but I see that for example the scraping of “Estadística de l’ensenyament 2021-2022” do not take all subsection inside of this.

education_statistic_section = "https://educacio.gencat.cat/ca/departament/estadistiques/" # Principal webside with subsection of eduction statistic section -- level 0 

html_eduaction_section_levels = "distribuidora-item grey" # Class with there are education section levels (each one)

web_education = "https://educacio.gencat.cat"

list_title_level_1 = []
list_web_level_1 = []
list_all_web=[]


list_title_subsecction = []
list_web_subsecction = []
list_title_document = []
list_web_document = []


def parse_url(url):
    response = requests.get(url)
    content = response.content
    parsed_response = BeautifulSoup(content, "lxml")
    return parsed_response



def first_secction_statistic (): # Function to scraping title and url of all div class distribuidora-item grey of variable education_statistic_section
    soup = parse_url(education_statistic_section) # Parse the principal webside
    html_div_level_1 = soup.find_all('div', {'class':html_eduaction_section_levels}) # Variable with all class distribuidora-item grey
    for html_elements_level_1 in html_div_level_1: # For each elemen of variable html_div_level_1
        list_title_level_1.append( html_elements_level_1.text.strip()) # Innsert to list and exracte text
        html_tags_as_level_1= html_elements_level_1.find('a') # Extract data of html tag "a" element
        list_web_level_1.append(web_education html_tags_as_level_1.get('href'))# Innsert to list data form variable html_tags_as_level_1



first_secction_statistic()
# csv = pd.DataFrame({'Títols nivell 1': pd.Series(list_title_level_1), 'Web nivell 1': pd.Series(list_web_level_1)})

def all_web_subsecction_statistic (): # Function to list all subsecction of education section
    for i in list_web_level_1: # For item in list
        soup = None # Clean variable
        soup = parse_url(i) # Parse the principal webside
        html_tags_a = soup.find_all('a') # Find all html 'a' --> this has the url of documents or interest link
        for element in html_tags_a: # For element in html_tags_a
            str_element = str(element.get('href')) # String of get element 'href'
            if str_element.startswith('/ca/departament/estadistiques/'): # if has a subsecction of statistic
                subsecction_web_statistic= web_education str_element # webside of subsecction that it's sum of web_educatio and str_element
                if subsecction_web_statistic not in list_all_web: # if not in list add
                    list_all_web.append(subsecction_web_statistic)


all_web_subsecction_statistic()
list_all_web

CodePudding user response:

Finally I iterate the two times more and I go it

education_statistic_section = "https://educacio.gencat.cat/ca/departament/estadistiques/" # Principal webside with subsection of eduction statistic section -- level 0 

html_eduaction_section_levels = "distribuidora-item grey" # Class with there are education section levels (each one)

web_education = "https://educacio.gencat.cat"

list_title_level_1 = []
list_web_level_1 = []
list_all_web=[]


list_title_subsecction = []
list_web_subsecction = []
list_title_document = []
list_web_document = []


def parse_url(url):
    response = requests.get(url)
    content = response.content
    parsed_response = BeautifulSoup(content, "lxml")
    return parsed_response



def first_secction_statistic (): # Function to scraping title and url of all div class distribuidora-item grey of variable education_statistic_section
    soup = parse_url(education_statistic_section) # Parse the principal webside
    html_div_level_1 = soup.find_all('div', {'class':html_eduaction_section_levels}) # Variable with all class distribuidora-item grey
    for html_elements_level_1 in html_div_level_1: # For each elemen of variable html_div_level_1
        list_title_level_1.append( html_elements_level_1.text.strip()) # Innsert to list and exracte text
        html_tags_as_level_1= html_elements_level_1.find('a') # Extract data of html tag "a" element
        list_web_level_1.append(web_education html_tags_as_level_1.get('href'))# Innsert to list data form variable html_tags_as_level_1



first_secction_statistic()
# csv = pd.DataFrame({'Títols nivell 1': pd.Series(list_title_level_1), 'Web nivell 1': pd.Series(list_web_level_1)})

def all_web_subsecction_statistic (): # Function to list all subsecction of education section
    for i in list_web_level_1: # For item in list
        soup = None # Clean variable
        soup = parse_url(i) # Parse the principal webside
        html_tags_a = soup.find_all('a') # Find all html 'a' --> this has the url of documents or interest link
        for element in html_tags_a: # For element in html_tags_a
            str_element = str(element.get('href')) # String of get element 'href'
            if str_element.startswith('/ca/departament/estadistiques/'): # if has a subsecction of statistic
                subsecction_web_statistic= web_education str_element # webside of subsecction that it's sum of web_educatio and str_element
                soup = None
                soup = parse_url(subsecction_web_statistic)
                html_tags_a = None
                html_tags_a = soup.find_all('a')
                for element in html_tags_a:
                    str_element = None
                    str_element = str(element.get('href'))
                    if str_element.startswith('/ca/departament/estadistiques/'):
                        if subsecction_web_statistic not in list_all_web:
                            list_all_web.append(subsecction_web_statistic)
                        
                if subsecction_web_statistic not in list_all_web: # if not in list add
                    list_all_web.append(subsecction_web_statistic)


all_web_subsecction_statistic()

for i in list_all_web:
    soup = None # Clean variable
    soup = parse_url(i) # Parse the principal webside
    html_tags_a = soup.find_all('a') # Find all html 'a' --> this has the url of documents or interest link
    for element in html_tags_a: # For element in html_tags_a
        str_element = str(element.get('href')) # String of get element 'href'
        if str_element.startswith('/ca/departament/estadistiques/'): # if has a subsecction of statistic
            subsecction_web_statistic= web_education str_element # webside of subsecction that it's sum of web_educatio and str_element
            soup = None
            soup = parse_url(subsecction_web_statistic)
            html_tags_a = None
            html_tags_a = soup.find_all('a')
            for element in html_tags_a:
                str_element = None
                str_element = str(element.get('href'))
                if str_element.startswith('/ca/departament/estadistiques/'):
                    if subsecction_web_statistic not in list_all_web:
                        list_all_web.append(subsecction_web_statistic)

            if subsecction_web_statistic not in list_all_web: # if not in list add
                list_all_web.append(subsecction_web_statistic)
  • Related