Home > Net >  How to save all links from all pages to csv using python beautiful soup
How to save all links from all pages to csv using python beautiful soup


Am trying to save all links collected from multiple paginated pages to csv. from print(links) I can see all the links I want to save from multiple pages but unfortunately, when I open csv file, am only finding one URL save. How can I save all URLs I see from the terminal (print(links) to csv?

below is my code:

def scrape_pages(url) -> None:

#max_pages = 10

max_pages = 5 # doing 3 pages for examples sake

current_page = 1

# Loop through all pages dynamically and build the url using the page number suffix the website uses

while current_page <= max_pages:


    # Get each page's html

    raw_html1 = requests.get(f'{url}page/{current_page}')

    soup1 = BeautifulSoup(raw_html1.text, 'html.parser')

    current_page  = 1
   # Find all table rows and from each table row get the needed data 

    #root = 'https://www.myjobmag.com'

    for link1 in soup1.find_all('li',{'class':'mag-b'}):

     link2 =  link1.find('a',href=True)

     link3 = 'https://www.myjobmag.com' (link2['href'])

    links = []

    [links.append(link3) for link2 in link1 ]  

    for link2 in links:

        raw_html =  urlopen(link3)

        soup = BeautifulSoup(raw_html.read(), 'html.parser')

    def getTitle(soup):

      return soup.find('h2', class_="mag-b").text.strip()

    def getCompany(soup):

      return soup.find('li', class_="job-industry").text.strip()

    def getInfo(soup):

      return soup.find('ul', class_="job-key-info").text.strip()

    def getDescription(soup):

      return soup.find('div', class_="job-details").text.strip()

    def getApplication(soup):

       return soup.find('div', class_="mag-b bm-b-30").text.strip()
    with open('output.csv', 'w', encoding='utf8', newline='') as 

       csv_output = csv.writer(f_output)

       csv_output.writerow(['Title', 'Info', 'Desc', 'Application'])

       row = [getTitle(soup), getCompany(soup), getInfo(soup),

getDescription(soup), getApplication(soup)]


       for f_output in row:


   # print(product, row, Title, Company, Info, Description, Application)
    time.sleep(5) # sleep before scraping next page to not send too 
     many requests at once 
    print('\n\n') # Clearing console up

def main() -> int:

URL = 'https://www.myjobmag.com/'


return 0

if name == 'main': exit(main())

CodePudding user response:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time as t

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}

s = requests.Session()

links_list = []
for x in range(1, 3):
    r = s.get(f'https://www.myjobmag.com/page/{x}')
    soup = BeautifulSoup(r.text, 'html.parser')
    links = soup.select_one('ul.job-list').select('li.job-list-li')
    for link in links:
            title = link.select_one('h2').text.strip()
            url = link.select_one('h2').select_one('a').get('href')
            r = s.get(f'https://www.myjobmag.com{url}')
            soup = BeautifulSoup(r.text, 'html.parser')
            key_info = soup.select_one('ul.job-key-info').text.strip()
            description = soup.select_one('div.job-details').text.strip()
            application_method = soup.select_one('div.mag-b.bm-b-30').text.strip()
            links_list.append((title, key_info, description, application_method, url))
            print(f'done {title} -- {url}')
        except Exception as e:

df = pd.DataFrame(links_list, columns = ['title', 'key_info', 'description', 'application_method', 'url'])

This will return a csv file with job title, key info, description, application method, and url.

  • Related