Home > Back-end >  Failed to parse content from a webpage using requests
Failed to parse content from a webpage using requests

Time:10-13

I'm trying to create a script using requests module (without using session) to parse two fields from a webpage but the script fails miserably. However, when I created another script using session, I could fetch the content from that site flawlessly.

Here goes the manual steps to reach the content:

  1. Choose the first item from dropdown.
  2. Get the links to the detail page.
  3. Grab these two fields from detail page.

While creating the script using plain requests, I tried to make use of cookies but I ended up getting AttributeError.

Script without session:

import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

base = 'https://compranet.hacienda.gob.mx'
link = 'https://compranet.hacienda.gob.mx/web/login.html'
vigen_detail_page = 'https://compranet.hacienda.gob.mx/esop/toolkit/opportunity/current/{}/detail.si'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
}

def grab_first_link_from_dropdown(link):
    r = requests.get(link,headers=headers)
    soup = BeautifulSoup(r.text,"html.parser")
    category_link = urljoin(base,soup.select_one('ul.dropdown-menu > li > a:contains("Vigentes")').get("href"))
    return category_link

def fetch_detail_page_link(cat_link):
    res = requests.get(cat_link,headers=headers)
    str_cookie = f"JSESSIONID={res.cookies['JSESSIONID']}"
    soup = BeautifulSoup(res.text,"html.parser")
    for items in soup.select("table.list-table > tbody.list-tbody > tr"):
        target_link = items.select_one("a.detailLink").get("onclick")
        detail_num = re.findall(r"goToDetail\(\'(\d ?)\'",target_link)[0]
        inner_link = vigen_detail_page.format(detail_num)
        yield str_cookie,inner_link

def get_content(str_cookie,inner_link):
    headers['Cookie'] = str_cookie
    res = requests.get(inner_link,headers=headers)
    soup = BeautifulSoup(res.text,"html.parser")
    try:
        expediente = soup.select_one(".form_question:contains('Código del Expediente')   .form_answer").get_text(strip=True)
    except AttributeError: expediente = ""
    try:
        descripcion = soup.select_one(".form_question:contains('Descripción del Expediente')   .form_answer").get_text(strip=True)
    except AttributeError: descripcion = ""
    return expediente,descripcion

if __name__ == '__main__':
    category_link = grab_first_link_from_dropdown(link)
    for cookie,detail_page_link in fetch_detail_page_link(category_link):
        print(get_content(cookie,detail_page_link))

What possible change should I bring about to make the script work?

CodePudding user response:

There's a redirect that occurs on fetch_detail_page_link. Python Requests follows redirects by default. When your script obtains the cookies, it is only grabbing the cookies for the final request in the chain. You must access the history field of the response to see the redirects that were followed. Doing this with a Session object worked because it was preserving those cookies for you.

I must agree with others who have commented that it really would be a good idea to use a Session object for this. However if you insist on not using Session, your script would look like this:

import re
import requests
from requests.cookies import RequestsCookieJar
from bs4 import BeautifulSoup
from urllib.parse import urljoin

base = 'https://compranet.hacienda.gob.mx'
link = 'https://compranet.hacienda.gob.mx/web/login.html'
vigen_detail_page = 'https://compranet.hacienda.gob.mx/esop/toolkit/opportunity/current/{}/detail.si'

headers = {
    'User-Agent': "Scraping Your Vigentes 1.0",
}


def grab_first_link_from_dropdown(link):
    r = requests.get(link, headers=headers)
    soup = BeautifulSoup(r.text, "html.parser")
    category_link = urljoin(base, soup.select_one('ul.dropdown-menu > li > a:contains("Vigentes")').get("href"))
    return category_link


def fetch_detail_page_link(cat_link):
    res = requests.get(cat_link, headers=headers)
    cookies = RequestsCookieJar()  # create empty cookie jar
    for r in res.history:
        cookies.update(r.cookies)  # merge in cookies from each redirect response
    cookies.update(res.cookies)  # merge in cookies from the final response

    soup = BeautifulSoup(res.text, "html.parser")
    for items in soup.select("table.list-table > tbody.list-tbody > tr"):
        target_link = items.select_one("a.detailLink").get("onclick")
        detail_num = re.findall(r"goToDetail\(\'(\d ?)\'", target_link)[0]
        inner_link = vigen_detail_page.format(detail_num)
        yield cookies, inner_link


def get_content(cookies, inner_link):
    res = requests.get(inner_link, headers=headers, cookies=cookies)
    if not res.ok:
        print("Got bad response %s :(" % res.status_code)
        return "", ""
    soup = BeautifulSoup(res.text, "html.parser")
    try:
        expediente = soup.select_one(".form_question:contains('Código del Expediente')   .form_answer").get_text(strip=True)
    except AttributeError:
        expediente = ""
    try:
        descripcion = soup.select_one(".form_question:contains('Descripción del Expediente')   .form_answer").get_text(strip=True)
    except AttributeError:
        descripcion = ""
    return expediente, descripcion


if __name__ == '__main__':
    category_link = grab_first_link_from_dropdown(link)
    for cookie, detail_page_link in fetch_detail_page_link(category_link):
        print(get_content(cookie, detail_page_link))

  • Related