Home > Enterprise >  Following links and crawling them
Following links and crawling them

Time:07-26

I was trying to make a crawler to follow links, with this code

import scrapy
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import json


class DicionarioSpider(scrapy.Spider):
    name = 'dicionario'
    allowed_domains = ['www.mediktor.com']
    start_urls = ['http://www.mediktor.com/']

    def start_requests(self):
        url = "https://www.mediktor.com/pt-br/glossario"
        options = Options()
        options.headless = True
        driver = webdriver.Chrome(options=options)
        driver.get(url)
        time.sleep(10)

        doencas = driver.find_elements(
            By.XPATH, "//a[@class='mdk-dictionary-list__glossary-item']")
        for doenca in doencas:
            url = doenca.get_attribute('href')
            yield scrapy.Request(url)
        driver.quit()

    def parse(self, response):
        urls = response.css(
            '.mdk-dictionary-list__glossary-item a::attr(href)')
        for url in urls:
            yield response.follow(url.get(), callback=self.parse_info)

    def parse_info(self, response):
        contents = response.css('div.page-glossary-detail__main-content')
        for desc in response.css('div.mdk-conclusion-detail__main-description'):
            desc = response.css('p ::text').getall()
        yield {
            'desc': desc
        }
        for content in contents:
            yield{
                'name': content.css(
                    'div.mdk-conclusion-detail__main-title ::text').get().strip(),
                'espec': content.css(
                    'div.mdk-ui-list-item__text mdc-list-item__text span::text').strip()
            }

I was able to get the links but the part of entering the links and getting the information I need was not working, so a friend helped me to come up with this code

import pandas as pd
import requests
from bs4 import BeautifulSoup


def get_auth_code():
    url = "https://www.mediktor.com/vendor.js"
    response = requests.get(url)
    start_index = response.text.index('APP_API_AUTH_CODE:"', 0)   len('APP_API_AUTH_CODE:"')
    end_index = response.text.index('"', start_index)
    return response.text[start_index:end_index]


def get_auth_token_and_device_id():
    url = "https://euapi01.mediktor.com/backoffice/services/login"
    payload = "{\"useCache\":0,\"apiVersion\":\"4.1.1\",\"appVersion\":\"8.7.0\"," \
              "\"appId\":null,\"deviceType\":\"WEB\",\"deviceToken\":null,\"language\":\"pt_BR\"," \
              "\"timezoneRaw\":180,\"authTokenRefreshExpiresIn\":null}"
    headers = {
        'authorization': f'Basic {get_auth_code()}',
        'Content-Type': 'text/plain'
    }
    response = requests.request("POST", url, headers=headers, data=payload)
    return response.json()['authToken'], response.json()['deviceId']


def get_conclusion_list(auth_token, device_id):
    url = "https://euapi01.mediktor.com/backoffice/services/conclusionList"
    payload = "{\"useCache\":168,\"apiVersion\":\"4.1.1\",\"appVersion\":\"8.7.0\"" \
              ",\"appId\":null,\"deviceType\":\"WEB\",\"deviceToken\":null,\"language\":\"pt_BR\"," \
              "\"timezoneRaw\":180,\"deviceId\":\""   device_id   "\"}"
    headers = {
        'accept': 'application/json, text/plain, */*',
        'authorization': f'Bearer {auth_token}',
        'content-type': 'application/json;charset=UTF-8'
    }
    response = requests.request("POST", url, headers=headers, data=payload)
    return [conclusionId['conclusionId'] for conclusionId in response.json()['conclusions']]


def get_details(conclusionId, auth_token, device_id):
    url = "https://euapi01.mediktor.com/backoffice/services/conclusionDetail"
    payload = "{\"useCache\":0,\"apiVersion\":\"4.1.1\",\"appVersion\":\"8.7.0\"," \
              "\"appId\":null,\"deviceType\":\"WEB\",\"deviceToken\":null,\"language\":\"en_EN\"," \
              "\"timezoneRaw\":180,\"deviceId\":\""   device_id   "\"," \
              "\"conclusionId\":\""   conclusionId   "\"," \
              "\"conclusionTemplate\":\"conclusion_description_body\",\"includeActions\":true}"
    headers = {
        'authorization': f'Bearer {auth_token}',
        'content-type': 'application/json;charset=UTF-8'
    }
    response = requests.request("POST", url, headers=headers, data=payload)
    return response.text


auth_token, device_id = get_auth_token_and_device_id()
conclusion_list = get_conclusion_list(auth_token, device_id)
for conclusion in conclusion_list:
    print(get_details(conclusion, auth_token, device_id))

It gets the json with the page items, but in loop number 230 it starts returning the following error and won't leave the loop

{"error":{"code":"ME667","description":"Expired user identification token.","retry":true}}

What I'd like to do is, pass this all to a file so I can see if it's getting all the items on the page I need and then leave a json with just the information I need, not everything from the site as it's returning now

CodePudding user response:

I after many sleepless nights solved my problem, I will leave it here in case it helps someone.

import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import json


class DicionarioSpider(scrapy.Spider):
    name = 'dicionario'
    allowed_domains = ['www.mediktor.com']
    start_urls = ['http://www.mediktor.com/']

    def parse(self, response):
        url = "https://www.mediktor.com/pt-br/glossario"
        option = Options()
        option.headless = True
        driver = webdriver.Chrome(options=option)
        driver.get(url)
        time.sleep(10)

        el_links = driver.find_elements(
            By.XPATH, "//a[@class='mdk-dictionary-list__glossary-item']")
        urls = []
        nome_doenca = []

        for i in range(len(el_links)):
            urls.append(el_links[i].get_attribute('href'))

        for link in urls:
            driver.get(link)

            myElem = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH,
                                                "//div[@class='mdk-conclusion-detail__main-title']"
                                                )))
            nome_source = driver.find_element(By.XPATH,
                                              "//div[@class='mdk-conclusion-detail__main-title']"
                                              ).text

            nome_doenca.append(nome_source)

            driver.back()
        print(nome_doenca)
        driver.quit()

I just modified my code and didn't use scrapy, just the selenium selectors.

  • Related