I was trying to make a crawler to follow links, with this code
import scrapy
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import json
class DicionarioSpider(scrapy.Spider):
name = 'dicionario'
allowed_domains = ['www.mediktor.com']
start_urls = ['http://www.mediktor.com/']
def start_requests(self):
url = "https://www.mediktor.com/pt-br/glossario"
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get(url)
time.sleep(10)
doencas = driver.find_elements(
By.XPATH, "//a[@class='mdk-dictionary-list__glossary-item']")
for doenca in doencas:
url = doenca.get_attribute('href')
yield scrapy.Request(url)
driver.quit()
def parse(self, response):
urls = response.css(
'.mdk-dictionary-list__glossary-item a::attr(href)')
for url in urls:
yield response.follow(url.get(), callback=self.parse_info)
def parse_info(self, response):
contents = response.css('div.page-glossary-detail__main-content')
for desc in response.css('div.mdk-conclusion-detail__main-description'):
desc = response.css('p ::text').getall()
yield {
'desc': desc
}
for content in contents:
yield{
'name': content.css(
'div.mdk-conclusion-detail__main-title ::text').get().strip(),
'espec': content.css(
'div.mdk-ui-list-item__text mdc-list-item__text span::text').strip()
}
I was able to get the links but the part of entering the links and getting the information I need was not working, so a friend helped me to come up with this code
import pandas as pd
import requests
from bs4 import BeautifulSoup
def get_auth_code():
url = "https://www.mediktor.com/vendor.js"
response = requests.get(url)
start_index = response.text.index('APP_API_AUTH_CODE:"', 0) len('APP_API_AUTH_CODE:"')
end_index = response.text.index('"', start_index)
return response.text[start_index:end_index]
def get_auth_token_and_device_id():
url = "https://euapi01.mediktor.com/backoffice/services/login"
payload = "{\"useCache\":0,\"apiVersion\":\"4.1.1\",\"appVersion\":\"8.7.0\"," \
"\"appId\":null,\"deviceType\":\"WEB\",\"deviceToken\":null,\"language\":\"pt_BR\"," \
"\"timezoneRaw\":180,\"authTokenRefreshExpiresIn\":null}"
headers = {
'authorization': f'Basic {get_auth_code()}',
'Content-Type': 'text/plain'
}
response = requests.request("POST", url, headers=headers, data=payload)
return response.json()['authToken'], response.json()['deviceId']
def get_conclusion_list(auth_token, device_id):
url = "https://euapi01.mediktor.com/backoffice/services/conclusionList"
payload = "{\"useCache\":168,\"apiVersion\":\"4.1.1\",\"appVersion\":\"8.7.0\"" \
",\"appId\":null,\"deviceType\":\"WEB\",\"deviceToken\":null,\"language\":\"pt_BR\"," \
"\"timezoneRaw\":180,\"deviceId\":\"" device_id "\"}"
headers = {
'accept': 'application/json, text/plain, */*',
'authorization': f'Bearer {auth_token}',
'content-type': 'application/json;charset=UTF-8'
}
response = requests.request("POST", url, headers=headers, data=payload)
return [conclusionId['conclusionId'] for conclusionId in response.json()['conclusions']]
def get_details(conclusionId, auth_token, device_id):
url = "https://euapi01.mediktor.com/backoffice/services/conclusionDetail"
payload = "{\"useCache\":0,\"apiVersion\":\"4.1.1\",\"appVersion\":\"8.7.0\"," \
"\"appId\":null,\"deviceType\":\"WEB\",\"deviceToken\":null,\"language\":\"en_EN\"," \
"\"timezoneRaw\":180,\"deviceId\":\"" device_id "\"," \
"\"conclusionId\":\"" conclusionId "\"," \
"\"conclusionTemplate\":\"conclusion_description_body\",\"includeActions\":true}"
headers = {
'authorization': f'Bearer {auth_token}',
'content-type': 'application/json;charset=UTF-8'
}
response = requests.request("POST", url, headers=headers, data=payload)
return response.text
auth_token, device_id = get_auth_token_and_device_id()
conclusion_list = get_conclusion_list(auth_token, device_id)
for conclusion in conclusion_list:
print(get_details(conclusion, auth_token, device_id))
It gets the json with the page items, but in loop number 230 it starts returning the following error and won't leave the loop
{"error":{"code":"ME667","description":"Expired user identification token.","retry":true}}
What I'd like to do is, pass this all to a file so I can see if it's getting all the items on the page I need and then leave a json with just the information I need, not everything from the site as it's returning now
CodePudding user response:
I after many sleepless nights solved my problem, I will leave it here in case it helps someone.
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import json
class DicionarioSpider(scrapy.Spider):
name = 'dicionario'
allowed_domains = ['www.mediktor.com']
start_urls = ['http://www.mediktor.com/']
def parse(self, response):
url = "https://www.mediktor.com/pt-br/glossario"
option = Options()
option.headless = True
driver = webdriver.Chrome(options=option)
driver.get(url)
time.sleep(10)
el_links = driver.find_elements(
By.XPATH, "//a[@class='mdk-dictionary-list__glossary-item']")
urls = []
nome_doenca = []
for i in range(len(el_links)):
urls.append(el_links[i].get_attribute('href'))
for link in urls:
driver.get(link)
myElem = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.XPATH,
"//div[@class='mdk-conclusion-detail__main-title']"
)))
nome_source = driver.find_element(By.XPATH,
"//div[@class='mdk-conclusion-detail__main-title']"
).text
nome_doenca.append(nome_source)
driver.back()
print(nome_doenca)
driver.quit()
I just modified my code and didn't use scrapy, just the selenium selectors.