Home > OS >  Failed attempt at web sraping with Selenium
Failed attempt at web sraping with Selenium

Time:01-22

After a failed attempt using just BeautifulSoup, I decided to try this with Selenium. The script is an attempt to get the subtitles for the specific TV Show or movie that is defined. If you look at the code, you'll see that there are quite a few wait.until with what seems enough of a pause to let Selenium do its work. I'm still getting the error message:

selenium.common.exceptions.TimeoutException: Message: 
Stacktrace:
0   chromedriver                        0x0000000104652fa8 chromedriver   4886440
1   chromedriver                        0x00000001045d0643 chromedriver   4351555
....

Here's the code I'm using:

# Import the necessary modules
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import requests
from bs4 import BeautifulSoup
import re


options = webdriver.ChromeOptions()
options.binary_location = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
chrome_driver_binary = "/usr/local/bin/chromedriver"
browser = webdriver.Chrome(chrome_driver_binary, chrome_options=options)


# Create a webdriver instanceand go to the website
browser.get('https://english-subtitles.org/')

# Wait for the page to fully load before trying to interact with it
wait = WebDriverWait(browser, 35)
wait.until(EC.presence_of_element_located((By.NAME, 'q')))


# Enter the name of the show in the search bar
search_input = browser.find_element(By.NAME, "q")
wait = WebDriverWait(browser, 25)
wait.until(EC.presence_of_element_located((By.NAME, 'q')))
search_input.send_keys("Babylon Berlin")


# Click the search button
wait = WebDriverWait(browser, e0)
search_button = browser.find_element_by_xpath('//button[@type="submit"]')
wait.until(EC.presence_of_element_located((By.XPATH, '//button[@type="submit"]')))
search_button.click()


# Parse the HTML content of the search results page
soup = BeautifulSoup(browser.page_source, 'html.parser')

# Find the download link of all episodes
episodes = soup.find_all('a', href=re.compile(r"download/\d "))

for item in episodes:
    # get url of item
    link = item["href"]
    # normalize episode name
    name = item.text.lower().replace(" ", "_")

    print(f"Downloading {name} from {link} url ...")

    # Download the subtitle file
    subtitle_file = requests.get(link, allow_redirects=True)
        
    # Save the file
    with open(f"{name}.srt", 'wb') as file:
        file.write(subtitle_file.content)

# Close the webdriver instance
browser.quit()

What is it that I'm doing wrong? Is it impossible to scrape this kind of site? I need the subtitles to analyze them and it is very tedious to have to download them manually. Can anybody tell me the best way to do this? Thanks in advance.

JM

EDIT: Here's a modified version of the script @Jurakin provided. This is an attempt to do a batch download of all the subtitles I'm interested in. It doesn't work completely. It gets interrupted after the second download is completed no matter what the amount of files you tell it to download.


from selenium import webdriver
from selenium.webdriver.common.by import By
import requests
from bs4 import BeautifulSoup
import time
import os.path
import re

session = requests.Session()

query = input("Enter your search: ")

page = session.post("https://english-subtitles.org/index.php?do=search", data={"do": "search", "subaction": "search", "story": query})
page.raise_for_status()

soup = BeautifulSoup(page.content, "html.parser")

results = soup.find_all("div", class_="tbl")
if results:
    for index, result in enumerate(results):
        title = result.find("h2").text
        subtitle = result.find("h3").text
        info = result.find("h4").text
        mark = result.find("div", class_="mark").text
        
        print(f"{index}: {title}")
        print(f" - {subtitle}")
        print(f" - {info}")
        print(f" - {mark}")
else:
    raise RuntimeError("no result")

# Parse the user input string to a list of numbers
input_num_list = [int(x) for x in re.findall(r'\d ', input("Select result: "))]

options = webdriver.ChromeOptions()
prefs = {"download.default_directory": os.path.abspath("."), "download.prompt_for_download": False}
options.add_experimental_option("prefs",prefs)

driver = webdriver.Chrome(options=options)

# Iterate over the numbers in the list and download the corresponding subtitles
for query in input_num_list:
    if query < 0 or query >= len(results):
        raise ValueError("invalid select")

    link = results[query].find("a", href=True)
    assert link, "link not found"

    # Create a webdriver instanceand go to the website
    driver.get(link["href"])

    element = driver.find_element(By.XPATH, """//*[@id="content"]/section/div[2]/div[2]/form/input[@type="submit"]""")
    assert element, "no download button found"
    element.click()

    element = driver.find_element(By.XPATH, """//*[@id="content"]/section/div[2]/div/form/input[@type="submit"]""")
    assert element, "no download button2 found"
    element.click()

    # wait until file downloads
    time.sleep(10)

driver.close()

CodePudding user response:

Try this code:

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time

options = Options()

prefs = {"download.default_directory": "..\\Downloads\\",
         "download.prompt_for_download": False,
         "download.directory_upgrade": True,
         "profile.default_content_settings.popups": 0,
         "profile.default_content_settings_values.automatic_downloads": 1,
         "plugins.plugins_disabled": False,
         "safebrowsing_for_trusted_sources_enabled": False,
         "safebrowsing.enabled": False,
         }
options.add_experimental_option("detach", True)
options.add_experimental_option("prefs", prefs)

webdriver_service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=webdriver_service, options=options)
driver.maximize_window()
driver.implicitly_wait(15)
driver.get("https://english-subtitles.org/")
time.sleep(1)

driver.find_element(By.ID, "story").send_keys("Babylon Berlin")
driver.find_element(By.CSS_SELECTOR, ".ser-but").click()
links = driver.find_elements(By.CSS_SELECTOR, ".sub a[href]")

urls = []
for link in links:
    urls.append(link.get_attribute("href"))

for url in urls:
    driver.get(url)
    driver.find_element(By.CSS_SELECTOR, ".downloadlink").click()
    time.sleep(1)
    driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
    driver.find_element(By.CSS_SELECTOR, ".downloadlink").click()
    time.sleep(2)

CodePudding user response:

I created selenium-beautifulsoup program to search and download subtitles into current folder.

from selenium import webdriver
from selenium.webdriver.common.by import By
import requests
from bs4 import BeautifulSoup
import time
import os.path

session = requests.Session()

query = input("Enter your search: ")

page = session.post("https://english-subtitles.org/index.php?do=search", data={"do": "search", "subaction": "search", "story": query})
page.raise_for_status()

soup = BeautifulSoup(page.content, "html.parser")

results = soup.find_all("div", class_="tbl")
if results:
    for index, result in enumerate(results):
        title = result.find("h2").text
        subtitle = result.find("h3").text
        info = result.find("h4").text
        mark = result.find("div", class_="mark").text
        
        print(f"{index}: {title}")
        print(f" - {subtitle}")
        print(f" - {info}")
        print(f" - {mark}")
else:
    raise RuntimeError("no result")

query = int(input("Select result: "))

if query < 0 or query >= len(results):
    raise ValueError("invalid select")

link = results[query].find("a", href=True)
assert link, "link not found"

options = webdriver.ChromeOptions()
prefs = {"download.default_directory": os.path.abspath("."), "download.prompt_for_download": False}
options.add_experimental_option("prefs",prefs)

driver = webdriver.Chrome(options=options)

# Create a webdriver instanceand go to the website
driver.get(link["href"])

element = driver.find_element(By.XPATH, """//*[@id="content"]/section/div[2]/div[2]/form/input[@type="submit"]""")
assert element, "no download button found"
element.click()

element = driver.find_element(By.XPATH, """//*[@id="content"]/section/div[2]/div/form/input[@type="submit"]""")
assert element, "no download button2 found"
element.click()

# wait until file downloads
time.sleep(10)

driver.close()

  • Related