After a failed attempt using just BeautifulSoup, I decided to try this with Selenium. The script is an attempt to get the subtitles for the specific TV Show or movie that is defined. If you look at the code, you'll see that there are quite a few wait.until
with what seems enough of a pause to let Selenium do its work. I'm still getting the error message:
selenium.common.exceptions.TimeoutException: Message:
Stacktrace:
0 chromedriver 0x0000000104652fa8 chromedriver 4886440
1 chromedriver 0x00000001045d0643 chromedriver 4351555
....
Here's the code I'm using:
# Import the necessary modules
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import requests
from bs4 import BeautifulSoup
import re
options = webdriver.ChromeOptions()
options.binary_location = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
chrome_driver_binary = "/usr/local/bin/chromedriver"
browser = webdriver.Chrome(chrome_driver_binary, chrome_options=options)
# Create a webdriver instanceand go to the website
browser.get('https://english-subtitles.org/')
# Wait for the page to fully load before trying to interact with it
wait = WebDriverWait(browser, 35)
wait.until(EC.presence_of_element_located((By.NAME, 'q')))
# Enter the name of the show in the search bar
search_input = browser.find_element(By.NAME, "q")
wait = WebDriverWait(browser, 25)
wait.until(EC.presence_of_element_located((By.NAME, 'q')))
search_input.send_keys("Babylon Berlin")
# Click the search button
wait = WebDriverWait(browser, e0)
search_button = browser.find_element_by_xpath('//button[@type="submit"]')
wait.until(EC.presence_of_element_located((By.XPATH, '//button[@type="submit"]')))
search_button.click()
# Parse the HTML content of the search results page
soup = BeautifulSoup(browser.page_source, 'html.parser')
# Find the download link of all episodes
episodes = soup.find_all('a', href=re.compile(r"download/\d "))
for item in episodes:
# get url of item
link = item["href"]
# normalize episode name
name = item.text.lower().replace(" ", "_")
print(f"Downloading {name} from {link} url ...")
# Download the subtitle file
subtitle_file = requests.get(link, allow_redirects=True)
# Save the file
with open(f"{name}.srt", 'wb') as file:
file.write(subtitle_file.content)
# Close the webdriver instance
browser.quit()
What is it that I'm doing wrong? Is it impossible to scrape this kind of site? I need the subtitles to analyze them and it is very tedious to have to download them manually. Can anybody tell me the best way to do this? Thanks in advance.
JM
EDIT: Here's a modified version of the script @Jurakin provided. This is an attempt to do a batch download of all the subtitles I'm interested in. It doesn't work completely. It gets interrupted after the second download is completed no matter what the amount of files you tell it to download.
from selenium import webdriver
from selenium.webdriver.common.by import By
import requests
from bs4 import BeautifulSoup
import time
import os.path
import re
session = requests.Session()
query = input("Enter your search: ")
page = session.post("https://english-subtitles.org/index.php?do=search", data={"do": "search", "subaction": "search", "story": query})
page.raise_for_status()
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find_all("div", class_="tbl")
if results:
for index, result in enumerate(results):
title = result.find("h2").text
subtitle = result.find("h3").text
info = result.find("h4").text
mark = result.find("div", class_="mark").text
print(f"{index}: {title}")
print(f" - {subtitle}")
print(f" - {info}")
print(f" - {mark}")
else:
raise RuntimeError("no result")
# Parse the user input string to a list of numbers
input_num_list = [int(x) for x in re.findall(r'\d ', input("Select result: "))]
options = webdriver.ChromeOptions()
prefs = {"download.default_directory": os.path.abspath("."), "download.prompt_for_download": False}
options.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome(options=options)
# Iterate over the numbers in the list and download the corresponding subtitles
for query in input_num_list:
if query < 0 or query >= len(results):
raise ValueError("invalid select")
link = results[query].find("a", href=True)
assert link, "link not found"
# Create a webdriver instanceand go to the website
driver.get(link["href"])
element = driver.find_element(By.XPATH, """//*[@id="content"]/section/div[2]/div[2]/form/input[@type="submit"]""")
assert element, "no download button found"
element.click()
element = driver.find_element(By.XPATH, """//*[@id="content"]/section/div[2]/div/form/input[@type="submit"]""")
assert element, "no download button2 found"
element.click()
# wait until file downloads
time.sleep(10)
driver.close()
CodePudding user response:
Try this code:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time
options = Options()
prefs = {"download.default_directory": "..\\Downloads\\",
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"profile.default_content_settings.popups": 0,
"profile.default_content_settings_values.automatic_downloads": 1,
"plugins.plugins_disabled": False,
"safebrowsing_for_trusted_sources_enabled": False,
"safebrowsing.enabled": False,
}
options.add_experimental_option("detach", True)
options.add_experimental_option("prefs", prefs)
webdriver_service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=webdriver_service, options=options)
driver.maximize_window()
driver.implicitly_wait(15)
driver.get("https://english-subtitles.org/")
time.sleep(1)
driver.find_element(By.ID, "story").send_keys("Babylon Berlin")
driver.find_element(By.CSS_SELECTOR, ".ser-but").click()
links = driver.find_elements(By.CSS_SELECTOR, ".sub a[href]")
urls = []
for link in links:
urls.append(link.get_attribute("href"))
for url in urls:
driver.get(url)
driver.find_element(By.CSS_SELECTOR, ".downloadlink").click()
time.sleep(1)
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
driver.find_element(By.CSS_SELECTOR, ".downloadlink").click()
time.sleep(2)
CodePudding user response:
I created selenium-beautifulsoup program to search and download subtitles into current folder.
from selenium import webdriver
from selenium.webdriver.common.by import By
import requests
from bs4 import BeautifulSoup
import time
import os.path
session = requests.Session()
query = input("Enter your search: ")
page = session.post("https://english-subtitles.org/index.php?do=search", data={"do": "search", "subaction": "search", "story": query})
page.raise_for_status()
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find_all("div", class_="tbl")
if results:
for index, result in enumerate(results):
title = result.find("h2").text
subtitle = result.find("h3").text
info = result.find("h4").text
mark = result.find("div", class_="mark").text
print(f"{index}: {title}")
print(f" - {subtitle}")
print(f" - {info}")
print(f" - {mark}")
else:
raise RuntimeError("no result")
query = int(input("Select result: "))
if query < 0 or query >= len(results):
raise ValueError("invalid select")
link = results[query].find("a", href=True)
assert link, "link not found"
options = webdriver.ChromeOptions()
prefs = {"download.default_directory": os.path.abspath("."), "download.prompt_for_download": False}
options.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome(options=options)
# Create a webdriver instanceand go to the website
driver.get(link["href"])
element = driver.find_element(By.XPATH, """//*[@id="content"]/section/div[2]/div[2]/form/input[@type="submit"]""")
assert element, "no download button found"
element.click()
element = driver.find_element(By.XPATH, """//*[@id="content"]/section/div[2]/div/form/input[@type="submit"]""")
assert element, "no download button2 found"
element.click()
# wait until file downloads
time.sleep(10)
driver.close()