I am trying to scrape multiple imsdb pages to get the movie scripts to create a dataset of movie scripts. I wrote this code
import pandas as pd
import numpy as np
#import seaborn as sns
import matplotlib.pyplot as plt
import requests #to send the request to the URL
from bs4 import BeautifulSoup
import numpy as np # to count the values (in our case)
import selenium
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep
from random import randint
driver = webdriver.Chrome(ChromeDriverManager().install())
scriptsList=[]
newScript=[]
titles=[]
movie_titles = pd.read_csv("movies.csv")
l=0;
url_list=[]
for index,row in movie_titles.iterrows():
movieString=movie_titles.loc[index]["title"]
count=0
#print(row)
#assigning the URL with variable name url
movieString=movie_titles.loc[index]["title"]
movieString=str(movieString)
titles.append(movieString)
movieString=movieString.replace(" ", "")
url = 'https://imsdb.com/scripts/' movieString '.html'
url_list.append(url)
for i in url_list:
# Target URL
driver.get(url)
# print(driver.title)
# Printing the whole body text
jt=driver.find_element_by_xpath("/html/body").text
jt = jt.strip('\n')
jt = jt.strip('\t')
print(jt)
scriptsList.append(jt)
# Closing the driver
driver.close()
scripts_DF = pd.DataFrame({'title': titles, 'Script': scriptsList})
scripts_DF.to_csv('NewScripts6.csv')'''
but the code doesn't print all the text it only prints this
ALL SCRIPTS
Writers :
Genres :
User Comments
Back to IMSDb
Index | Submit | Link to IMSDb | Disclaimer | Privacy policy | Contact
The Internet Movie Script Database (IMSDb)
The web's largest
movie script resource!
Search IMSDb
Alphabetical
# A B C D E F G H
I J K L M N O P Q
R S T U V W X Y Z
Genre
Action Adventure Animation
Comedy Crime Drama
Family Fantasy Film-Noir
Horror Musical Mystery
Romance Sci-Fi Short
Thriller War Western
I also wrote this code
import pandas as pd
import numpy as np
#import seaborn as sns
import matplotlib.pyplot as plt
import requests #to send the request to the URL
from bs4 import BeautifulSoup
import numpy as np # to count the values (in our case)
import selenium
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())
# Importing necessary modules
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
# WebDriver Chrome
driver = webdriver.Chrome(ChromeDriverManager().install())
# Target URL
#driver.get("https://www.geeksforgeeks.org/competitive-programming-a-complete-guide/")
driver.get("https://imsdb.com/scripts/Toy-Story.html")
# print(driver.title)
# Printing the whole body text
print(driver.find_element_by_xpath("/html/body").text)
# Closing the driver
driver.close()
this code prints all the text of the website can anyone help me to scrape multiple pages and get all the text from them.I think i need to add time delays to the program because the site can not handle so many requests
CodePudding user response:
With this code to me print all text
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get("https://imsdb.com/Movie Scripts/Joker Script.html")
jt = driver.find_element(by=By.XPATH, value="/html/body").text
print(jt)
CodePudding user response:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
list=["Cable Guy Script","Joker Script","Capote Script"]
for film in list:
driver.get("https://imsdb.com/Movie Scripts/" film ".html")
jt = driver.find_element(by=By.XPATH, value="/html/body").text
print(jt.strip('\n').strip('\t'))
the output is so long, so I attach a piece
Capote Script
IMSDb opinion
None available
IMSDb rating
Not available
Average user rating
None available
Writers
Dan Futterman
Genres
Crime
Drama
Movie Release Date : February 2006
Read "Capote" Script
User Comments for Capote
Add your own comment *Name: E-mail: