All images downloaded from the image scraper have the same file size of 130 kb and are corrupted and cannot be seen in the image viewer.
I really have no idea what the problem is.
Anyone please give me some advice on this matter.
import requests
import parsel
import os
import time
url = 'https://movie-screencaps.com/movie-directory/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
response = requests.get(url, headers=headers)
selector = parsel.Selector(response.text)
movie_list = selector.xpath('//div[@]/ul/li')
for li in movie_list:
movie_name = li.xpath('.//a/text()').get().strip()
movie_url = li.xpath('.//a/@href').get()
print(movie_name, movie_url)
# dir = f'download/{movie_name}'
dir = f'{movie_name}'
if not os.path.exists(dir):
os.makedirs(dir)
page_response = requests.get(movie_url, headers=headers)
page_selector = parsel.Selector(page_response.text)
page_text = page_selector.xpath('//div[@]/text()').get()
last_page = int(page_text.split(' ')[-1])
for page in range(1, last_page 1):
page_url = f'{movie_url}/page/{page}'
print(f'===== Downloading from page {page} =====')
image_response = requests.get(url=page_url, headers=headers)
image_selector = parsel.Selector(image_response.text)
images_url_list = image_selector.xpath('//div[@align="center"]/a/@href').getall()
for image_url in images_url_list:
image_data = requests.get(url=page_url, headers=headers).content
# print(image_data)
file_name = image_url.split('/')[-1]
with open(f'{dir}/{file_name}', mode='wb') as f:
f.write(image_data)
print(file_name)
time.sleep(2)
CodePudding user response:
The problem is a typo where you are fetching the page_url
for each image_url instead of fetching the image_url
:
...
for image_url in images_url_list:
image_data = requests.get(url=page_url, headers=headers).content
file_name = image_url.split('/')[-1]
...
Should be:
...
for image_url in images_url_list:
# Typo is here...
image_data = requests.get(url=image_url, headers=headers).content
file_name = image_url.split('/')[-1]
...
CodePudding user response:
I tested your code and you just got a little mistake
change:
image_data = requests.get(url=page_url, headers=headers).content
to:
image_data = requests.get(url=image_url, headers=headers).content
tested and works just fine :)