I want to store Image in an excel sheet CSV but giving me this "data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw=="
instead of image url
class NewsSpider(scrapy.Spider):
name = "articles"
def start_requests(self):
url = input("Enter the article url: ")
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
Feature_Image =response.xpath('//*[@id="article-wrapper"]/article/section[2]/div/div/div/img//@src').get()
Feature_Image = response.urljoin(Feature_Image)
yield{
'Publication Date': Published_Date,
'Feature_Image': Feature_Image,
'Article Content': Content
}
# =============== Data Store
Data = [[Category,Headlines,Author,Source,Published_Date,Feature_Image,Content,url]]
try:
df = pd.DataFrame (Data, columns = ['Category','Headlines','Author','Source','Published_Date','Feature_Image','Content','URL'])
print(df)
with open('C:/Users/Public/pagedata.csv', 'a') as f:
df.to_csv(f, header=False)
except:
df = pd.DataFrame (Data, columns = ['Category','Headlines','Author','Source','Published_Date','Feature_Image','Content','URL'])
print(df)
df.to_csv('C:/Users/Public/pagedata.csv', mode='a')
CodePudding user response:
The image url is absolute url. So no need to make it again absolute url using
urljoin()
method which is the main reason not to grab the original image url.Your image url selected xpath expression select only a single image. So get rid of extra forward slash from @src
You aren't getting the right image url because @src select the image url that is your output but the original image url's attribute is
@data-src
Try:
import scrapy
class NewsSpider(scrapy.Spider):
name = "articles"
def start_requests(self):
#https://skift.com/2022/10/08/american-express-travels-rebound-and-other-top-stories-this-week/
url = input("Enter the article url: ")
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
Feature_Image =response.xpath('//*[@id="article-wrapper"]/article/section[2]/div/div/div/img/@data-src').get()
yield {
#'Publication Date': Published_Date,
'Feature_Image': Feature_Image,
#'Article Content': Content
}
Output:
{'Feature_Image': 'https://skift.com/wp-content/uploads/2022/10/American_Express_office_in_Rome-1-e1665181357253-1024x682.jpg'}