while Web Scraping I collect All products list, problem is 'For loop doesn't execute prope-CodePudding

# I am trying to extract Books data from Amazon.com. using Selenium Python only.

the last part of output is something like this::

`
{'book title: ': 'Explore Atoms and Molecules!: With 25 Great Projects', 'authors: ': 'Part of: Explore Your World (59 books)', 'rating: ': '', 'price: ': ''} #each 10  times
.
.
.
{'book title: ': 'Patience is my Superpower: A Kid’s Book about Learning How to Wait (My Superpower Books)', 'authors: ': 'Book 7 of 7: My Superpower Books', 'rating: ': '', 'price: ': ''}
#and so on... till end
#due to restriction, cant paste whole result.. hope u got my point

`

when I run my program, in result I get repetition of first book's Elements for 1 page iteration, then for 2nd, 3rd page till last, i some elements of that pages are repeated each for 10 times in a row, then displays another book for 10 times in a row. I tried simple for loop/ for loop with Range. both same issue, different result. such a weird behavior.. hehe

from search-results pages, without going into each book's detail pages, I am trying to extract each book's specific elements like it's title, price, rating, authors etc. ignoring its remaining elements. the way I am trying is:

first of all, I collect results list from search result. through 'resultsXpath', into a variable called results. i then execute 2 'for loops', one outer Loop for pagination. and the 2nd as a nested for loop, for extracting result one by one.

now within this nested loop I am trying to get each result's Elements(not all only specific), I think the issue is with the loops. but how to solve it... I don't know. any help form your side would be highly appreciated. I am new to selenium.

thank you all. this is how I tried:


from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.common import NoSuchElementException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get("https://amazon.com")
driver.maximize_window()

# searching for Children books in search bar
driver.implicitly_wait(2)
searchBox = driver.find_element("id", "twotabsearchtextbox")
searchBox.send_keys("children Books")

# to click the search button
searchBtn = driver.find_element("id", "nav-search-submit-button")
searchBtn.click()

# collecting data of all listed books

booksList = []
resultsXpath = "//div[@data-component-type='s-search-result']"

# total pages to search = 7 so,

for i in range(1, 8):

    WebDriverWait(driver, 25).until(EC.visibility_of_all_elements_located((By.XPATH, resultsXpath)))
    results = driver.find_elements(By.XPATH, resultsXpath)

    for result in results:
        title = result.find_element(By.XPATH, "//h2/a/span[@class='a-size-base-plus a-color-base a-text-normal']").text.strip()
        price = result.find_element(By.XPATH, "//span[@class='a-price']/span[@class='a-offscreen']").text
        rating = result.find_element(By.XPATH, "//span[@class='a-declarative']/a[@role='button']/i/span").text
        auth = result.find_element(By.XPATH,"//h2/following-sibling::div[@class='a-row a-size-base a-color-secondary']").text.strip()
        # title = aatag.text.strip()
        booksList.append({'book title: ': title, 'authors: ': auth, 'rating: ': rating, 'price: ': price})

        print()

    try:
        nextBtn = driver.find_element("xpath",
                                      "//a[@class=\"s-pagination-item s-pagination-next s-pagination-button s-pagination-separator\"]")
        nextBtn.click()
    except NoSuchElementException:
        pass

for book in booksList:
    print(book)

print()
print(len(booksList))
driver.quit()`

CodePudding user response：

For avoiding duplicates in general, you can use the data-asin attribute which should be unique to each result. First initiate a list of added IDs before the for i in range... loop [like scrapedIds = []] and then, inside the for result... loop, skip results if the ID has already been added

    for result in results:
        dasin = result.get_attribute('data-asin')
        if dasin in scrapedIds:
                continue
        scrapedIds.append(dasin)
        ## REST OF CODE ##

As for not getting anything but the first result in each page, I don't know the reason for this, and what I'm suggesting is probably more of a work-around than anything, but I came across a similar duplicates issue with selenium before, and it you might get around it with bs4:

# from bs4 import BeautifulSoup


# scrapedIds = []
# for i in range...

    for result in results:
        print('', end=f'\rpage {i} of 7 - {len(results)} results left')
        dasin = result.get_attribute('data-asin')
        if dasin in scrapedIds: continue
        scrapedIds.append(dasin)
        rSoup = BeautifulSoup(result.get_attribute('outerHTML').encode('utf-8'), 'html.parser')
        title = rSoup.select_one("h2 > a > span[class='a-size-base-plus a-color-base a-text-normal']")
        price = rSoup.select_one("span[class='a-price'] > span[class='a-offscreen']")
        rating = rSoup.select_one("span[class='a-declarative'] > a[role='button'] > i > span")
        auth = rSoup.select_one("h2 ~ div[class='a-row a-size-base a-color-secondary']")

        booksList.append({k: v if k in ['pg', 'asin'] else (
            '' if v is None else v.get_text(' ').strip()
        ) for k, v in {
            'pg': i, 'asin': dasin, ## remove if you don't want
            'book title': title, 'authors': auth, 'rating': rating, 'price': price
        }.items()})
    ## REST OF YOUR CODE ##

Btw, why don't you use

        nextBtn.click()
    except NoSuchElementException:
        break

instead of except NoSuchElementException: pass? That way, the code won't be unnecessarily scraping the last page multiple times...

With the above changes, I got 115 unique entries and the data collected in booksList looked like:

|   pg | asin       | book title                                                                                                           | authors                                                     | rating             | price   |
|-----:|:-----------|:---------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------|:-------------------|:--------|
|    1 | 9388369882 | My First Complete Learning Library: Boxset of 20 Board Books for Kids                                                | by  Wonder House Books                                      | 4.7 out of 5 stars | $26.88  |
|    1 | 031077702X | I'm So Glad You Were Born: Celebrating Who You Are                                                                   | by  Ainsley Earhardt    and  Kim Barnes                     | 4.9 out of 5 stars | $14.16  |
|    1 | 9389432006 | Best of Children s Classics (Set of 5 Books)                                                                         | by  Various                                                 | 4.6 out of 5 stars | $22.49  |
|    1 | 0545261244 | The Wonky Donkey                                                                                                     | Part of: Wonky Donkey (4 books)                             | 4.8 out of 5 stars | $5.28   |
|    1 | 0375832416 | Dancing Dinos Go to School (Step into Reading)                                                                       | Part of: Step into Reading (233 books)                      | 4.8 out of 5 stars | $3.99   |
|    1 | 0062075535 | The Berenstain Bears' Night Before Christmas: A Christmas Holiday Book for Kids                                      | Part of: Berenstain Bears (90 books)                        | 4.9 out of 5 stars | $4.99   |
|    1 | 054521579X | Clifford's Good Deeds (Classic Storybook)                                                                            | Part of: Clifford (37 books)                                | 4.8 out of 5 stars | $4.99   |
|    1 | 0060254920 | Where the Wild Things Are                                                                                            | Part of: Caldecott Collection (8 books)                     | 4.9 out of 5 stars | $13.20  |
|    1 | 0736431411 | The Never Girls Collection #1 (Disney: The Never Girls): Books 1-4                                                   | Book 1 of 3: The Never Girls Collection                     | 4.8 out of 5 stars | $14.39  |
|    1 | 1797213873 | Construction Site: Farming Strong, All Year Long (Goodnight, Goodnight, Construc)                                    | by  Sherri Duskey Rinker    and  AG Ford                    | 4.7 out of 5 stars | $14.99  |
|    1 | 1665920580 | Angelina Ballerina 5-Minute Stories                                                                                  | Part of: Angelina Ballerina (42 books)                      | 4.5 out of 5 stars | $11.00  |
|    1 | 1452170339 | Tiny T. Rex and the Impossible Hug                                                                                   | Book 1 of 4: Tiny T. Rex                                    | 4.8 out of 5 stars | $9.26   |
|    2 | 006267529X | Pete the Cat Goes Camping (I Can Read Level 1)                                                                       | Part of: I Can Read Level 1 (375 books)                     | 4.8 out of 5 stars | $4.74   |
|    3 | 1510756299 | World Cup Women: Megan, Alex, and the Team USA Soccer Champs                                                         | by  Meg Walters    and  Nikkolas Smith                      | 4.9 out of 5 stars | $16.99  |
|    3 | 1496454839 | How Much Is a Little Boy Worth?                                                                                      | by  Rachael Denhollander   ,  Jacob Denhollander , et al.   | 4.8 out of 5 stars | $13.49  |
|    3 | 0312527594 | See, Touch, Feel: A First Sensory Book                                                                               | by  Roger Priddy                                            | 4.8 out of 5 stars | $7.58   |
|    4 | 1680105469 | Grandma Loves You!                                                                                                   | by  Danielle McLean    and  Alison Edgson                   | 4.9 out of 5 stars | $6.49   |
|    4 | 1452111731 | Goodnight, Goodnight Construction Site (Board Book for Toddlers, Children's Board Book)                              | Part of: Goodnight, Goodnight, Construction Site (6 books)  | 4.9 out of 5 stars | $4.28   |
|    5 | 1644725088 | Best Hidden Pictures Puzzles EVER: The Ultimate Collection of America's Favorite Puzzle (Highlights Hidden Pictures) | Part of: Highlights Hidden Pictures (17 books)              | 4.7 out of 5 stars | $7.29   |
|    6 | 1664350292 | Early Learning Library: 10 Books! (My First Home Learning)                                                           | by  Tiger Tales                                             | nan                | $19.35  |
|    6 | 0823425479 | The Story of Hanukkah                                                                                                | by  David A. Adler    and  Jill Weber                       | 4.7 out of 5 stars | $7.99   |
|    7 | 1728280117 | TummyTime Fold-Out Book Set: 4 high-contrast books to help develop strength and eye coordination for your baby       | by  duopress labs                                           | nan                | $34.84  |
|    7 | 0593406265 | Little Black Boy: Oh, the Things You Will Do!                                                                        | by  Kirby Howell-Baptiste   ,  Larry C. Fields III , et al. | 4.3 out of 5 stars | $17.94  |

[ The above was processed with pandas and is the output printed from print(pandas.DataFrame(booksList).loc[::5].to_markdown(index=False)) (only every 5th row) ]

EDIT: For a selenium-only approach, try

scrapedIds = []
for i in range(1, 8):
    WebDriverWait(driver, 25).until(EC.visibility_of_all_elements_located((By.XPATH, resultsXpath)))
    resLen = len(driver.find_elements(By.XPATH, resultsXpath))

    for ri in range(1, resLen   1):
        print('', end=f'\rpage {i} of 7 - result {ri} of {resLen}')
        dasin = driver.find_element(By.XPATH, f"{resultsXpath}[{ri}]").get_attribute('data-asin')
        if dasin in scrapedIds: continue
        scrapedIds.append(dasin)

        title = driver.find_elements(By.XPATH, f"{resultsXpath}[{ri}]//h2/a/span[@class='a-size-base-plus a-color-base a-text-normal']")
        price = driver.find_elements(By.XPATH, f"{resultsXpath}[{ri}]//span[@class='a-price']/span[@class='a-offscreen']")
        rating = driver.find_elements(By.XPATH, f"{resultsXpath}[{ri}]//span[@class='a-declarative']/a[@role='button']/i/span")
        auth = driver.find_elements(By.XPATH, f"{resultsXpath}[{ri}]//h2/following-sibling::div[@class='a-row a-size-base a-color-secondary']")

        booksList.append({k: v if k in ['pg', 'asin'] else (
            v[0].get_attribute('innerText').strip() if v else ''
        ) for k, v in {
            'pg': i, 'asin': dasin, ## remove if you don't want
            'book title': title, 'authors': auth, 'rating': rating, 'price': price
        }.items()})
print()