Home > Back-end >  After puppeteer infinite scroll finishes does not return all results
After puppeteer infinite scroll finishes does not return all results

Time:07-11

Here is the code in my data scraping file:

const puppeteer = require('puppeteer');
const db = require('../db');
const Job = require('../models/job');

(async() => {
  try {
    const browser = await puppeteer.launch({
      headless: false,
      defaultViewport: null,
      // args: ['--no-zygote', '--no-sandbox']
    });
    const url = 'https://www.linkedin.com/jobs/search?keywords=Junior Software Developer&location=Indianapolis, IN&geoId=&trk=homepage-jobseeker_jobs-search-bar_search-submit&position=1&pageNum=0';

    // Open browser instance
    const page = await browser.newPage({
      waitUntil: 'networkidle0'
    });
    console.log(`Navigating to ${url}`);
    await page.goto(url);

    // Scroll to bottom of page, click on 'See More Jobs' and repeat   
    let lastHeight = await page.evaluate('document.body.scrollHeight');
    const scroll = async() => {
      while (true) {
        await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
        await page.waitForTimeout(2000);
        let newHeight = await page.evaluate('document.body.scrollHeight');
        if (newHeight === lastHeight) {
          console.log('Done scrolling!');
          break;
        }
        lastHeight = newHeight;
        seeMoreJobs();
      }
      console.log(data);
    }
    // Click on 'See More Jobs'
    const seeMoreJobs = async() => {
      await page.evaluate(() => {
        document.querySelector('button[data-tracking-control-name="infinite-scroller_show-more"]').click();
      });
    }
    // Collect data
    const data = await page.evaluate(() => {
      const allJobsArr = Array.from(document.querySelectorAll('a[data-tracking-control-name="public_jobs_jserp-result_search-card"]'));
      const namesAndUrls = allJobsArr.map(job => {
        return {
          name: job.innerText,
          url: job.href,
          path: job.pathname
        }
      });
      return namesAndUrls;
    });
    scroll();
  } catch (err) {
    console.log(err);
  }
})();

So the above code is designed to navigate to the variable url and then to scroll until the scroll function "breaks"/finishes, i.e., to the very bottom of the page. Once these actions have finished, I want to then log some data in the form of an array with three properties from each job posting: name, href, and path. When I run the IIFE as shown I am able to grab the first 24-25 job postings with my data function, which are the first to be displayed on page load (before any of the scrolling takes place).

  • For whatever reason, this data function is unable to evaluate the entire page or document after all the scrolling has occurred.

I have tried various things and have really analyzed what the code is doing, but alas, I am at a loss for a solution. My end goal here is to comb through every job posting that has displayed with my scrolling function and then to log everything (not just the first 24-25 results) returned with the desired data properties to the console.

Thanks, all.

CodePudding user response:

Ok, I have now figured out the reason why it was only pulling out the first 25 results, and I believe it was a problem of scope, sort of how I had outlined in the original question. I ended up housing the 'data' functional expression within the scroll() function, so that the same 'page' was being 'evaluated', otherwise I believe the two were looking at two different instances of the 'page'. I know this might not be the most accurate explanation, so if someone would like to better articulate this for me, that would be awesome. Here is the simple solution to the simple problem that I was having. Thanks.

const puppeteer = require('puppeteer');
const db = require('../db');
const Job = require('../models/job');

(async() => {
  try {
    const browser = await puppeteer.launch({
      headless: false,
      defaultViewport: null,
      // args: ['--no-zygote', '--no-sandbox']
    });
    const url = 'https://www.linkedin.com/jobs/search?keywords=Junior Software Developer&location=Indianapolis, IN&geoId=&trk=homepage-jobseeker_jobs-search-bar_search-submit&position=1&pageNum=0';

    // Open browser instance
    const page = await browser.newPage({
      waitUntil: 'networkidle0'
    });
    console.log(`Navigating to ${url}`);
    await page.goto(url);

    // Scroll to bottom of page, click on 'See More Jobs' and repeat   
    let lastHeight = await page.evaluate('document.body.scrollHeight');
    const scroll = async() => {
      while (true) {
        await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
        await page.waitForTimeout(2000);
        let newHeight = await page.evaluate('document.body.scrollHeight');
        if (newHeight === lastHeight) {
          break;
        }
        lastHeight = newHeight;
        seeMoreJobs();
      }
      // Scrape all junior job titles
      const data = await page.evaluate(() => {
        const allJobsArr = Array.from(document.querySelectorAll('a[data-tracking-control-name="public_jobs_jserp-result_search-card"]'));
        const namesAndUrls = allJobsArr.map(job => {
          return {
            name: job.innerText,
            url: job.href,
            path: job.pathname
          }
        });
        const juniorJobs = namesAndUrls.filter(function(job) {
          return job.name.includes('Junior') || job.name.includes('Jr') || job.name.includes('Entry') && job.url && job.path;
        });
        return juniorJobs;
      });
      console.log(data);
    }
    // Click on 'See More Jobs'
    const seeMoreJobs = async() => {
      await page.evaluate(() => {
        document.querySelector('button[data-tracking-control-name="infinite-scroller_show-more"]').click();
      });
    }
    scroll();
  } catch (err) {
    console.log(err);
  }
})();

  • Related