WebScrapping using node js-CodePudding

I need to disply image of whole webiste first i have get the hrefs of all anchor tags in the website and the tried to load the images of the tags but failed because of some loop error. Result is displaying but not according to the images in website



loginrouter.get('/images',(req,res)=>{
  var WriteStream  = fs.createWriteStream("ImagesLink.txt", "UTF-8");
  request('https://nu.edu.pk/', (err, resp, html)=>{
  
      if(!err && resp.statusCode == 200){
          console.log("Request was success ");
          
          const $ = cherio.load(html);
          
  
          $("a").each((index, datalinks)=>{
            var Links=[];
            
              var anch = $(datalinks).attr('href');
              var baseUrl = 'https://nu.edu.pk';
               Links= baseUrl  anch;

             Array.of(Links).forEach(Links => {
              request(Links, (err, resp, html1)=>{
               console.log("Links areeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee/////////////////////////",Links)
                $("img").each((index2, imglinks) => {
                        var img2 = $(imglinks).attr('src');
                        var url='https://nu.edu.pk';
                        const Links2=url   img2;
                        console.log(Links2)
                   })
              })
              });
            
              });
      }
      else{
          console.log("Request Failed ");
      }
  });
  
  })

Link here is representing the anchor tag hrefs and the simple links are displaying the image links but the image links are incomplete and are not displaying the main images of the href pages

CodePudding user response：

For scraping I believe it is best to use a headless browser such as puppeteer. Some websites block ajax requests.

Here is an example on how to use puppeteer to do what you wanted.

Get all a-tags and the corresponding hrefs

Go to each link you retrieved and get all the image links

const puppeteer = require("puppeteer");

(async () => {
let browser;

async function initialisePage(link) {
    const page = await browser.newPage();
    // page.setDefaultNavigationTimeout(10000);
    // page.setDefaultTimeout(10000);
    await page.setExtraHTTPHeaders({
        'Accept-Language': 'en'
    });
    await page.setGeolocation({
        latitude: 40.75073264981463,
        longitude: -73.9859968851446
    });
    await page.setViewport({ width: 1920, height: 1080});
    await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36');
    await page.goto(link);

    await page.evaluateOnNewDocument(() => {
        Object.defineProperty(navigator, "language", {
            get: function() {
                return "en";
            }
        });
        Object.defineProperty(navigator, "languages", {
            get: function() {
                return ["en"];
            }
        });
    });

    const viewPortHeight = await page.evaluate(() => {
        return window.document.documentElement.scrollHeight;
    });

    await page.setViewport({
        width: 1920,
        height: Math.round(viewPortHeight)
    });

    try {
        await page.waitForNetworkIdle(); // wait till all requests have been finished, sometimes websites might fetch data using fetch, ajax and what not
    }
    catch (e) {
        console.error(e);
    }
    //accept cookies
    await page.evaluate(_ => { // script to try to accept cookies, maybe this accept cookies blocks the content
        const iframes = [...document.getElementsByTagName("iframe")];
        for (let iframe of iframes) {
            iframe.remove();
        }
        function xcc_contains(selector, text) {
            const elements = document.querySelectorAll(selector);
            return Array.prototype.filter.call(elements, function(element){
                return RegExp(text, "i").test(element.textContent.trim());
            });
        }
        const _xcc = xcc_contains('a, button', '^(Accept All|Accept|I understand|Agree|Okay|Agree & continue|OK)$');
        if (_xcc != null && _xcc.length !== 0) { _xcc[0].click(); }
    });

    /*try { // remove this block if you are sure there are no redirects after accepting cookies
        await page.waitForNavigation({
            timeout: 1000 // only wait 1 seconds because it might not redirect you
        }); // because the accept cookies click might redirect you
    }
    catch (e) {
        console.error("wait nav error", e);
    }*/

    return page;
}

try {
    browser = await puppeteer.launch({
        headless: true,
        args: ['--lang=en', '--accept-lang=en', '--no-sandbox']
    });

    const page = await initialisePage('https://nu.edu.pk/');

    const links = await page.evaluate(() => { // get all hrefs from all a tags
        return [...new Set([...document.getElementsByTagName("a")].filter(value => value.href).map(value => value.href))]; // new set because you have to make sure it's unique
    });

    let images = [];

    console.log("Getting images for ", links.length, " links...");

    for (let link of links) {
        console.log("Getting images for ", link);
        try {
            const page = await initialisePage(link);
            const imageLinks = await page.evaluate(() => { // get all image links for the current a tag
                return [...document.getElementsByTagName("img")].filter(value => value.src).map(value => value.src);
            });
            images.push(...imageLinks);
            console.log(imageLinks);

            await page.close();
        }
        catch (e) {
            console.error(e);
        }

    }

    images = new Set(images); // make the imageLinks unique
    console.log(images)
    await page.close();
}
catch (e) {
    console.error(e);
    throw e;
}
finally {
    browser && await browser.close();
}
})();

CodePudding user response：

The following code works but there are issues

hardcoded .jpg extension
it doesn't handle cyclic links.
it will crash on invalid links
you will need to async await or promisify the code.
You will have to add extra logic to clean up URL

If you run the below code it will break after a while.

const cheerio = require('cheerio');
const request = require('request');
const fs = require('fs');

const url = 'https://nu.edu.pk/';

async function downloadImages(url) {
  request(url, (error, response, html) => {
    if (!error && response.statusCode == 200) {
      const $ = cheerio.load(html);

      const images = $('img');

      images.each((i, image) => {
        const src = $(image).attr('src');
        console.log({ src })
        const fileName = `IMAGE_NAME-${i}.jpg`;
        const imgPath = `${url}${src}`;
        console.log({ imgPath })
        request(imgPath).pipe(fs.createWriteStream(fileName));
      });

      const links = $('a');

      links.each((i, link) => {
        const href = $(link).attr('href');
        downloadImages(href);
      });
    }
  });
}

downloadImages(url);