Home > Blockchain >  WebScrapping using node js
WebScrapping using node js

Time:12-23

I need to disply image of whole webiste first i have get the hrefs of all anchor tags in the website and the tried to load the images of the tags but failed because of some loop error. Result is displaying but not according to the images in website



loginrouter.get('/images',(req,res)=>{
  var WriteStream  = fs.createWriteStream("ImagesLink.txt", "UTF-8");
  request('https://nu.edu.pk/', (err, resp, html)=>{
  
      if(!err && resp.statusCode == 200){
          console.log("Request was success ");
          
          const $ = cherio.load(html);
          
  
          $("a").each((index, datalinks)=>{
            var Links=[];
            
              var anch = $(datalinks).attr('href');
              var baseUrl = 'https://nu.edu.pk';
               Links= baseUrl  anch;

             Array.of(Links).forEach(Links => {
              request(Links, (err, resp, html1)=>{
               console.log("Links areeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee/////////////////////////",Links)
                $("img").each((index2, imglinks) => {
                        var img2 = $(imglinks).attr('src');
                        var url='https://nu.edu.pk';
                        const Links2=url   img2;
                        console.log(Links2)
                   })
              })
              });
            
              });
      }
      else{
          console.log("Request Failed ");
      }
  });
  
  })

Link here is representing the anchor tag hrefs and the simple links are displaying the image links but the image links are incomplete and are not displaying the main images of the href pages

CodePudding user response:

For scraping I believe it is best to use a headless browser such as puppeteer. Some websites block ajax requests.

Here is an example on how to use puppeteer to do what you wanted.

  1. Get all a-tags and the corresponding hrefs

  2. Go to each link you retrieved and get all the image links

    const puppeteer = require("puppeteer");
    
    (async () => {
    let browser;
    
    async function initialisePage(link) {
        const page = await browser.newPage();
        // page.setDefaultNavigationTimeout(10000);
        // page.setDefaultTimeout(10000);
        await page.setExtraHTTPHeaders({
            'Accept-Language': 'en'
        });
        await page.setGeolocation({
            latitude: 40.75073264981463,
            longitude: -73.9859968851446
        });
        await page.setViewport({ width: 1920, height: 1080});
        await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36');
        await page.goto(link);
    
        await page.evaluateOnNewDocument(() => {
            Object.defineProperty(navigator, "language", {
                get: function() {
                    return "en";
                }
            });
            Object.defineProperty(navigator, "languages", {
                get: function() {
                    return ["en"];
                }
            });
        });
    
        const viewPortHeight = await page.evaluate(() => {
            return window.document.documentElement.scrollHeight;
        });
    
        await page.setViewport({
            width: 1920,
            height: Math.round(viewPortHeight)
        });
    
        try {
            await page.waitForNetworkIdle(); // wait till all requests have been finished, sometimes websites might fetch data using fetch, ajax and what not
        }
        catch (e) {
            console.error(e);
        }
        //accept cookies
        await page.evaluate(_ => { // script to try to accept cookies, maybe this accept cookies blocks the content
            const iframes = [...document.getElementsByTagName("iframe")];
            for (let iframe of iframes) {
                iframe.remove();
            }
            function xcc_contains(selector, text) {
                const elements = document.querySelectorAll(selector);
                return Array.prototype.filter.call(elements, function(element){
                    return RegExp(text, "i").test(element.textContent.trim());
                });
            }
            const _xcc = xcc_contains('a, button', '^(Accept All|Accept|I understand|Agree|Okay|Agree & continue|OK)$');
            if (_xcc != null && _xcc.length !== 0) { _xcc[0].click(); }
        });
    
        /*try { // remove this block if you are sure there are no redirects after accepting cookies
            await page.waitForNavigation({
                timeout: 1000 // only wait 1 seconds because it might not redirect you
            }); // because the accept cookies click might redirect you
        }
        catch (e) {
            console.error("wait nav error", e);
        }*/
    
        return page;
    }
    
    try {
        browser = await puppeteer.launch({
            headless: true,
            args: ['--lang=en', '--accept-lang=en', '--no-sandbox']
        });
    
        const page = await initialisePage('https://nu.edu.pk/');
    
        const links = await page.evaluate(() => { // get all hrefs from all a tags
            return [...new Set([...document.getElementsByTagName("a")].filter(value => value.href).map(value => value.href))]; // new set because you have to make sure it's unique
        });
    
        let images = [];
    
        console.log("Getting images for ", links.length, " links...");
    
        for (let link of links) {
            console.log("Getting images for ", link);
            try {
                const page = await initialisePage(link);
                const imageLinks = await page.evaluate(() => { // get all image links for the current a tag
                    return [...document.getElementsByTagName("img")].filter(value => value.src).map(value => value.src);
                });
                images.push(...imageLinks);
                console.log(imageLinks);
    
                await page.close();
            }
            catch (e) {
                console.error(e);
            }
    
        }
    
        images = new Set(images); // make the imageLinks unique
        console.log(images)
        await page.close();
    }
    catch (e) {
        console.error(e);
        throw e;
    }
    finally {
        browser && await browser.close();
    }
    })();
    

CodePudding user response:

The following code works but there are issues

  1. hardcoded .jpg extension
  2. it doesn't handle cyclic links.
  3. it will crash on invalid links
  4. you will need to async await or promisify the code.
  5. You will have to add extra logic to clean up URL

If you run the below code it will break after a while.

const cheerio = require('cheerio');
const request = require('request');
const fs = require('fs');

const url = 'https://nu.edu.pk/';

async function downloadImages(url) {
  request(url, (error, response, html) => {
    if (!error && response.statusCode == 200) {
      const $ = cheerio.load(html);

      const images = $('img');

      images.each((i, image) => {
        const src = $(image).attr('src');
        console.log({ src })
        const fileName = `IMAGE_NAME-${i}.jpg`;
        const imgPath = `${url}${src}`;
        console.log({ imgPath })
        request(imgPath).pipe(fs.createWriteStream(fileName));
      });

      const links = $('a');

      links.each((i, link) => {
        const href = $(link).attr('href');
        downloadImages(href);
      });
    }
  });
}

downloadImages(url);
  • Related