Home > Software design >  I'm trying to scrap data from a website and getting back basic HTML with JS function in the bod
I'm trying to scrap data from a website and getting back basic HTML with JS function in the bod

Time:12-26

Hi everyone,

I'm playing around with Node.js and cheerio package as part of my node.js learning and im trying to build a web scrapper that will get the title and the price of an item from a shopping site but when I try to console.log the html variable it returns a basic html structure with some Js functions that are trying to prevent the scraping.

my code:

const needle = require('needle')
const http = require('http')
const cheerio = require("cheerio")

needle.get('https://ksp.co.il/web/item/130984', (error, response, html) => {
    if (!error && response.statusCode == 200){
        const $ = cheerio.load(html)

        
   

        console.log(html)

        http.createServer(function (req, res) {
            res.writeHead(200, {'Content-Type': 'text/html'});
            res.write(html)
            res.end();
          }).listen(3000);
            

    }
})

I guess it's some kind of protection layer from scrapers but this what i get as a result:

<html lang="he">

<head>
    <meta charset="utf-8" />
    <link rel="icon" id="header-icon" href="/web/favicon.ico">
    <link rel="canonical" id="header-canonical">
    <meta name="viewport" content="width=device-width,initial-scale=1" />
    <meta name="description" content="מעל 38,000 מוצרים: מחשבים סלולר, בשמים, למטבח, למשרד טיפוח, פארם, צעצועים, נעלים ומיזוג" />
    <link rel="manifest" href="/web/manifest.json" />
    <script src='https://ksp.co.il/_cache/dictionary_site_only/all.js?ts=1640275687'></script>
    <script src="/web/encoding.min.js"></script>
    <script>
        ! function() {
            if ("function" == typeof window.CustomEvent) return !1;

            function t(t, e) {
                e = e || {
                    bubbles: !1,
                    cancelable: !1,
                    detail: void 0
                };
                var n = document.createEvent("CustomEvent");
                return n.initCustomEvent(t, e.bubbles, e.cancelable, e.detail), n
            }
            t.prototype = window.Event.prototype, window.CustomEvent = t
        }()
        
        

    </script>
    <script src='https://ksp.co.il/_cache/menu_dev/menu.js?ts=1640280140'></script>
    <style>
        .lang_he svg.revert {
            transform: scaleX(-1)
        }

        * {
            box-sizing: border-box
        }

        a:visited {
            color: inherit
        }

    </style>
    <title>KSP</title>
    <link rel="preload" href="/web/fonts/Assistant/w300en.woff2" as="font" crossorigin="anonymous">
    <link rel="preload" href="/web/fonts/Assistant/w400en.woff2" as="font" crossorigin="anonymous">
    <link rel="preload" href="/web/fonts/Assistant/w600en.woff2" as="font" crossorigin="anonymous">
    <link rel="preload" href="/web/fonts/Assistant/w700en.woff2" as="font" crossorigin="anonymous">
    <link rel="preload" href="/web/fonts/Assistant/w800en.woff2" as="font" crossorigin="anonymous">
    <link rel="preload" href="/web/fonts/Assistant/w300he.woff2" as="font" crossorigin="anonymous">
    <link rel="preload" href="/web/fonts/Assistant/w400he.woff2" as="font" crossorigin="anonymous">
    <link rel="preload" href="/web/fonts/Assistant/w600he.woff2" as="font" crossorigin="anonymous">
    <link rel="preload" href="/web/fonts/Assistant/w700he.woff2" as="font" crossorigin="anonymous">
    <link rel="preload" href="/web/fonts/Assistant/w800he.woff2" as="font" crossorigin="anonymous">
    <link rel="stylesheet" href="/web/fonts/Assistant/index.css">
    <script>
        ! function(e, t, a, n, g) {
            e[n] = e[n] || [], e[n].push({
                "gtm.start": (new Date).getTime(),
                event: "gtm.js"
            });
            var m = t.getElementsByTagName(a)[0],
                r = t.createElement(a);
            r.async = !0, r.src = "https://www.googletagmanager.com/gtm.js?id=GTM-59D9ZCV", m.parentNode.insertBefore(r, m)
        }(window, document, "script", "dataLayer")

    </script>
    <link href="/web/static/css/2.f825bd5a.chunk.css" rel="stylesheet">
    <link href="/web/static/css/main.4e4460ee.chunk.css" rel="stylesheet">
</head>

<body style="overflow-x:hidden"><noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-59D9ZCV" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript><noscript>You need to enable JavaScript to run this app.</noscript>
    <div>
        <div id="root"></div>
    </div>
    <script async src="https://www.googletagmanager.com/gtag/js?id=UA-109261-1"></script>
    <script async src="https://www.googletagmanager.com/gtag/js?id=AW-1032006858"></script>
    <script>
        function gtag() {
            dataLayer.push(arguments)
        }
        window.dataLayer = window.dataLayer || [], gtag("js", new Date)

    </script>
    <script>
        ! function(e, t, n, c, o, a, f) {
            e.fbq || (o = e.fbq = function() {
                o.callMethod ? o.callMethod.apply(o, arguments) : o.queue.push(arguments)
            }, e._fbq || (e._fbq = o), o.push = o, o.loaded = !0, o.version = "2.0", o.queue = [], (a = t.createElement(n)).async = !0, a.src = "https://connect.facebook.net/en_US/fbevents.js", (f = t.getElementsByTagName(n)[0]).parentNode.insertBefore(a, f))
        }(window, document, "script"), fbq("init", "1179615532183839"), fbq("track", "PageView")

    </script><noscript><img height="1" width="1" style="display:none" src="https://www.facebook.com/tr?id=1179615532183839&ev=PageView&noscript=1" /></noscript>
    <script>
        ! function(e) {
            function r(r) {
                for (var n, l, f = r[0], i = r[1], a = r[2], p = 0, s = []; p < f.length; p  ) l = f[p], Object.prototype.hasOwnProperty.call(o, l) && o[l] && s.push(o[l][0]), o[l] = 0;
                for (n in i) Object.prototype.hasOwnProperty.call(i, n) && (e[n] = i[n]);
                for (c && c(r); s.length;) s.shift()();
                return u.push.apply(u, a || []), t()
            }

            function t() {
                for (var e, r = 0; r < u.length; r  ) {
                    for (var t = u[r], n = !0, f = 1; f < t.length; f  ) {
                        var i = t[f];
                        0 !== o[i] && (n = !1)
                    }
                    n && (u.splice(r--, 1), e = l(l.s = t[0]))
                }
                return e
            }
            var n = {},
                o = {
                    1: 0
                },
                u = [];

            function l(r) {
                if (n[r]) return n[r].exports;
                var t = n[r] = {
                    i: r,
                    l: !1,
                    exports: {}
                };
                return e[r].call(t.exports, t, t.exports, l), t.l = !0, t.exports
            }
            l.m = e, l.c = n, l.d = function(e, r, t) {
                l.o(e, r) || Object.defineProperty(e, r, {
                    enumerable: !0,
                    get: t
                })
            }, l.r = function(e) {
                "undefined" != typeof Symbol && Symbol.toStringTag && Object.defineProperty(e, Symbol.toStringTag, {
                    value: "Module"
                }), Object.defineProperty(e, "__esModule", {
                    value: !0
                })
            }, l.t = function(e, r) {
                if (1 & r && (e = l(e)), 8 & r) return e;
                if (4 & r && "object" == typeof e && e && e.__esModule) return e;
                var t = Object.create(null);
                if (l.r(t), Object.defineProperty(t, "default", {
                        enumerable: !0,
                        value: e
                    }), 2 & r && "string" != typeof e)
                    for (var n in e) l.d(t, n, function(r) {
                        return e[r]
                    }.bind(null, n));
                return t
            }, l.n = function(e) {
                var r = e && e.__esModule ? function() {
                    return e.default
                } : function() {
                    return e
                };
                return l.d(r, "a", r), r
            }, l.o = function(e, r) {
                return Object.prototype.hasOwnProperty.call(e, r)
            }, l.p = "/web/";
            var f = this.webpackJsonpcode = this.webpackJsonpcode || [],
                i = f.push.bind(f);
            f.push = r, f = f.slice();
            for (var a = 0; a < f.length; a  ) r(f[a]);
            var c = i;
            t()
        }([])

    </script>
    <script src="/web/static/js/2.159a3d73.chunk.js"></script>
    <script src="/web/static/js/main.1ab08410.chunk.js"></script>
</body>

</html>

Any idea how can i overcome this ? Thanks everyone

CodePudding user response:

This likely is not scraper protection. Instead, this site is probably using some web framework that loads in the viewable data and DOM elements after the JS has run. The easiest way to get past this would be to use a library like puppeteer that will load the site and process it like how a real browser would. Here is a basic example of what you might want:

const puppeteer = require('puppeteer');

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  await page.goto('[the full URL you want to scrape]');

  // once the page has loaded, you can find data in a few ways:
  
  // 1: querying
  const elements = await page.$$("[any JS selector]")
  
  // 2: evaluate
  const elements1 = await page.evaluate(() => {
    // run any code on the site and have it's result returned to you
  });
  
  // 3: text
  const wholePage = await page.evaluate(() => document.querySelector("*").outerHTML);

  // this gives you the text content of the whole page
  // which you can then put in to cheerio or any parser
  // and use how you were using before

  await browser.close();
})();

You can read more about puppeteer more broadly, method 1, method 2 and method 3.

  • Related