i use crawler how is a builtin cheerio crawler inside my nodejs project.
my crawler start with a function like this (and like exemple in doc) :
let c = new Crawler({
maxConnections: 10,
callback: (error, res, done) => {
if (error) { throw err; } else {
let $ = res.$;
let content_test1 = $("*").toString();
let content_test2 = res.body;
let file_test1 = fs.writeFile(path, content_test1, (err) => {
err ? console.log(err) : console.log("done");
});
let file_test2 = fs.writeFile(path, content_test2, (err) => {
err ? console.log(err) : console.log("done");
});
}
done();
}});
let browser = c.queue('https://someUrl.com');
problem is "content_test1" file is 4x more larger than file created with "content_test2" as data, and most of html seems to be duplicated. so where i'm wrong ?
regards.
Charly
CodePudding user response:
$("*")
selects all elements in the document. That includes the nested ones. Here
<div>Outer <div>Middle <div>Inner</div></div></div>
it will select
<div>Outer <div>Middle <div>Inner</div></div></div>
<div>Middle <div>Inner</div></div>
<div>Inner</div>