I am tring to download around 20.000 images from a website by scraping it.
(I am authorized by the owner)
The path of the image is composed like this:
VolumeId/ChapterId/PageId.jpg
There are around 100 volumes, and every volume has x chapters, and every chapter has y pages.
In the database I have stored for every volume the number of chapters, but i don't have the number of pages, so I have to navigate to the image url and check if it exists.
I know for sure that there are less than 30 pages per chapter so I did something like this:
let exists = true;
for (let i = 0; i < 30 && exists; i ) {
fetch(`imgUrl/${i}.jpg`)
.then(data => {
if (data.ok)
return data.arrayBuffer();
else
exists = false;
.then(arrayBuffer => {
if (exists) {
let buffer = Buffer.from(arrayBuffer );
if (!fs.existsSync(path.join(__dirname, imgPath))) {
fs.mkdirSync(path.join(__dirname, imgPath), {
recursive: true,
});
}
fs.writeFile(
path.join(__dirname, imgPath "/" img ".jpg"),
buffer,
(err) => {
if (err) throw err;
}
);
}
});
}
The problem:
The problem is that the loop does not wait for the image to be fetched and saved locally.
I have tried with async/await and promises (I think I have implemented them wrong)
Is there a better way to download a large quantity of data? Maybe with streams?
CodePudding user response:
It can be a little bit complicated to implement your code with just async/await and at the same time assure the "exists" condition between iterations, I suggest you use a class that implements an async iterator, refer to the official documentation for more details, the following code achieve what you are looking for (note: the code snippet you provided didn't show where "imgPath" is coming from so just fix my code accordingly) :
class FetchImages {
constructor(urls) {
this.urls = urls;
this.index = 0;
}
[Symbol.asyncIterator]() {
const urlsIterator = this.urls[Symbol.iterator]();
return {
async next() {
if ( index == 30) {
return {
done: true
};
}
const iteratorResult = urlsIterator.next();
if (iteratorResult.done) {
return { done: true };
}
const url = iteratorResult.value;
try {
let response = await fetch(url);
let data;
if (response.status == 200) {
data = await response.arrayBuffer();
} else {
// equivalent to exists == false, exit condition of the iterator
return {
done: true
};
};
let buffer = Buffer.from(data);
if (!fs.existsSync(path.join(__dirname, imgPath))) {
fs.mkdirSync(path.join(__dirname, imgPath), {
recursive: true,
});
}
fs.writeFileSync(
path.join(__dirname, imgPath),
buffer,
);
return {
done: false,
value: imgPath
};
} catch (err) {
return {
done: false,
value: err.message
};
}
}
}
}
}
(async function () {
const fetchImages = new FetchImages(urls);
for await (const fetchImage of fetchImages) {
// check status of each fetch
console.log(fetchImage);
}
})();