I'm trying to download a .xlsx file from a website through web scraping, I've done the entire process until I access the temporary url that the website generates for the download.
When I open the file url in the browser, the download starts automatically (as shown in the image). excel file download
The problem is that I need to parse this file to send later to my front-end. When I try to create the file using fs.createWriteStream('result.xlsx') and later populate it with res.pipe(fileStream); the file is always generated empty.
Here's my full code:
const https = require("https");
const fs = require("fs");
const path = require("path");
const xlsx = require("node-xlsx");
function download(url, callback) {
const filename = path.basename(url);
const req = https.get(url, function (res) {
const fileStream = fs.createWriteStream("result.xlsx");
res.pipe(fileStream);
const obj = xlsx.parse('result.xlsx');
callback(obj[0]);
fileStream.on("error", function (err) {
console.log("Error writting to the stream.");
console.log(err);
});
fileStream.on("close", function () {
callback(filename);
});
fileStream.on("finish", function () {
fileStream.close();
});
});
req.on("error", function (err) {
console.log("error downloading the file");
console.log(err);
});
}
module.exports.download = download;
<iframe name="sif1" sandbox="allow-forms allow-modals allow-scripts" frameborder="0"></iframe>
My questions are:
- Is it possible to parse this data into an array WITHOUT needing to save to a physical file? If yes, how?
- If I can't parse the data without needing to populate a physical file, how can I download the spreadsheet and then read and parse the data later.
NOTE: I have already tested the rest of my download function with a valid file entered manually, everything is working perfectly. The only thing that isn't working is the data downloading and reading part of the spreadsheet.
CodePudding user response:
Is it possible to parse this data into an array WITHOUT needing to save to a physical file? Basically No (file from remote server). Except the server allowed you to do it live.
Your code is nearly right, except the order is wrong. You must callback after the writing is done, it will fix your empty file issue.
Here is how:
const https = require("https");
const fs = require("fs");
const path = require("path");
const xlsx = require("node-xlsx");
function download(url, callback) {
const filename = path.basename(url);
const req = https.get(url, function (res) {
const fileStream = fs.createWriteStream("result.xlsx");
res.pipe(fileStream);
fileStream.on("error", function (err) {
console.log("Error writting to the stream.");
console.log(err);
});
fileStream.on("close", function () {
const obj = xlsx.parse('result.xlsx');// or whatever you named it
callback(obj[0]);
});
fileStream.on("finish", function () {
fileStream.close();
});
});
req.on("error", function (err) {
console.log("error downloading the file");
console.log(err);
});
}
module.exports.download = download;
<iframe name="sif2" sandbox="allow-forms allow-modals allow-scripts" frameborder="0"></iframe>