I have parsed two .pdf files into two .json files to be able to read their content. For this I have used pdf2json (I've parsed each of the files separately):
const fs = require("fs")
const PDFParser = require("pdf2json")
const pdfParser = new PDFParser();
pdfParser.on("pdfParser_dataError", errData => console.error(errData.parserError) );
pdfParser.on("pdfParser_dataReady", pdfData => {
fs.writeFile("./json/first.json", JSON.stringify(pdfData), () => {});
});
pdfParser.loadPDF("./pdf/first.pdf");
The structure of the first.json is like:
{
"Transcoder": "[email protected] [https://github.com/modesty/pdf2json]",
"Meta": {...}
"Pages": [
{
"Texts": [
{
"x": 3.95,
"y": 4.102,
"w": 3000.893,
"sw": 0.3606875,
"A": "left",
"R": [
{
"T": "Some Awesome Text",
"S": -1,
"TS": [
0,
330.679993,
0,
0
]
}
]
},
{
"x": 22.895,
"y": 4.05,
"w": 2782.003,
"sw": 0.3606875,
"A": "left",
"R": [
{
"T": "Some more important awesome text",
"S": -1,
"TS": [
0,
330.679993,
0,
0
]
}
]
},
{
"x": 38.755,
"y": 6.02,
"w": 2782.003,
"sw": 0.3606875,
"A": "left",
"R": [
{
"T": "",
"S": -1,
"TS": [
0,
330.679993,
0,
0
]
}
]
},
{
"x": 69.6868,
"y": 69.6868,
"w": 2782.003,
"sw": 0.3606875,
"A": "left",
"R": [
{
"T": "text I do not want",
"S": -1,
"TS": [
0,
330.679993,
0,
0
]
}
]
}
]
}
]
}
And the second, similar file second.json:
{
"Transcoder": "[email protected] [https://github.com/modesty/pdf2json]",
"Meta": {...}
"Pages": [
{
"Texts": [
{
"x": 3.85,
"y": 4.052,
"w": 3000.893,
"sw": 0.3606875,
"A": "left",
"R": [
{
"T": "Some Awesome Text",
"S": -1,
"TS": [
0,
330.679993,
0,
0
]
}
]
},
{
"x": 22.855,
"y": 4.12,
"w": 2782.003,
"sw": 0.3606875,
"A": "left",
"R": [
{
"T": "Some more important awesome text",
"S": -1,
"TS": [
0,
330.679993,
0,
0
]
}
]
},
{
"x": 38.855,
"y": 6.12,
"w": 2782.003,
"sw": 0.3606875,
"A": "left",
"R": [
{
"T": "",
"S": -1,
"TS": [
0,
330.679993,
0,
0
]
}
]
},
{
"x": 69.6969,
"y": 69.6969,
"w": 2782.003,
"sw": 0.3606875,
"A": "left",
"R": [
{
"T": "text I do not want",
"S": -1,
"TS": [
0,
330.679993,
0,
0
]
}
]
}
]
}
]
}
Please do note, that the "x"
and "y"
variables vary about 0.1 between those files.
This is important since I would like to get the content of the "T"
from both files:"Some Awesome Text"
and "Some more important awesome text"
.
The content of the "T"
can vary, so I cannot rely on it. I can only get the content by checking "x"
and "y"
if it is in range.
Now my question is, how to achieve it?
My idea would be to make a nested for loop, that would check "y"
and "x"
if they are in range, but how should I then get the "T"
?
Solution (thanks to @timpa, and the rest of SO users!):
const PDFParser = require("pdf2json")
const extractPDF = () => {
return new Promise((resolve, reject) => {
const pdfParser = new PDFParser();
pdfParser.loadPDF("./pdf/first.pdf");
pdfParser.on("pdfParser_dataError", errData => console.error(errData.parserError))
pdfParser.on("pdfParser_dataReady", (pdfData, err ) => {
if (err) return console.log(err)
resolve(pdfData)
})
})
}
const getTargetContent = jsonContent => {
const data = {}
jsonContent.Pages[0].Texts.forEach(text => {
if(4.002 < text.y && text.y < 4.202) {
if(3.85 < text.x && text.x < 4.05) {
data.content = decodeURI(text.R[0].T)
}
}
})
return data
}
const main = async () => {
const pdfContent = await extractPDF()
const targetContent = getTargetContent(pdfContent)
console.log(targetContent)
}
main()
CodePudding user response:
I don't think you even need a nested loop. Just do something like
obj.Pages[0].Texts.forEach(text => {
// check text.x, text.y, and text.R[0].T
})