Home > Enterprise >  Way to find content in a json file with node
Way to find content in a json file with node

Time:04-28

I have parsed two .pdf files into two .json files to be able to read their content. For this I have used pdf2json (I've parsed each of the files separately):

const fs = require("fs")
const PDFParser = require("pdf2json")

const pdfParser = new PDFParser();

pdfParser.on("pdfParser_dataError", errData => console.error(errData.parserError) );
pdfParser.on("pdfParser_dataReady", pdfData => {
  fs.writeFile("./json/first.json", JSON.stringify(pdfData), () => {});
});

pdfParser.loadPDF("./pdf/first.pdf");

The structure of the first.json is like:

{
  "Transcoder": "[email protected] [https://github.com/modesty/pdf2json]",
  "Meta": {...}
  "Pages": [
    {
      "Texts": [
        {
          "x": 3.95,
          "y": 4.102,
          "w": 3000.893,
          "sw": 0.3606875,
          "A": "left",
          "R": [
            {
              "T": "Some Awesome Text",
              "S": -1,
              "TS": [
                0,
                330.679993,
                0,
                0
              ]
            }
          ]
        },
        {
          "x": 22.895,
          "y": 4.05,
          "w": 2782.003,
          "sw": 0.3606875,
          "A": "left",
          "R": [
            {
              "T": "Some more important awesome text",
              "S": -1,
              "TS": [
                0,
                330.679993,
                0,
                0
              ]
            }
          ]
        },
        {
          "x": 38.755,
          "y": 6.02,
          "w": 2782.003,
          "sw": 0.3606875,
          "A": "left",
          "R": [
            {
              "T": "",
              "S": -1,
              "TS": [
                0,
                330.679993,
                0,
                0
              ]
            }
          ]
        },
        {
          "x": 69.6868,
          "y": 69.6868,
          "w": 2782.003,
          "sw": 0.3606875,
          "A": "left",
          "R": [
            {
              "T": "text I do not want",
              "S": -1,
              "TS": [
                0,
                330.679993,
                0,
                0
              ]
            }
          ]
        }
      ]
    }
  ]
}

And the second, similar file second.json:

{
  "Transcoder": "[email protected] [https://github.com/modesty/pdf2json]",
  "Meta": {...}
  "Pages": [
    {
      "Texts": [
        {
          "x": 3.85,
          "y": 4.052,
          "w": 3000.893,
          "sw": 0.3606875,
          "A": "left",
          "R": [
            {
              "T": "Some Awesome Text",
              "S": -1,
              "TS": [
                0,
                330.679993,
                0,
                0
              ]
            }
          ]
        },
        {
          "x": 22.855,
          "y": 4.12,
          "w": 2782.003,
          "sw": 0.3606875,
          "A": "left",
          "R": [
            {
              "T": "Some more important awesome text",
              "S": -1,
              "TS": [
                0,
                330.679993,
                0,
                0
              ]
            }
          ]
        },
        {
          "x": 38.855,
          "y": 6.12,
          "w": 2782.003,
          "sw": 0.3606875,
          "A": "left",
          "R": [
            {
              "T": "",
              "S": -1,
              "TS": [
                0,
                330.679993,
                0,
                0
              ]
            }
          ]
        },
        {
          "x": 69.6969,
          "y": 69.6969,
          "w": 2782.003,
          "sw": 0.3606875,
          "A": "left",
          "R": [
            {
              "T": "text I do not want",
              "S": -1,
              "TS": [
                0,
                330.679993,
                0,
                0
              ]
            }
          ]
        }
      ]
    }
  ]
}

Please do note, that the "x" and "y" variables vary about 0.1 between those files.
This is important since I would like to get the content of the "T" from both files:
"Some Awesome Text" and "Some more important awesome text". The content of the "T"can vary, so I cannot rely on it. I can only get the content by checking "x" and "y" if it is in range.

Now my question is, how to achieve it?
My idea would be to make a nested for loop, that would check "y" and "x" if they are in range, but how should I then get the "T"?


Solution (thanks to @timpa, and the rest of SO users!):

const PDFParser = require("pdf2json")

const extractPDF = () => {
  return new Promise((resolve, reject) => {
    const pdfParser = new PDFParser();
    pdfParser.loadPDF("./pdf/first.pdf");
    pdfParser.on("pdfParser_dataError", errData => console.error(errData.parserError))
    pdfParser.on("pdfParser_dataReady", (pdfData, err ) => {
      if (err) return console.log(err)
      resolve(pdfData)
    })
  })  
}

const getTargetContent = jsonContent => {
    const data = {}
    jsonContent.Pages[0].Texts.forEach(text => {
      
      if(4.002 < text.y && text.y < 4.202) {
        if(3.85 < text.x && text.x < 4.05) {
         data.content = decodeURI(text.R[0].T)
        }
      }
    })
    return data
}

const main = async () => {
 const pdfContent = await extractPDF()
 const targetContent = getTargetContent(pdfContent)
 console.log(targetContent)
}

main()

CodePudding user response:

I don't think you even need a nested loop. Just do something like

obj.Pages[0].Texts.forEach(text => {
  // check text.x, text.y, and text.R[0].T
})
  • Related