I am trying to use the iText7 library but for some reason, I cannot split pages into the list of strings.
Instead, I am getting a list of pages like this: 1,1 2,1 2 3,1 2 3 4
public List<string> PdfPages;
private ITextExtractionStrategy _Strategy;
public PdfExtractor(IFormFile pdf, ITextExtractionStrategy? strategy = default)
{
this._Strategy = strategy ?? new SimpleTextExtractionStrategy();
PdfPages = new List<string>();
ExtractTextFromPages(pdf);
}
private void ExtractTextFromPages(IFormFile pdf)
{
using (var stream = pdf.OpenReadStream())
{
using (var reader = new PdfReader(stream))
{
PdfDocument pdfDoc = new PdfDocument(reader);
for (int index = 1; index < pdfDoc.GetNumberOfPages(); index )
{
string PdfPageToText = PdfTextExtractor.GetTextFromPage(pdfDoc.GetPage(index), _Strategy);
PdfPages.Add(PdfPageToText);
}
}
}
}
Does anyone know how to correct that?
CodePudding user response:
The problem was as @mkl mentioned in the comment below, that I did not create a new ITextExtractionStrategy object for each page, and when I did that everything works like a charm without the need to save files anywhere.
using (var stream = pdf.OpenReadStream())
{
using (var reader = new PdfReader(stream))
{
PdfDocument pdfDoc = new PdfDocument(reader);
for (int index = 1; index < pdfDoc.GetNumberOfPages(); index )
{
ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
string PdfPageToText = PdfTextExtractor.GetTextFromPage(pdfDoc.GetPage(index), strategy);
PdfPages.Add(PdfPageToText);
}
pdfDoc.Close();
reader.Close();
}
}