How to divide a DocumentFragment based on character offset-CodePudding

I have a string that (potentially) contains HTML tags.

I want to split it into smaller valid HTML strings based on (text) character length. The use case is essentially pagination. I know the length of text that can fit on a single page. So I want to divide the target string into "chunks" or pages based on that character length. But I need each of the resulting pages to contain valid HTML without unclosed tags, etc.

So for example:

const pageCharacterSize = 10
const testString = 'some <strong>text with HTML</strong> tags
function paginate(string, pageSize) { //@TODO }
const pages = paginate(testString, pageCharacterSize)
console.log(pages)
// ['some <strong>text </strong>', '<strong>with HTML</strong> ', 'tags']

I think this is possible to do with a DocumentFragment or Range but I can't figure out how slice the pages based on character offsets.

This MDN page has a demo that does something close to what I need. But it uses caretPositionFromPoint() which takes X, Y coordinates as arguments.

Update

For the purposes of clarity, here are the tests I'm working with:

import { expect, test } from 'vitest'
import paginate from './paginate'

// 1
test('it should chunk plain text', () => {
  // a
  const testString = 'aa bb cc dd ee';
  const expected = ['aa', 'bb', 'cc', 'dd', 'ee']
  expect(paginate(testString, 2)).toStrictEqual(expected)

  // b
  const testString2 = 'a a b b c c';
  const expected2 = ['a a', 'b b', 'c c']
  expect(paginate(testString2, 3)).toStrictEqual(expected2)

  // c
  const testString3 = 'aa aa bb bb cc cc';
  const expected3 = ['aa aa', 'bb bb', 'cc cc']
  expect(paginate(testString3, 5)).toStrictEqual(expected3)
  
  // d
  const testString4 = 'aa bb cc';
  const expected4 = ['aa', 'bb', 'cc']
  expect(paginate(testString4, 4)).toStrictEqual(expected4)

  // e
  const testString5 = 'a b c d e f g';
  const expected5 = ['a b c', 'd e f', 'g']
  expect(paginate(testString5, 5)).toStrictEqual(expected5)

  // f
  const testString6 = 'aa bb cc';
  const expected6 = ['aa bb', 'cc']
  expect(paginate(testString6, 7)).toStrictEqual(expected6)
})

// 2
test('it should chunk an HTML string without stranding tags', () => {
  const testString = 'aa <strong>bb</strong> <em>cc dd</em>';
  const expected = ['aa', '<strong>bb</strong>', '<em>cc</em>', '<em>dd</em>']
  expect(paginate(testString, 3)).toStrictEqual(expected)
})

// 3
test('it should handle tags that straddle pages', () => {
  const testString = '<strong>aa bb cc</strong>';
  const expected = ['<strong>aa</strong>', '<strong>bb</strong>', '<strong>cc</strong>']
  expect(paginate(testString, 2)).toStrictEqual(expected)
})

CodePudding user response：

Here is a solution that assumes and supports the following:

tags without attributes (you could tweak the regex to support that)
well formed tags assumed, e.g. not: wrong nesting, missing end tag, missing start tag
tags may be nested
tags are removed & later restored for proper characters per page count
page split is done by looking backwards for first space

function paginate(html, pageSize) {
  let splitRegex = new RegExp('\\s*[\\s\\S]{1,'   pageSize   '}(?!\\S)', 'g');
  let tagsInfo = [];  // saved tags
  let tagOffset = 0;  // running offset of tag in plain text
  let pageOffset = 0; // page offset in plain text
  let openTags = [];  // open tags carried over to next page
  let pages = html.replace(/<\/?[a-z][a-z0-9]*>/gi, (tag, pos) => {
    let obj = { tag: tag, pos: pos - tagOffset };
    tagsInfo.push(obj);
    tagOffset  = tag.length;
    return '';
  }).match(splitRegex).map(page => {
    let nextOffset = pageOffset   page.length;
    let prefix = openTags.join('');
    tagsInfo.slice().reverse().forEach(obj => {
      if(obj.pos >= pageOffset && obj.pos < nextOffset) {
        // restore tags in reverse order to maintain proper position
        page = page.substring(0, obj.pos - pageOffset)   obj.tag   page.substring(obj.pos - pageOffset);
      }
    });
    tagsInfo.forEach(obj => {
      let tag = obj.tag;
      if(obj.pos >= pageOffset && obj.pos < nextOffset) {
        if(tag.match(/<\//)) {
          // remove tag from openTags list
          tag = tag.replace(/<\//, '<');
          let index = openTags.indexOf(tag);
          if(index >= 0) {
            openTags.splice(index, 1);
          }
        } else {
          // add tag to openTags list
          openTags.push(tag);
        }
      }
    });
    pageOffset = nextOffset;
    let postfix = openTags.slice().reverse().map(tag => tag.replace(/</, '</')).join('');
    page = prefix   page.trim()   postfix;
    return page.replace(/<(\w )><\/\1>/g, ''); // remove tags with empty content
  });
  return pages;
}

[
  { str: 'some <strong>text <i>with</i> HTML</strong> tags, and <i>some <b>nested tags</b> sould be <b>supported</b> as well</i>.', size: 16 },
  { str: 'a a b b c c', size: 3 },
  { str: 'aa aa bb bb cc cc', size: 5 },
  { str: 'aa bb cc', size: 4 },
  { str: 'aa <strong>bb</strong> <em>cc dd</em>', size: 3 },
  { str: '<strong>aa bb cc</strong>', size: 2 }
].forEach(o => {
  let pages = paginate(o.str, o.size);
  console.log(pages);
});

Output:

[
  "some <strong>text <i>with</i></strong>",
  "<strong> HTML</strong> tags, and",
  "<i>some <b>nested tags</b></i>",
  "<i> sould be</i>",
  "<i><b>supported</b> as</i>",
  "<i>well</i>."
]
[
  "a a",
  "b b",
  "c c"
]
[
  "aa aa",
  "bb bb",
  "cc cc"
]
[
  "aa",
  "bb",
  "cc"
]
[
  "aa",
  "<strong>bb</strong>",
  " <em>cc</em>",
  "<em>dd</em>"
]
[
  "<strong>aa</strong>",
  "<strong>bb</strong>",
  "<strong>cc</strong>"
]

Update

Based on new request in comment I fixed the split regex from '[\\s\\S]{1,' pageSize '}(?!\\S)' to '\\s*[\\s\\S]{1,' pageSize '}(?!\\S)', e.g. added \\s* to catch leading spaces. I also added a page.trim() to remove leading spaces. Finally I added a few of the OP examples.