How to get all the characters not contained in anchor tags using RegEx in Javascript?-CodePudding

How can I use regex to get an array of all the individual characters not contained within anchor tags? So for example, with this text:

DOWNLOAD <a href="https://this.com/" target="_blank">THIS</a> OR <a href="https://that.io/" target="_blank">THAT</a>

I want an array of the indices for the characters D,O,W,N,L,O,A,D, ,T,H,I,S, , ... etc.

I managed to figure out how to get everything I don't want selected, using this: /(?:<.*?>)

But I don't know how to use that to get all the characters outside of that group.

CodePudding user response：

As already pointed out by @Cid, don't do this with regular expressions. Instead, use something like below and read the input character by character:

function reader(el) {
  let i = 0;
  let src = el.innerHTML;
  
  const r = {
    done() {
      return i >= src.length;
    },
    advance() {
      i  = 1;
    },
    char() {
      let c = !r.done() ? src[i] : '';
      r.advance();
      return c;
    },
    peek() {
      return !r.done() ? src[i] : '';
    }
  };
  return r;
}

function collector(el) {
  const r = reader(el);
  
  const skipUntil = char => {
    while (r.peek() !== char) {
     r.advance();
    }
    r.advance();
  };
  
  return {
    collect() {
      const v = [];
      while (!r.done()) {
        if (r.peek() === '<') {
          skipUntil('>');
        } else if (r.peek() === '\n') {
          r.advance();
        } else {
          v.push(r.char());
        }
      }
      return v;
    }
  };
}



/* --- */

const el = document.querySelector('#source');
const cl = collector(el);
console.log(cl.collect());

<div id="source">
DOWNLOAD <a href="#noop">THIS</a> OR <a href="#noop2">THAT</a>
</div>