Home > Enterprise >  How to get all the characters not contained in anchor tags using RegEx in Javascript?
How to get all the characters not contained in anchor tags using RegEx in Javascript?

Time:08-16

How can I use regex to get an array of all the individual characters not contained within anchor tags? So for example, with this text:

DOWNLOAD <a href="https://this.com/" target="_blank">THIS</a> OR <a href="https://that.io/" target="_blank">THAT</a>

I want an array of the indices for the characters D,O,W,N,L,O,A,D, ,T,H,I,S, , ... etc.

I managed to figure out how to get everything I don't want selected, using this: /(?:<.*?>)

But I don't know how to use that to get all the characters outside of that group.

CodePudding user response:

As already pointed out by @Cid, don't do this with regular expressions. Instead, use something like below and read the input character by character:

function reader(el) {
  let i = 0;
  let src = el.innerHTML;
  
  const r = {
    done() {
      return i >= src.length;
    },
    advance() {
      i  = 1;
    },
    char() {
      let c = !r.done() ? src[i] : '';
      r.advance();
      return c;
    },
    peek() {
      return !r.done() ? src[i] : '';
    }
  };
  return r;
}

function collector(el) {
  const r = reader(el);
  
  const skipUntil = char => {
    while (r.peek() !== char) {
     r.advance();
    }
    r.advance();
  };
  
  return {
    collect() {
      const v = [];
      while (!r.done()) {
        if (r.peek() === '<') {
          skipUntil('>');
        } else if (r.peek() === '\n') {
          r.advance();
        } else {
          v.push(r.char());
        }
      }
      return v;
    }
  };
}



/* --- */

const el = document.querySelector('#source');
const cl = collector(el);
console.log(cl.collect());
<div id="source">
DOWNLOAD <a href="#noop">THIS</a> OR <a href="#noop2">THAT</a>
</div>

  • Related