Home > Software design >  How to find all instances of a string in JavaScript between parts of URLs in HTML?
How to find all instances of a string in JavaScript between parts of URLs in HTML?

Time:07-26

I have a huge HTML file (~50MB) and I want to extract all instances of strings between two other strings (they contain forward slashes), which are parts of URLs.

var content = '<div><a href="https://sample.org/something/here/091209283/?param=xoxhwu">Link</a></div><div><a href="https://sample.org/something/here/091209284/?param=xoxhwu">Link</a></div>';
var startingString = '/something/here/';
var endingString = '/';

Desired output:

['091209283', '091209284']

I was trying to do that using RegEx, but no luck so far.

function getBetweenText(fromString, ignoreStart, ignoreEnd){
    var s =  fromString.split(new RegExp(ignoreStart '|' ignoreEnd)), r = [];
    for(var i=1,l=s.length; i<l; i =2){
      r.push(s[i]);
    }
    return r;
}

var result = getBetweenText(html, "something\/here\/", "\/");

This outputs the following:

[
  '',          'something',
  '091209283', 'a><',
  '',          'something',
  '091209284', 'a><'
]
[
  '',          'something',
  '091209283', 'a><',
  '',          'something',
  '091209284', 'a><'
]

I can't find the right method of escaping the slashes. What is the most memory-effective way of doing that?

I tried various getStringBetween functions found online, but they seem to consume gigabytes of RAM for a 50MB HTML string. Not good in my particular case.

Example of such function:

var getFromBetween = {
  results:[],
  string:"",
  getFromBetween:function (sub1,sub2) {
      if(this.string.indexOf(sub1) < 0 || this.string.indexOf(sub2) < 0) return false;
      var SP = this.string.indexOf(sub1) sub1.length;
      var string1 = this.string.substr(0,SP);
      var string2 = this.string.substr(SP);
      var TP = string1.length   string2.indexOf(sub2);
      return this.string.substring(SP,TP);
  },
  removeFromBetween:function (sub1,sub2) {
      if(this.string.indexOf(sub1) < 0 || this.string.indexOf(sub2) < 0) return false;
      var removal = sub1 this.getFromBetween(sub1,sub2) sub2;
      this.string = this.string.replace(removal,"");
  },
  getAllResults:function (sub1,sub2) {
      // first check to see if we do have both substrings
      if(this.string.indexOf(sub1) < 0 || this.string.indexOf(sub2) < 0) return;

      // find one result
      var result = this.getFromBetween(sub1,sub2);
      // push it to the results array
      this.results.push(result);
      // remove the most recently found one from the string
      this.removeFromBetween(sub1,sub2);

      // if there's more substrings
      if(this.string.indexOf(sub1) > -1 && this.string.indexOf(sub2) > -1) {
          this.getAllResults(sub1,sub2);
      }
      else return;
  },
  get:function (string,sub1,sub2) {
      this.results = [];
      this.string = string;
      this.getAllResults(sub1,sub2);
      return this.results;
  }
};

CodePudding user response:

This works for the example above but won't work for case where the end isn't a single character. For that you would have to compare the whole string like I did with the before portion.
If you are parsing HTML you can make the process simpler by iterating the children of the node and extracting the href directly.

const line = '<div><a href="https://sample.org/something/here/091209283/?param=xoxhwu">Link</a></div><div><a href="https://sample.org/something/here/091209284/?param=xoxhwu">Link</a></div>'

function findIds(line, before, end) {
  const bef = [...before]
    const lista = [...line]
  
  let acc = []
  for (let i = 0; i < lista.length; i  ) {
    const found = bef.every((el, idx) => el === lista[i idx])

    if (found) {
      let save = []
      let start = 0
      for (start = i before.length; lista[start] !== end; start  ) {
        save.push(lista[start])
      }
      acc.push(save.join(''))
      i = start-1
    }
  }
  
  return acc
}

findIds(line, '/something/here/', "/")
  • Related