I have a huge HTML file (~50MB) and I want to extract all instances of strings between two other strings (they contain forward slashes), which are parts of URLs.
var content = '<div><a href="https://sample.org/something/here/091209283/?param=xoxhwu">Link</a></div><div><a href="https://sample.org/something/here/091209284/?param=xoxhwu">Link</a></div>';
var startingString = '/something/here/';
var endingString = '/';
Desired output:
['091209283', '091209284']
I was trying to do that using RegEx, but no luck so far.
function getBetweenText(fromString, ignoreStart, ignoreEnd){
var s = fromString.split(new RegExp(ignoreStart '|' ignoreEnd)), r = [];
for(var i=1,l=s.length; i<l; i =2){
r.push(s[i]);
}
return r;
}
var result = getBetweenText(html, "something\/here\/", "\/");
This outputs the following:
[
'', 'something',
'091209283', 'a><',
'', 'something',
'091209284', 'a><'
]
[
'', 'something',
'091209283', 'a><',
'', 'something',
'091209284', 'a><'
]
I can't find the right method of escaping the slashes. What is the most memory-effective way of doing that?
I tried various getStringBetween
functions found online, but they seem to consume gigabytes of RAM for a 50MB HTML string. Not good in my particular case.
Example of such function:
var getFromBetween = {
results:[],
string:"",
getFromBetween:function (sub1,sub2) {
if(this.string.indexOf(sub1) < 0 || this.string.indexOf(sub2) < 0) return false;
var SP = this.string.indexOf(sub1) sub1.length;
var string1 = this.string.substr(0,SP);
var string2 = this.string.substr(SP);
var TP = string1.length string2.indexOf(sub2);
return this.string.substring(SP,TP);
},
removeFromBetween:function (sub1,sub2) {
if(this.string.indexOf(sub1) < 0 || this.string.indexOf(sub2) < 0) return false;
var removal = sub1 this.getFromBetween(sub1,sub2) sub2;
this.string = this.string.replace(removal,"");
},
getAllResults:function (sub1,sub2) {
// first check to see if we do have both substrings
if(this.string.indexOf(sub1) < 0 || this.string.indexOf(sub2) < 0) return;
// find one result
var result = this.getFromBetween(sub1,sub2);
// push it to the results array
this.results.push(result);
// remove the most recently found one from the string
this.removeFromBetween(sub1,sub2);
// if there's more substrings
if(this.string.indexOf(sub1) > -1 && this.string.indexOf(sub2) > -1) {
this.getAllResults(sub1,sub2);
}
else return;
},
get:function (string,sub1,sub2) {
this.results = [];
this.string = string;
this.getAllResults(sub1,sub2);
return this.results;
}
};
CodePudding user response:
This works for the example above but won't work for case where the end isn't a single character. For that you would have to compare the whole string like I did with the before portion.
If you are parsing HTML you can make the process simpler by iterating the children of the node and extracting the href directly.
const line = '<div><a href="https://sample.org/something/here/091209283/?param=xoxhwu">Link</a></div><div><a href="https://sample.org/something/here/091209284/?param=xoxhwu">Link</a></div>'
function findIds(line, before, end) {
const bef = [...before]
const lista = [...line]
let acc = []
for (let i = 0; i < lista.length; i ) {
const found = bef.every((el, idx) => el === lista[i idx])
if (found) {
let save = []
let start = 0
for (start = i before.length; lista[start] !== end; start ) {
save.push(lista[start])
}
acc.push(save.join(''))
i = start-1
}
}
return acc
}
findIds(line, '/something/here/', "/")