I am trying to parse some html from a website.
The html may contain some invalid html which cause that the parser are not able to parse the html.
this is my regex that I wrote
/(\[class\]((=)("|')?.*("|')))|(\[class\])|((\[id\]((=)("|')?.*("|')))|(\[id\]))/
This will remove all [class] and [id] attr
My above regex work fine with some html but not all example 1 that works
<div [class]="'par fontsize-' fontsize"><p>the two of them left that everyone came back to their senses.</p>
but it dose not work with
</div><span id="saved" hidden>Settings saved..</span><div ></div><div [class]="'par fontsize-' fontsize"><p>It wasn't " until the two of them left that everyone came back to their senses.</p>
This is caused by the string It wasn't "
which is removed.
I only want to remove the attr and its content and not the tags content
is it possible
Final solution
Thanx to It goldman I ended up with a solution. I am posting it incase someone needs it.
cleanHTML(html: string, ...attrs: string[]) {
attrs.forEach(attr => {
var pos = 0
while ((pos = html.indexOf(attr)) > -1) {
var sep = null;
var state = 0;
if (html[pos attr.length] === "=") {
for (var i = pos attr.length; i < html.length; i ) {
var c = html.charAt(i);
if (c == '=') {
state = 1
continue;
}
if (state == 1 && (c.trim() === '"' || c.trim() === "'")) {
console.log(c.trim())
sep = c;
break;
} else if (state === 1)
break;
}
}
if (!sep) {
html = html.substring(0, pos) html.substring(pos attr.length (state== 1 ? 1 : 0));
continue;
}
var pos_q = html.indexOf(sep, pos);
var pos_q2 = html.indexOf(sep, pos_q 1);
html = html.substring(0, pos) html.substring(pos_q2 1)
}
});
return html;
}
CodePudding user response:
var src = `</div><span [class] [class][class] id="saved" [id]hidden>Settings saved..</span><div ></div><div [class]="'par fontsize-' fontsize"><p>It wasn't " until the two of them left that everyone came back to their senses.</p><a [class]='another'>sasportas</a>`
function clean_str(src, attributes_to_remove) {
attributes_to_remove.forEach(function(attr) {
var pos
while ((pos = src.indexOf(attr)) > -1) {
var sep;
var state = 0;
for (var i = pos attr.length; i < src.length; i ) {
var c = src.charAt(i);
if (c == '=') {
state = 1
continue;
}
if (state == 0 && c.trim()) {
sep = null;
break;
}
if (state == 1 && c.trim()) {
sep = c;
break;
}
}
if (sep) {
var pos_q = src.indexOf(sep, pos);
var pos_q2 = src.indexOf(sep, pos_q 1);
src = src.substring(0, pos) src.substring(pos_q2 1)
} else {
src = src.substring(0, pos) src.substring(pos attr.length)
}
}
})
return src;
}
console.log(clean_str(src, ["[class]", "[id]"]))