I found a very nice code for CKeditor to count and rank most used words in a textarea, this is very useful for SEO-keywords suggestions when writting articles. The problem is that when you write any non english characters like öäåÖÄÅ they are just not passed by some kind of filter, the same occurs with Diacritics like léjonet with é or ñ.
Here is the code and a working Jsfiddle demo
The HTML is:
<!-- Textarea -->
<div >
<label for="editor1">HTML </label>
<div >
<textarea id="editor1" name="editor1"><p>text example with ahöäåra</p></textarea>
</div>
</div>
<!-- KW density result -->
<div >
<label for="editor1">Words Repeat</label>
<div >
<div id="KWdensity" ></div>
</div>
</div>
And the javascript code is:
<script type="text/javascript">
$(document).ready(function () {
//----------------------------------------------------------------------
// Editor init
//----------------------------------------------------------------------
CKEDITOR.replace( 'editor1' );
//----------------------------------------------------------------------
// KW init
//----------------------------------------------------------------------
$(initKW);
//----------------------------------------------------------------------
// Editor key intercept
//----------------------------------------------------------------------
CKEDITOR.instances.editor1.on('contentDom', function() {
CKEDITOR.instances.editor1.document.on('keyup', function(event) { $(initKW); });
});
function removeDiacritics (str) {
var defaultDiacriticsRemovalMap = [
{'base':'A', 'letters':'\u0041\u24B6\uFF21\u00C0\u00C1\u00C2\u1EA6\u1EA4\u1EAA\u1EA8\u00C3\u0100\u0102\u1EB0\u1EAE\u1EB4\u1EB2\u0226\u01E0\u00C4\u01DE\u1EA2\u00C5\u01FA\u01CD\u0200\u0202\u1EA0\u1EAC\u1EB6\u1E00\u0104\u023A\u2C6F'}
];
var diacriticsMap = {};
for (var i=0; i < defaultDiacriticsRemovalMap .length; i ){
var letters = defaultDiacriticsRemovalMap [i].letters;
for (var j=0; j < letters.length ; j ){
diacriticsMap[letters[j]] = defaultDiacriticsRemovalMap [i].base;
}}
return str.replace(/[^\u0000-\u007E]/g, function(a){
return diacriticsMap[a] || a;
});}
function KeyDensityShow(srctext, MaxKeyOut, keylenMin) {
var Output;
// Split text on non word characters
var words = srctext.toLowerCase().split(/\W /)
var positions = new Array()
var word_counts = new Array()
try {
for (var i=0; i<words.length; i ) {
var word = words[i]
if (!word || word.length < keylenMin) {
continue
}
if (!positions.hasOwnProperty(word)) {
positions[word] = word_counts.length;
word_counts.push([word, 1]);
} else {
word_counts[positions[word]][1] ;
}}
// Put most frequent words at the beginning.
word_counts.sort(function (a, b) {return b[1] - a[1]})
// Return the first MaxKeyOut items
return word_counts.slice(0, MaxKeyOut)
}
catch(err) {
return "";
}}
function removeStopWords(input) {
var stopwords = ['test',
];
var filtered = input.split( /\b/ ).filter( function( v ){
return stopwords.indexOf( v ) == -1;
});
stopwords.forEach(function(item) {
var reg = new RegExp('\\W' item '\\W','gmi');
input = input.replace(reg, " ");
});
return input.toString();
}
function initKW() {
$('#KWdensity').html('');
var TextGrab = CKEDITOR.instances['editor1'].getData();
TextGrab = $(TextGrab).text(); // html to text
TextGrab = removeDiacritics(TextGrab);
TextGrab = removeStopWords(TextGrab);
TextGrab = TextGrab.replace(/\r?\n|\r/gm," ").trim(); // remove line breaks
TextGrab = TextGrab.replace(/\s\s /g, " ").trim(); // remove double spaces
TextGrab = TextGrab.replace(/[^a-zA-Z ] /g, "").trim(); // only letters and and space
if (TextGrab != "")
{
var keyCSV = KeyDensityShow(TextGrab, 11, 3);
var KeysArr = keyCSV.toString().split(',');
var item, items = '';
for (var i = 0; i < KeysArr.length; i ) {
item = '';
item = item '<b>' KeysArr[i] "</b></button> ";
i ;
item = '<button type="button"><span >' KeysArr[i] "</span> " item;
items = items item;
}
$('#KWdensity').html(items);
}
}
});
</script>
As you can see there is a function to remove diacritics "defaultDiacriticsRemovalMap" it had some values from A to Z i shortened it to see if Ö displays and nothing, and no diacritics neither. I also tried deleting many parts of this code and nothing works.
I am not good with Javascript but I have some knowledge anyway to understand the functions and I cannot really see where is the problem in this code.
Any idea how to delete the function of diacritics removal and to make characters like öäåñ etc work without the script changing it automatically to oaan ?
Here is the Fiddle demo:
https://jsfiddle.net/rg0myntj/1/
CodePudding user response:
This line TextGrab.replace(/[^a-zA-Z ] /g, "").trim()
removed anything except a to z in both lower and upper case, and blank space. This means unicode characters will be removed. So, delete or comment out this line of code.
This line srctext.toLowerCase().split(/\W /)
will be split anything that is not a to z or 0 to 9. This means word ahöäåra will be ah and ra.
Replace this with srctext.toLowerCase().split(/[^\p{L}\p{M}\p{N}] /u)
.
And to delete function removeDiacritics()
, you just delete it in the code.
$(document).ready(function() {
//----------------------------------------------------------------------
// Editor init
//----------------------------------------------------------------------
CKEDITOR.replace('editor1');
//----------------------------------------------------------------------
// KW init
//----------------------------------------------------------------------
$(initKW);
//----------------------------------------------------------------------
// Editor key intercept
//----------------------------------------------------------------------
CKEDITOR.instances.editor1.on('contentDom', function() {
CKEDITOR.instances.editor1.document.on('keyup', function(event) {
$(initKW);
});
});
/*function removeDiacritics(str) {
var defaultDiacriticsRemovalMap = [{
'base': 'A',
'letters': '\u0041\u24B6\uFF21\u00C0\u00C1\u00C2\u1EA6\u1EA4\u1EAA\u1EA8\u00C3\u0100\u0102\u1EB0\u1EAE\u1EB4\u1EB2\u0226\u01E0\u00C4\u01DE\u1EA2\u00C5\u01FA\u01CD\u0200\u0202\u1EA0\u1EAC\u1EB6\u1E00\u0104\u023A\u2C6F'
}];
var diacriticsMap = {};
for (var i = 0; i < defaultDiacriticsRemovalMap.length; i ) {
var letters = defaultDiacriticsRemovalMap[i].letters;
for (var j = 0; j < letters.length; j ) {
diacriticsMap[letters[j]] = defaultDiacriticsRemovalMap[i].base;
}
}
return str.replace(/[^\u0000-\u007E]/g, function(a) {
return diacriticsMap[a] || a;
});
}*/ // this is comment code block, you can delete this function from your code.
function KeyDensityShow(srctext, MaxKeyOut, keylenMin) {
var Output;
// Split text on non word characters
var words = srctext.toLowerCase().split(/[^\p{L}\p{M}\p{N}] /u)
var positions = new Array()
var word_counts = new Array()
try {
for (var i = 0; i < words.length; i ) {
var word = words[i]
if (!word || word.length < keylenMin) {
continue
}
if (!positions.hasOwnProperty(word)) {
positions[word] = word_counts.length;
word_counts.push([word, 1]);
} else {
word_counts[positions[word]][1] ;
}
}
// Put most frequent words at the beginning.
word_counts.sort(function(a, b) {
return b[1] - a[1]
})
// Return the first MaxKeyOut items
return word_counts.slice(0, MaxKeyOut)
} catch (err) {
return "";
}
}
function removeStopWords(input) {
var stopwords = ['test', ];
var filtered = input.split(/\b/).filter(function(v) {
return stopwords.indexOf(v) == -1;
});
stopwords.forEach(function(item) {
var reg = new RegExp('\\W' item '\\W', 'gmi');
input = input.replace(reg, " ");
});
return input.toString();
}
function initKW() {
$('#KWdensity').html('');
var TextGrab = CKEDITOR.instances['editor1'].getData();
TextGrab = $(TextGrab).text(); // html to text
//TextGrab = removeDiacritics(TextGrab); // you can delete this line of code.
TextGrab = removeStopWords(TextGrab);
TextGrab = TextGrab.replace(/\r?\n|\r/gm, " ").trim(); // remove line breaks
TextGrab = TextGrab.replace(/\s\s /g, " ").trim(); // remove double spaces
//TextGrab = TextGrab.replace(/[^a-zA-Z ] /g, "").trim(); // only letters and and space // you can delete this line of code.
if (TextGrab != "") {
var keyCSV = KeyDensityShow(TextGrab, 11, 3);
var KeysArr = keyCSV.toString().split(',');
var item, items = '';
for (var i = 0; i < KeysArr.length; i ) {
item = '';
item = item '<b>' KeysArr[i] "</b></button> ";
i ;
item = '<button type="button"><span >' KeysArr[i] "</span> " item;
items = items item;
}
$('#KWdensity').html(items);
}
}
});
<link href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" rel="stylesheet"/>
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.2.4/jquery.min.js"></script>
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js"></script>
<script src="https://cdn.ckeditor.com/4.6.1/standard/ckeditor.js"></script>
<!-- Textarea -->
<div >
<label for="editor1">HTML </label>
<div >
<textarea id="editor1" name="editor1"><p>text example with ahöäåra</p></textarea>
</div>
</div>
<!-- KW density result -->
<div >
<label for="editor1">Words Repeat</label>
<div >
<div id="KWdensity"></div>
</div>
</div>
The code above cannot run properly on Stack Overflow due to iframe permission. See it in action on jsFiddle instead.