Home > Blockchain >  Find and change cyrillic word with boundary in google scripts
Find and change cyrillic word with boundary in google scripts

Time:10-10

The problem is that \b doesn't work with Russian and Ukrainian letters.

Here I try to find all matches of a word 'февраля' it the text, change them to tempword, then make it a link and change it back to 'февраля'.

function addLinks(word, siteurl) {
  var id = 'doc\'s ID';
  var doc = DocumentApp.openById(id);
  var body = doc.getBody();
  var tempword = 'ASDFDSGDDKDSL2';
  var searchText = "\\b" word "\\b";
  var element = body.findText(searchText);
  console.log(element);
  while (element) {
    var start = element.getStartOffset();
    var text = element.getElement().asText();
    text.replaceText(searchText, tempword);
    text.setLinkUrl(start, start   tempword.length - 1, siteurl);
    element = body.findText(searchText);
  }
  body.replaceText(tempword, word);
}

addLinks('февраля', 'example.com');

It works as it should, if I change Russian word 'февраля' to English 'february'.

addLinks('february', 'example.com');

I need regular expression, because if I just look for 'февраля' script will apply it to other words like 'февралям', 'февралями' etc. So, it is a question, how to make it work. Mistake "Exception: Invalid regular expression pattern" occurs with this code:

var searchText = "(?<=[\\s,.:;\"']|^)" word "(?=[\\s,.:;\"']|$)";

or this:

var searchText = "(^|\s)" word "(?=\s|$)";

and some other.

CodePudding user response:

I think next code does what is needed... At least in this situation.

function addLinks(word, siteurl) {
  var id = 'doc\'s ID';
  var doc = DocumentApp.openById(id);
  var body = doc.getBody();
  var tempword = 'ASDFGFDSA';
  var searchText = word;
  var tempwordRegex = "[^А-Яа-я]" tempword "[^А-Яа-я]";
  body.replaceText(searchText, tempword); // We replace all matches of the **word** in cyrillic with **tempword** in latin without boundries.

  var element = body.findText(tempwordRegex); //now find only **tempword**, that is not surrounded with any other cyrillic letters
  console.log(element);
  while (element) {
    var start = element.getStartOffset();
    var text = element.getElement().asText();
    text.setLinkUrl(start, start   tempword.length, siteurl); // make it a clickable url
    element = body.findText(tempwordRegex, element); // find next
  }
  body.replaceText(tempword, word); // change back all **tempword** to **word**
}
addLinks('февраля', 'example.com');

CodePudding user response:

Here is my solution:

function main() {
  addLinks('февралями', 'example.com');
}

function addLinks(word, url) {
  var doc   = DocumentApp.getActiveDocument();
  var pgfs  = doc.getParagraphs();
  var bound = '[^А-яЁё]'; // any letter except Russian one

  var patterns = [
    {regex: bound   word   bound, start: 1, end: 1},
    {regex: '^'     word   bound, start: 0, end: 1},
    {regex: bound   word   '$',   start: 1, end: 0},
    {regex: '^'     word   '$',   start: 0, end: 0}
  ];

  for (var pgf of pgfs) for (var pattern of patterns) {
    var location = pgf.findText(pattern.regex);
    while (location) {
      var start = location.getStartOffset()   pattern.start;
      var end   = location.getEndOffsetInclusive() - pattern.end;
      pgf.editAsText().setLinkUrl(start, end, url);
      location = pgf.findText(pattern.regex, location);
    }
  }
}

Test output:

enter image description here

  • Related