Home > Blockchain >  Find and change cyrillic word with boundary in google scripts
Find and change cyrillic word with boundary in google scripts


The problem is that \b doesn't work with Russian and Ukrainian letters.

Here I try to find all matches of a word 'февраля' it the text, change them to tempword, then make it a link and change it back to 'февраля'.

function addLinks(word, siteurl) {
  var id = 'doc\'s ID';
  var doc = DocumentApp.openById(id);
  var body = doc.getBody();
  var tempword = 'ASDFDSGDDKDSL2';
  var searchText = "\\b" word "\\b";
  var element = body.findText(searchText);
  while (element) {
    var start = element.getStartOffset();
    var text = element.getElement().asText();
    text.replaceText(searchText, tempword);
    text.setLinkUrl(start, start   tempword.length - 1, siteurl);
    element = body.findText(searchText);
  body.replaceText(tempword, word);

addLinks('февраля', 'example.com');

It works as it should, if I change Russian word 'февраля' to English 'february'.

addLinks('february', 'example.com');

I need regular expression, because if I just look for 'февраля' script will apply it to other words like 'февралям', 'февралями' etc. So, it is a question, how to make it work. Mistake "Exception: Invalid regular expression pattern" occurs with this code:

var searchText = "(?<=[\\s,.:;\"']|^)" word "(?=[\\s,.:;\"']|$)";

or this:

var searchText = "(^|\s)" word "(?=\s|$)";

and some other.

CodePudding user response:

I think next code does what is needed... At least in this situation.

function addLinks(word, siteurl) {
  var id = 'doc\'s ID';
  var doc = DocumentApp.openById(id);
  var body = doc.getBody();
  var tempword = 'ASDFGFDSA';
  var searchText = word;
  var tempwordRegex = "[^А-Яа-я]" tempword "[^А-Яа-я]";
  body.replaceText(searchText, tempword); // We replace all matches of the **word** in cyrillic with **tempword** in latin without boundries.

  var element = body.findText(tempwordRegex); //now find only **tempword**, that is not surrounded with any other cyrillic letters
  while (element) {
    var start = element.getStartOffset();
    var text = element.getElement().asText();
    text.setLinkUrl(start, start   tempword.length, siteurl); // make it a clickable url
    element = body.findText(tempwordRegex, element); // find next
  body.replaceText(tempword, word); // change back all **tempword** to **word**
addLinks('февраля', 'example.com');

CodePudding user response:

Here is my solution:

function main() {
  addLinks('февралями', 'example.com');

function addLinks(word, url) {
  var doc   = DocumentApp.getActiveDocument();
  var pgfs  = doc.getParagraphs();
  var bound = '[^А-яЁё]'; // any letter except Russian one

  var patterns = [
    {regex: bound   word   bound, start: 1, end: 1},
    {regex: '^'     word   bound, start: 0, end: 1},
    {regex: bound   word   '$',   start: 1, end: 0},
    {regex: '^'     word   '$',   start: 0, end: 0}

  for (var pgf of pgfs) for (var pattern of patterns) {
    var location = pgf.findText(pattern.regex);
    while (location) {
      var start = location.getStartOffset()   pattern.start;
      var end   = location.getEndOffsetInclusive() - pattern.end;
      pgf.editAsText().setLinkUrl(start, end, url);
      location = pgf.findText(pattern.regex, location);

Test output:

enter image description here

  • Related