Correct truncation of URL by length-CodePudding

How to truncate encoded URL by max length without make it invalid?

function truncateUrl(url, maxlength) {
  return url.substring(0, maxlength)
  // Try to remove invalid encoded sectors at the end.
  .replace(/(%[0-9A-F]{2}%?[%0-9A-F])$/gi, '');
}

function checkUrl(url) {
  try {
    decodeURI(url);
    return 'VALID: '   url;
  }
  catch {
    return 'INVALID: '   url;
  }
}

var source = 'https://test.com/uslugi/remont-kvartir/?region_name=Москва и МоскоМ';

console.log(checkUrl(truncateUrl(source, 136)));
console.log(checkUrl(truncateUrl(source, 135)));
console.log(checkUrl(truncateUrl(source, 134)));
console.log(checkUrl(truncateUrl(source, 133)));
console.log(checkUrl(truncateUrl(source, 132)));
console.log(checkUrl(truncateUrl(source, 131)));
console.log(checkUrl(truncateUrl(source, 130)));

How to get rid of invalid multibyte-sequences and to make URL valid in all cases?

CodePudding user response：

You do not need regex here. You should decode the URI encoded string, truncate it and encode it back. So all URLs will be valid.

function truncateUrl(url, maxlength) {
  url = decodeURIComponent(url);
  return encodeURIComponent(url.substring(0, maxlength));
}

var source = 'https://test.com/uslugi/remont-kvartir/?region_name=Москва и МоскоМ';

function checkUrl(url) {
  try {
    decodeURI(url);
    return 'VALID: '   url;
  }
  catch {
    return 'INVALID: '   url;
  }
}

console.log(checkUrl(truncateUrl(source, 136)));
console.log(checkUrl(truncateUrl(source, 135)));
console.log(checkUrl(truncateUrl(source, 134)));
console.log(checkUrl(truncateUrl(source, 133)));
console.log(checkUrl(truncateUrl(source, 132)));
console.log(checkUrl(truncateUrl(source, 131)));
console.log(checkUrl(truncateUrl(source, 130)));

CodePudding user response：

I used Wikipedia to find out about the encoding of UTF-8 and converted it to a regex:

((?:(?:%[0-7][0-9A-F])|(?:%[CD][0-9A-F]%[89AB][0-9A-F])|(?:%E[0-9A-F](?:%[89AB][0-9A-F]){2})|(?:%F[0-7](?:%[89AB][0-9A-F]){3})) )([%0-9A-F]*)$

While I did not manage to match just the invalid part, I matched both: the valid and the invalid part at the end of the URL. The valid part is in group 1, the invalid in group 2. So if you additionally cut off the number of characters in group 2 of the match, you should be left with a valid encoded URI.

UPDATE: with this version you get the complete valid part in group 1:

^(https?:\/\/(?:(?:[a-z0-9.\/?_=-] )|(?:%[0-7][0-9A-F])|(?:%[CD][0-9A-F]%[89AB][0-9A-F])|(?:%E[0-9A-F](?:%[89AB][0-9A-F]){2})|(?:%F[0-7](?:%[89AB][0-9A-F]){3})) )([%0-9A-F]*)$

But take care: it does not really validate the URL as such. Also note, that both versions don't care for any fragment part (#foobar) in the end of the URL.