Home > Software design >  How to split string into array of words, and also get their ranges?
How to split string into array of words, and also get their ranges?

Time:03-11

I'm using components(separatedBy:) to separate a string into an array of words (using a space " " as the separator).

let string = "This is a string."
let words = string.components(separatedBy: " ")
print(words) // Result: ["This", "is", "a", "string."]

This works fine, but I also want to get each word's range/index in the parent string. Here's my current brute-force attempt. It kind of works, but there's an extra space after some of the words and I don't know how it performs when applied to other strings.

let string = "This is a string."
let words = string.words(separatedBy: " ")
print(words)

/* Result:
 [
     Word(range: Range(0..<4), component: "This "),
     Word(range: Range(5..<7), component: "is "),
     Word(range: Range(8..<9), component: "a "),
     Word(range: Range(10..<16), component: "string."),
 ]
 */


/// ...

struct Word {
    var range: Range<Int> /// the component's range in a parent string
    var component: String
}

extension String {
    func words(separatedBy separator: String) -> [Word] {
        var words = [Word]()
        var currentWord: Word? /// the current word that the loop pieces together
        for stringIndex in indices {
            /// get the index as an `Int`
            let index = distance(from: startIndex, to: stringIndex)
            let letter = String(self[stringIndex])

            if var word = currentWord {
                word.component.append(letter)
                currentWord = word
                
                /// check if the letter is the separator, or if this index is the last one
                if letter == separator || stringIndex == indices.last {
                    word.range = word.range.startIndex..<index
                    words.append(word)
                    currentWord = nil
                }
            } else {
                currentWord = Word(range: index..<index   1, component: letter)
            }
        }
        return words
    }
}

Is there a built-in function that not only splits a string into an array, but also returns the ranges? If not, is there something I can use that is less brute-force?

CodePudding user response:

You can use enumerate substrings in range and pass byWords options:

extension StringProtocol {
    var wordsAndRanges: [(word: String,range: Range<Index>)] {
        var result: [(word: String, range: Range<Index>)] = []
        enumerateSubstrings(in: startIndex..., options: .byWords) { word, range, _, _ in
            guard let word = word else { return }
            result.append((word, range))
        }
        return result
    }
}

let string = "This is a string."
for (word, range) in string.wordsAndRanges {
    print("word:", word)
    print("substring:", string[range])
    print("range:", range)
}

Or using a Word struct as you tried:

struct Word {
    let range: Range<String.Index>
    let component: String
}

extension StringProtocol {
    var words: [Word] {
        var result: [Word] = []
        enumerateSubstrings(in: startIndex..., options: .byWords) { word, range, _, _ in
            guard let word = word else { return }
            result.append(.init(range: range, component: word))
        }
        return result
    }
}

let string = "This is a string."
for word in string.words {
    print("word:", word.component)
    print("subsequence", string[word.range])
    print("range", word.range)
}
  • Related