Home > Back-end >  How can I split a long continuous string into an array of the words it contains?
How can I split a long continuous string into an array of the words it contains?

Time:12-02

I have a long continous string that looks something like this:

let myString = "onetwothreefourfivesixseveneightnineteneleventwelvethirteenfourteen";

It does not have any separators to easily target.
So how can I itrate over it and split the words so it ends up like:

splitString = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen"];

Preferably with JavaScript.

CodePudding user response:

The problem here is the lack of separators as you have mentioned - this makes it impossible for the software to know where the words begin and end.

Given that you know the words that will show up, my technique would be so:

NOTE: This does not take into account the possibility of overlapping words and assumes none of the words are possible subsets of other words...

  1. Iterate the known words
  2. Search (indexOf) the string for each known word and note down it's positions in the string
  3. Sort the the values by the index values
  4. Generate an array with the values contained in the order found

/**
 * This assumes that:
 *  - Input words are not subsets of other input words
 */

// Find all indices of the input word in the input String
function findAll(inputString, inputWord) {
    const indices = [];
    let index = 0;
    while (index < inputString.length) {
        index = inputString.indexOf(inputWord, index);
        if (index == -1) break; // -1 means not found so we break here
        indices.push({ index, word: inputWord });
        index  = inputWord.length;
    }
    return indices;
}

// Split the words into an array of Objects holding their positions and values
function splitWords(inputString, inputWords) {
    // For holding the results
    let results = [];
    // Loop the input words
    for (const inputWord of inputWords) {
        // Find the indices and concat to the results array
        results = results.concat(findAll(inputString, inputWord));
    }
    return results;
}

// Sort the words and return just an array of Strings
const orderWords = (inputArr) => inputArr.sort((a, b) => a.index - b.index).map(input => input.word);

/**
 * Usage like so:
 */
const myString = 'onetwothreefourfivesixseveneightnineteneleventwelvethirteenfourteen';
const inputWords = ["one", "two", "three","four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen"];

const result = splitWords(myString, inputWords);
const ordered = orderWords(result);

console.dir(ordered);

/**
 * Result:
    [
    'one',      'two',
    'three',    'four',
    'five',     'six',
    'seven',    'eight',
    'nine',     'ten',
    'eleven',   'twelve',
    'thirteen', 'four',
    'fourteen'
    ]
 */

CodePudding user response:

If as you said in the comments that you know the expected words then create an array of these words and loop through your string to find these words

note the bellow code takes into account the length of the matched words so that you can find words such as one hundred eighty five otherwise the loop stops when it finds one

you can read the comments in the code to better understand it

// your string
var myString =
"onetwothreefourfivesixseveneightnineteneleventwelvethirteenfourteentwentyfiveonehundredeightyfiveeightyfive";

// the list of expected words
var possibleWords = 
[ 
    "one",
    "two",
    "three",
    "four",
    "five",
    "six",
    "seven",
    "eight",
    "nine",
    "ten",
    "eleven",
    "twelve",
    "thirteen",
    "fourteen",
    "twenty five",
    "one hundred eighty five",
    "eighty five",
];


function separateString(mergedString, possibleWords) {
     // the resulted array that has all the splited words
    var result = [];
    
    // buffer to temporary store the string and match it with the expected words array
    var buffer = ""; 

    // The word that has been matched in buffer with possible word in expected words array
    var matchedWord = "";
    
    // Index if the matched word
    var matchedWordLastIndex = -1;

    // Converting your string into array so we can access it by index letter by letter
    var splitedString = mergedString.split("");

    // For every letter in your string
    for (var stringIndex = 0; stringIndex < splitedString.length; stringIndex  ) 
    {
        // Resetting the variables 
        matchedWord = "";
        buffer = "";
        matchedWordLastIndex = -1;
        
        // Look a head from current string index to the end of your string and find every word that matches with expected words
        for ( var lookAhead = stringIndex; lookAhead < splitedString.length; lookAhead  ) 
        {
            // Append letters with each iteration of look ahead with the buffer so we can make words from it
            buffer  = splitedString[lookAhead];

            // loop through expected words to find a match with buffer
            for (var i = 0; i < possibleWords.length; i  ) {

                // if buffer is equal to a word in expected words array: .replace(/ /g, '') removes space if the words inside expected array of words have space such as twenty five to twentyfive
                if (buffer == possibleWords[i].replace(/ /g, '')) 
                {
                    // check if the found word has more letters than the previouse matched word so we can find words like one hundred eighty five otherwise it will just find one and stops
                    if(matchedWord.length < buffer.length)
                    {
                        // if the word has more letters then put the word into matched word and store the look ahead index into matchedWordLastIndex
                        matchedWord = possibleWords[i];
                        matchedWordLastIndex = lookAhead;
                    }
                }
            }
        }


        // if a word has been found
        if(matchedWord.length > 0){
            // make starting index same as look ahead index since last word found ended there
            stringIndex = matchedWordLastIndex;
            // put the found word into result array
            result.push(matchedWord);
        }
    }
    
    return result;
}

console.log(separateString(myString, possibleWords));
<iframe name="sif1" sandbox="allow-forms allow-modals allow-scripts" frameborder="0"></iframe>

  • Related