Home > Blockchain >  Index in shortlex order
Index in shortlex order

Time:10-21

I have been given an n length string and I need to find its rank in the alphabetical ordering of all the n length strings. Like for example, let's say I have been given, "ABC" then as n=3 then ordering will be as {ABC, ABD, ABE ....} thus the rank of "ABC" is 0 (considering we start from 0). Similarly, for "ABD" it will be 1. Now, how do I find the rank of "ZXY"?

On first sight, it looks a recursive problem where we can create a dictionary for the given length of the string (like n=3 for "ZXY" and dict={ABC, ABD, ....}) and then search for the string in the dictionary and return the index as it dict is ordered. My problem is how do I code this up? and Is there a better way to go about this?

Edit: the listing in the question is the alphabetical listing of all n-length strings of letters from {A,...,Z} composed of distinct letters.

CodePudding user response:

Craig Estey already came up with a wonderful answer, but I thought I'd add to his permutation example. If you want to find the alphabetical place in which an n-length word goes, you don't have to increment the base 26 number.

Here is some code that finds the correct index in which the n-length word would go:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

/* Convert base 26 to base 10
 * Base 26 is represented here as A (0), B (1) ... Z (25)
 */
unsigned ts_to_ten(char *buf, size_t len)
{
    unsigned ret = 0;
    for (size_t i = 0; i < len; i  ) {
        ret *= 26;
        ret  = (buf[i] - 'a');
    }

    return ret;
}

int main(int argc, char **argv)
{
    if (argc != 3) {
        fprintf(stderr, "fatal: requires 3 arguments.\n");
        exit(EXIT_FAILURE);
    }
    
    char *start_text = argv[1];
    char *end_text = argv[2];
    size_t n;

    assert((n = strlen(start_text)) == strlen(end_text));
    
    unsigned start = ts_to_ten(start_text, n);
    unsigned end = ts_to_ten(end_text, n);

    printf("%u\n", end - start);
}

This program takes in the starting word and the ending word. The starting word is the word that is considered the first iteration (or 0), or 'abc' in the question's case. The ending word would be the word you want to get the iteration of, or 'zxy' according to the question.

So, if you convert both words into a number and subtract the start from the end, you will obtain how far the end word needs to be from the start word in alphabetical order.

CodePudding user response:

Okay, as I mentioned above this is a "print all base 26 numbers that are n digits wide" problem.

Although we can work with strings (e.g. ABC, DEF, etc.) it is easier [IMO] to convert to base 26 numbers.

You want the digits in the number to be unique. That is, combinations rather than permutations of the digits.

Permutations is actually easier, so let's start with that ...


Here is a program that does brute force generation of all possible permutations. It is [heavily] annotated:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define sysfault(_fmt...) \
    do { \
        printf(_fmt); \
        exit(1); \
    } while (0)

int ndig;                           // number of digits in string

// the alphabet
#define ALFMAX      26
const char *alf = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";

int opt_v;                          // verbose mode

// diginc -- increment the base 26 number (ndig digits wide)
// RETURNS: carry out
int
diginc(char *digits)
// digits -- base 26 number (ndig digits wide)
{
    int cout = 0;

    for (int idx = ndig - 1;  idx >= 0;  --idx) {
        int curdig = digits[idx];

        curdig  = 1;

        // did we overflow/wrap the digit?
        cout = (curdig >= ALFMAX);

        // no -- store new number and exit
        if (! cout) {
            digits[idx] = curdig;
            break;
        }

        // wrap digit back to zero
        digits[idx] = 0;
    }

    return cout;
}

// digdecode -- printable string (A-Z) to base 26 number (0-25)
void
digdecode(char *digits,const char *str)
// digits -- base 26 number (ndig digits wide)
// str -- string (A-Z)
{

    for (int idx = 0;  idx < ndig;    idx)
        digits[idx] = str[idx] - 'A';
}

// digprint -- convert base 26 number to printable form (e.g. A-Z)
// RETURNS: pointer to string
char *
digprint(const char *digits)
// digits -- base 26 number (ndig digits wide)
{
    int idx;
    static char str[1000];

    for (idx = 0;  idx < ndig;    idx)
        str[idx] = digits[idx]   'A';

    str[idx] = 0;

    return str;
}

// digmatch -- compare two base 26 numbers for equality
// RETURNS: 1=match, 0=mismatch
int
digmatch(const char *diglhs,const char *digrhs)
// diglhs -- base 26 number (ndig digits wide)
// digrhs -- base 26 number (ndig digits wide)
{
    int match = 0;

    for (int idx = 0;  idx < ndig;    idx) {
        match = (diglhs[idx] == digrhs[idx]);
        if (! match)
            break;
    }

    return match;
}

int
main(int argc,char **argv)
{

    --argc;
      argv;

    setlinebuf(stdout);

    // get options
    for (;  argc > 0;  --argc,   argv) {
        char *cp = *argv;
        if (*cp != '-')
            break;

        cp  = 2;
        switch (cp[-1]) {
        case 'v':
            opt_v = ! opt_v;
            break;
        }
    }

    // candidate string (e.g. ABD or ZYX)
    const char *str;

    ndig = -1;
    switch (argc) {
    case 2:
        ndig = atoi(argv[1]);
        // fall through

    case 1:
        str = argv[0];
        break;

    default:
        sysfault("usage: <str> [n]\n");
        break;
    }

    // number of digits in string
    if (ndig < 0)
        ndig = strlen(str);

    printf("ndig=%d str='%s'\n",ndig,str);

    // convert string to base 26 digit vector
    char expdig[ndig];
    digdecode(expdig,str);

    // sequence index
    int seqidx = 0;

    // current base 26 number
    char curdig[ndig];

    // initialize to all zeros
    for (int idx = 0;  idx < ndig;    idx)
        curdig[idx] = 0;

    while (1) {
        // does the current digit vect match the expected/desired vector?
        int match = digmatch(curdig,expdig);

        // yes [or verbose mode] -- print the entry
        if (match || opt_v) {
            printf("d: %s",seqidx,digprint(curdig));

            if (match)
                printf(" MATCH");

            printf("\n");
        }

        // increment the base 26 number
        if (diginc(curdig))
            break;

          seqidx;
    }

    return 0;
}

Here is the output for ABD:

ndig=3 str='ABD'
        29: ABD MATCH

Here is the output for ZYX:

ndig=3 str='ZYX'
     17547: ZYX MATCH

Note although a combinatorics guru might have a better/faster algorithm, we can get combinations by calculating a frequency table and rejecting any numbers that have duplicate digits. So, this is a slight extension of the previous example:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define sysfault(_fmt...) \
    do { \
        printf(_fmt); \
        exit(1); \
    } while (0)

int ndig;                           // number of digits in string

// the alphabet
#define ALFMAX      26
const char *alf = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";

int opt_u;                          // unique (combination) mode
int opt_v;                          // verbose mode

// freqtab -- create frequency table
// RETURNS: 1=duplicate detected
int
freqtab(const char *digits,int *freq)
{
    int dupflg = 0;

    for (int idx = 0;  idx < ndig;    idx) {
        int curdig = digits[idx];

        dupflg = freq[curdig];
        if (dupflg)
            break;

        freq[curdig]  = 1;
    }

    return dupflg;
}

// reject -- reject number if digits are duplicated
// RETURNS: 1=reject/dup, 0=unique
int
reject(const char *digits,int *freq)
{

    if (freq == NULL) {
        freq = alloca(sizeof(int) * ALFMAX);

        for (int idx = 0;  idx < ALFMAX;    idx)
            freq[idx] = 0;
    }

    // create frequency table
    int dupflg = freqtab(digits,freq);

    return dupflg;
}

// diginc -- increment the base 26 number (ndig digits wide)
// RETURNS: carry out
int
diginc(char *digits)
// digits -- base 26 number (ndig digits wide)
{
    int cout = 0;

    for (int idx = ndig - 1;  idx >= 0;  --idx) {
        int curdig = digits[idx];

        curdig  = 1;

        // did we overflow/wrap the digit?
        cout = (curdig >= ALFMAX);

        // no -- store new number and exit
        if (! cout) {
            digits[idx] = curdig;
            break;
        }

        // wrap digit back to zero
        digits[idx] = 0;
    }

    return cout;
}

// digdecode -- printable string (A-Z) to base 26 number (0-25)
void
digdecode(char *digits,const char *str)
// digits -- base 26 number (ndig digits wide)
// str -- string (A-Z)
{

    for (int idx = 0;  idx < ndig;    idx)
        digits[idx] = str[idx] - 'A';
}

// digprint -- convert base 26 number to printable form (e.g. A-Z)
// RETURNS: pointer to string
char *
digprint(const char *digits)
// digits -- base 26 number (ndig digits wide)
{
    int idx;
    static char str[1000];

    for (idx = 0;  idx < ndig;    idx)
        str[idx] = digits[idx]   'A';

    str[idx] = 0;

    return str;
}

// digmatch -- compare two base 26 numbers for equality
// RETURNS: 1=match, 0=mismatch
int
digmatch(const char *diglhs,const char *digrhs)
// diglhs -- base 26 number (ndig digits wide)
// digrhs -- base 26 number (ndig digits wide)
{
    int match = 0;

    for (int idx = 0;  idx < ndig;    idx) {
        match = (diglhs[idx] == digrhs[idx]);
        if (! match)
            break;
    }

    return match;
}

int
main(int argc,char **argv)
{

    --argc;
      argv;

    setlinebuf(stdout);

    // get options
    for (;  argc > 0;  --argc,   argv) {
        char *cp = *argv;
        if (*cp != '-')
            break;

        cp  = 2;
        switch (cp[-1]) {
        case 'u':
            opt_u = (*cp != 0) ? atoi(cp) : 1;
            break;

        case 'v':
            opt_v = (*cp != 0) ? atoi(cp) : 1;
            break;
        }
    }

    // candidate string (e.g. ABD or ZYX)
    const char *str;

    ndig = -1;
    switch (argc) {
    case 2:
        ndig = atoi(argv[1]);
        // fall through

    case 1:
        str = argv[0];
        break;

    default:
        sysfault("usage: <str> [n]\n");
        break;
    }

    // number of digits in string
    if (ndig < 0)
        ndig = strlen(str);

    printf("ndig=%d str='%s'\n",ndig,str);

    // convert string to base 26 digit vector
    char expdig[ndig];
    digdecode(expdig,str);

    // sequence index
    int seqidx = 0;

    // current base 26 number
    char curdig[ndig];

    // initialize to all zeros
    for (int idx = 0;  idx < ndig;    idx)
        curdig[idx] = 0;

    int rejflg = 0;
    int match = 0;

    while (1) {
        if (opt_u)
            rejflg = reject(curdig,NULL);

        // does the current digit vect match the expected/desired vector?
        if (rejflg)
            match = 0;
        else
            match = digmatch(curdig,expdig);

        // yes [or verbose mode] -- print the entry
        do {
            // don't print rejections
            if (opt_v < 2) {
                if (rejflg)
                    break;
            }

            // don't print unless we have a match or using verbose mode
            if (! (match || opt_v))
                break;

            printf("d: %s",seqidx,digprint(curdig));

            if (match)
                printf(" MATCH");

            if (rejflg)
                printf(" REJECT");

            printf("\n");
        } while (0);

        // increment the base 26 number
        if (diginc(curdig))
            break;

        // increment sequence index if (unique)
        seqidx  = (! rejflg) || (opt_v >= 2);
    }

    return 0;
}

Here is the output for -u ABD:

ndig=3 str='ABD'
         1: ABD MATCH

Here is the output for -u ZYX:

ndig=3 str='ZYX'
     15599: ZYX MATCH

Note that the above is a bit inefficient. That's because the frequency table is recalculated from scratch on each iteration.

Note that freqtab and reject take an extra freq argument. That's planning for the future.

If we pass down an actual array, with some modifications to the functions and two additional ones, we can quickly adjust the frequency table without having to create it from scratch each time.


UPDATE:

This looks pretty neat! Just a few questions, so as you said it is a bit inefficient but, the program seems to work when ndig if more than 5.

Yes, it will work but [as I said] it will be slow. I've coded up the faster version below.

Now, I know some C but can't really make complete sense how you've achieved it and my bad if this is intended behavior – only_ranch

There's a lot to unpack here. I've heavily annotated [commented] the source. But, still, it may take some time to digest.

This is particularly true if one is new to C coding. Some of the basic constructs (e.g. if, while, for, etc.) for both syntax and meaning may not [yet] be clear [at a glance].

This may take some extra study. (e.g.) Instead of looking at an if statement and "instantly" understanding it, you may have to take longer (e.g. a minute or two) to get it. You may have to go over things a few times.

Then, there is the algorithm itself. Again, it may take a few goings over to understand things.

Sometimes pencil and paper can be your best friend here ...

This is normal when just starting out.

It may seem difficult/intractible until you have your "Aha!" moment. And, then, understanding will "flood" in.

That's what it was like for me when I was just learning to code.

That's why I started with the simpler permutations version (that has duplicate chars). Then, a second version that does the combinations (where the digits are unique).

There's an old programming maxim [from https://en.wikipedia.org/wiki/The_Elements_of_Programming_Style]: Make it right before you make it faster.

Below is a version that has the "smarter" frequency table calculation. I had to:

  1. Create a struct for the frequency table
  2. Change the table creation function (renamed from freqtab to ftabnew).
  3. Add two functions: ftabadd and ftabsub to make incremental changes.
  4. Modify diginc to call ftabadd/ftabsub as needed
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define sysfault(_fmt...) \
    do { \
        printf(_fmt); \
        exit(1); \
    } while (0)

int ndig;                           // number of digits in string

// the alphabet
#define ALFMAX      26

int opt_u;                          // unique (combination) mode
int opt_v;                          // verbose mode

// frequency table
typedef struct {
    char ftab_duptot;               // total number of dup _cells_ in ftab_freq
    char ftab_freq[ALFMAX];         // frequency counts
} ftab_t;

// ftabadd -- add to frequency table
// RETURNS: 1=duplicate detected
int
ftabadd(int curdig,ftab_t *ftab)
// curdig -- digit to add/remove from table
// ftab -- pointer to frequency table
// inc --  1=increment, -1=decrement
{
    int cntold = ftab->ftab_freq[curdig];

    // bump up the bucket count
    ftab->ftab_freq[curdig] = cntold   1;

    // this bucket just became a dup
    if (cntold == 1)
        ftab->ftab_duptot  = 1;

    return ftab->ftab_duptot;
}

// ftabsub -- subtract from frequency table
// RETURNS: 1=duplicate detected
int
ftabsub(int curdig,ftab_t *ftab)
// curdig -- digit to add/remove from table
// ftab -- pointer to frequency table
// inc --  1=increment, -1=decrement
{
    int cntold = ftab->ftab_freq[curdig];

    // bump up the bucket count
    ftab->ftab_freq[curdig] = cntold - 1;

    // this bucket just became unique [again]
    if (cntold == 2)
        ftab->ftab_duptot -= 1;

    return ftab->ftab_duptot;
}

// ftabnew -- create frequency table
// RETURNS: 1=duplicate detected
int
ftabnew(const char *digits,ftab_t *ftab)
{

    if (ftab == NULL)
        ftab = alloca(sizeof(*ftab));

    // zero out the counts
    ftab->ftab_duptot = 0;
    for (int idx = 0;  idx < ALFMAX;    idx)
        ftab->ftab_freq[idx] = 0;

    for (int idx = 0;  idx < ndig;    idx) {
        int curdig = digits[idx];
        ftabadd(curdig,ftab);
    }

    return ftab->ftab_duptot;
}

// diginc -- increment the base 26 number (ndig digits wide)
// RETURNS: carry out
int
diginc(char *digits,ftab_t *ftab)
// digits -- base 26 number (ndig digits wide)
{
    int cout = 0;

    for (int idx = ndig - 1;  idx >= 0;  --idx) {
        int digold = digits[idx];

        // remove value from frequency table
        ftabsub(digold,ftab);

        // get new digit value
        int dignew = digold   1;

        // did we overflow/wrap the digit?
        cout = (dignew >= ALFMAX);

        // no -- store new number and exit
        if (! cout) {
            digits[idx] = dignew;
            ftabadd(dignew,ftab);
            break;
        }

        // wrap digit back to zero
        dignew = 0;
        digits[idx] = dignew;
        ftabadd(dignew,ftab);
    }

    return cout;
}

// digdecode -- printable string (A-Z) to base 26 number (0-25)
void
digdecode(char *digits,const char *str)
// digits -- base 26 number (ndig digits wide)
// str -- string (A-Z)
{

    for (int idx = 0;  idx < ndig;    idx)
        digits[idx] = toupper(str[idx]) - 'A';
}

// digprint -- convert base 26 number to printable form (e.g. A-Z)
// RETURNS: pointer to string
char *
digprint(const char *digits)
// digits -- base 26 number (ndig digits wide)
{
    int idx;
    static char str[1000];

    for (idx = 0;  idx < ndig;    idx)
        str[idx] = digits[idx]   'A';

    str[idx] = 0;

    return str;
}

// digmatch -- compare two base 26 numbers for equality
// RETURNS: 1=match, 0=mismatch
int
digmatch(const char *diglhs,const char *digrhs)
// diglhs -- base 26 number (ndig digits wide)
// digrhs -- base 26 number (ndig digits wide)
{
    int match = 0;

    for (int idx = 0;  idx < ndig;    idx) {
        match = (diglhs[idx] == digrhs[idx]);
        if (! match)
            break;
    }

    return match;
}

int
main(int argc,char **argv)
{

    --argc;
      argv;

    setlinebuf(stdout);

    // get options
    for (;  argc > 0;  --argc,   argv) {
        char *cp = *argv;
        if (*cp != '-')
            break;

        cp  = 2;
        switch (cp[-1]) {
        case 'u':
            opt_u = (*cp != 0) ? atoi(cp) : 1;
            break;

        case 'v':
            opt_v = (*cp != 0) ? atoi(cp) : 1;
            break;
        }
    }

    // candidate string (e.g. ABD or ZYX)
    const char *str;

    ndig = -1;
    switch (argc) {
    case 2:
        ndig = atoi(argv[1]);
        // fall through

    case 1:
        str = argv[0];
        break;

    default:
        sysfault("usage: <str> [n]\n");
        break;
    }

    // number of digits in string
    if (ndig < 0)
        ndig = strlen(str);

    printf("ndig=%d str='%s'\n",ndig,str);

    // convert string to base 26 digit vector
    char expdig[ndig];
    digdecode(expdig,str);

    // check expected string for dups
    if (ftabnew(expdig,NULL))
        sysfault("main: str has duplicate chars\n");

    // sequence index
    int seqidx = 0;

    // current base 26 number
    char curdig[ndig];

    // initialize to all zeros
    for (int idx = 0;  idx < ndig;    idx)
        curdig[idx] = 0;

    // create frequency table
    // NOTE: we _will_ start with dups
    ftab_t *ftab = alloca(sizeof(*ftab));
    ftabnew(curdig,ftab);

    int rejflg = 0;
    int match = 0;

    while (1) {
        if (opt_u)
            rejflg = ftab->ftab_duptot;

        // does the current digit vect match the expected/desired vector?
        if (rejflg)
            match = 0;
        else
            match = digmatch(curdig,expdig);

        // yes [or verbose mode] -- print the entry
        do {
            // don't print rejections
            if (opt_v < 2) {
                if (rejflg)
                    break;
            }

            // don't print unless we have a match or using verbose mode
            if (! (match || opt_v))
                break;

            printf("d: %s",seqidx,digprint(curdig));

            if (match)
                printf(" MATCH");

            if (rejflg)
                printf(" REJECT");

            printf("\n");
        } while (0);

        // increment the base 26 number
        if (diginc(curdig,ftab))
            break;

        // increment sequence index if (unique)
        seqidx  = (! rejflg) || (opt_v >= 2);
    }

    return 0;
}

CodePudding user response:

Combinatorics problems usually end up dealing with enormous numbers, and the solution to a question which involves counting combinatoric objects rarely if ever involves enumerating every possible object and counting as you go. Much less will it involve enumerating a superset of possible objects, and testing to see if each one qualifies. Brute force solutions like that are probably easier to come up with, and they will work for very small problems, but when you increase the problem size even by a little bit, you rapidly find yourself running out of resources.

For example, I compiled a solution from another answer to this question with optimisation level 3, and slowly increased the size of the string to index:

n String Count User time (version 1) User time (version 2)
3 ZYX 15599 0m0.003s 0m0.003s
4 ZYXW 358799 0m0.021s 0m0.015s
5 ZYXWV 7893599 0m0.188s 0m0.068s
6 ZYXWVU 165765599 0m4.973s 0m1.630s
7 ZYXWVUT 3315311999 2m13.501s 0m41.631s

(Note: I changed one variable from int to unsigned long long in order to avoid arithmetic overflow in the last invocation. int is only guaranteed to represent integers up to 32767, so it's rarely useful in combinatorics.)

So if you want to solve even moderately-sized problems, you need a solution which computes the index rather than counting up to it. In this case, there is a fairly simple computation, which is quadratic in the length of the string (i.e., O(n²)). That's not usually considered a great time complexity, but it's certainly sufficient to expand the range of solutions to this problem at least to the point where you would need a bignum to represent the count. n=15 is the largest value possible without bignum support, and the code below computes an index of that length in much less than a millisecond.

It starts with the fairly straight-forward observation that if you have some set Σ containing t distinct characters (perhaps a subset of the set of capital letters), the number of distinct strings you can form with n of those characters is the product of the first n terms in the sequence t, t-1, t-2, …. That's because there are t possibilities for the first character in the sequence, and then t-1 possibilities for the second character, because the first character chosen cannot be reused, and then t-2 possibilities for the third character, because the first two characters chosen cannot be reused, and so on until you have n characters. (If n=t, then this works out to t!, which is the number of permutations of t characters. But multiplying the first n terms works for any value of n<t. (For n=0, we use the convention that the product of an empty sequence is 1.) So if we have only the set { A, J, Q, W } and we want to know how many sequences of two distinct characters are possible, we compute 4*3=12. And it's easy to see that that is correct, because we can just list the possible pairs: there are three pairs starting with each of the four characters.

But we want to know what the index of some particular sequence of letters is. Call that sequence s, and again, we assume that we have a set Σ of t usable characters, and that the sequence s is of length n. (We require that all the characters in s be present in Σ.) The question then is, how many possible sequences are (lexicographically) less than s. We can work that out in pieces, by adding:

  • All the sequences which start with a letter in Σ smaller than the first letter of s, and

  • If n is greater than 1, all the sequences which start with the first letter of s and continue with n-1 distinct characters chosen from Σ but excluding the first letter of s. (If n is 1, then there are no more sequences to add.)

Note that the first part of that sum is just the number of smaller letters (which we have to count) times the number of unrestricted sequences of length n-1 from an alphabet of size t-1, which we already know how to do. Furthermore, the second part of that sum can be computed using the same procedure, with a smaller set of characters and a shorter sequence length. So we could easily write a recursive function to do this computation, but it will turn out to be more efficient to write it in dynamic programming style, working from the end. That is, we count the number of suffixes of each length which are (lexicographically) less than the corresponding suffix of s. For each suffix length, we use the set of characters excluding the initial characters of s up to that suffix.

As an example, let's compute the index of BED in sequences composed using the letters ABCDE. We start at the end of BED (the suffix D), and compute:

  • Using the alphabet ACD and the suffix D, the number of sequences of length 1 starting with a letter less than D. That's 2, the number of letters in the alphabet which are less than D, since the single letter is a sequence of length 1.

  • Using the alphabet ACDE and the suffix ED, the number of sequences of length two starting with a letter less than E. For each of the three such letters (B isn't in the alphabet), we need the number of continuation sequences of length 1 out of an alphabet of size 3 (ACDE minus the symbol we chose for the start). So that's 9.

  • Using the full alphabet ABCDE and the full suffix BED, the number of sequence of length three starting with a letter less than B. There's one such letter, and there are 4×3 continuation sequences, for a total of 12.

So we compute the index of BED as 2 9 12 = 23. (That can be verified by writing out the 60 possible sequences of length three. But if you have Python handy, you can also do a quick check:

>>> sum(c < tuple('BED') for c in permutations('ABCDE', 3))
23

The annoying part of that computation is figuring out the multiplier at each step: how many characters less than the first suffix character are there in the current alphabet (which is the entire alphabet other than the characters in the prefix). The following code does that by scanning the entire prefix and counting the number of characters less than the first suffix character. That's what leads to quadratic time complexity, because we're repeatedly scanning over the initial part of the sequence.

#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* Define the alphabet, which must be a set of contiguous letters. */
#define ALPH_MIN 'A'
#define ALPH_MAX 'Z'
#define ALPH_COUNT (ALPH_MAX - ALPH_MIN   1)

/* This function is called exactly once, to verify that the
 * provided string only contains letters in the alphabet, and
 * that it contains no repeated letters. The repeat check depends
 * on ALPH_COUNT being no greater than the number of bits in a
 * long (which is at least 32); it would need to be fixed for
 * larger alphabets.
 */
bool validate(const char* s) {
  unsigned long seen = 0;
  for (; *s;   s) {
    if (*s < ALPH_MIN) return false;
    if (*s > ALPH_MAX) return false;
    unsigned long flag = 1UL << (*s - ALPH_MIN);
    if (seen & flag) return false;
    seen |= flag;
  }
  return true;
}

/* Assumes that validate(s) returns true */
unsigned long long perm_index(const char* s) {
  /* The "rising factorial" (see Wikipedia). */
  unsigned long long rising_fact = 1;
  unsigned long long cumulative_index = 0;
  size_t len = strlen(s);
  while (len > 0) {
    unsigned c = s[--len];
    unsigned avail = c - ALPH_MIN;
    for (unsigned i = 0; i < len;   i)
      if (s[i] < c) --avail;
    cumulative_index  = avail * rising_fact;
    rising_fact *= ALPH_COUNT - len;
  }
  return cumulative_index;
}

int main(int argc, char* argv[]) {
  for (int i = 1; i < argc;   i) {
    const char* s = argv[i];
    if (!validate(s))
      printf("Invalid string: %s\n", s);
    else
      printf("s='%s', n=%zu, i=%llu\n", s, strlen(s), perm_index(s));
  }
  return 0;
}
  • Related