Home > OS >  Longest word in file
Longest word in file

Time:02-10

My program needs to print longest word which contains only letters from a file.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
int checkString(const char s[]) {
  unsigned char c;
  while ((c = *s) && (isalpha(c) || isblank(c)))
      s;
  return *s == '\0';
}
int main() {
  char file_name[]="document.txt";
  FILE *fp = fopen(file_name, "r");
  char *largest = str;
  int largest_len = 0;
  while (fgets(file_name, 1000, fp) != NULL) {
    char *temp = strtok(file_name, " ");
    while (temp != NULL) {
      if (strlen(temp) > largest_len) {
        strcpy(largest, temp);
        largest_len = strlen(largest);
      }
      temp = strtok(NULL, "\",.,1,2,4,5,6,7,8,9 ");
    }
  }
  if(checkString(largest))
  printf("%s", largest);
  fclose(fp);
  return 0;
}
  • In my code, if the largest word contains only letters it will be printed. How to modify this code to check next words if the largest doesn't contain only letters?

CodePudding user response:

First of all, you cannot store the pointer to longest word like that. You re-use str for the next line and so the pointer is not likely to point to something useful.

Second, while strtok() appears simple, initially, I tend to apply a straightforward approach to a straightforward problem. The problem is O(n) (where n is the length of the document). You just need to go through it character by character. Of course, since every line is ended by a \n, you can use the line based approach in this case.

So, instead of strtok, simply check each character, if it is a legal word character (an alphanumeric character, that is). You can easily do so with the standard library function isalpha() from header ctype.h.

Below is the program, copying the longest string into a dedicated buffer, using isalpha() and doing the line based reading of the file, just like the code in the original question did.

Of course, this code assumes, no line is ever longer than 999 characters.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <ctype.h>

static size_t gulp(const char* line, size_t istart, size_t len) {
  size_t n = 0;
  for (size_t i = istart; i < len; i  , n  ) {
    if (!isalpha(line[i])) {
      break;
    }
  }
  return n;
}

int main(int argc, const char * argv[]) {
  FILE* f = fopen("document.txt","r");
  char line[1000];
  char longest_word[1000];
  size_t longest_word_length = 0;
  while (fgets(line, sizeof(line), f) != NULL) {
    size_t i0 = 0;
    size_t line_length = strlen(line);
    while (i0 < line_length) {
      if (isalpha(line[i0])) {
        size_t n = gulp(line, i0, line_length);
        if (n > longest_word_length) {
          strncpy(longest_word, &line[i0], n);
          longest_word[n] = '\0';
          longest_word_length = n;
        }
        i0 = i0   n;
      } else {
        i0  ;
      }
    }
  }
  fclose(f);
  f = NULL;
  if (longest_word_length > 0) {
    printf("longest word: %s (%lu characters)\n",
       longest_word, longest_word_length);
  }
  return 0;
}

CodePudding user response:

There are a number of problems here:

  • you use the same buffer (str) for two different uses: as a read buffer and to store the longest word. If you find the largest word in the first line, the word will be erased when reading the second line. Furthemore, if you find a rather long word at the beginning of a line, the strings pointed to by largest and temp could overlap which leads to undefined behaviour => use a different array or strdup (and free) for largest
  • you only use the space as possible separator. You should wonder whether you should add tab and/or punctuations
  • once you have got a word you should ensure that it only contains valid letters before testing its length and ignore it if for example it contains digits.
  • if a single line can be longer than 1000 characters, you should wrap the end of the current part before the beginning of the next one for the possible case where a long word would be splitted there.

For additional corner case processing, you should specify what to do if a word contains illegal characters but only at one side. For example if . is not used as a word delimiter, a word with an embedded . like "a.b" should be ignored, but a terminating . should only be stripped (like "example." should become "example"

CodePudding user response:

I think the order you do things should be a bit different, here is an example

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>

int isCandidate(char* word);

int main(int argc, char* argv[])
{
  if (--argc == 0)
  {
    perror("not enough command line arguments, expecting a filename");
    return -1;
  }
    argv;
  FILE* fp = fopen(*argv, "r");
  if (fp == NULL)
  {
    perror(*argv);
    return -1;
  } 
  // get size of file
  fseek(fp, 0L, SEEK_END);
  long fileLength = ftell(fp);
  if (fileLength < 1)
  {
    perror("file is empty");
    return -1;
  }
  
  fseek(fp, 0L, SEEK_SET); // position file pointer at the beginning again
  
  // allocate space for the whole file and then read it in
  // for a text file it should be OK to do so since they 
  // normally are not that large.
  char* buffer = malloc(fileLength 1);
  if (fread(buffer, 1, fileLength, fp) != 0)
  {
    buffer[fileLength] = '\0'; // make sure the buffer ends with \0
  }
  else
  {
    perror("Failed reading into buffer");
    return -1;
  }
  fclose(fp); // we are done with the file
  
  const char filter[] = " \n\r";
  char* longestWord = malloc(fileLength 1); // max length in theory
  long unsigned int maxLength = 0;
  for (char* token = strtok(buffer, filter); token != NULL; token = strtok(NULL, filter))
  {
     if (isCandidate(token))
     {
       if (strlen(token) > maxLength)
       {
         strcpy(longestWord, token);
         maxLength = strlen(token);
       }
     }
  }
  printf("Longest word:'%s', len=%lu\n", longestWord, maxLength);

  free(longestWord);
  free(buffer);
}

int isCandidate(char* word)
{
  if (word == NULL)
  {
    perror("invalid argument to isCandidate");
    return 0;
  }
  for (char* ch = word; *ch;   ch)
  {
    if (!isalpha(*ch)) return 0;
  }
  return 1;
}
  • Related