I'm trying to make a program that splits the string based on a specific character.
Data Structure used:
typedef struct pieces {
char **members;
size_t len;
} pieces;
Function declarations:
pieces split (const char *s, const char c);
size_t charCount (const char *s, const char c);
char *slice (const char *s, int a, int b);
size_t indexOf (const char *s, const char c, size_t start);
charCount
-> No. of times the char appeared in string.
indexOf
-> Returns the index of a first occurrence of the given character inside the string, starting from the index start
; i.e. indexOf("Stack Overflow", 'O', 0) == indexOf("Stack Overflow", 'O', 3)
I've implemented slice
like this:
char *slice (const char *s, int a, int b)
{
if (a > b || a == b)
return NULL;
if (b > strlen(s)) // Only slice upto end if tried to slice out of index
b = strlen(s);
size_t len = b - a 1;
char *slice = malloc(sizeof(char) * len);
for (size_t i = a; i < b; i )
slice[i - a] = s[i];
slice[len - 1] = '\0';
return slice;
}
I'm confused on split
function:
pieces split (const char *s, const char c)
{
// Is this the right way to make room for incoming slices ?
pieces arr;
arr.len = charCount(s, c) 1;
arr.members = malloc(sizeof(char *) * arr.len);
// Should I do something like this to insert slices ?
for (size_t i = 0; i < strlen(s);)
{
int seperator_idx = indexOf(s, c, i);
char *piece = slice(s, i, seperator_idx);
arr.members[i] = piece; // Should I use strdup ??
i = seperator_idx 1;
}
// What about the last slice ?
return arr;
}
CodePudding user response:
To split a string on 1 character, I would do something like:
#include <string.h>
#include <stdlib.h>
int count_words(char const *str, char const delim)
{
int count = 0;
int i = 0;
for (; str[i]; i ) {
// the next character is the beggining of a new string
if (str[i] == delim && str[i 1] != delim)
count ;
}
// for safety
if (str[i - 1] != delim)
count ;
return count;
}
int word_length(char const *str, char const delim)
{
int length = 0;
// while we're on a valid character, increase the word length
while (str[length] && str[length] != delim)
length ;
return length;
}
pieces split(char const *str, char const delim)
{
// move the pointer until we're not on the delimiter
while (*str == delim)
str ;
// prepare the string array
pieces p;
p.len = count_words(str, delim);
p.members = malloc(p.len * sizeof(char *));
// for each string
for (int i = 0; i < p.len; i ) {
// copy the string
int length = word_length(str, delim);
p.members[i] = strndup(str, length);
// move the pointer until we're not on the delimiter
str = length;
while (*str == delim)
str ;
}
return p;
}
CodePudding user response:
There are some issues with the proposed prototypes:
pieces split(const char *s, const char c);
it is unclear if consecutive occurrences ofc
represent empty substrings or a single separator (as instrtok
). Let's assume empty substrings should be accepted.const
qualifyingc
is overkill and not meaningless in a prototypesize_t charCount(const char *s, const char c);
same remark aboutconst char c
. Let's assume the null terminator is not part of the string socharCount("abc", '\0')
is zero.char *slice(const char *s, int a, int b);
why area
andb
typedint
instead ofsize_t
?size_t indexOf(const char *s, const char c, size_t start);
what should this function return in casec
is not found the string starting from indexstart
? Let's assume the offset of the end of string should be returned, as it is more convenient to implementslice
.
With these conventions, indexOf
and charCount
can be written as:
#include <stddef.h>
size_t indexOf(const char *s, const char c, size_t start) {
while (s[start] && s[start] != c)
start ;
return start;
}
size_t charCount(const char *s, const char c) {
size_t count = 0;
while (*s) {
count = (*s == c);
}
return count;
}
Your slice
function has multiple problems:
- It should return an empty string if
a == b
, - it is confusing to name
len
something that is not the length of the substring. Either definelen
assize_t len = b - a;
or usesize_t size = b - a 1;
- it has undefined behavior if
a
is larger thanstrlen(s)
andb > a
. - you should gracefully return
NULL
in case ofmalloc()
failure
Here is a modified version:
#include <stdlib.h>
/* return an empty string if a >= b */
char *slice(const char *s, size_t a, size_t b) {
size_t len = strlen(s);
if (a > len)
a = len;
if (b < a)
b = a;
char *slice = malloc(b - a 1);
if (slice != NULL) {
for (size_t i = a; i < b; i )
slice[i - a] = s[i];
slice[b - a] = '\0';
}
return slice;
}
The split
function also has problems:
- naming the
pieces
structurearr
is confusing: it is not an array. - your allocation for
arr.members
is correct, but you should test if was allocated successful. - there is no need to
strdup()
the return value ofslice
, which was allocated withmalloc()
. - you should use 2 separate index variables for the index
i
into the arrayarr.members
and the index of the start of the substring. - the loop should be written with a test so
split("", c)
return a single empty string. - if
indexOf
returns the end of the string ifc
cannot be found, no special case is needed for the last slice.
Here is a modified version:
pieces split(const char *s, const char c) {
pieces arr;
arr.len = charCount(s, c) 1;
arr.members = malloc(sizeof(*arr.members) * arr.len);
if (arr.members != NULL) {
for (size_t i = 0, start = 0; i < arr.len; i ) {
size_t end = indexOf(s, c, start);
arr.members[i] = slice(s, start, end);
start = end 1;
if (arr.members[i] == NULL) {
/* free previous substrings and the members array */
while (i-- > 0) {
free(arr.members[i]);
}
free(arr.members);
arr.members = NULL;
break;
}
}
}
return arr;
}
Note these final remarks:
split
as coded above works too ifindexOf()
returns(size_t)(-1)
when the character is not found in the string.recomputing the length of the string in
slice()
is wasteful.slice()
should assume that the argument values are correct:0
<=a
<b
<=strlen(s)
.there is no direct way for
split
to return an error. Setting themembers
toNULL
seems a workable solution.instead of
slice()
, and assumingindexOf
returns a valid offset into the string, you could use the POSIX standard functionstrndup()
:arr.members[i] = strndup(s start, end - start);