I'm surprised there is no this question already despite how popular this task might be. Is it possible to do it without writing your own parser? Two or more whitespaces in a row must be skipped. Here's my solution if not
vector<string> split_with_backslash(const string &s)
{
vector<string> ret;
const char *ps = s.c_str();
while (*ps ) {
std::stringstream ss;
switch (*ps) {
default:
ss << *ps;
break;
case ' ': case '\t': case '\n':
ret.push_back(ss.str());
ss.str(string());
ss.clear();
break;
case '\\':
if (!(* ps == ' ' || *ps == '\t' || *ps == '\n'))
--ps;
break;
}
}
return ret;
}
I'm bad with cpp iterators, so I've used c_str()
EDIT: Adding the wordexp
example for @MarekR. It is relevant to the question, because wordexp
nicely handles the quotation marks/backspaces, but not the special symbols below.
Here I'm preparing the pipeline for execution. This way I get rid of the '|' sign, which wordexp
doesn't eat, but there's still <>&
(I don't consider &
for simplicity.
vector<string> subcommands;
boost::split(subcommands, cpp_buf, boost::is_any_of("|"));
wordexp_t res;
switch (wordexp(subcommands.back().c_str(), &res, 0)) {
case 0:
break;
case WRDE_NOSPACE:
wordfree(&res);
default:
exit(EXIT_FAILURE);
}
CodePudding user response:
The code you have presented is so close to doing what you want it to do; just one subtle change required: move the std::stringstream ss;
declaration to before (i.e. outside of) the while
loop. As it stands, a new (empty) stringstream
object is being created on each and every iteration of that loop.
However, there is a (possibly) easier way to achieve the same, by using an 'escape flag' to signal that any space is preceded by a backslash, and only pushing your substring to the result vector if that flag is false:
vector<string> split_with_backslash(const string& s)
{
vector<string> ret;
bool escape = false;
std::stringstream ss{};
for (auto ps : s) { // "ps" is now a simple "char" but I've kept the name
switch (ps) {
default:
ss << ps;
escape = false;
break;
case ' ': case '\t': case '\n':
if (!escape) {
ret.push_back(ss.str());
ss.str(string());
ss.clear();
}
else {
ss << ps; // If you want the escaped space to be included in the string
escape = false;
}
break;
case '\\':
escape = true;
break;
// The following alternative "case '\\'" block will allow input of
// escaped backslash characters (if that's required) ...
/* case '\\':
if (escape) {
ss << ps;
escape = false;
}
else {
escape = true;
}
break; */
}
}
if (!ss.str().empty()) ret.push_back(ss.str()); // Don't forget final sub-string!
return ret;
}
CodePudding user response:
Here's a version that lets you avoid allocation (except if you use split_all
), and handles escaped backslashes as well, and includes testcases:
#include <string>
#include <vector>
// Needed for C 17 compatibility.
// In C 20, just use the string_view constructor.
template<class It>
std::string_view string_view_from_it(It b, It e)
{
return std::string_view(&*b, e - b);
}
std::string_view skip_space(std::string_view s)
{
auto it = s.begin();
for (; it != s.end(); it)
{
if (!std::isspace(*it))
break;
}
return string_view_from_it(it, s.end());
}
std::pair<std::string_view, std::string_view> split_one(std::string_view s)
{
bool escaped = false;
auto it = s.begin();
for (; it != s.end(); it)
{
if (escaped)
{
escaped = false;
continue;
}
if (*it == '\\')
escaped = true;
else if (std::isspace(*it))
break;
}
return {string_view_from_it(s.begin(), it), string_view_from_it(it, s.end())};
}
std::vector<std::string_view> split_all(std::string_view s)
{
std::vector<std::string_view> v;
s = skip_space(s);
while (!(s = skip_space(s)).empty())
{
auto pair = split_one(s);
v.push_back(pair.first);
s = pair.second;
}
return v;
}
#include <iostream>
int main()
{
for (std::string_view s : {
"",
" ",
" a b ",
"hello\\ world",
" \\ \\\\ \\ \\",
})
{
auto v = split_all(s);
std::cout << "split('" << s << "') = [";
bool first = true;
for (auto s : v)
{
if (!first)
std::cout << ", ";
first = false;
std::cout << "'" << s << "'";
}
std::cout << "]" << std::endl;
}
}
That said, I would prefer a real lexer at this point, and actually handle the backslashes (including "backslash at end of input").
I also prefer to forbid all control characters other than newline - notably CR and TAB - or at least normalize them in an earlier phase.