I need to update a parser to admit these new features, but I am not able to manage all them at a time:
- The commands must admit an indeterminate number of parameters (> 0).
- Parameters might be numbers, unquoted strings or quoted strings.
- Parameters are separate by commas.
- Within quoted strings, it shall be permitted to use opening/closing parenthesis.
(It easier to understand these requirements looking at source code example)
My current code, including checks, is as follows:
Godbolt link: https://godbolt.org/z/5d6o53n9h
#include <boost/fusion/adapted/struct/adapt_struct.hpp>
#include <boost/spirit/include/qi.hpp>
namespace script
{
struct Command
{
enum Type { NONE, WRITE_LOG, INSERT_LABEL, START_PROCESS, END_PROCESS, COMMENT, FAIL };
Type type{ Type::NONE };
std::vector<std::string> args;
};
using Commands = std::vector<Command>;
}//namespace script
BOOST_FUSION_ADAPT_STRUCT(script::Command, type, args)
namespace script
{
namespace qi = boost::spirit::qi;
template <typename It>
class Parser : public qi::grammar<It, Commands()>
{
private:
qi::symbols<char, Command::Type> type;
qi::rule<It, Command(), qi::blank_type> none, command, comment, fail;//By its very nature "fail" must be the last one to be checked
qi::rule<It, Commands()> start;
public:
Parser() : Parser::base_type(start)
{
using namespace qi;//NOTE: "as_string" is neccessary in all args due to std::vector<std::string>
auto empty_args = copy(attr(std::vector<std::string>{}));
type.add
("WriteLog", Command::WRITE_LOG)
("InsertLabel", Command::INSERT_LABEL)
("StartProcess", Command::START_PROCESS)
("EndProcess", Command::END_PROCESS);
none = omit[*blank] >> &(eol | eoi)
>> attr(Command::NONE)
>> empty_args;//ignore args
command = type >> '('
>> as_string[lexeme[ ~char_("(),\r\n")]] % ',' >> ')';
comment = lit("//")
>> attr(Command::COMMENT)
>> as_string[lexeme[*~char_("\r\n")]];
fail = omit[*~char_("\r\n")]
>> attr(Command::FAIL)
>> empty_args;//ignore args
start = skip(blank)[(none | command | comment | fail) % eol] >> eoi;
}
};
Commands parse(std::istream& in)
{
using It = boost::spirit::istream_iterator;
static const Parser<It> parser;
Commands commands;
It first(in >> std::noskipws), last;//No white space skipping
if (!qi::parse(first, last, parser, commands))
throw std::runtime_error("command parse error");
return commands;
}
}//namespace script
std::stringstream ss{
R"(// just a comment
WriteLog("this is a log")
WriteLog("this is also (in another way) a log")
WriteLog("but this is just a fail)
StartProcess(17, "program.exe", True)
StartProcess(17, "this_is_a_fail.exe, True)
)"};
int main()
{
using namespace script;
try
{
auto commands = script::parse(ss);
std::array args{ 0, 0, 1, 1, -1, 0, 3, -1, 0 };//Fails may have any number of arguments. It doesn't care. Sets as -1 by convenience flag
std::array types{ Command::COMMENT, Command::NONE, Command::WRITE_LOG, Command::WRITE_LOG, Command::FAIL, Command::NONE, Command::START_PROCESS, Command::FAIL, Command::NONE };
std::cout << std::boolalpha << "size correct? " << (commands.size() == 9) << std::endl;
std::cout << "types correct? " << std::equal(commands.begin(), commands.end(), types.begin(), types.end(), [](auto& cmd, auto& type) { return cmd.type == type; }) << std::endl;
std::cout << "arguments correct? " << std::equal(commands.begin(), commands.end(), args.begin(), args.end(), [](auto& cmd, auto arg) { return cmd.args.size() == arg || arg == -1; }) << std::endl;
}
catch (std::exception const& e)
{
std::cout << e.what() << "\n";
}
}
Any help with this will be appreciated.
CodePudding user response:
You say you want to allow parentheses within quoted strings. But you don't even support quoted strings!
So the problem is your argument rule. Which doesn't even exist. It whould be roughly this part:
argument = ~char_("(),\r\n");
command = type >> '(' >> argument % ',' >> ')';
Where argument
might be declared as
qi::rule<It, Argument()> argument;
In fact, rewriting the tests in an organized fashion, here's what we get right now:
static const Commands expected{
{Command::COMMENT, {"just a comment"}},
{Command::NONE, {}},
{Command::WRITE_LOG, {"this is a log"}},
{Command::WRITE_LOG, {"this is also (in another way) a log"}},
{Command::FAIL, {}},
{Command::NONE, {}},
{Command::START_PROCESS, {"17", "program.exe", "True"}},
{Command::FAIL, {}},
{Command::NONE, {}},
};
try {
auto parsed = script::parse(ss);
fmt::print("Parsed all correct? {} -- {} parsed (vs. {} expected)\n",
(parsed == expected), parsed.size(), expected.size());
for (auto i = 0u; i < std::min(expected.size(), parsed.size()); i) {
if (expected[i] != parsed[i]) {
fmt::print("index #{} expected {}\n"
" actual: {}\n",
i, expected[i], parsed[i]);
} else {
fmt::print("index #{} CORRECT ({})\n", i, parsed[i]);
}
}
} catch (std::exception const& e) {
fmt::print("Exception: {}\n", e.what());
}
Prints
Parsed all correct? false -- 9 parsed (vs. 9 expected)
index #0 CORRECT (Command(COMMENT, ["just a comment"]))
index #1 CORRECT (Command(NONE, []))
index #2 expected Command(WRITE_LOG, ["this is a log"])
actual: Command(WRITE_LOG, ["\"this is a log\""])
index #3 expected Command(WRITE_LOG, ["this is also (in another way) a log"])
actual: Command(FAIL, [])
index #4 expected Command(FAIL, [])
actual: Command(WRITE_LOG, ["\"but this is just a fail"])
index #5 CORRECT (Command(NONE, []))
index #6 expected Command(START_PROCESS, ["17", "program.exe", "True"])
actual: Command(START_PROCESS, ["17", "\"program.exe\"", "True"])
index #7 expected Command(FAIL, [])
actual: Command(START_PROCESS, ["17", "\"this_is_a_fail.exe", "True"])
index #8 CORRECT (Command(NONE, []))
As you can see, it fails quoted strings too, in my expectation. That's because the quoting is a language construct. In the AST (parsed results) you donot care about how exactly it was written in code. E.g. "hello\ world\041"
might be equivalent too "hello world!"
so both should result in the argument value hello world!
.
So, let's do as we say:
argument = quoted_string | number | boolean | raw_string;
We can add a few rules:
// notice these are lexemes (no internal skipping):
qi::rule<It, Argument()> argument, quoted_string, number, boolean, raw_string;
And define them:
quoted_string = '"' >> *~char_('"') >> '"';
number = raw[double_];
boolean = raw[bool_];
raw_string = ~char_("(),\r\n");
argument = quoted_string | number | boolean | raw_string;
(If you want to allow escaped quotes, something like this:
quoted_string = '"' >> *('\\' >> char_ | ~char_('"')) >> '"';
Now, I'd say you probably want Argument
to be something like variant<double, std::string, bool>
, instead of just std::string
.
With only this change, all the problems have practically vanished: Live On Compiler Explorer:
Parsed all correct? false -- 9 parsed (vs. 9 expected)
index #0 CORRECT (Command(COMMENT, ["just a comment"]))
index #1 CORRECT (Command(NONE, []))
index #2 CORRECT (Command(WRITE_LOG, ["this is a log"]))
index #3 CORRECT (Command(WRITE_LOG, ["this is also (in another way) a log"]))
index #4 CORRECT (Command(FAIL, []))
index #5 CORRECT (Command(NONE, []))
index #6 CORRECT (Command(START_PROCESS, ["17", "program.exe", "True"]))
index #7 expected Command(FAIL, [])
actual: Command(START_PROCESS, ["17", "this_is_a_fail.exe, True)\n\"this_is_a_fail.exe", "True"])
index #8 CORRECT (Command(NONE, []))
Now, index #7 looks very funky, but it's actually a well-known phenomenon in Spirit¹. Enabling BOOST_SPIRIT_DEBUG demonstrates it:
<argument>
<try>"this_is_a_fail.exe,</try>
<quoted_string>
<try>"this_is_a_fail.exe,</try>
<fail/>
</quoted_string>
<number>
<try>"this_is_a_fail.exe,</try>
<fail/>
</number>
<boolean>
<try>"this_is_a_fail.exe,</try>
<fail/>
</boolean>
<raw_string>
<try>"this_is_a_fail.exe,</try>
<success>, True)</success>
<attributes>[[t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e, ,, , T, r, u, e, ), ", t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e]]</attributes>
</raw_string>
<success>, True)</success>
<attributes>[[t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e, ,, , T, r, u, e, ), ", t, h, i, s, _, i, s, _, a, _, f, a, i, l, ., e, x, e]]</attributes>
</argument>
So, the string gets accepted as a raw string, even though it started with "
. That's easily fixed, but we don't even need to. We could just apply qi::hold
to avoid the duplication:
argument = qi::hold[quoted_string] | number | boolean | raw_string;
Result:
actual: Command(START_PROCESS, ["17", "\"this_is_a_fail.exe", "True"])
However, if you expect it to fail, fix that other problem:
raw_string = ~char_("\"(),\r\n"); // note the \"
Note: In the off-chance you really only require it to not start with a quote:
raw_string = !lit('"') >> ~char_("(),\r\n");
I guess by now you see the problem with a "loose rule" like that, so I don't recommend it.
You could express the requirement another way though, saying "if an argument starts with
'"'
then is MUST be aquoted_string
. Use an expectation point there:quoted_string = '"' > *('\\' >> char_ | ~char_('"')) > '"';
This has the effect that failure to parse a complete
quoted_string
will throw anexpectation_failed
exception.
Summary / Listing
This is what we end up with:
//#define BOOST_SPIRIT_DEBUG
#include <boost/fusion/adapted/struct/adapt_struct.hpp>
#include <boost/spirit/include/qi.hpp>
#include <fmt/ranges.h>
namespace script {
using Argument = std::string;
using Arguments = std::vector<Argument>;
struct Command {
enum Type {
NONE,
WRITE_LOG,
INSERT_LABEL,
START_PROCESS,
END_PROCESS,
COMMENT,
FAIL
};
Type type{Type::NONE};
Arguments args;
auto operator<=>(Command const&) const = default;
};
using Commands = std::vector<Command>;
} // namespace script
BOOST_FUSION_ADAPT_STRUCT(script::Command, type, args)
namespace script {
namespace qi = boost::spirit::qi;
template <typename It> class Parser : public qi::grammar<It, Commands()> {
public:
Parser() : Parser::base_type(start) {
using namespace qi; // NOTE: "as_string" is neccessary in all args
auto empty_args = copy(attr(Arguments{}));
type.add //
("WriteLog", Command::WRITE_LOG) //
("InsertLabel", Command::INSERT_LABEL) //
("StartProcess", Command::START_PROCESS) //
("EndProcess", Command::END_PROCESS); //
none = omit[*blank] >> &(eol | eoi) //
>> attr(Command{Command::NONE, {}});
quoted_string = '"' >> *('\\' >> char_ | ~char_('"')) >> '"';
number = raw[double_];
boolean = raw[bool_];
raw_string = ~char_("\"(),\r\n");
argument = qi::hold[quoted_string] | number | boolean | raw_string;
command = type >> '(' >> argument % ',' >> ')';
comment = "//" //
>> attr(Command::COMMENT) //
>> as_string[lexeme[*~char_("\r\n")]]; //
fail = omit[*~char_("\r\n")] >> attr(Command{Command::FAIL, {}});
line = none | command | comment | fail; // keep fail last
start = skip(blank)[line % eol] >> eoi;
BOOST_SPIRIT_DEBUG_NODES((start)(line)(fail)(comment)(command)(
argument)(none)(quoted_string)(raw_string)(boolean)(number))
}
private:
qi::symbols<char, Command::Type> type;
qi::rule<It, Command(), qi::blank_type> line, none, command, comment, fail;
// notice these are lexemes (no internal skipping):
qi::rule<It, Argument()> argument, quoted_string, number, boolean, raw_string;
qi::rule<It, Commands()> start;
};
Commands parse(std::istream& in)
{
using It = boost::spirit::istream_iterator;
static const Parser<It> parser;
Commands commands;
return qi::parse(It{in >> std::noskipws}, {}, parser, commands)
? commands
: throw std::runtime_error("command parse error");
}
struct Formatter {
static constexpr auto name(script::Command::Type type) {
return std::array{"NONE", "WRITE_LOG", "INSERT_LABEL",
"START_PROCESS", "END_PROCESS", "COMMENT",
"FAIL"}
.at(static_cast<int>(type));
}
auto parse(auto& ctx) const { return ctx.begin(); }
auto format(script::Command const& cmd, auto& ctx) const {
return format_to(ctx.out(), "Command({}, {})", name(cmd.type), cmd.args);
}
};
} // namespace script
template <> struct fmt::formatter<script::Command> : script::Formatter {};
std::stringstream ss{
R"(// just a comment
WriteLog("this is a log")
WriteLog("this is also (in another way) a log")
WriteLog("but this is just a fail)
StartProcess(17, "program.exe", True)
StartProcess(17, "this_is_a_fail.exe, True)
)"};
int main() {
using namespace script;
static const Commands expected{
{Command::COMMENT, {"just a comment"}},
{Command::NONE, {}},
{Command::WRITE_LOG, {"this is a log"}},
{Command::WRITE_LOG, {"this is also (in another way) a log"}},
{Command::FAIL, {}},
{Command::NONE, {}},
{Command::START_PROCESS, {"17", "program.exe", "True"}},
{Command::FAIL, {}},
{Command::NONE, {}},
};
try {
auto parsed = script::parse(ss);
fmt::print("Parsed all correct? {} -- {} parsed (vs. {} expected)\n",
(parsed == expected), parsed.size(), expected.size());
for (auto i = 0u; i < std::min(expected.size(), parsed.size()); i) {
if (expected[i] != parsed[i]) {
fmt::print("index #{} expected {}\n"
" actual: {}\n",
i, expected[i], parsed[i]);
} else {
fmt::print("index #{} CORRECT ({})\n", i, parsed[i]);
}
}
} catch (std::exception const& e) {
fmt::print("Exception: {}\n", e.what());
}
}
Prints
Parsed all correct? true -- 9 parsed (vs. 9 expected)
index #0 CORRECT (Command(COMMENT, ["just a comment"]))
index #1 CORRECT (Command(NONE, []))
index #2 CORRECT (Command(WRITE_LOG, ["this is a log"]))
index #3 CORRECT (Command(WRITE_LOG, ["this is also (in another way) a log"]))
index #4 CORRECT (Command(FAIL, []))
index #5 CORRECT (Command(NONE, []))
index #6 CORRECT (Command(START_PROCESS, ["17", "program.exe", "True"]))
index #7 CORRECT (Command(FAIL, []))
index #8 CORRECT (Command(NONE, []))
¹ see e.g. boost::spirit alternative parsers return duplicates (which links to three more of the same kind)