Home > Back-end >  How to use a pointer as token attribute in Boost::Spirit::Lex?
How to use a pointer as token attribute in Boost::Spirit::Lex?

Time:06-05

I write a minimum example to demonstrate this problem. It parses nested list of numbers like (1 2 3 (4 5) (6 (7 (8)))). I use spirit::lex to parse number and spirit::qi to parse list, so I code like this:

using TokenTypes = boost::mpl::vector<Object*>;
using Iterator = std::string::iterator;

class Lexer : public lex::lexer<actor_lexer<token<Iterator, TokenTypes>>>
{
public:
  lex::token_def<> spaces;        // used to skip spaces
  lex::token_def<Object*> number; // create Number Object on heap and use the pointer as attribute

public:
  Lexer();
};

template<typename... Ts>
using Rule = qi::rule<Lexer::iterator_type, Ts...>;

class Parser : public qi::grammar<Lexer::iterator_type, Object*>
{
public:
  Lexer lexer;

  Rule<Object*> list;
  Rule<Object*> elem;

public:
  Parser();
};

But in Parser::Parser(), I can't use Lexer::number in gramma expression:

Parser::Parser()
  : base_type(elem)
{
  // list = ...

  elem %= list | lexer.number; // fail to compile!
}

Clang error message (brief):

/usr/include/boost/spirit/home/qi/detail/assign_to.hpp:42:36: error: type 'Object *' cannot be used prior to '::' because it has no members
          : is_iter_range<typename C::value_type> {};
                                   ^
...
...
...

I can't understand why this is wrong considering it used to work fine when I use other scalar types like int and double as token attribute.

So, how to use pointer type as token attribute?

complete example

#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/qi.hpp>
#include <iostream>
#include <string>
#include <vector>

class Object
{
public:
  virtual ~Object() = default;

public:
  virtual void print(std::ostream& out) = 0;
};

class Number : public Object
{
public:
  int64_t _val;

public:
  virtual void print(std::ostream& out) override { out << _val; }
};

class List : public Object
{
public:
  std::vector<Object*> _objs;

public:
  virtual void print(std::ostream& out) override
  {
    out << '(';
    for (auto&& i : _objs) {
      i->print(out);
      out << ' ';
    }
    out << ')';
  }
};

namespace qi = boost::spirit::qi;
namespace fu = boost::fusion;
namespace lex = boost::spirit::lex;

using lex::lexertl::actor_lexer;
using lex::lexertl::token;

using TokenTypes = boost::mpl::vector<Object*>;
using Iterator = std::string::iterator;

class Lexer : public lex::lexer<actor_lexer<token<Iterator, TokenTypes>>>
{
public:
  lex::token_def<> spaces;
  lex::token_def<Object*> number;

public:
  Lexer();
};

template<typename... Ts>
using Rule = qi::rule<Lexer::iterator_type, Ts...>;

class Parser : public qi::grammar<Lexer::iterator_type, Object*>
{
public:
  Lexer lexer;

  Rule<Object*, qi::locals<List*>> list;
  Rule<Object*> elem;

public:
  Parser();
};

Lexer::Lexer()
{
  self  = '(';
  self  = ')';

  spaces = R"(\s )";
  self  =
    spaces[([](auto& start, auto& end, auto& matched, auto& id, auto& ctx) {
      matched = lex::pass_flags::pass_ignore;
    })];

  number = R"(\d )";
  self  =
    number[([](auto& start, auto& end, auto& matched, auto& id, auto& ctx) {
      auto val = new Number();
      auto iter = start;
      qi::parse(iter, end, qi::long_long, val->_val);
      ctx.set_value(val);
    })];
}

Parser::Parser()
  : base_type(elem)
{
  list = (         //
    qi::lit('(')[( //
      [](auto& attr, auto& ctx, bool& pass) {
        fu::at_c<0>(ctx.locals) = new List();
      })]       //
    >> *(elem[( //
         [](auto& attr, auto& ctx, bool& pass) {
           List* list = fu::at_c<0>(ctx.locals);
           list->_objs.push_back(attr);
         })]) //
    >> ')'    //
    )[(       //
    [](auto& attr, auto& ctx, bool& pass) {
      List* list = fu::at_c<0>(ctx.locals);
      fu::at_c<0>(ctx.attributes) = list;
    })];

  elem %= list | lexer.number;
}

int
main(int argc, char* argv[])
{
  Parser parser;

  std::string line;
  while (std::getline(std::cin, line)) {
    auto begin = line.begin();
    Object* obj;
    lex::tokenize_and_parse(begin, line.end(), parser.lexer, parser, obj);
    obj->print(std::cout);
    std::cout << std::endl;
  }
}

CodePudding user response:

Okay. Don't take this badly. Reading your sample (kudos for including a self-contained example! This saves a ton of time) I can't help but feeling that you've somehow stumbled on the worst possible cross-section of anti-patterns in Spirit Qi.

  1. You're using a polymorphic AST:

  2. You're using semantic actions. As a rule this already misses the sweet spot for embedded grammars, which is why I linked 126 answers to Boost Spirit: "Semantic actions are evil"?.

    However, that's even just talking about semantic actions for Qi. You also use them for Lex:

    self  =
        spaces[([](auto& start, auto& end, auto& matched, auto& id,
                   auto& ctx) { matched = lex::pass_flags::pass_ignore; })];
    

    Which is then further complicated by not using Phoenix, e.g.:

    self  = spaces[lex::_pass = lex::pass_flags::pass_ignore];
    

    Which does exactly the same but with about 870% less noise and equal amounts of evil magic.

  3. The other semantic action tops it all:

    self  = number[(
        [](auto& start, auto& end, auto& matched, auto& id, auto& ctx) {
            auto val = new Number();
            auto iter = start;
            qi::parse(iter, end, qi::long_long, val->_val);
            ctx.set_value(val);
        })];
    

    Besides having all the problems already listed, it literally makes a fractal out of things by calling Qi from a Lex semantic action. Of course, this wants to be:

    self  = number[lex::_val = phx::new_<Number>(/*magic*/)];
    

    But that magic doesn't exist. My gut feeling is that your issue that the Lexer shouldn't be concerned with AST types at all. At this point I feel that the lexer could/should be something like

    using TokenTypes = boost::mpl::vector<uint64_t>;
    using Iterator = std::string::const_iterator; // NOTE const_
    
    struct Lexer : lex::lexer<actor_lexer<token<Iterator, TokenTypes>>> {
        lex::token_def<>         spaces;
        lex::token_def<uint64_t> number;
    
        Lexer() : spaces{R"(\s )"}, number{R"(\d )"} {
            self  = '(';
            self  = ')';
            self  = spaces[lex::_pass = lex::pass_flags::pass_ignore];
            self  = number;
        }
    };
    

    That is, if it should exist at all.

That's the structural assessment. Let me apply simplifications to the Qi grammar along the same lines, just so we can reason about the code:

struct Parser : qi::grammar<Lexer::iterator_type, Object*()> {
    Parser() : base_type(elem) {
        using namespace qi::labels;

        static constexpr qi::_a_type _list{};
        const auto _objs = phx::bind(&List::_objs, _list);

        list = (                               //
            '(' >>                             //
            *(elem[phx::push_back(_objs, _1)]) //
            >> ')'                             //
            )[_val = phx::new_<List>(_list)];

        elem                  //
            = list[_val = _1] //
            | lexer.number[_val = phx::new_<Number>(_1)];
    }

    Lexer lexer; // TODO FIXME excess scope

  private:
    using It = Lexer::iterator_type;
    qi::rule<It, Object*(), qi::locals<List>> list;
    qi::rule<It, Object*()>                    elem;
};

Note how I made the local List instead of List*, to just slightly reduce the chance of memory leaks. I guess for efficiency you could try to make Phoenix do move-semantics for you:

 [_val = phx::new_<List>(phx::static_cast_<List&&>(_list))];

But at that point I wouldn't trust all the expression templates to do what you want and go to the more elaborate (even assuming c 17):

phx::function move_new = [](List& l) { return new List(std::move(l)); };

list = (                               //
    '(' >>                             //
    *(elem[phx::push_back(_objs, _1)]) //
    >> ')'                             //
    )[_val = move_new(_list)];

Now we arrive at a workable demo:

Live On Coliru

int main() {
    Parser parser;

    for (std::string const line : {
             "",
             "42",
             "()",
             "(1 2 3)",
             "(1 (44 55 () 66) 3)",
         }) {
        auto    begin = line.begin();
        Object* obj = nullptr;
        if (lex::tokenize_and_parse(begin, line.end(), parser.lexer, parser,
                                    obj)) {
            obj->print(std::cout << std::quoted(line) << " -> ");
            delete obj;
        } else {
            std::cout << std::quoted(line) << " -> FAILED";
        }
        std::cout << std::endl;
    }
}

Printing

"" -> FAILED
"42" -> 42
"()" -> ()
"(1 2 3)" -> (1 2 3 )
"(1 (44 55 () 66) 3)" -> (1 (44 55 () 66 ) 3 )

Note that this simple test program ALREADY leaks 11 objects, for a total of 224 bytes. That's not even complicating things with error-handling or backtracking rules.

That's craziness. You could of course fix it with smart pointers, but that just further complicates everything while making sure performance will be very poor.

Further Simplifications

I would stop using Lex and dynamic polymorphism:

No More Lex:

The only "value" Lex is adding here is skipping spaces. Qi is very capable (see e.g. Boost spirit skipper issues for variations on that theme), so we'll use skip(space)[] instead:

Live On Coliru

#include <boost/phoenix.hpp>
#include <boost/spirit/include/qi.hpp>
#include <iomanip>
#include <iostream>
#include <string>
#include <vector>

struct Object {
    virtual ~Object() = default;
    virtual void print(std::ostream& out) const = 0;

    friend std::ostream& operator<<(std::ostream& os, Object const& o) { return o.print(os), os; }
};

struct Number : Object {
    Number(uint64_t v = 0) : _val(v) {}
    int64_t      _val;
    virtual void print(std::ostream& out) const override { out << _val; }
};

struct List : Object {
    std::vector<Object*> _objs;

    virtual void print(std::ostream& out) const override {
        out << '(';
        for (auto&& el : _objs)
            out << ' ' << *el;
        out << ')';
    }
};

namespace qi  = boost::spirit::qi;
namespace phx = boost::phoenix;

template <typename It>
struct Parser : qi::grammar<It, Object*()> {
    Parser() : Parser::base_type(start) {
        using namespace qi::labels;

        static constexpr qi::_a_type _list{};
        const auto _objs = phx::bind(&List::_objs, _list);

        phx::function move_new = [](List& l) { return new List(std::move(l)); };

        list = (                               //
            '(' >>                             //
            *(elem[phx::push_back(_objs, _1)]) //
            >> ')'                             //
            )[_val = move_new(_list)];

        elem                                          //
            = list[_val = _1]                         //
            | qi::uint_[_val = phx::new_<Number>(_1)] //
            ;

        start = qi::skip(qi::space)[elem];
    }

  private:
    qi::rule<It, Object*(), qi::space_type, qi::locals<List>> list;
    qi::rule<It, Object*(), qi::space_type> elem;

    // lexemes
    qi::rule<It, Object*()> start;
};

int main() {
    Parser<std::string::const_iterator> const parser;

    for (std::string const line : {
             "",
             "42",
             "()",
             "(1 2 3)",
             "(1 (44 55 () 66) 3)",
         }) {
        Object* obj = nullptr;
        if (parse(line.begin(), line.end(), parser >> qi::eoi, obj)) {
            std::cout << std::quoted(line) << " -> " << *obj;
        } else {
            std::cout << std::quoted(line) << " -> FAILED";
        }
        delete obj;
        std::cout << std::endl;
    }
}

Still leaking like C went out of fashion, but at least doing so in 20 fewer LoC and half the compile time.

Static Polymorphism

Hiding all the raw pointer stuff (or avoiding it completely, depending on the exact AST requirements):

using Number = uint64_t;
using Object = boost::make_recursive_variant< //
    Number,                                   //
    std::vector<boost::recursive_variant_>>::type;

using List   = std::vector<Object>;

For ease of supplying operator<< I moved them into an AST namespace below.

The parser goes down to:

template <typename It> struct Parser : qi::grammar<It, AST::Object()> {
    Parser() : Parser::base_type(start) {
        list = '(' >> *elem >> ')';
        elem = list | qi::uint_;

        start = qi::skip(qi::space)[elem];
    }
  private:
    qi::rule<It, AST::List(), qi::space_type> list;
    qi::rule<It, AST::Object(), qi::space_type> elem;
    qi::rule<It, AST::Object()> start;
};

No more lex, no more phoenix, no more leaks, no more manual semantic actions. Just, expressive code.

Live Demo

Live On Coliru

#include <boost/spirit/include/qi.hpp>
#include <iomanip>
#include <iostream>

namespace AST {
    struct Number {
        uint64_t v;
        Number(uint64_t v = 0) : v(v){};
    };

    using Object = boost::make_recursive_variant< //
        Number,                                   //
        std::vector<boost::recursive_variant_>>::type;

    using List = std::vector<Object>;

    std::ostream& operator<<(std::ostream& os, Number const& n) {
        return os << n.v;
    }
    std::ostream& operator<<(std::ostream& os, List const& l) {
        os << '(';
        for (auto& el : l)
            os << ' ' << el;
        return os << ')';
    }
} // namespace AST

namespace qi = boost::spirit::qi;

template <typename It> struct Parser : qi::grammar<It, AST::Object()> {
    Parser() : Parser::base_type(start) {
        list = '(' >> *elem >> ')';
        elem = list | qi::uint_;

        start = qi::skip(qi::space)[elem];
    }
  private:
    qi::rule<It, AST::List(), qi::space_type> list;
    qi::rule<It, AST::Object(), qi::space_type> elem;
    qi::rule<It, AST::Object()> start;
};

int main() {
    Parser<std::string::const_iterator> const parser;

    for (std::string const line : {
             "",
             "42",
             "()",
             "(1 2 3)",
             "(1 (44 55 () 66) 3)",
         }) {
        AST::Object obj;
        if (parse(line.begin(), line.end(), parser >> qi::eoi, obj))
            std::cout << std::quoted(line) << " -> " << obj << "\n";
        else
            std::cout << std::quoted(line) << " -> FAILED\n";
    }
}

Prints

"" -> FAILED
"42" -> 42
"()" -> ()
"(1 2 3)" -> ( 1 2 3)
"(1 (44 55 () 66) 3)" -> ( 1 ( 44 55 () 66) 3)

But this time, without leaking memory. And also, it now compiles fast enough that Compiler Explorer can also handle it.

CodePudding user response:

Found a walkaround: use std::size_t and reinterpret_cast to replace pointer types:

#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/qi.hpp>
#include <iostream>
#include <string>
#include <vector>

class Object
{
public:
  virtual ~Object() = default;

public:
  virtual void print(std::ostream& out) = 0;
};

class Number : public Object
{
public:
  int64_t _val;

public:
  virtual void print(std::ostream& out) override { out << _val; }
};

class List : public Object
{
public:
  std::vector<Object*> _objs;

public:
  virtual void print(std::ostream& out) override
  {
    out << '(';
    for (auto&& i : _objs) {
      i->print(out);
      out << ' ';
    }
    out << ')';
  }
};

namespace qi = boost::spirit::qi;
namespace fu = boost::fusion;
namespace lex = boost::spirit::lex;

using lex::lexertl::actor_lexer;
using lex::lexertl::token;

using TokenTypes = boost::mpl::vector<std::size_t>;
using Iterator = std::string::iterator;

class Lexer : public lex::lexer<actor_lexer<token<Iterator, TokenTypes>>>
{
public:
  lex::token_def<> spaces;
  lex::token_def<std::size_t> number; // use std::size_t instead

public:
  Lexer();
};

template<typename... Ts>
using Rule = qi::rule<Lexer::iterator_type, Ts...>;

class Parser : public qi::grammar<Lexer::iterator_type, Object*>
{
public:
  Lexer lexer;

  Rule<Object*, qi::locals<List*>> list;
  Rule<Object*> elem;

public:
  Parser();
};

Lexer::Lexer()
{
  self  = '(';
  self  = ')';

  spaces = R"(\s )";
  self  =
    spaces[([](auto& start, auto& end, auto& matched, auto& id, auto& ctx) {
      matched = lex::pass_flags::pass_ignore;
    })];

  number = R"(\d )";
  self  =
    number[([](auto& start, auto& end, auto& matched, auto& id, auto& ctx) {
      auto val = new Number();
      auto iter = start;
      qi::parse(iter, end, qi::long_long, val->_val);
      ctx.set_value(reinterpret_cast<std::size_t>(val)); // cast here
    })];
}

Parser::Parser()
  : base_type(elem)
{
  list = (         //
    qi::lit('(')[( //
      [](auto& attr, auto& ctx, bool& pass) {
        fu::at_c<0>(ctx.locals) = new List();
      })]       //
    >> *(elem[( //
         [](auto& attr, auto& ctx, bool& pass) {
           List* list = fu::at_c<0>(ctx.locals);
           list->_objs.push_back(attr);
         })]) //
    >> ')'    //
    )[(       //
    [](auto& attr, auto& ctx, bool& pass) {
      List* list = fu::at_c<0>(ctx.locals);
      fu::at_c<0>(ctx.attributes) = list;
    })];

  elem %= list | qi::omit[lexer.number[([](auto& attr, auto& ctx, bool& pass) {
            fu::at_c<0>(ctx.attributes) = reinterpret_cast<Object*>(attr); // cast here
          })]];
}

int
main(int argc, char* argv[])
{
  Parser parser;

  std::string line;
  while (std::getline(std::cin, line)) {
    auto begin = line.begin();
    Object* obj;
    lex::tokenize_and_parse(begin, line.end(), parser.lexer, parser, obj);
    obj->print(std::cout);
    std::cout << std::endl;
  }
}

I think this is really ugly. Anyone has a better solution???

  • Related