Tokenizer function using Boost::qi and Boost::lex throws "stack overflow" exception for large inputs

Question

This function identifies words and stores them into a vector of wstrings. It worked fine for test cases like: this.is://a-test HI,hi

But for input string exceeding 60 tokens or above it throws me stack overflow exception and program breaks,I suspect this could be because of recursive call to 'tokens', if yes how can I remove it with some alternative. Can someone please help. Thanks in advance. Here is my code:

Header file:

class Tokenizer
{
public:
    static vector<wstring> tvec;
    template<typename Lexer>
    struct Tokens : boost::spirit::lex::lexer<Lexer>
    {
        Tokens()
        {
            identifier = L"[a-z0-9A-Z]+";
            separator = L"[-|:|.|\t| |\n|(|)|@|=|,|_|/|;|\]+";
            this->self.add
            (identifier)
                (separator)
                ;
        }
        boost::spirit::lex::token_def<std::wstring, wchar_t> identifier, separator;
    };
    struct Echo
    {
        void echo(boost::fusion::vector<std::wstring> const& t) const
        {
            std::cout << "\ntoken";
            tvec.push_back(boost::fusion::at_c<0>(t));
        }
    };

    template <typename Iterator>
    struct Grammar : boost::spirit::qi::grammar<Iterator>
    {
        template <typename TokenDef>
        Grammar(TokenDef const& tok, Echo const& e)
            : Grammar::base_type(tokens)
        {
            using boost::spirit::_val;
            //using client::print;
            tokens
                =
                ((token >> *(tok.separator) >> (tokens))[boost::bind(&Echo::echo, e, ::_1)]
                    |
                    (token)[boost::bind(&Echo::echo, &e, ::_1)]
                    |
                    ((tok.separator) >> token)[boost::bind(&Echo::echo, e, ::_1)]
                    | (token >> (tok.separator))[boost::bind(&Echo::echo, e, ::_1)]
                    ) >> boost::spirit::eoi
                ; // Look for end of input.
            token           //separator
                            = (tok.identifier)[_val = boost::spirit::qi::_1]
                //= (tok.identifier)[boost::bind(&Echo::echo, e, ::_1)]
                ;
            //(tok.identifier)[boost::bind(&sep::echo, &s, ::_1)];
            // Look for end of input.
        }
        boost::spirit::qi::rule<Iterator> tokens;
        boost::spirit::qi::rule<Iterator, std::wstring()> token;
    };
    vector<wstring> Tokenize(wstring str);
};

.cpp file:

vector<wstring> Tokenizer::tvec;
//int Tokenizer::tvec;
vector<wstring> Tokenizer::Tokenize(wstring str)
{
    tvec.clear();
    typedef std::wstring::iterator BaseIteratorType;
    typedef boost::spirit::lex::lexertl::token<BaseIteratorType, boost::mpl::vector<std::wstring> > TokenType;
    typedef boost::spirit::lex::lexertl::lexer<TokenType> LexerType;
    typedef Tokens<LexerType>::iterator_type TokensIterator;
    typedef LexerType::iterator_type LexerIterator;
    Echo e;
    Tokens<LexerType> tokens;
    Grammar<TokensIterator> grammar(tokens, e);

    BaseIteratorType first = str.begin();
    BaseIteratorType last = str.end();

    // Have the lexer consume our string.
    LexerIterator lexFirst = tokens.begin(first, last);
    LexerIterator lexLast = tokens.end();

    // Have the parser consume the output of the lexer.
    bool r = boost::spirit::qi::parse(lexFirst, lexLast, grammar);
    std::reverse(tvec.begin(), tvec.end());
    //return Tokenizer::tvec.size();
    return tvec;

}

Tokenizer function using Boost::qi and Boost::lex throws "stack overflow" exception for large inputs

0 Answers0