1

Consider the following sample text line:

"Hello : World 2020 :tag1:tag2:tag3"

I want to design a spirit X3 parser that can extract:

  1. Content := "Hello : world 2020 "
  2. Tags := { tag1,tag2,tag3 }

The problem: Content is defined as leftover char sequence(excluding eol) after matching the tags and I am not sure how to write a rule that can synthesize two attributes: one representing the extracted tags and another representing leftover characters(the content)

So far I've written the rule for extracting the tags:

       ...
    namespace ast { 
      struct sample {
         std::u32string content;
         std::vector<std::u32string> tags;  
      };
      //BOOST FUSION STUFF .....
    }

   namespace grammar {

       using x3 = boost::spirit::x3;
       using x3::unicode::lit;
       using x3::unicode::char_;
       using x3::unicode::alnum;

       auto const tag
       = x3::rule<class tag_class, std::u32string> {"tag"}
            %=
            lit(U":")
            >>
            +(alnum | lit(U"_") | lit(U"@") | lit(U"#") | lit(U"%") )
            ;

       auto const tags
       = x3::rule<class tags_class, std::vector<std::u32string>{"tags"}
            %= +tag >> lit(U":");
    }

But stuck over here:


  auto const sample_rule = 
     = x3::rule<class sample_rule_class, ast::sample> {"sample"}
     = ?? // something like (+char_ - (eol|tags);
Nicol Bolas
  • 449,505
  • 63
  • 781
  • 982
Aaron
  • 11
  • 2

1 Answers1

0

I'm sure there is a much elegant solution out there. In the meantime, a messy solution:

  1. Parse each sample line as a single string unit.
  2. Use semantic action to filer out the tags from each matched string unit.
  3. Discard the filtered tags from the string unit to be left with only content.

sample_ast.h

#prgama once
#include <string>

namespace ast {

struct sample {
std::u32string              content;
std::vector<std::u32string> tags;
};

}

sample.h

#pgrama once
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/spirit/char_encoding/unicode.hpp>
#include <boost/spirit/home/x3.hpp>
#include "sample_ast.hpp"

//tags property is intentionally ignored.
//It will be synthesized
//manually using semantic actions
BOOST_FUSION_ADAPT_STRUCT( ast::sample,content )

namespace grammar { 
namespace detail  {

using x3 = boost::spirit::x3;
using     x3::unicode::char_;
using     x3::eol;
using     x3::eoi;
using     x3::lexeme;

auto const sample_line 
 = x3::rule<class sample_line_class, std::u32string>{"sample_line"}
 = lexeme[ +(char_ - (eol|eoi)) ];

auto filter_tags = /*.... definition moved to next page for clarity */

auto const sample
=  x3::rule<class sample, ast::sample >{"sample"}
=% filter_tags[ sample_line ];
}}

namespace grammar {
  using grammar::detail::sample;
}

filter_tags definition iterate the matched data right to left collecting colon separated tags until an invalid tag char is encountered or all chars have been exhausted. pos_saved is used to track the beginning of the tag list, which is used to discard the tags from the content after collecting them into the ast.

    auto filter_tags = []( auto& context )
    {
                         
       auto &attr = _attr(context);  // content string
       auto &val  = _val(context);   // ast::sample
    
       std::stack<char32_t> mem;
       auto        pos       = attr.rbegin();
       auto& const pos_end   = attr.rend();
       auto        pos_saved = atrr.end();
       do{

         //tag start or end          
         if( *pos == U':' ){
     
            if( mem.empty() ) {  //tag start

                mem.push(U':'); 
            }
            else { //tag end

                 //tag closed state:
                 //all chars for the current tag
                 //are ready for transfer into
                 //the ast.
                  
                 std::u32string tag;
                 while( mem.top() != ':' ){
                      
                      //since we're reverse iterating the data
                      //the tags wont be backwards 
                      tag.push_back( mem.top());
                      mem.pop();
                 } 

                 val.tags.push_back(tag);
                 //update the start offset of
                 //that tags
                 pos_saved = pos.base();   
            } 

         } else { // tag char or not

                using u = spirit::char_encoding::unicode; 
                if( !mem.empty() ) {   
                   if(u::isalnum(*pos)) mem.push( *pos ); //tag char found 
                   else                 break;            //invalid tag char found  
                }
                else {
                   //space after tag list but before content end 
                   if(u::isspace(*pos) pos_saved = pos.base(); 
                }
        }              
       }while(++pos != pos_end);

       if( pos_saved != attr.end()) attr.erase(pos_saved, attr.end() );
       if( attr.empty() )           _pass(context) = false;     
    };
    
Aaron
  • 11
  • 2