0

I'm trying to write a parser using rust and nom which can parse separated lists of tokens with the separator usually surrounded by spaces, but not if the thing between the tokens is in parenthesis.

For example, this is a valid expression: a and b as would be (a)and(b) or (a) and b, however aandb is not valid.

For cases other than (a)and(b), the following code works fine. But removing the spaces from tag(" and ") makes (a)andb valid. How can I support both cases?

use nom::branch::alt;
use nom::bytes::complete::tag;
use nom::character::complete::{alpha1, char};
use nom::combinator::{all_consuming, complete, verify};
use nom::error::Error;
use nom::multi::separated_list1;
use nom::sequence::delimited;
use nom::{Finish, IResult};

fn parse_token(i: &str) -> IResult<&str, &str> {
    alpha1(i)
}

fn parse_parens(i: &str) -> IResult<&str, &str> {
    delimited(char('('), parse_token, char(')'))(i)
}

fn parse_and(i: &str) -> IResult<&str, Vec<&str>> {
    separated_list1(tag(" and "), alt((parse_parens, parse_token)))(i)
}

fn parse(i: &str) -> Result<Vec<&str>, Error<&str>> {
    let result = all_consuming(complete(parse_and))(i);
    result.finish().map(|(_, o)| o)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn no_parens() {
        assert!(parse("a and b").is_ok())
    }

    #[test]
    fn parens() {
        assert!(parse("(a) and (b)").is_ok())
    }

    #[test]
    fn mixed() {
        assert!(parse("(a) and b").is_ok())
    }

    #[test]
    fn parens_no_space() {
        assert!(parse("(a)and b").is_ok())
    }

    #[test]
    fn no_parens_no_space() {
        assert!(parse("(a)andb").is_err())
    }
}
ojii
  • 4,729
  • 2
  • 23
  • 34
  • 1
    Tokenizing and parsing often get's messy and thus often is split into 2 parts you probably should consider that here. – cafce25 Mar 17 '23 at 10:18
  • With your revised `parse_token` `"aandb"` is a valid token, thus it's also a valid list of 1 expression separated by 0 `"and"`. – cafce25 Mar 17 '23 at 10:57
  • you're right, `aandb` should be valid, but `(a)andb` should not. updated again – ojii Mar 17 '23 at 11:06

1 Answers1

0

The solution was to check for a space, closing paren or eof after parse_token, this solved the problem for me:

use nom::branch::alt;
use nom::bytes::complete::tag;
use nom::character::complete::{alpha1, char, multispace0, multispace1};
use nom::combinator::{all_consuming, complete, eof, peek, value};
use nom::error::Error;
use nom::multi::separated_list1;
use nom::sequence::{delimited, terminated};
use nom::{Finish, IResult};

fn parse_token(i: &str) -> IResult<&str, &str> {
    alpha1(i)
}

fn parse_parens(i: &str) -> IResult<&str, &str> {
    delimited(char('('), parse_token, char(')'))(i)
}

fn parse_expr(i: &str) -> IResult<&str, &str> {
    terminated(
        alt((parse_parens, terminated(parse_token, end_of_expression))),
        multispace0,
    )(i)
}

fn end_of_expression(i: &str) -> IResult<&str, ()> {
    alt((
        value((), eof),
        value((), peek(char(')'))),
        value((), char(' ')),
    ))(i)
}

fn parse_and(i: &str) -> IResult<&str, Vec<&str>> {
    separated_list1(tag("and "), parse_expr)(i)
}

fn parse(i: &str) -> Result<Vec<&str>, Error<&str>> {
    let result = all_consuming(complete(parse_and))(i);
    result.finish().map(|(_, o)| o)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn no_parens() {
        assert!(parse("a and b").is_ok())
    }

    #[test]
    fn parens() {
        assert!(parse("(a) and (b)").is_ok())
    }

    #[test]
    fn mixed() {
        assert!(parse("(a) and b").is_ok())
    }

    #[test]
    fn parens_no_space() {
        println!("{:?}", parse("(a)and b"));
        assert!(parse("(a)and b").is_ok())
    }

    #[test]
    fn no_parens_no_space() {
        assert!(parse("(a)andb").is_err())
    }
}
ojii
  • 4,729
  • 2
  • 23
  • 34