3

I'm trying to parse frames formatted in the following scheme :

$[number],[number],[number],<string>;[string]~<string>

Parameters surrounded with '[]' are optional and those surrounded with '<>' are always present defined:

Thus, the following frames are all correct:

$0,0,0,thisIsFirstString;secondString~thirdOne
$0,,0,firstString;~thirdOne
$,,,firstString;~thirdString

Currently I'm able to parse the frame when all elements are present with the following code

int main() {
    char frame[100] = "$1,2,3,string1;string2~string3";
    char num1[10], num2[10], num3[10], str1[100], str2[100], str3[100];

    printf("frame : %s\n", frame);

    sscanf(frame,"$%[^,],%[^,],%[^,],%[^;];%[^~]~%s", num1, num2, num3, str1, str2, str3);

    printf("Number 1 : %s\n", num1);
    printf("Number 2 : %s\n", num2);
    printf("Number 3 : %s\n", num3);
    printf("String 1 : %s\n", str1);
    printf("String 2 : %s\n", str2);
    printf("String 3 : %s\n", str3);

    return 0;
}

With the following result

frame : $1,2,3,string1;string2~string3
Number 1 : 1
Number 2 : 2
Number 3 : 3
String 1 : string1
String 2 : string2
String 3 : string3

However, if a parameter is missing, the parameters before are well parsed, but not those which are after the missing parameter.

frame : $1,,3,string1;string2~string3
Number 1 : 1
Number 2 : 
Number 3 : 
String 1 :��/�
String 2 : �\<��
String 3 : $[<��

frame : $1,2,3,string1;~string3
Number 1 : 1
Number 2 : 2
Number 3 : 3
String 1 : string1
String 2 : h�v��
String 3 : ��v��

How can I specify to sscanf that some parameters are potentially missing in the frame so in that case they will be discarded?

chqrlie
  • 131,814
  • 10
  • 121
  • 189
Arkaik
  • 852
  • 2
  • 19
  • 39
  • 6
    My hint: don't use `sscanf` if the input is potentially different from what you expect, but parse the string yourself. – Jabberwocky Jun 14 '18 at 11:40
  • 5
    Parsing like that is really not one of the strong points of `scanf` and family. You might want to consider using other methods, like tokenizing with [`strtok`](http://en.cppreference.com/w/c/string/byte/strtok) or perhaps even [regular expressions](https://en.wikipedia.org/wiki/Regular_expression). – Some programmer dude Jun 14 '18 at 11:41
  • Do the optional numbers have a default value if they are not provided or something? – J...S Jun 14 '18 at 11:53
  • Is `string` only made up with letters? – chux - Reinstate Monica Jun 14 '18 at 12:01
  • @J...S No, optional numbers (same for strings) are either specified or not present (empty field). – Arkaik Jun 14 '18 at 12:01
  • What should happen if the parsed `string` length exceeds 99? – chux - Reinstate Monica Jun 14 '18 at 12:02
  • @chux No, string fields may contain any ascii character, including numbers. And I could have added length conditions but I know that strings can't exceed 20 bytes so it's not an issue in that case even if it could have been. – Arkaik Jun 14 '18 at 12:03
  • Then how to know when a `string` ends? "any ASCII character" includes `'\0'`, so that is certainly too broad. With `;` is it any character aside from `;`. Are white-spaces part of a `str1` and `str3`? – chux - Reinstate Monica Jun 14 '18 at 12:08
  • @chux I fact I also know that there won't have any '\0' character in the string except at the end of the frame, there also won't have any delimiter (',' or ';' or '~') character inside the string. – Arkaik Jun 14 '18 at 12:12

3 Answers3

2

The guess the best is still to write your own parser function:

#define _GNU_SOURCE 1
#define _POSIX_C_SOURCE 1
#include <stdio.h>
#include <stddef.h>
#include <assert.h>
#include <string.h>
#include <stdlib.h>

#define __arraycount(x) (sizeof(x)/sizeof(x[0]))

// from https://stackoverflow.com/a/3418673/9072753
static char *mystrtok(char **m,char *s,char c)
{
  char *p = s ? s : *m;
  if (!*p)
    return NULL;
  *m = strchr(p, c);
  if (*m)
    *(*m)++ = '\0';
  else
    *m = p + strlen(p);
  return p;
}

static char getseparator(size_t i)
{
    return i <= 2 ? ',' : i == 3 ? ';' : i == 4 ? '~' : 0;
}

int main()
{
    char ***output = NULL;
    size_t outputlen = 0;
    const size_t outputstrings = 6;

    char *line = NULL;
    size_t linelen = 0;
    size_t linecnt;
    for (linecnt = 0; getline(&line, &linelen, stdin) > 0; ++linecnt) {
        if (line[0] != '$') {
            printf("Lines not starting with $ are ignored\n");
            continue;
        }

        // alloc memory for new set of 6 strings
        output = realloc(output, sizeof(*output) * outputlen++);
        if (output == NULL) {
            fprintf(stderr, "%d Error allocating memory", __LINE__);
            return -1;
        }
        output[outputlen - 1] = malloc(sizeof(*output[outputlen - 1]) * outputstrings);
        if (output[outputlen - 1] == NULL) {
            fprintf(stderr, "%d Error allocating memory", __LINE__);
            return -1;
        }

        // remove closing newline
        line[strlen(line)-1] = '\0';

        //printf("Read line `%s`\n", line);

        char *token;
        char *rest = &line[1];
        char *state;
        size_t i;
        for (i = 0, token = mystrtok(&state, &line[1], getseparator(i)); 
                i < outputstrings && token != NULL;
                ++i, token = mystrtok(&state, NULL, getseparator(i))) {
            output[outputlen - 1][i] = strdup(token);
            if (output[outputlen - 1][i] == NULL) {
                fprintf(stderr, "%d Error allocating memory", __LINE__);
                return -1;
            }
            //printf("Read %d string: `%s`\n", i, output[outputlen - 1][i]);
        }
        if (i != outputstrings) {
            printf("Malformed line: %s %d %p \n", line, i, token);
            continue;
        }
    }
    free(line);

    for (size_t i = 0; i < outputlen; ++i) {
        for (size_t j = 0; j < outputstrings; ++j) {
            printf("From line %d the string num %d: `%s`\n", i, j, output[i][j]);
        }
    }

    for (size_t i = 0; i < outputlen; ++i) {
        for (size_t j = 0; j < outputstrings; ++j) {
            free(output[i][j]);
        }
        free(output[i]);
    }
    free(output);

    return 0;
}

Which for intput:

$0,0,0,thisIsFirstString;secondString~thirdOne
$0,,0,firstString;~thirdOne
$,,,firstString;~thirdString

produces the result:

From line 0 the string num 0: `0`
From line 0 the string num 1: `0`
From line 0 the string num 2: `0`
From line 0 the string num 3: `thisIsFirstString`
From line 0 the string num 4: `secondString`
From line 0 the string num 5: `thirdOne`
From line 1 the string num 0: `0`
From line 1 the string num 1: ``
From line 1 the string num 2: `0`
From line 1 the string num 3: `firstString`
From line 1 the string num 4: ``
From line 1 the string num 5: `thirdOne`
From line 2 the string num 0: ``
From line 2 the string num 1: ``
From line 2 the string num 2: ``
From line 2 the string num 3: `firstString`
From line 2 the string num 4: ``
From line 2 the string num 5: `thirdStrin`
KamilCuk
  • 120,984
  • 8
  • 59
  • 111
1

As others have said scanf() family functions may not be suited for this as adequate error handling is not possible in case the input string is not in the expected format.

But if you are sure that the input string will always be of that format, you could use a pointer pointing to the relevant part of the input string which is then processed with sscanf().

First initialise all the character array to an empty string so that they won't display garbage when printed. Like

char num1[10]="";

for all the char arrays to which the extracted parameters are to written.

Declare a character pointer and make it point to the beginning of frame which is the input string.

char *ptr=frame;

Now check for the first parameter which is optional like

if(sscanf(ptr, "$%[^,],", num1)==1)
{
    //parameter 1 is present.
    ptr+=strlen(num1);  
}
ptr+=2;

If the parameter is present, we increment ptr by the length of parameter string and a further increment of 2 is done for the '$' and comma.

Likewise for the next two parameters which are also optional.

if(sscanf(ptr, "%[^,]", num2)==1)
{
    //Parameter 2 is present
    ptr+=strlen(num2);  
}
ptr+=1;

if(sscanf(ptr, "%[^,]", num3)==1)
{
    //Parameter 3 is present
    ptr+=strlen(num3);  
}
ptr+=1;

The next parameter, parameter 4, is not optional.

sscanf(ptr, "%[^;]", str1);
ptr+=strlen(str1)+1;

Now for the optional parameter 5,

if(sscanf(ptr, "%[^~]", str2)==1)
{
    //Parameter 5 is present
    ptr+=strlen(str2);  
}
ptr+=1; //for ~

And finally for the non-optional parameter 6,

sscanf(ptr, "%s", str3);

Possible error checking is omitted for brevity. And to prevent overflow, use width specifiers in the scanf() format string like

sscanf(ptr, "%9[^,],", num2);

where 9 is one less than the length of the num2 character array.

And in your program, sscanf() effectively stop assigning to the variables if the %[^,] part corresponds to an empty string.

J...S
  • 5,079
  • 1
  • 20
  • 35
  • The 2nd `,` in the format of `sscanf(ptr, "%[^,],", num2)==1` serves no functional purpose. Further, should `ptr` end without a `,` this code will index passed the end of the string with `ptr+=1;`. Perhaps this is part of "Possible error checking is omitted for brevity." – chux - Reinstate Monica Jun 14 '18 at 13:03
  • @chux Do you mean, if the part of the string pointed to by `ptr` matching `%[^,]` is an empty string, `sscanf()` will stop assigning and the comma afterwards won't be considered anyway? – J...S Jun 14 '18 at 13:05
  • `sscanf(ptr, "%[^,],", num2)==1` is a test if the `num2` had something written to it. Yet it does not insure that a `,` followed. Consider the effect of `if(sscanf("abc", "%[^,],", num2)==1) { ptr+=strlen(num2); } ptr+=1;`. Where does `ptr` now point? – chux - Reinstate Monica Jun 14 '18 at 13:08
  • @chux I see that if a comma is absent, it will get out of hand but if the input is well formatted and there is a comma, is it that the `sscanf()` stops assigning if the `%[^,]` matches an empty string? – J...S Jun 14 '18 at 13:16
  • 1
    `sscanf(",","%[^,],", num2)` should not return 1. So this test handles well formed input. IMHO, lack of error handling, apparently OK for OP, remains a weakness for general applicability of this approach. – chux - Reinstate Monica Jun 14 '18 at 14:36
  • @chux Lack of error handling methods is indeed a problem with `scanf()` family. But the `sscanf()` format string in your last comment looks strange. Maybe it has a mistake in it? Or is it just something that I don't know? – J...S Jun 14 '18 at 14:41
  • Let us [continue this discussion in chat](https://chat.stackoverflow.com/rooms/173159/discussion-between-chux-and-j-s). – chux - Reinstate Monica Jun 14 '18 at 14:45
  • 1
    sscanf() in itself is not the error problem, it is the problems like "if(sscanf("abc", "%[^,],", num2)==1) { ptr+=strlen(num2); } ptr+=1;" and "if(sscanf("abc(200 chracters)xyz,", "%[^,],", num2). With well formed input, your answer works fine, yet I find it difficult to use as a basis with good error handling. – chux - Reinstate Monica Jun 14 '18 at 15:09
1

scanf() cannot convert handle empty character classes, and strtok() considers every sequence of separators as a single separator, which is really only suitable for white space.

Here is a simple scanf-like non greedy parser for your purpose:

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>

int my_sscanf(const char *s, const char *fmt, ...) {
    int res = 0;
    va_list ap;

    va_start(ap, fmt);
    for (; *fmt; fmt++) {
        if (*fmt == '%') {
            fmt++;
            if (*fmt == 's') {
                size_t i = 0, size = va_arg(ap, size_t);
                char *dest = va_arg(ap, char *);

                while (*s && *s != fmt[1]) {
                    if (i + 1 < size)
                        dest[i++] = *s;
                    s++;
                }
                if (size)
                    dest[i] = '\0';
                res++;
                continue;
            }
            if (*fmt == 'd') {
                *va_arg(ap, int *) = strtol(s, (char **)&s, 10);
                res++;
                continue;
            }
            if (*fmt == 'i') {
                *va_arg(ap, int *) = strtol(s, (char **)&s, 0);
                res++;
                continue;
            }
            /* add support for other conversions as you wish */
            if (*fmt != '%')
                return -1;
        }
        if (*fmt == ' ') {
            while (isspace((unsigned char)*s))
                s++;
            continue;
        }
        if (*s == *fmt) {
            s++;
        } else {
            break;
        }
    }
    va_end(ap);
    return res;
}

int main() {
    char frame[100] = "$1,,3,string1;~string3";
    char str1[100], str2[100], str3[100];
    int res, num1, num2, num3;

    printf("frame : %s\n", frame);

    res = my_sscanf(frame, "$%d,%d,%d,%s;%s~%s", &num1, &num2, &num3,
                    sizeof str1, str1, sizeof str2, str2, sizeof str3, str3);

    if (res == 6) {
        printf("Number 1 : %d\n", num1);
        printf("Number 2 : %d\n", num2);
        printf("Number 3 : %d\n", num3);
        printf("String 1 : %s\n", str1);
        printf("String 2 : %s\n", str2);
        printf("String 3 : %s\n", str3);
    } else {
        printf("my_scanf returned %d\n", res);
    }
    return 0;
}
chqrlie
  • 131,814
  • 10
  • 121
  • 189