2

I wrote a C function that checks if two given strings (C-style) are anagrams or not. I try to verify it with Frama-C but it cannot validate the final behaviors of the function (other specifications are valid). The first one goes to timeout (even with very high timeout values in WP) and the second is unknown.

Here is the code:

    #include <string.h>
//@ ghost char alphabet[26] = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'};

/*@
    // Takes a character and return it to lowercase if it's uppercase
    axiomatic ToLower
    {
        logic char to_lower(char c);

        axiom lowercase:
            \forall char c; 97 <= c <= 122 ==> to_lower(c) == c;

        axiom uppercase:
            \forall char c; 65 <= c <= 90 ==> to_lower(c) == to_lower((char) (c+32));
    }
*/
/*@
    // Count the occurences of character 'c' into 'string' that is long 'n' characters
    axiomatic CountChar
    {
        logic integer count_char(char* string, integer n, char c);

        axiom count_zero:
            \forall char* string, integer n, char c; n <= 0 ==>
            count_char(string, n, c) == 0;

        axiom count_hit:
            \forall char* string, integer n, char c; n >= 0 && to_lower(string[n]) == c ==>
            count_char(string, n+1, c) == count_char(string, n, c) + 1;

        axiom count_miss:
            \forall char* string, integer n, char c; n >= 0 && to_lower(string[n]) != c ==>
            count_char(string, n+1, c) == count_char(string, n, c);
    }
*/

/*@
    predicate are_anagrams{L}(char* s1, char* s2) = ( \forall integer i; 0 <= i < 26 ==> 
    count_char(s1, strlen(s1), alphabet[i]) == count_char(s2, strlen(s2), alphabet[i]) );
*/

/*@
    requires valid_string(a);
    requires valid_string(b);

    // Requires that strings 'a' and 'b' are composed only by alphabet's letters and that are long equally.
    requires \forall integer k; 0 <= k < strlen(a) ==> 65 <= a[k] <= 90 || 97 <= a[k] <= 122;
    requires \forall integer k; 0 <= k < strlen(b) ==> 65 <= b[k] <= 90 || 97 <= b[k] <= 122;
    requires strlen(a) == strlen(b);

    ensures 0 <= \result <= 1;
    assigns \nothing;

    behavior anagrams:
    assumes are_anagrams(a, b);
    ensures \result == 1;
    behavior not_anagrams:
    assumes !are_anagrams(a, b);
    ensures \result == 0;
    complete behaviors anagrams, not_anagrams;
    disjoint behaviors anagrams, not_anagrams;
*/
int check_anagram(const char a[], const char b[])
{
   // Create two arrays and initialize them to zero
   int first[26];
   int second[26];
   int c;
   /*@
    loop assigns first[0..(c-1)];
    loop assigns second[0..(c-1)];
    loop assigns c; 
    loop invariant 0 <= c <= 26;
    loop invariant \forall integer k; 0 <= k < c ==> second[k] == first[k];
    loop invariant \forall integer k; 0 <= k < c ==> first[k] == 0 && second[k] == 0;
    loop invariant \valid(first+(0..25)) && \valid(second+(0..25));
    loop variant 26-c;
   */
   for(c = 0; c < 26; c++)
   {
      first[c] = 0;
      second[c] = 0;
   }

   char tmp = 'a';
   c = 0;

   // Now increment the array position related to position of character occured in the alphabet, subtracting ASCII decimal value of character from the character.
   /*@
    loop assigns first[0..25];
    loop assigns tmp;
    loop assigns c;
    loop invariant 97 <= tmp <= 122;
    loop invariant \valid(first+(0..25));
    loop invariant strlen(\at(a, Pre)) == strlen(\at(a, Here));
    loop invariant 0 <= c <= strlen(a);
    loop variant strlen(a)-c;
   */
   while (a[c] != '\0')
   {
      // This is a little trick to lowercase if the char is uppercase.
      tmp = (a[c] > 64 && a[c] < 91) ? a[c]+32 : a[c];
      first[tmp-97]++;
      c++;
   }


   c = 0;
   // Doing the same thing on second string.
   /*@
    loop assigns second[0..25];
    loop assigns tmp;
    loop assigns c;
    loop invariant 97 <= tmp <= 122;
    loop invariant \valid(second+(0..25));
    loop invariant strlen(\at(b, Pre)) == strlen(\at(b, Here));
    loop invariant 0 <= c <= strlen(b);
    loop variant strlen(b)-c;
   */
   while (b[c] != '\0')
   {
      tmp = (b[c] > 64 && b[c] < 91) ? b[c]+32 : b[c];
      second[tmp-'a']++;
      c++;
   }

   // And now compare the arrays containing the number of occurences to determine if strings are anagrams or not.
   /*@
    loop invariant strlen(\at(a, Pre)) == strlen(\at(a, Here));
    loop invariant strlen(\at(b, Pre)) == strlen(\at(b, Here));
    loop invariant 0 <= c <= 26;
    loop assigns c;
    loop variant 26-c;
   */
   for (c = 0; c < 26; c++)
   {
      if (first[c] != second[c])
         return 0;
   }

   return 1;
}
Bortoliño
  • 33
  • 6
  • I don't know frama, but `loop invariant 0 <= c <= 26;` and `for(c = 0; c < 26; c++)` don't feel right: one says 0...26, the other 0...25. Also, should the "declaration"(?) of `first` and `second` a couple of lines above be `0..(c-1)` or `0..25`? – TripeHound Aug 24 '15 at 08:38
  • 1
    @TripeHound An invariant has to be true for every iteration, including the one at which the loop exits because the condition has become false. For a for-loop `for(c = 0; c < 26; c++) …`, the invariant `0 ≤ c ≤ 26` is correct. The property that is guaranteed when the loop exits is “Invariant and not Condition”, in this case “0 ≤ c ≤ 26 and not c < 26”, from which one can mechanically deduce that c holds 26 when the loop exits (but that's the least of what should be inferred about what the loop does in order to conclude that the function works). – Pascal Cuoq Aug 24 '15 at 08:44
  • @TripeHound `loop assigns first[0..(c-1)];` is also correct: at the beginning of any iteration, only the elements 0 through c-1 of the array `first` have been assigned by the loop: http://blog.frama-c.com/index.php?post/2010/10/06/Specification-of-loop-assigns – Pascal Cuoq Aug 24 '15 at 08:48
  • @PascalCuoq Thanks for that... that [just about] makes sense. – TripeHound Aug 24 '15 at 08:53

2 Answers2

2

Your specification appears to be correct at first sight (but then again it is a very sophisticated specification. I have never written any ACSL that sophisticated and I could be missing something).

The annotations inside your function check_anagram, however, are clearly not enough to explain why this function should respect the contract. In particular, consider the while loops. In order to provide a real insight into how the function works, the invariant of each of these loops should express that at any iteration, the arrays respectively first and second contain the counts of characters of the first and second string visited so far.

This is why at the end of each of these loops, these arrays contain the character counts of the entire strings.

Expressing these invariants would really show how the function works. Without them, there is no hope to reach the conclusion that the contract is implemented.

Pascal Cuoq
  • 79,187
  • 7
  • 161
  • 281
  • I tried to add a new specification to the while loop, following your suggestion. The new spec is `loop invariant \forall integer i; 0 <= i <= c ==> first[i] == count_char( a, i+1, (char)(i+97) );` (using the axiomatic defined before) but, also using two different provers, i obtained similar results: Alt-Ergo: Unknown Z3: Timeout (with 300 seconds) – Bortoliño Aug 24 '15 at 09:49
  • As Pascal said, you need at least to do that also for `second` in the second loop. Additionally, the third loop also needs at least another invariant, saying that `first` and `second` have similar elements up to `c-1`. – Virgile Aug 26 '15 at 13:22
0

I'm no expert in static analysis, but I suspect that some static analysis engines might choke on things like (a[c] > 64 && a[c] < 91), a[c]+32, first[tmp-97] and the other ASCII-specific code you've used here.

Remember, C does not require an ASCII character set; For all we know you could be trying to run this where EBCDIC is the character set, and in that case, I would expect that there might be buffer overflows, depending on the input.

You should use a lookup table (or dictionary of some kind) to convert each character to an integer index, and functions like toupper and tolower to convert an unsigned char value (note the importance of unsigned char here) portably.

autistic
  • 1
  • 3
  • 35
  • 80
  • I previously tried to remove caps handling code/specific. No effort. – Bortoliño Aug 21 '15 at 15:43
  • @Bortolino I see. Please note that showing us code riddled with magic numbers is roughly the equivalent of inviting us into a dark house with filthy light switches that aren't suitably located. – autistic Aug 21 '15 at 15:51
  • i added some comments to source code to improve readability. Thank you. – Bortoliño Aug 21 '15 at 16:05
  • Comments shouldn't need to explain *what* is happening. They're much more useful for explaining *why* something is happening. – autistic Aug 22 '15 at 01:49
  • In practice, it's impossible to say anything about the behavior of a nontrivial C program without assumptions about implementation-defined behavior (`int x = 40000;` invokes implementation-defined behavior). Any analyzer the task of which involves finely tracking what a C program does will have these assumptions. A static analyzer that warns that your C program may not work in EBCDIC is useless, because statistically you could not care less about EBCDIC. – Pascal Cuoq Aug 24 '15 at 06:29
  • @PascalCuoq Indeed, `int x = 40000;` invokes implementation-defined behaviour, and the type of `x` is poorly chosen as a result. Is that really the example you're going to use? A poor choice? You seem to have written a lengthy comment, presumably to support writing non-portable code with garbage such as magic numbers and selecting inappropriate types? – autistic Aug 24 '15 at 06:56
  • @Freenode-newbostonSebivor I have no idea what you are talking about. You are not answering the question and I was trying to explain why. Sorry if I did not make it clear. – Pascal Cuoq Aug 24 '15 at 08:06
  • @Freenode-newbostonSebivor Oh maybe I see what you are talking about now. You claim that I somehow recommend writing `int x = 40000;`. Not at all. What I am saying is, suppose that you are writing a static analyzer. How do you choose that your static analyzer behaves when it encounters `int x = 40000;`. You cannot assume that it won't happen, especially since you claim it is some sort of anti-pattern; static analyzers are for detecting and warning about exactly these anti-patterns, right. Does your analyzer (a) warn about the assignment and stop the analysis until this problem has been fixed… – Pascal Cuoq Aug 24 '15 at 10:04
  • (b) warn about the assignment and continue the analysis assuming that `x` contains 40000, that is, if the next line is a division by `39999 - x`, there is no need to warn about a possible division by zero (c) warn and assume that `x` can have any value, that is, warn about `39999 - x` since that could be a division by zero. In practice, none of these solutions are well-received by actual users. – Pascal Cuoq Aug 24 '15 at 10:10
  • 1
    What actually seems to work in practice is (d) pick up a consistent choice of implementation-defined parameters and analyze for that target. This is what Frama-C does, it has picked ASCII and (unless you use a command-line option to change this) 32-bit, 2's complement `int`. – Pascal Cuoq Aug 24 '15 at 10:11
  • @PascalCuoq Actual users usually aren't impressed to hear that their code is imperfect/flawed. Boo hoo. – autistic Aug 26 '15 at 11:33