From a regular expression to NFA and to DFA

Question

I have the following regular expression:

[A−Z]∗01∗[^A−Z]{3}

The alphabet is [A-Z][0-9].

Can someone explain the correct way to convert to a NFA and then to DFA. And how can I implement the epsilon closure in C.

ikegami · Accepted Answer · 2019-12-01T01:27:03.853

Implementing an NFA with ε-moves (NFA-ε)

First of all, the 36-character alphabet can be reduced to 4 exclusive classes in this particular case.

Let \1 mean [A−Z]
Let \2 mean [^A−Z01] aka [2-9]

This reduces our alphabet to \1, 0, 1 and \2.

[A−Z]*01*[^A−Z]{3} becomes \1*01*[01\2]{3}

Thompson's construction can be used to convert a regular expression into an NFA-ε.

\1*01*[01\2]{3} is equivalent to \1*01*[01\2][01\2][01\2], so that gives us:

We can use the state diagram depicted above to generated the following transition table for the NFA-ε:

      ε       \1      0       1       \2
      ------- ------- ------- ------- -------
S00   S01,S03                                   Starting State
S01           S02
S02   S01,S03
S03                   S04
S04   S05,S07
S05                           S06
S06   S05,S07
S07                   S08     S08     S08
S08                   S09     S09     S09
S09                   S10     S10     S10
S10                                             Accepting State

Of course, it's also quite evident that could also use the following NFA:

That would be far simpler to implement because it's far simpler to implement an NFA engine than an NFA-ε engine. However, it's trivial to convert an NFA-ε into an NFA by going through the following steps:

Flatten recursive ε-moves.

For example,

S00--ε-->{S01}
S01--ε-->{S02}
S02--ε-->{S03}
S02--1-->{S04}

becomes

S00--ε-->{S01,S02,S03}
S01--ε-->{S02,S03}
S02--ε-->{S03}
S02--1-->{S04}

Merge state transitions reached through ε-moves.

For example,

S00--ε-->{S01,S02,S03}
S01--ε-->{S02,S03}
S02--ε-->{S03}
S03--1-->{S04}

becomes

S00--ε-->{S01,S02,S03}
S00--1-->{S04}
S01--ε-->{S02,S03}
S01--1-->{S04}
S02--ε-->{S03}
S02--1-->{S04}
S03--1-->{S04}

Add states that reach an accepting state through ε-moves to the set of accepting states.
Drop ε-moves.

For our NFA-ε, we get the following:

We could take it a step further and clean up the NFA.

There are unreachable states that can be removed. (S01, S03, S05 and S07 are unreachable.)
There are identical states that can be combined. (S00 and S02 have identical transitions, and so do S04 and S06.)

After performing those cleanups, we get

Doesn't that look familiar?

Time to implement! The following implements the NFA-ε above, but automatically converts it to an NFA using the steps above before processing the input. (It doesn't perform any of the cleanup steps.)

#include <stdio.h>
#include <stdint.h>


/*
   ε                 => 0
   'A'..'Z' = 41..5A => 1
   '0'      = 30     => 2
   '1'      = 31     => 3
   '2'..'9' = 32..39 => 4
*/

#define NUM_INPUTS 5

#define _ -1

static int char_to_input[0x80] = {
    _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,  // 00..0F
    _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,  // 10..1F
    _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,  // 20..2F
    2, 3, 4, 4, 4, 4, 4, 4, 4, 4, _, _, _, _, _, _,  // 30..3F
    _, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, _, _, _, _, _,  // 50..5F
    _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,  // 60..6F
    _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _,  // 70..7F
};

#undef _


#define S(x) (1 << x)
#define S00 (1 <<  0)
#define S01 (1 <<  1)
#define S02 (1 <<  2)
#define S03 (1 <<  3)
#define S04 (1 <<  4)
#define S05 (1 <<  5)
#define S06 (1 <<  6)
#define S07 (1 <<  7)
#define S08 (1 <<  8)
#define S09 (1 <<  9)
#define S10 (1 << 10)

#define _ 0

static uint_least16_t transitions[][NUM_INPUTS] = {
   //       ε  ,   \1  ,    0  ,    1, ,   \2
   // -------  ,  ---  ,  ---  ,  ---  ,  ---
   {  S01|S03  ,  _    ,  _    ,  _    ,  _    },   // S00   Starting State
   {  _        ,  S02  ,  _    ,  _    ,  _    },   // S01
   {  S01|S03  ,  _    ,  _    ,  _    ,  _    },   // S02
   {  _        ,  _    ,  S04  ,  _    ,  _    },   // S03
   {  S05|S07  ,  _    ,  _    ,  _    ,  _    },   // S04
   {  _        ,  _    ,  _    ,  S06  ,  _    },   // S05
   {  S05|S07  ,  _    ,  _    ,  _    ,  _    },   // S06
   {  _        ,  _    ,  S08  ,  S08  ,  S08  },   // S07
   {  _        ,  _    ,  S09  ,  S09  ,  S09  },   // S08
   {  _        ,  _    ,  S10  ,  S10  ,  S10  },   // S09
   {  _        ,  _    ,  _    ,  _    ,  _    },   // S10   Accepting State
};

#undef _

uint_least16_t STATES_START  = S00;
uint_least16_t STATES_REJECT = 0;
uint_least16_t STATES_ACCEPT = S10;

#define NUM_STATES (sizeof(transitions)/sizeof(transitions[0]))


int main(int argc, char** argv) {
   if (argc != 2) {
      fprintf(stderr, "usage\n");
      return 2;
   }

   // Flatten recursive ε-moves.
   for (size_t s1=0; s1<NUM_STATES; ++s1) {
      uint_least16_t e_states = transitions[s1][0];

      do {
         for (size_t s2=0; s2<NUM_STATES; ++s2) {
            if (e_states & S(s2))
               e_states |= transitions[s2][0];
         }
      } while (e_states != transitions[s1][0]);

      transitions[s1][0] = e_states;
   }

   // Convert NFA-ε into NFA.
   {
      // For each state s1,
      // for each state s2,
      // if s1 has an ε-move to s2,
      // merge s2's transitions into s1's.
      for (size_t s1=0; s1<NUM_STATES; ++s1) {
         uint_least16_t e_states = transitions[s1][0];
         for (size_t s2=0; s2<NUM_STATES; ++s2) {
            if (e_states & S(s2)) {
               for (size_t i=1; i<NUM_INPUTS; ++i) {
                  transitions[s1][i] |= transitions[s2][i];
               }
            }
         }
      }

      // For each state s1,
      // for each accepting state s2,
      // if s1 has an ε-move to s2,
      // add s1 to the set of accepting states.
      for (size_t s1=0; s1<NUM_STATES; ++s1) {
         uint_least16_t e_states = transitions[s1][0];
         uint_least16_t Ss1 = S(s1);
         if (!(STATES_ACCEPT & Ss1)) {
            for (size_t s2=0; s2<NUM_STATES; ++s2) {
               if (e_states & S(s2) & STATES_ACCEPT) {
                  STATES_ACCEPT |= Ss1;
                  break;
               }
            }
         }
      }
   }

   // NFA engine
   {
      const char *str = argv[1];
      size_t i = 0;
      uint_least16_t states = STATES_START;
      while (1) {
         unsigned char ch = (unsigned char)str[i];
         if (!ch) {
            if (states & STATES_ACCEPT) {
               fprintf(stdout, "Match.\n");
               return 0;
            }
         }

         if (ch >= 0x80) {
            states = STATES_REJECT;
         } else {
            int input = char_to_input[ch];
            if (input < 0) {
               states = STATES_REJECT;
            } else {
               uint_least16_t new_states = STATES_REJECT;
               for (size_t s=0; s<NUM_STATES; ++s,states>>=1) {
                  if (states & 1)
                     new_states |= transitions[s][input];
               }

               states = new_states;
            }
         }

         if (states == STATES_REJECT) {
            fprintf(stderr, "Not a match. (Failed at offset %zu.)\n", i);
            return 1;
         }

         ++i;
      }
   }
}

(I used _ to make non-trivial values stand out.)

An NFA can be converted into a DFA using powerset contruction, but there's not much point in doing so.

From a regular expression to NFA and to DFA

1 Answers1