I have the following regular expression:
[A−Z]∗01∗[^A−Z]{3}
The alphabet is [A-Z][0-9]
.
Can someone explain the correct way to convert to a NFA and then to DFA. And how can I implement the epsilon closure in C.
Implementing an NFA with ε-moves (NFA-ε)
First of all, the 36-character alphabet can be reduced to 4 exclusive classes in this particular case.
Let \1
mean [A−Z]
Let \2
mean [^A−Z01]
aka [2-9]
This reduces our alphabet to \1
, 0
, 1
and \2
.
[A−Z]*01*[^A−Z]{3}
becomes \1*01*[01\2]{3}
Thompson's construction can be used to convert a regular expression into an NFA-ε.
\1*01*[01\2]{3}
is equivalent to \1*01*[01\2][01\2][01\2]
, so that gives us:
We can use the state diagram depicted above to generated the following transition table for the NFA-ε:
ε \1 0 1 \2
------- ------- ------- ------- -------
S00 S01,S03 Starting State
S01 S02
S02 S01,S03
S03 S04
S04 S05,S07
S05 S06
S06 S05,S07
S07 S08 S08 S08
S08 S09 S09 S09
S09 S10 S10 S10
S10 Accepting State
Of course, it's also quite evident that could also use the following NFA:
That would be far simpler to implement because it's far simpler to implement an NFA engine than an NFA-ε engine. However, it's trivial to convert an NFA-ε into an NFA by going through the following steps:
Flatten recursive ε-moves.
For example,
S00--ε-->{S01}
S01--ε-->{S02}
S02--ε-->{S03}
S02--1-->{S04}
becomes
S00--ε-->{S01,S02,S03}
S01--ε-->{S02,S03}
S02--ε-->{S03}
S02--1-->{S04}
Merge state transitions reached through ε-moves.
For example,
S00--ε-->{S01,S02,S03}
S01--ε-->{S02,S03}
S02--ε-->{S03}
S03--1-->{S04}
becomes
S00--ε-->{S01,S02,S03}
S00--1-->{S04}
S01--ε-->{S02,S03}
S01--1-->{S04}
S02--ε-->{S03}
S02--1-->{S04}
S03--1-->{S04}
Add states that reach an accepting state through ε-moves to the set of accepting states.
Drop ε-moves.
For our NFA-ε, we get the following:
We could take it a step further and clean up the NFA.
There are unreachable states that can be removed. (S01, S03, S05 and S07 are unreachable.)
There are identical states that can be combined. (S00 and S02 have identical transitions, and so do S04 and S06.)
After performing those cleanups, we get
Doesn't that look familiar?
Time to implement! The following implements the NFA-ε above, but automatically converts it to an NFA using the steps above before processing the input. (It doesn't perform any of the cleanup steps.)
#include <stdio.h>
#include <stdint.h>
/*
ε => 0
'A'..'Z' = 41..5A => 1
'0' = 30 => 2
'1' = 31 => 3
'2'..'9' = 32..39 => 4
*/
#define NUM_INPUTS 5
#define _ -1
static int char_to_input[0x80] = {
_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, // 00..0F
_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, // 10..1F
_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, // 20..2F
2, 3, 4, 4, 4, 4, 4, 4, 4, 4, _, _, _, _, _, _, // 30..3F
_, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 40..4F
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, _, _, _, _, _, // 50..5F
_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, // 60..6F
_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, // 70..7F
};
#undef _
#define S(x) (1 << x)
#define S00 (1 << 0)
#define S01 (1 << 1)
#define S02 (1 << 2)
#define S03 (1 << 3)
#define S04 (1 << 4)
#define S05 (1 << 5)
#define S06 (1 << 6)
#define S07 (1 << 7)
#define S08 (1 << 8)
#define S09 (1 << 9)
#define S10 (1 << 10)
#define _ 0
static uint_least16_t transitions[][NUM_INPUTS] = {
// ε , \1 , 0 , 1, , \2
// ------- , --- , --- , --- , ---
{ S01|S03 , _ , _ , _ , _ }, // S00 Starting State
{ _ , S02 , _ , _ , _ }, // S01
{ S01|S03 , _ , _ , _ , _ }, // S02
{ _ , _ , S04 , _ , _ }, // S03
{ S05|S07 , _ , _ , _ , _ }, // S04
{ _ , _ , _ , S06 , _ }, // S05
{ S05|S07 , _ , _ , _ , _ }, // S06
{ _ , _ , S08 , S08 , S08 }, // S07
{ _ , _ , S09 , S09 , S09 }, // S08
{ _ , _ , S10 , S10 , S10 }, // S09
{ _ , _ , _ , _ , _ }, // S10 Accepting State
};
#undef _
uint_least16_t STATES_START = S00;
uint_least16_t STATES_REJECT = 0;
uint_least16_t STATES_ACCEPT = S10;
#define NUM_STATES (sizeof(transitions)/sizeof(transitions[0]))
int main(int argc, char** argv) {
if (argc != 2) {
fprintf(stderr, "usage\n");
return 2;
}
// Flatten recursive ε-moves.
for (size_t s1=0; s1<NUM_STATES; ++s1) {
uint_least16_t e_states = transitions[s1][0];
do {
for (size_t s2=0; s2<NUM_STATES; ++s2) {
if (e_states & S(s2))
e_states |= transitions[s2][0];
}
} while (e_states != transitions[s1][0]);
transitions[s1][0] = e_states;
}
// Convert NFA-ε into NFA.
{
// For each state s1,
// for each state s2,
// if s1 has an ε-move to s2,
// merge s2's transitions into s1's.
for (size_t s1=0; s1<NUM_STATES; ++s1) {
uint_least16_t e_states = transitions[s1][0];
for (size_t s2=0; s2<NUM_STATES; ++s2) {
if (e_states & S(s2)) {
for (size_t i=1; i<NUM_INPUTS; ++i) {
transitions[s1][i] |= transitions[s2][i];
}
}
}
}
// For each state s1,
// for each accepting state s2,
// if s1 has an ε-move to s2,
// add s1 to the set of accepting states.
for (size_t s1=0; s1<NUM_STATES; ++s1) {
uint_least16_t e_states = transitions[s1][0];
uint_least16_t Ss1 = S(s1);
if (!(STATES_ACCEPT & Ss1)) {
for (size_t s2=0; s2<NUM_STATES; ++s2) {
if (e_states & S(s2) & STATES_ACCEPT) {
STATES_ACCEPT |= Ss1;
break;
}
}
}
}
}
// NFA engine
{
const char *str = argv[1];
size_t i = 0;
uint_least16_t states = STATES_START;
while (1) {
unsigned char ch = (unsigned char)str[i];
if (!ch) {
if (states & STATES_ACCEPT) {
fprintf(stdout, "Match.\n");
return 0;
}
}
if (ch >= 0x80) {
states = STATES_REJECT;
} else {
int input = char_to_input[ch];
if (input < 0) {
states = STATES_REJECT;
} else {
uint_least16_t new_states = STATES_REJECT;
for (size_t s=0; s<NUM_STATES; ++s,states>>=1) {
if (states & 1)
new_states |= transitions[s][input];
}
states = new_states;
}
}
if (states == STATES_REJECT) {
fprintf(stderr, "Not a match. (Failed at offset %zu.)\n", i);
return 1;
}
++i;
}
}
}
(I used _
to make non-trivial values stand out.)
An NFA can be converted into a DFA using powerset contruction, but there's not much point in doing so.