Grammar inference: How can the algorithm Alignment Based Learning help us infering a grammar

Question

I'm a beginner in the grammar inference domain. During my research about it, I found an implementation called Alignment Based Learning (ABL). I have understood what is being done in this implementation (Alignment -> Clustering -> selection) but i don't know what to do next in order to reach my objective which is the grammar induced.

My question is: after having done this 3 steps of the algorithm, what should we do next to get the induced grammar, and how?

This is a link to the ABL implementation (works on linux):

http://ilk.uvt.nl/menno/research/software/abl

score 1 · Answer 1 · answered Apr 24 '15 at 13:06

There are a manual pdf in the package, in addition I wrote to Dr. Zaanen (who presents ABL) and this is his answer:

I have written a small Perl script some years ago that does something like this. I'm not sure the conversion of left recursive rules to right recursive rules works well, though.

Zaanen

this is his code (I've not checked it)

#!/usr/bin/perl -w
use strict;
use vars qw($opt_h $opt_i $opt_o $opt_l $opt_d);
use Getopt::Std;

$opt_i ="-"; # default value
$opt_o ="-"; # default value
getopts('hi:o:ld');

$opt_h ||=0; # default value
$opt_d ||=0; # default value
$opt_l ||=0; # default value

my $usage =
  "Usage: $0 [OPTION]\.\.\.
This program converts an ABL file into a Nuance GSL file\.

  -i FILE   Name of input file (default: $opt_i)
  -o FILE   Name of output file (default: $opt_o)
  -l        Convert left recurive rules to right recursive (default: off)
  -d        Debug (default: off)
  -h        Show this help and exit
";

die $usage if $opt_h;

die $usage if $opt_i eq "";
open(INPUT, "<$opt_i")
  || die "Couldn't open input file: $!\n";

die $usage if $opt_o eq "";
open(OUTPUT, ">$opt_o")
  || die "Couldn't open outputfile: $!\n";


my %grammar;
my $start_nonterm;
my %first;

sub read_line() {
  my $line=<INPUT>;
  while (defined($line) and ($line=~/^#/)) {
    $line=<INPUT>;
  }
  return $line;
}

sub extract_structure {
  my $structure=$_[0];
  my %structure;
  my $constituent;
  while ($structure=~/\(\s*(\d+)\s*,\s*(\d+)\s*,\s*\[([^\]]+)\]\s*\)\s*/g) {
    my $b=$1;
    my $e=$2;
    my $n=$3;
    if ($n!~/^\d+$/) {
      die "Incorrect nonterminal format: $n";
    }
    if ($b!=$e) {
      $structure{$b}{$e}=$n;
    }
  }
  return %structure;
}

sub find_rhs {
  my @sentence=@{$_[0]};
  my %structure=%{$_[1]};
  my $begin=$_[2];
  my $end=$_[3];
  my @result;
  my $counter=$begin;
  while ($counter!=$end) {
    if (defined($structure{$counter})) {
      # handle non-term
      my @keys=sort { $a < $b } keys %{$structure{$counter}};
      if ($keys[0]>$end) {
        die "Found overlapping constituent: ($counter,$keys[0],[${$structure{$counter}}{$keys[0]}])";
      }
      my $nonterm=translate_nonterm(${$structure{$counter}}{$keys[0]});
      push (@result, $nonterm);
      find_grammar(\@sentence, \%structure, $counter, $keys[0]);
      $counter=$keys[0];
    }
    else {
      # handle term
      push (@result, $sentence[$counter]);
      ++$counter;
    }
  }
  return join(" ", @result);
}

sub find_grammar {
  my @sentence=@{$_[0]};
  my %structure=%{$_[1]};
  my $begin=$_[2];
  my $end=$_[3];
  my $top_level=$_[4];
  my $lhs="";
  if (defined($structure{$begin}) and
      defined(${$structure{$begin}}{$end})) {
    $lhs=${$structure{$begin}}{$end};
    delete ${$structure{$begin}}{$end};
    if (scalar(keys %{$structure{$begin}})==0) {
      delete $structure{$begin};
    }
  }
  if ($lhs eq "") {
    die "missing start symbol";
  }
  my $rhs=find_rhs(\@sentence, \%structure, $begin, $end);
  $lhs=translate_nonterm($lhs);
  if ($top_level) {
    $start_nonterm=$lhs;
  }
  ${$grammar{$lhs}}{$rhs}++;
}

sub translate_nonterm {
  my $in=$_[0];
  if ($in==0) {
    return "A";
  }
  my $result;
  while ($in!=0) {
    my $digit=$in%10;
    $in=($in-($in%10))/10;
    $result.=chr(ord("A")+$digit);
  }
  return $result;
}

sub remove_direct_left_recursion {
  my $Ai=$_[0];
  my @rhss=keys %{$grammar{$Ai}};
  my %recursive;
  my %nonrecursive;
  foreach my $rhs (@rhss) {
    if ($rhs=~/^$Ai\s*(.*)$/) {
      my $alpha=$1;
      $recursive{$alpha}++;
    }
    else {
      $nonrecursive{$rhs}++;
    }
  }
  if (scalar(keys %recursive)>0) {
    delete $grammar{$Ai}; # remove
    delete $first{$Ai};
    foreach my $rhs (keys %nonrecursive) {
      if ($rhs=~/^([A-Z]+)\s*(.*)$/) {
        $first{$Ai}{$1}=1;
      }
      $grammar{$Ai}{$rhs}++;
      $grammar{$Ai}{$rhs." $Ai'"}++;
    }
    foreach my $rest (keys %recursive) {
      if ($rest=~/^([A-Z]+)\s*(.*)$/) {
        $first{$Ai}{$1}=1;
      }
      $grammar{"$Ai'"}{$rest}++;
      $grammar{"$Ai'"}{$rest." $Ai'"}++;
    }
  }
}

sub remove_left_recursion {
  my $nonterm=$_[0];
print "handling $nonterm\n";
  if (defined($first{$nonterm})) { # already done
    print "already done\n";
    return;
  }
  # find left nonterms in first
  my @rhs=keys %{$grammar{$nonterm}};
  foreach my $rhs (@rhs) {
    if ($rhs=~/^([A-Z]+)\s*.*$/) {
      my $left_nonterm=$1;
      $first{$nonterm}{$left_nonterm}=1;
    }
  }
  print "first level:".join(" ", (keys %{$first{$nonterm}}))."\n";

  # find first of nonterms in first
  foreach my $left_nonterm (keys %{$first{$nonterm}}) {
    remove_left_recursion($left_nonterm);
    foreach my $n (keys %{$first{$left_nonterm}}) {
      $first{$nonterm}{$n}=1;
    }
  }
  if (defined($first{$nonterm}{$nonterm})) {
    my @chain=keys %{$first{$nonterm}};
    foreach my $a (keys %first) {
      print "$a:".join(" ", (keys %{$first{$a}}))."\n";
    }
    print "Identified chain $nonterm=".join(" ", @chain)."\n";
    remove_left_recursion_chain(\@chain);
    @chain=keys %{$first{$nonterm}};
    foreach my $a (keys %first) {
      print "$a:".join(" ", (keys %{$first{$a}}))."\n";
    }
    print "rest of chain: $nonterm=".join(" ", @chain)."\n";
  }
}

sub remove_left_recursion_chain {
  my @lhs=@{$_[0]};
  my $nr_lhs=scalar(@lhs)-1;
  for my $i (0 .. $nr_lhs) {
    my $Ai=$lhs[$i];
    my @rhs_Ai=keys %{$grammar{$Ai}};
    if ($opt_d) {
      print STDERR "Removing recursion of $Ai: ";
      print STDERR "$i of $nr_lhs (".scalar(@rhs_Ai).")\n";
    }
    for my $j (0 .. $i-1) {
      my $Aj=$lhs[$j];
      foreach my $rhs (@rhs_Ai) {
        if ($rhs=~/^$Aj\s*(.*)$/) {
          my $alpha=$1;
          delete ${$grammar{$Ai}}{$rhs};
          foreach my $beta (keys %{$grammar{$Aj}}) {
            ${$grammar{$Ai}}{"$beta $alpha"}++;
            if ($rhs=~/^([A-Z]+)\s*(.*)$/) {
              $first{$Ai}{$1}=1;
            }
          }
          delete $first{$Ai}{$Aj};
        }
      }
    }
    # transform direct left recursion
    remove_direct_left_recursion($Ai);
  }
}

sub remove_unreachable_rules {
  my %used_tokens;
  $used_tokens{$start_nonterm}++; # start is always reachable
  foreach my $lhs (keys %grammar) {
    foreach my $rhs (keys %{$grammar{$lhs}}) {
      foreach my $token (split(/\s+/, $rhs)) {
        $used_tokens{$token}++;
      }
    }
  }
  my @lhss=keys %grammar;
  foreach my $lhs (@lhss) {
    if (!defined($used_tokens{$lhs})) {
      delete $grammar{$lhs};
    }
  }
}

sub print_grammar {
  foreach my $lhs (sort keys %grammar) {
    my %rhss=%{$grammar{$lhs}};
    my @rhss;
    foreach my $rhs (keys %rhss) {
      if ($rhs=~/\s/) {
        push (@rhss, "($rhs)");
      }
      else {
        push (@rhss, "$rhs");
      }
    }
    if ($lhs eq $start_nonterm) {
      $lhs=".$lhs";
    }
    if (scalar(@rhss)>1) {
      print OUTPUT "$lhs [".join(" ", @rhss)."]\n";
    }
    else {
      print OUTPUT "$lhs ".join(" ", @rhss)."\n";
    }
  }
}

my $nr_lines;
my $line=read_line();
while (defined($line)) {
  $nr_lines++;
  chomp ($line);
  if ($line!~/^(.*)@@@(.*)$/) {
    die "Incorrect input: $line";
  }
  my @sentence=split(/\s+/, $1);
  my $structure=$2;
  my %structure=extract_structure($structure);
  # do top level
  find_grammar(\@sentence, \%structure, 0, scalar(@sentence), 1);
  $line=read_line();
}
if ($opt_d) {
  print STDERR "$nr_lines lines\n";
  print STDERR scalar(keys %grammar)." nonterminals\n";
  my $nr_rules;
  foreach my $nonterm (keys %grammar) {
    $nr_rules+=scalar(keys %{$grammar{$nonterm}});
  }
  print STDERR "$nr_rules rules\n";
}

if ($opt_l) {
  remove_left_recursion($start_nonterm);
#  remove_left_recursion();
#  remove_unreachable_rules();
  if ($opt_d) {
    print STDERR scalar(keys %grammar)." non left recursive nonterminals\n";
    my $nr_rules;
    foreach my $nonterm (keys %grammar) {
      $nr_rules+=scalar(keys %{$grammar{$nonterm}});
    }
    print STDERR "$nr_rules non left recursive rules\n";
  }
}
print_grammar();

close(INPUT);
close(OUTPUT);

Grammar inference: How can the algorithm Alignment Based Learning help us infering a grammar

1 Answers1