#!/usr/bin/perl -w # Using XML::Parser::Lite # Author: Michel Rodriguez # based on a stub by Josh Narins # and help from Jeff Gleixner use strict; use XML::Parser::Lite; # we need to replace the entities as XML::Parser::Lite does not do it my %ent= ( amp => '&', quot => '"', apos => "'", lt => '<', gt => '>', xmlpio => "' "&#x", # uses & nbsp => ' ', '#160' => ' ', # def is commented out in the REC ); open( REC, "; } close REC; # load entities, breaks for entities using other entities while( $doc=~ //sg) { $ent{$1} ||= $3; } # use ||= to avoid redefining entities my $flags={}; my $parser = new XML::Parser::Lite Handlers => { Start => sub { my ( $p, $el) = @_; if ( $el eq 'rhs' ) { $flags->{in_rhs}=1 } elsif ( $el eq 'lhs' ) { $flags->{in_lhs}=1 } }, Char => sub { my ( $p, $txt) = @_; if ($flags->{in_lhs}) {$flags->{lhs} .= $txt} elsif ($flags->{in_rhs}) {$flags->{rhs} .= $txt} }, End => sub { my ( $p, $el) = @_; if ( $el eq 'rhs' ) { $flags->{in_rhs}=0; } elsif ( $el eq 'lhs' ) { $flags->{in_lhs}=0; } elsif ( $el eq 'prod') { push @{$flags->{production}}, production(++$flags->{i},$flags->{lhs},$flags->{rhs}); $flags->{lhs}= ''; $flags->{rhs}= ''; } } }; $parser->parse( $doc); foreach my $prod ( @{$flags->{production}}) { print clean( $prod), "\n"; } sub production { my ($i,$lhs,$rhs) = @_; return "[$i] $lhs ::= $rhs"; } sub clean { my( $string)= @_; # replace entities $string=~ s{&(.*?);}{$ent{$1} or die "unknown entity $1\n"}eg; $string =~ s{\xc2\xa0}{ }g; # weird characters in the original document $string =~ s{\s+}{ }g; $string=~ s{\s$}{}; return $string; }