#!/usr/bin/perl -w # using Perl Regular Expressions # Author: Ingo Macherius # modified by Michel Rodriguez use strict; open( REC, "; # remove comments NOW # fails for }{}sg; # a semi generic way to get the entities # fails miserably for entities using other entities my %ent= ( amp => '&', quot => '"', apos => "'", lt => '<', gt => '>', xmlpio => "' "&#x", # uses & nbsp => ' ', '#160' => ' ', # def is commented out in the REC ); while( $doc=~ //g) { $ent{$1} ||= $3; } # use ||= to avoid redefining entities my $i = 0; foreach ( $doc =~ m{.*?}gs ) { my( $lhs) = m{(.*?)} or die "no lhs in prod $_";; my $rhs=''; while( m{(.*?)}sg) { $rhs .= $1; } $rhs =~ s{}{}sg; # remove nt tags #$rhs =~ s{.*?}{}sg; # remove com elements, not needed here $i++; print clean( "[$i] $lhs ::= $rhs"), "\n"; } sub clean { my( $string)= @_; # yes, you have to replace the entities yourself $string=~ s{&(\w+);}{ $ent{$1} || die "unknown entity $1"}eg; $string =~ s{\xc2\xa0}{ }g; # weird character in source $string =~ s{\s+}{ }g; $string =~ s{\s$}{}; return $string; }