#!/usr/bin/perl -w # Using XML::TokeParser # Author: D.H. http://search.cpan.org/author/PODMASTER # additional comments by Michel Rodriguez use strict; use XML::TokeParser; my $file = '/article/ways_to_rome/ex_ps_tokeparser/REC_xml_19980210.xml'; my $i = 0; my $p = XML::TokeParser->new($file); my $Ret = ""; # go through the document, reading tokens while(defined(my $t = $p->get_token() )){ if($t->[0] eq 'S' and $t->[1] eq 'lhs') { # found the start tag for an 'lhs' element: get its text $i++; $Ret = join '', "[$i] ", $p->get_text('/lhs'), " ::= "; }elsif( $t->[0] eq 'S' and $t->[1] eq 'rhs'){ # start tag for a 'rh's element: get its text $Ret .= $p->get_text('/rhs'); }elsif($t->[0] eq 'E' and $t->[1] eq 'prod'){ # end tag for a 'prod' element: output the rule print clean($Ret),"\n"; $Ret = ""; } } undef $Ret; undef $p; ## mirod already did this, so I'm borrowing sub clean { my( $string)= @_; $string =~ s/\xc2\xa0/ /sg; $string =~ s/\s+/ /g; $string=~ s{\s$}{}g; return $string; }