#!/bin/perl -w # author Michel Rodriguez use strict; # Basic parsing and grove building use XML::Grove::Builder; use Data::Grove::Visitor; use XML::Grove::AsString; use XML::Parser::PerlSAX; my $store= 0; my $i=0; my $prod; my $grove_builder = XML::Grove::Builder->new; my $parser = XML::Parser::PerlSAX->new ( Handler => $grove_builder ); my $grove; my $visitor = new TitleVisitor; # create the visitor # create the grove object $grove= $parser->parse ( Source => { SystemId => '/article/ways_to_rome/ex_ps_grove/REC_xml_19980210.xml' } ); $grove->accept( $visitor); # visit the grove # This package is used by the $grove->accept call package TitleVisitor; sub new { my $class = shift; return bless {}, $class; } sub visit_document # first method called { my ($self, $grove)= @_; $grove->children_accept_name ($self); # visit all children of the doc } sub visit_element # called if no visit_element_exists for the element { my( $self, $element)= @_; $element->children_accept_name ($self); # visit children } sub visit_characters { my( $self, $characters)= @_; # called for characters in elements $prod .= $characters->{Data} if( $store); # store the text when required } sub visit_name_prod # called for elements prod { my( $self, $element )= @_; $i++; $prod= "[$i] "; $element->children_accept_name ($self); # visit children, updates $prod print clean( $prod), "\n"; } sub visit_name_lhs # called for elements lhs { my( $self, $element )= @_; $store= 1; # store text in lhs $element->children_accept_name ($self); $store= 0; $prod .= " ::= "; } sub visit_name_rhs # called for elements lhs { my( $self, $element )= @_; $store= 1; # store text in lhs $element->children_accept_name ($self); $store= 0; } sub clean { my( $string)= @_; $string =~ s/\xc2\xa0/ /sg; $string =~ s/\s+/ /g; $string=~ s{\s$}{}g; return $string; }