#!/usr/bin/perl -w # Using XML::DOM # Authors: Ingo Macherius # and Michel Rodriguez # with input from the XML::DOM author Enno Derksen use strict; use XML::DOM; my $parser = XML::DOM::Parser->new; my $doc = $parser->parsefile ("/article/ways_to_rome/ex_ps_dom/REC_xml_19980210.xml"); my $node_list = $doc->getElementsByTagName ("prod"); my @nodes; my $len_node_list = $node_list->getLength(); for (my $j = 0; $j < $len_node_list; $j++) { push @nodes, $node_list->item($j); } my $i=0; foreach my $node(@nodes) { my $lhs = $node->getElementsByTagName("lhs")->item(0); my $rhs_list = $node->getElementsByTagName("rhs"); my $len = $rhs_list->getLength(); my @rhs; for (my $j = 0; $j < $len; $j++) { push @rhs, $rhs_list->item($j); } $i++; my $prod= "[$i] " . $lhs->getFirstChild->getNodeValue() . " ::= " . rhs(@rhs); $prod= clean( $prod); print $prod, "\n"; } sub rhs { my $text; foreach my $rhs (@_) { my $node_list = $rhs->getChildNodes(); my $len_node_list = $node_list->getLength(); my @nodes; for (my $j = 0; $j < $len_node_list; $j++) { push @nodes, $node_list->item($j); } foreach my $node (@nodes ) { if ($node->getNodeType() == XML::DOM::Node::ELEMENT_NODE()) { $text .= $node->getFirstChild()->getNodeValue() unless( $node->getFirstChild()->getNodeName eq '#comment'); } else { $text .= $node->getNodeValue() unless( $node->getNodeName eq '#comment'); } } } return $text; } sub clean { my( $string)= @_; $string =~ s/\xc2\xa0/ /sg; # weird characters in the original document $string =~ s/\s+/ /g; $string=~ s{^\s}{}g; $string=~ s{\s$}{}g; return $string; }