dajstatystyki 2.89 KB
#! /usr/bin/perl

use strict;
use utf8;
use open ':utf8', ':std';
use XML::LibXML;
use File::Find;
use vars qw(@nodes @nkjp);

die "Wymagane argumenty: ścieżka źródłowa\n"
    unless @ARGV == 1;

my ($srcdir) = (shift);


my $parser = XML::LibXML->new();
my $xpc = XML::LibXML::XPathContext->new;
$xpc->registerNs('n', 'http://nlp.ipipan.waw.pl/dendrarium');

my $nr = 0;

find({
    wanted => \&process_one_file,
    preprocess => \&sort_the_files,
    no_chdir => 1
     },
     $srcdir);

sub sort_the_files {
#    print STDERR join(', ', @_, "\n");
    sort @_
}

sub process_one_file {
    return unless $_ =~ m|\.xml$|;
    return if  $_ =~ m|\.packet\.xml$|;
    print STDERR "$File::Find::name\n";

    my $doc = $parser->parse_file( $_ );

    my $root = $doc->documentElement();

    my $sent_id = $root->getAttribute('sent_id');
    # sample_id!!!
    my $sample_id = $sent_id;
    $sample_id =~ s|/[^/]+$||;
    my $grammar_no= $root->getAttribute('grammar_no');

    { my @tekst = $xpc->findnodes('/*[name() = "forest"]/*[name() = "text"]',$root);
      die "Nie znaleziono tekstu" unless @tekst == 1; }
    # zapis z name() jest konieczny, żeby nie kwalifikować elementów
    # przestrzeniami nazw:
    my $tekst = $root->findvalue('/*[name() = "forest"]/*[name() = "text"]');
    my $numwords = scalar(split '\s+', $tekst);

    my $stats = $root->find('/*[name() = "forest"]/*[name() = "stats"]')->[0];
    my $numtrees = $stats->getAttribute('trees');
    my $nodes = $stats->getAttribute('nodes');
    my $inferences = $stats->getAttribute('inferences');
    my $cputime = $stats->getAttribute('cputime');

    my $sukces = ($numtrees > 0);

    my $nkjp = 0;
    if ($sukces) {
	# liczymy drzewa zgodne z NKJP:
	@nodes = ();
	for my $n ($root->findnodes('*[name() = "node"]')) {
	    my $nid = $n->getAttribute('nid');
	    $nodes[$nid] = $n;
	}
	@nkjp = ();
	$nkjp = policz_nkjp(0);
    };

    print "$grammar_no\t$sample_id\t$sent_id\t$numwords\t$sukces\t$numtrees\t$nkjp\t$nodes\t$inferences\t$cputime\t$tekst\n";

}

    

sub policz_nkjp {
    my $nid = shift;
#    print STDERR "#$nid '$nkjp[$nid]' ".defined($nkjp[$nid])." \n";
    return $nkjp[$nid] if defined($nkjp[$nid]);
#	$nodes[$nid]->getChildrenByLocalName('terminal')
    my @children = grep $_->nodeName ne '#text', $nodes[$nid]->childNodes;
    my $firstchild = shift @children;
    if ($firstchild->nodeName eq 'terminal') {
	$nkjp[$nid] = ($firstchild->getAttribute('disamb') eq "true") 
	    ? 1 : 0;
#	print STDERR "$nid terminal $nkjp[$nid]\n";
    } elsif ($firstchild->nodeName eq 'nonterminal') {
	my $zgodnych = 0;
	for my $chs (@children) {
	    my $zgodc = 1;
	    for my $c ($chs->getChildrenByLocalName('child')) {
		$zgodc *= policz_nkjp($c->getAttribute('nid'));
	    }
	    $zgodnych += $zgodc;
	}
#	print STDERR "$nid nonterminal $zgodnych\n";
	$nkjp[$nid] = $zgodnych;
    } else {
	die $firstchild->nodeName;
    }
    return $nkjp[$nid];
}