dajstatystyki
2.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#! /usr/bin/perl
use strict;
use utf8;
use open ':utf8', ':std';
use XML::LibXML;
use File::Find;
use vars qw(@nodes @nkjp);
die "Wymagane argumenty: ścieżka źródłowa\n"
unless @ARGV == 1;
my ($srcdir) = (shift);
my $parser = XML::LibXML->new();
my $xpc = XML::LibXML::XPathContext->new;
$xpc->registerNs('n', 'http://nlp.ipipan.waw.pl/dendrarium');
my $nr = 0;
find({
wanted => \&process_one_file,
preprocess => \&sort_the_files,
no_chdir => 1
},
$srcdir);
sub sort_the_files {
# print STDERR join(', ', @_, "\n");
sort @_
}
sub process_one_file {
return unless $_ =~ m|\.xml$|;
return if $_ =~ m|\.packet\.xml$|;
print STDERR "$File::Find::name\n";
my $doc = $parser->parse_file( $_ );
my $root = $doc->documentElement();
my $sent_id = $root->getAttribute('sent_id');
# sample_id!!!
my $sample_id = $sent_id;
$sample_id =~ s|/[^/]+$||;
my $grammar_no= $root->getAttribute('grammar_no');
{ my @tekst = $xpc->findnodes('/*[name() = "forest"]/*[name() = "text"]',$root);
die "Nie znaleziono tekstu" unless @tekst == 1; }
# zapis z name() jest konieczny, żeby nie kwalifikować elementów
# przestrzeniami nazw:
my $tekst = $root->findvalue('/*[name() = "forest"]/*[name() = "text"]');
my $numwords = scalar(split '\s+', $tekst);
my $stats = $root->find('/*[name() = "forest"]/*[name() = "stats"]')->[0];
my $numtrees = $stats->getAttribute('trees');
my $nodes = $stats->getAttribute('nodes');
my $inferences = $stats->getAttribute('inferences');
my $cputime = $stats->getAttribute('cputime');
my $sukces = ($numtrees > 0);
my $nkjp = 0;
if ($sukces) {
# liczymy drzewa zgodne z NKJP:
@nodes = ();
for my $n ($root->findnodes('*[name() = "node"]')) {
my $nid = $n->getAttribute('nid');
$nodes[$nid] = $n;
}
@nkjp = ();
$nkjp = policz_nkjp(0);
};
print "$grammar_no\t$sample_id\t$sent_id\t$numwords\t$sukces\t$numtrees\t$nkjp\t$nodes\t$inferences\t$cputime\t$tekst\n";
}
sub policz_nkjp {
my $nid = shift;
# print STDERR "#$nid '$nkjp[$nid]' ".defined($nkjp[$nid])." \n";
return $nkjp[$nid] if defined($nkjp[$nid]);
# $nodes[$nid]->getChildrenByLocalName('terminal')
my @children = grep $_->nodeName ne '#text', $nodes[$nid]->childNodes;
my $firstchild = shift @children;
if ($firstchild->nodeName eq 'terminal') {
$nkjp[$nid] = ($firstchild->getAttribute('disamb') eq "true")
? 1 : 0;
# print STDERR "$nid terminal $nkjp[$nid]\n";
} elsif ($firstchild->nodeName eq 'nonterminal') {
my $zgodnych = 0;
for my $chs (@children) {
my $zgodc = 1;
for my $c ($chs->getChildrenByLocalName('child')) {
$zgodc *= policz_nkjp($c->getAttribute('nid'));
}
$zgodnych += $zgodc;
}
# print STDERR "$nid nonterminal $zgodnych\n";
$nkjp[$nid] = $zgodnych;
} else {
die $firstchild->nodeName;
}
return $nkjp[$nid];
}