Commit 681727718acccf4a63d6dca1cb4c682a05217abb
1 parent
a6478f31
Added Świgra
Showing
122 changed files
with
111222 additions
and
0 deletions
Too many changes to show.
To preserve performance only 2 of 122 files are displayed.
swigra/disambiguator-pcfg/README
0 → 100644
1 | +Katalog zawiera pliki źródłowe dezambiguatora dla lasów składniowych w formacie Świgra/Składnica oraz: | ||
2 | + | ||
3 | +pcfg-grammar.pkl - gramatyka pcfg slużąca do dezambiguacji | ||
4 | +skrypt make-model.py służy do stworzenia nowej gamatyki | ||
5 | + | ||
6 | +skrypt disamb-tree.py służy do automatycznej anotacji ujednoznacznionego drzewa | ||
7 | + | ||
8 | +sposób użycia: | ||
9 | + | ||
10 | + python disamb-tree.py sciezka/do/nazwa_pliku_wejsciowego.xml | ||
11 | + skrypt utworzy plik sciezka/do/nazwa_pliku_wejscioweg-disamb.xml | ||
0 | \ No newline at end of file | 12 | \ No newline at end of file |
swigra/disambiguator-pcfg/disamb-tree.py
0 → 100644
1 | +# -*- encoding: utf-8 -*- | ||
2 | +__author__ = 'nika' | ||
3 | + | ||
4 | +from xml.sax import make_parser | ||
5 | +from treeparser import TreeParser | ||
6 | +import pickle | ||
7 | +import sys | ||
8 | +import xml.dom.minidom | ||
9 | +import codecs | ||
10 | + | ||
11 | +try: | ||
12 | + f = sys.argv[1] | ||
13 | +except IndexError: | ||
14 | + print("ERROR: no filename given") | ||
15 | + exit() | ||
16 | + | ||
17 | +# parse xml file for future processing | ||
18 | +dom = xml.dom.minidom.parse(sys.argv[1]) | ||
19 | +nodes = dom.getElementsByTagName("node") | ||
20 | + | ||
21 | +if nodes: # if there is anything to disambiguate | ||
22 | + # load grammar | ||
23 | +# grammar = pickle.load(open("grammars/pcfg-tfw-130718.pkl")) | ||
24 | + grammar = pickle.load(open("grammars/pcfg-tfw-150326.pkl")) | ||
25 | + # make parser, parse tree | ||
26 | + parser = make_parser() | ||
27 | + handler = TreeParser(f) | ||
28 | + parser.setContentHandler(handler) | ||
29 | + parser.parse(f) | ||
30 | + tree = handler.getTree() | ||
31 | + | ||
32 | + # disambiguation | ||
33 | + tree.act_pcfg(grammar) | ||
34 | + disamb_nodes = tree.getDisambNodes() | ||
35 | + | ||
36 | +# update chosen nodes (if any) | ||
37 | +for node in nodes: | ||
38 | + if node.attributes["nid"].value in disamb_nodes: | ||
39 | + node.attributes["chosen"] = "true" | ||
40 | + children_all = node.getElementsByTagName("children") | ||
41 | + for children in children_all: | ||
42 | + chosen = True | ||
43 | + for child in children.getElementsByTagName("child"): | ||
44 | + if child.attributes["nid"].value not in disamb_nodes: | ||
45 | + chosen = False | ||
46 | + if chosen: | ||
47 | + children.attributes["chosen"] = "true" | ||
48 | + else: | ||
49 | + node.attributes["chosen"] = "false" | ||
50 | + | ||
51 | +new_f = sys.argv[1].rsplit(".", 1)[0] + "-disamb.xml" | ||
52 | +print "saving in :", new_f | ||
53 | +open(new_f, 'w').write(codecs.encode(dom.toxml(), 'utf-8')) |