productionparseractpcfg.py
4.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# -*- encoding: utf-8 -*-
__author__ = 'nika'
from xml.sax import handler, make_parser
from multiprocessing import Pool
class ProductionParserActPCFG(handler.ContentHandler):
def __init__(self):
self.current_tag = ""
self.nodes = {}
self.productions = []
self.productions_dict = {}
self.in_chosen_node = False
self.nid = None
self.in_chosen_children = False
self.current_prod = []
self.full_tree = False
self.prod_with_centre = []
self.is_in_fw = False
self.is_in_l = False
self.is_in_o = False
self.is_in_r = False
self.is_in_p = False
self.is_in_tfw = False
def getProductions(self):
for prod in self.productions_dict:
pass#print prod, self.productions_dict[prod]
return self.productions_dict
def startElement(self, name, attrs):
self.current_tag = name
if name == "node" and attrs.getValue("chosen") == "true":
self.in_chosen_node = True
self.nid = attrs.getValue("nid")
if name == "children" and "chosen" in attrs.getNames() and attrs.getValue("chosen") == "true":
self.in_chosen_children = True
if name == "child" and self.in_chosen_children:
self.current_prod.append(( attrs.getValue("nid"), attrs.getValue("head") ))
if name == "terminal" and self.in_chosen_node:
self.nodes[self.nid] = u'terminal'
if name == "f" and self.in_chosen_node:
if attrs.getValue("type") == "tfw" and self.is_in_fw:
self.is_in_tfw = True
elif attrs.getValue("type") == "liczba":
self.is_in_l = True
elif attrs.getValue("type") == "osoba":
self.is_in_o = True
elif attrs.getValue("type") == "rodzaj":
self.is_in_r = True
elif attrs.getValue("type") == "przypadek":
self.is_in_p = True
def endElement(self, name): # zlicza w zależności od węzła
if name == "node":
self.in_chosen_node = False
if name == "children" and self.in_chosen_children:
self.in_chosen_children = False
self.productions.append(( self.nid, self.current_prod ))
self.current_prod = []
def characters(self, content):
if self.current_tag == "category" and self.in_chosen_node and content.strip():
self.nodes[self.nid] = content.strip()
if content.strip() == "fw":
self.is_in_fw = True
if self.current_tag == "f" and content.strip() and self.in_chosen_node:
if self.is_in_tfw:
self.nodes[self.nid] += '@' + content.strip()
self.is_in_fw = False
self.is_in_tfw = False
elif self.is_in_l:
#self.nodes[self.nid] += '@' + content.strip()
self.is_in_l = False
elif self.is_in_o:
#self.nodes[self.nid] += '@' + content.strip()
self.is_in_o = False
elif self.is_in_r:
#self.nodes[self.nid] += '@' + content.strip()
self.is_in_r = False
elif self.is_in_p:
#self.nodes[self.nid] += '@' + content.strip()
self.is_in_p = False
def endDocument(self):
tmp_prod = map(lambda x: (self.nodes[x[0]], map(lambda y: (self.nodes[y[0]], y[1]), x[1] )), self.productions)
for ii in list(set(map(lambda x: (x[0],str(x[1])),tmp_prod))):
self.productions_dict[ii] = map(lambda x: (x[0],str(x[1])),tmp_prod).count(ii)
def runParse(fname):
parser = make_parser()
handler = ProductionParserActPCFG()
parser.setContentHandler(handler)
parser.parse(fname)
return handler.getProductions()
def makeActPcfg(flist, tasks=4):
"""
"""
pool = Pool(processes=tasks)
all_prods = {}
for productions in pool.map(runParse, flist):
for production in productions:
if production in all_prods:
all_prods[production] += productions[production]
else:
all_prods[production] = productions[production]
non_term_count = {}
prods_prob = {}
#print all_prods
non_term = map(lambda x: x[0], all_prods)
for ii in set(non_term):
non_term_count[ii] = 0
for jj in all_prods:
if jj[0] == ii:
non_term_count[ii] += all_prods[jj]
for ii in all_prods:
prods_prob[ii] = all_prods[ii] * 1.0 / non_term_count[ii[0]]
return prods_prob