main-main-mistake.py
3.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# -*- encoding: utf-8 -*-
__author__ = 'nika'
from xml.sax import make_parser
from treeparser import TreeParser
from productionparseractpcfg import makeActPcfg
from node import Node
from tree import Tree
import glob
from multiprocessing import Pool
from sklearn.cross_validation import KFold
############################# CZYTANIE WSZYSTKICH PLIKÓW
all_files = []
for f in glob.iglob("/home/nika/zrobione140523/*/*/*-s.xml"):
all_files.append(f)
print len(all_files)
all_files = all_files
############################# WALIDACJA KRZYŻOWA
def ev(args):
f, grammar = args
parser = make_parser()
handler = TreeParser(f)
parser.setContentHandler(handler)
parser.parse(f)
tree = handler.getTree()
return tree.act_pcfg(grammar), tree.countChildrenZadnie()#[]#act_pcfg(grammar), tree.findMistakesSimple()
def count_ev(list_of_tuples):
lll = []
for ii in range(len(list_of_tuples[0])):
lll.append(sum(map(lambda x: x[ii], list_of_tuples))/len(list_of_tuples))
return lll
def count_ev_repr(list_of_tuples):
lll = []
for ii in range(len(list_of_tuples[0][0])):
lll.append(sum(map(lambda x: x[0][ii], list_of_tuples))/len(list_of_tuples))
for ii in range(len(list_of_tuples[0][1])):
lll.append(sum(map(lambda x: x[1][ii], list_of_tuples))/len(list_of_tuples))
return lll
# self.agreed_count, self.agreed_strict_count, self.chosen_nodes_count, self.disamb_nodes_count, self.dep_agreed_count, self.dep_count
def repr_stats(list_of_stats):
all_sum = []
for ii in range(len(list_of_stats[0])):
all_sum.append(sum(map(lambda x:x[ii], list_of_stats)))
result = [[1.0*all_sum[0]/all_sum[2], 1.0*all_sum[0]/all_sum[3], 1.0*all_sum[1]/all_sum[2], 1.0*all_sum[1]/all_sum[3], 1.0*all_sum[4]/all_sum[5]]]
tmp = map(lambda x:[1.0*x[0]/x[2], 1.0*x[0]/x[3], 1.0*x[1]/x[2], 1.0*x[1]/x[3], 1.0*x[4]/x[5]], list_of_stats)
all_sum = []
for ii in range(len(tmp[0])):
all_sum.append(sum(map(lambda x:x[ii], tmp))/len(tmp))
result.append( all_sum )
return result
all_lex_pcfg = []
mistakes =[]
for train, test in KFold(len(all_files), 10):
print len(train), len(test)
train_files = map(lambda x: all_files[x], train)
print "making grammar ..."
grammar = makeActPcfg(train_files, 20)
test_files = map(lambda x: (all_files[x], grammar), test)
print "evaluating ..."#, test_files
pool = Pool(processes=20) # start 4 worker processes
#for ii in test_files: ev(ii)
results = pool.map(ev, test_files)
all_lex_pcfg.append(repr_stats(map(lambda x: x[0],results)))
mistakes += map(lambda x:x[1], results)
print "***************************************************************************************"
print "RESULT", len(all_lex_pcfg), ": ", all_lex_pcfg[-1]
print "***************************************************************************************"
print "RESULT: ", count_ev_repr(all_lex_pcfg)
"""
mistakes_dict = {}
for ii in mistakes:
for jj in ii:
if jj in mistakes_dict:
mistakes_dict[jj] +=1
else:
mistakes_dict[jj] =1
for jj in mistakes:
for mistake in jj.keys():
if mistake in mistakes_dict:
mistakes_dict[mistake] += jj[mistake]
else:
mistakes_dict[mistake] = jj[mistake]
import pickle
for ii in sorted(mistakes_dict, key= lambda x:-mistakes_dict[x]):
print mistakes_dict[ii], ii
pickle.dump(mistakes_dict, open("count-zdanie-dump.pkl", "w"))
"""