mst_experiment.py
8.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#!/usr/bin/python
import os
import sys
import optparse
## Check that MSTPARSER_DIR environment variable is set and get it
global mstparser_dir
mstparser_dir = ''
if os.environ.has_key('MSTPARSER_DIR'):
mstparser_dir = os.environ['MSTPARSER_DIR']
else:
print "Please set the MSTPARSER_DIR environment variable to where you have the MSTParser installed."
exit(1)
###########################################################################
#
# Run a single fold. This could actually be not a "fold" per se, but
# actually explicitly provided training and test files.
#
###########################################################################
def create_tag_train_file (source_file, formatted_file):
output = file(formatted_file, "w")
input = file(source_file)
line = input.readline()
while not(line == ""):
words = line.strip().split("\t")
line = input.readline()
tags = line.strip().split("\t")
# the splitting takes care of word+stem representations like biliyor+bil
merged = [words[i].split("+")[0]+"_"+tags[i].replace("_", "+us+") \
for i in range(len(words))]
output.write(" ".join(merged)+"\n")
input.readline() # eat up labels
input.readline() # eat up dependencies
input.readline() # eat blank line
line = input.readline() # read words of next sentence
output.close()
def run_single_train_and_test(options, train_filename,
test_filename, output_filename, args):
realtest_filename = test_filename
# Tag the test sentences if requested
if options.tag_source == "OTK_Tagger":
print " Tagging test sentences..."
tag_train_filename = train_filename+".tagged"
create_tag_train_file(train_filename, tag_train_filename)
tagged_filename = test_filename+".tagged.tmp"
tag_command = "python %s/bin/pos_tag.py -o %s %s %s %s" \
% (mstparser_dir,
options.output_dir,
tag_train_filename,
test_filename,
tagged_filename)
#print >> argfile, tag_command
if options.verbose:
print tag_command
os.system(tag_command)
#os.system(tag_command+' |tee --append '+options.output_dir+'/tag.out 2>&1')
else:
os.system(tag_command+' &>/dev/null')
#os.system(tag_command+' >> '+options.output_dir+'/tag.out 2>&1')
tag_lines = []
counter = 0
for line in file(tagged_filename):
if counter % 2 == 1:
tag_lines.append(line)
counter += 1
realtest_filename = test_filename+".tagged"
output = file(realtest_filename, "w")
counter = 0
for line in file(test_filename):
if counter % 5 == 1:
output.write(tag_lines[(counter-1)/5])
else:
output.write(line)
counter += 1
output.close()
# Train the parser
print " Training and evaluating..."
train_command = 'mst_parse.sh train train-file:%s model-name:%s/dep.model decode-type:%s test test-file:%s output-file:%s %s' % (train_filename, options.output_dir, options.decoder_type, realtest_filename, output_filename, " ".join(args[1:]))
if options.verbose:
print train_command
os.system(train_command)
else:
os.system(train_command+' &>/dev/null')
###################### END FUNCTION DEFINITIONS ########################
## Get options
opt_parser = optparse.OptionParser()
opt_parser.add_option("-l", "--language", action="store", default='Unspecified',
help="use configurations specific to LANGUAGE",
metavar="LANGUAGE")
opt_parser.add_option("-e", "--eval_file", action="store", default='Generated',
help="Read evaluation sentences from FILE. Using this option means that cross-validation will not be used.",
metavar="FILE")
opt_parser.add_option("-d", "--decoder_type", action="store",
choices=['proj', 'non-proj'],
default="proj",
help="Use a projective or non-projective algorithm.E",
metavar="FILE")
opt_parser.add_option("-o", "--output_dir", action="store", default='output',
help="save parser output to DIR",
metavar="DIR")
opt_parser.add_option("-f", "--num_folds", action="store", default=10,
help="The number of folds to use in cross-validation (Default=10).",
metavar="NUM")
opt_parser.add_option("-v", "--verbose", action="store_true", default=False,
help="be verbose")
opt_parser.add_option("-t", "--tag_source", choices=['Gold','OTK_Tagger'],
default='Gold',
help="use tags from Gold standard or from a tagger (Gold (default), OTK_Tagger)",
metavar="SOURCE")
(options, args) = opt_parser.parse_args()
#Convert from FP to Int
options.num_folds = int(options.num_folds)
# Check that the requested output directory doesn't exist and isn't a
# file. If it's okay, create the directory.
output_dir = options.output_dir
if os.path.isdir(output_dir):
os.system("rm -rf %s" % output_dir)
elif os.path.isfile(output_dir):
raise OSError("A file with the same name as the desired dir, " \
"'%s', already exists." % output_dir)
os.makedirs(output_dir)
# This file accumulates the results across all folds.
model_output_filename = output_dir+"/model_out"
os.system('touch %s' % model_output_filename)
## Process files
train_filename = args[0]
# This file accumulates the gold dependencies across all folds.
gold_deps_filename = output_dir+"/gold.deps"
if options.eval_file == "Generated":
num_folds = int(options.num_folds)
print "Running a %d-fold evaluation on file %s" \
% (num_folds, train_filename)
print
# Align parses with their corresponding sentences and assign a
# partition id to them.
train_file = file(train_filename)
examples = []
next_example = train_file.readline()
counter = 0
while next_example:
partition = counter % num_folds
elements = []
while next_example and next_example != "\n":
elements += next_example
next_example = train_file.readline()
examples.append((partition, elements))
next_example = train_file.readline()
counter += 1
# Close the sentences file and delete it. (It was either copied or
# generated, so it's okay.)
train_file.close()
# Train/test on each partion
gold_deps = open(gold_deps_filename,"w")
# Run each fold. The output from each fold is appended to gold.deps
# and model.deps
#for test_partition in range(1):
for test_partition in range(num_folds):
print "Fold",test_partition
train_filename = output_dir+"/train"
train_set = open(train_filename, "w")
test_filename = output_dir+"/test"
test_set = open(test_filename, "w")
counter = 0
for ex in examples:
if ex[0] == test_partition:
test_set.write("".join(ex[1])+"\n")
gold_deps.write("".join(ex[1])+"\n")
else:
train_set.write("".join(ex[1])+"\n")
counter += 1
train_set.close()
test_set.close()
# Run the fold.
output_filename = output_dir+"/output"
run_single_train_and_test(options, train_filename, test_filename, output_filename, args)
# Pile this fold's output onto the accumulating result file.
os.system('cat %s >> %s' % (output_filename, model_output_filename))
gold_deps.flush()
gold_deps.close()
else:
os.system('cp %s %s' %(options.eval_file, gold_deps_filename))
run_single_train_and_test(options, train_filename, gold_deps_filename, model_output_filename, args)
################## EVALUATION ###################
print "Evaluating. If anything here dies, you can still look at the output files in the directory '%s'." % (output_dir)
# Get dependency results.
os.system("mst_score.sh %s %s" % (gold_deps_filename, model_output_filename))