robust.C
8.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
/*******************************************************************/
/* */
/* File: robust.C */
/* Author: Helmut Schmid */
/* Purpose: */
/* Created: Wed Aug 3 08:49:16 2005 */
/* Modified: Thu Nov 27 17:15:25 2008 (schmid) */
/* */
/*******************************************************************/
#include "compact.h"
using std::vector;
// data structure for a search path
class Path {
public:
unsigned int arc_number; // number of the current transducer arc
unsigned int position; // number of processed input symbols
float errors; // errors accumulated so far
int previous; // back-pointer (for printing)
// contructor
Path( unsigned int n, unsigned int p, float e, unsigned int pp )
{ arc_number = n; position = p; errors = e; previous = (int)pp; };
// constructor for the intial path
Path() { arc_number = 0; position = 0; errors = 0; previous = -1; };
// check whether a path is the intial path
bool is_start() { return previous == -1; };
};
// search data structure containing all the search paths
class Agenda {
private:
float errors_allowed; // maximum number of errors allowed
vector<Path> path; // set of active and inactive paths (for printing)
vector<size_t> active_path; // set of currently active search paths
vector<size_t> complete_path; // set of complete search paths
public:
Agenda( float e ) {
// initialization
errors_allowed = e;
path.push_back(Path()); // initial search path
active_path.push_back(0); // one active search path
};
// get the highest ranked active search path
Path &best_active_path() { return path[active_path.back()]; };
// retrieve an active path by its index number
Path &get_active_path( int i ) { return path[active_path[i]]; };
// get the highest ranked complete search path
Path &first_complete_path() { return path[complete_path[0]]; };
// check whether the analysis is finished
bool finished() {
return (active_path.size() == 0 || // no more active paths
(complete_path.size() > 0 && // best analysis was found
best_active_path().errors > first_complete_path().errors));
};
// add a new search path
void add_path( int s, unsigned int pos, float e, int pp, bool final );
void add_analysis( int sn, CAnalysis &ana );
void extract_analyses( vector<CAnalysis> &analyses );
friend class CompactTransducer;
};
// trivial error functions for the beginning
float mismatch_error( Character c, Character c2) {
return 1.0;
};
float deletion_error( Character c) { return 1.0; };
float insertion_error( Character c) { return 1.0; };
float transpose_error( Character c, Character c2) { return 1.0; };
/*******************************************************************/
/* */
/* Agenda::add_path */
/* */
/*******************************************************************/
void Agenda::add_path(int arc, unsigned int pos, float e, int pp, bool final)
{
// check whether the number of allowed errors is exceeded
if (e > errors_allowed)
return;
// check whether a complete analysis with fewer errors exists
if (complete_path.size() > 0 && first_complete_path().errors < e)
return;
// store the new search path
size_t sn=path.size(); // index of the new search path
path.push_back(Path(arc, pos, e, pp)); // add the new path
// sorted insertion of the new active path (reversed order)
active_path.push_back(sn); // increase the array size by 1
// copy the paths up to the insertion position
int i=(int)active_path.size()-1;
while (i > 0) {
Path &s=get_active_path(i-1);
if (e < s.errors || (e == s.errors && pos >= s.position))
break;
active_path[i] = active_path[i-1];
i--;
}
active_path[i] = sn;
if (final) {
// Is the new analysis better than the previous ones?
if (complete_path.size() > 0 && first_complete_path().errors > e)
complete_path.clear(); // delete all the previous analyses
complete_path.push_back(sn);
}
}
/*******************************************************************/
/* */
/* Agenda::add_analysis */
/* */
/*******************************************************************/
void Agenda::add_analysis( int sn, CAnalysis &ana )
{
Path &cs=path[sn];
if (!cs.is_start()) {
add_analysis( cs.previous, ana );
ana.push_back(cs.arc_number);
}
}
/*******************************************************************/
/* */
/* Agenda::extract_analyses */
/* */
/*******************************************************************/
void Agenda::extract_analyses( vector<CAnalysis> &analyses )
{
analyses.resize(complete_path.size());
for( size_t i=0; i<complete_path.size(); i++ )
add_analysis((int)complete_path[i], analyses[i]);
}
/*******************************************************************/
/* */
/* CompactTransducer::robust_analyze_string */
/* */
/*******************************************************************/
float CompactTransducer::robust_analyze_string( char *string,
vector<CAnalysis> &analyses,
float ErrorsAllowed )
{
analyses.clear();
// convert the input string to a sequence of symbols
vector<Character> input;
alphabet.string2symseq( string, input );
// initialize the agenda
Agenda agenda( ErrorsAllowed );
// start the analysis
while (!agenda.finished()) {
// get the highest ranked search path
unsigned int sn=(unsigned)agenda.active_path.back();
Path cs=agenda.path[sn];
agenda.active_path.pop_back();
unsigned int state=cs.is_start()? 0: target_node[cs.arc_number];
// for all transitions from the current state
for( unsigned int i=first_arc[state]; i<first_arc[state+1]; i++ ) {
Label l = label[i]; // label of the transition
Character tc = l.upper_char(); // surface symbol
if (cs.position == input.size()) {
if (tc == Label::epsilon) // epsilon transition
agenda.add_path(i, cs.position, cs.errors, sn,
finalp[target_node[i]]);
else // insertion of symbol
agenda.add_path(i, cs.position, cs.errors + insertion_error(tc),
sn, finalp[target_node[i]]);
}
else {
Character ic = input[cs.position];
if (tc == Label::epsilon) // epsilon transition
agenda.add_path(i, cs.position, cs.errors, sn, false);
else if (tc == ic) { // matching symbols
bool f=(cs.position+1==input.size() && finalp[target_node[i]]);
agenda.add_path(i, cs.position+1, cs.errors, sn, f);
}
else {
// symbol mismatch
bool f=(cs.position+1==input.size() && finalp[target_node[i]]);
agenda.add_path(i, cs.position+1, cs.errors+mismatch_error(tc, ic),
sn, f);
// deletion of symbol
f = (cs.position+1==input.size() &&
finalp[target_node[cs.arc_number]]);
agenda.add_path(cs.arc_number, cs.position+1,
cs.errors+deletion_error(ic), cs.previous, f);
// insertion of symbol
f = (cs.position==input.size() && finalp[target_node[i]]);
agenda.add_path(i, cs.position, cs.errors + insertion_error(tc),
sn, f);
}
}
}
}
if (agenda.complete_path.size() == 0)
return 0.0;
agenda.extract_analyses( analyses );
return agenda.first_complete_path().errors;
}