compact.h
3.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
/*******************************************************************/
/* */
/* FILE compact.h */
/* MODULE compact */
/* PROGRAM SFST */
/* AUTHOR Helmut Schmid, IMS, University of Stuttgart */
/* */
/* PURPOSE finite state tools */
/* */
/*******************************************************************/
#ifndef _COMPACT_H_
#define _COMPACT_H_
#include "alphabet.h"
#include <vector>
typedef std::vector<unsigned int> CAnalysis;
class CompactTransducer {
protected:
// the following data structures are used to store the nodes
unsigned int number_of_nodes; // number of nodes in the transducer
char *finalp; // finalp[i] is 1 if node i is final and 0 otherwise
unsigned int *first_arc; // first_arc[i] is the number of the first
// arc outgoing from node i
// the following data structures are used to store the transition arcs
unsigned int number_of_arcs; // total number of arcs in the transducer
Label *label; // the label (character pair) of arc i
unsigned int *target_node; // target node of arc i
// the following data structures are used to store the stochastic parameters
float *final_logprob;
float *arc_logprob;
// functions needed to read the transducer from a file
void read_finalp( FILE *file );
void read_first_arcs( FILE *file );
void read_target_nodes( FILE *file );
void read_labels( FILE *file );
void read_probs( FILE *file );
// functions needed to analyze data with the transducer
void analyze( unsigned int n, std::vector<Character> &ch, size_t ipos,
CAnalysis&, std::vector<CAnalysis>&);
// function selecting the simplest morphological analysis
int compute_score( CAnalysis &ana );
void disambiguate( std::vector<CAnalysis> &analyses );
// functions for longest-match analysis of input data
void longest_match2(unsigned int, char*, int, CAnalysis&, int&, CAnalysis&);
void convert( CAnalysis &cana, Analysis &ana );
public:
size_t node_count() { return number_of_nodes; };
size_t arc_count() { return number_of_arcs; };
bool both_layers; // print surface and analysis symbols
bool simplest_only; // print only the simplest analyses
Alphabet alphabet; // data structure which maps symbols to numeric codes
CompactTransducer(); // dummy constructor
CompactTransducer( FILE*, FILE *pfile=NULL ); // reads a (stochastic) transducer
~CompactTransducer(); // destroys a transducer
// the analysis function returns the set of analyses for the string "s"
// in the argument "analyses"
void analyze_string( char *s, std::vector<CAnalysis > &analyses );
void compute_probs( std::vector<CAnalysis> &analyses, std::vector<double> &prob );
char *print_analysis( CAnalysis &ana );
// longest-match analysis
const char *longest_match( char*& );
// EM training
bool train2( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
bool train( char *s, std::vector<double> &arcfreq, std::vector<double> &finalfreq );
void estimate_probs( std::vector<double> &arcfreq, std::vector<double> &finalfreq );
// robust analysis
float robust_analyze_string( char *string, std::vector<CAnalysis> &analyses,
float ErrorsAllowed );
};
#endif