morfeusz2_c.h
4.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
/* morfeusz.h
Copyright (c) by Marcin Woliński
$Date: 2009/01/06 18:48:06 $
C language interface for Morfeusz morphological analyser
*/
#ifndef __MORFEUSZ_H__
#define __MORFEUSZ_H__
#ifndef __WIN32
#define DLLIMPORT
#else
/* A Windows system. Need to define DLLIMPORT. */
#if BUILDING_MORFEUSZ
# define DLLIMPORT __declspec (dllexport)
#else
# define DLLIMPORT __declspec (dllimport)
#endif
#endif
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/* Returns a string containing information on authors and version of
the library:
*/
DLLIMPORT char *morfeusz_about();
/*
The result of analysis is a directed acyclic graph with numbered
nodes representing positions in text (points _between_ segments)
and edges representing interpretations of segments that span from
one node to another. E.g.,
{0,1,"ja","ja","ppron12:sg:nom:m1.m2.m3.f.n1.n2:pri"}
|
| {1,2,"został","zostać","praet:sg:m1.m2.m3:perf"}
| |
__| ____| __{2,3,"em","być","aglt:sg:pri:imperf:wok"}
/ \ / \ / \
* Ja * został*em *
0 1 2 3
Note that the word 'zostałem' got broken into 2 separate segments.
The structure below describes one edge of this DAG:
*/
struct _InterpMorf {
int p, k; /* number of start node and end node */
char *forma, /* segment (token) */
*haslo, /* lemma */
*interp; /* morphosyntactic tag */
};
typedef struct _InterpMorf InterpMorf;
/* Analyse a piece of text:
'tekst' - the string to be analysed. It should neither start nor
end within a word. Morfeusz has limited space for results.
Don't pass to this function more than a typical paragraph at a
time. The best strategy is probably to pass to Morfeusz either
separate words or lines of text. If the text is too long to
analyse the function will return empty result, that is with p==-1
in the first structure returned.
RETURNS a table of InterpMorf structures representing edges of
the resulting graph. The result remains valid until next
invocation of morfeusz_analyse(). The function does not allocate
any memory, the space is reused on subsequent invocations.
The starting node of resulting graph has value of 0 on each
invocation. The end of results is marked with a sentinel element
having the value -1 in the 'p' field. If a segment is unknown to
Morfeusz, the 'haslo' and 'interp' fields in the resulting
structure are NULL.
*/
DLLIMPORT InterpMorf *morfeusz_analyse(char *tekst);
/*
Set options:
'option' is set to 'value'. Available options are represented by
#defines listed below.
RETURNS 1 (true) on success, 0 (false) on failure (no such option
or value).
*/
DLLIMPORT int morfeusz_set_option(int option, int value);
/*
MORFOPT_ENCODING:
The encoding used for 'tekst' argument of morfeusz_analyse and
fields 'forma', 'haslo', and 'interp' of results. Possible
values: UTF-8, ISO-8859-2 (default), CP1250, CP852.
*/
#define MORFOPT_ENCODING 1
#define MORFEUSZ_UTF_8 8
#define MORFEUSZ_ISO8859_2 88592
#define MORFEUSZ_CP1250 1250
#define MORFEUSZ_CP852 852
/* MORFOPT_WHITESPACE:
MORFEUSZ_SKIP_SPACE: whitespace characters are silently ignored
MORFEUSZ_KEEP_SPACE: whitespace characters are reported as tokens
*/
#define MORFOPT_WHITESPACE 2
#define MORFEUSZ_SKIP_WHITESPACE 0
#define MORFEUSZ_KEEP_WHITESPACE 2
#define MORFEUSZ_APPEND_WHITESPACE 4
/* MORFOPT_CASE:
MORFEUSZ_WEAK_CASE: interps not matching case are ignored unless there is no alternative
MORFEUSZ_STRICT_CASE: interps not matching case are marked as unrecognized (ign)
MORFEUSZ_IGNORE_CASE: interps not matching case are treated the same way as those that match case
*/
#define MORFOPT_CASE 3
#define MORFEUSZ_WEAK_CASE 301
#define MORFEUSZ_STRICT_CASE 302
#define MORFEUSZ_IGNORE_CASE 303
/* MORFOPT_TOKEN_NUMBERING
SEPARATE: after each invocation first token number is set to 0
CONTINUOUS: after each invocation first token number is set to the end of the one from previous invocation
*/
#define MORFOPT_TOKEN_NUMBERING 4
#define MORFEUSZ_SEPARATE_TOKEN_NUMBERING 401
#define MORFEUSZ_CONTINUOUS_TOKEN_NUMBERING 402
#ifdef __cplusplus
} /* extern C */
#endif /* __cplusplus */
#endif /* __MORFEUSZ_H__ */