morfeusz2_c.h
3.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
/* morfeusz.h
Copyright (c) by Marcin Woliński
$Date: 2009/01/06 18:48:06 $
C language interface for Morfeusz morphological analyser
*/
#ifndef __MORFEUSZ_H__
#define __MORFEUSZ_H__
#ifndef __WIN32
#define DLLIMPORT
#else
/* A Windows system. Need to define DLLIMPORT. */
#if BUILDING_MORFEUSZ
# define DLLIMPORT __declspec (dllexport)
#else
# define DLLIMPORT __declspec (dllimport)
#endif
#endif
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
/* Returns a string containing information on authors and version of
the library:
*/
DLLIMPORT char *morfeusz_about();
/*
The result of analysis is a directed acyclic graph with numbered
nodes representing positions in text (points _between_ segments)
and edges representing interpretations of segments that span from
one node to another. E.g.,
{0,1,"ja","ja","ppron12:sg:nom:m1.m2.m3.f.n1.n2:pri"}
|
| {1,2,"został","zostać","praet:sg:m1.m2.m3:perf"}
| |
__| ____| __{2,3,"em","być","aglt:sg:pri:imperf:wok"}
/ \ / \ / \
* Ja * został*em *
0 1 2 3
Note that the word 'zostałem' got broken into 2 separate segments.
The structure below describes one edge of this DAG:
*/
struct _InterpMorf {
int p, k; /* number of start node and end node */
char *forma, /* segment (token) */
*haslo, /* lemma */
*interp; /* morphosyntactic tag */
};
typedef struct _InterpMorf InterpMorf;
/* Analyse a piece of text:
'tekst' - the string to be analysed. It should neither start nor
end within a word. Morfeusz has limited space for results.
Don't pass to this function more than a typical paragraph at a
time. The best strategy is probably to pass to Morfeusz either
separate words or lines of text. If the text is too long to
analyse the function will return empty result, that is with p==-1
in the first structure returned.
RETURNS a table of InterpMorf structures representing edges of
the resulting graph. The result remains valid until next
invocation of morfeusz_analyse(). The function does not allocate
any memory, the space is reused on subsequent invocations.
The starting node of resulting graph has value of 0 on each
invocation. The end of results is marked with a sentinel element
having the value -1 in the 'p' field. If a segment is unknown to
Morfeusz, the 'haslo' and 'interp' fields in the resulting
structure are NULL.
*/
DLLIMPORT InterpMorf *morfeusz_analyse(char *tekst);
/*
Set options:
'option' is set to 'value'. Available options are represented by
#defines listed below.
RETURNS 1 (true) on success, 0 (false) on failure (no such option
or value).
*/
DLLIMPORT int morfeusz_set_option(int option, int value);
/*
MORFOPT_ENCODING:
The encoding used for 'tekst' argument of morfeusz_analyse and
fields 'forma', 'haslo', and 'interp' of results. Possible
values: UTF-8, ISO-8859-2 (default), CP1250, CP852.
*/
#define MORFOPT_ENCODING 1
#define MORFEUSZ_UTF_8 8
#define MORFEUSZ_ISO8859_2 88592
#define MORFEUSZ_CP1250 1250
#define MORFEUSZ_CP852 852
/* MORFOPT_WHITESPACE:
MORFEUSZ_SKIP_SPACE: whitespace characters are silently ignored
MORFEUSZ_KEEP_SPACE: whitespace characters are reported as tokens
*/
#define MORFOPT_WHITESPACE 2
#define MORFEUSZ_SKIP_WHITESPACE 0
#define MORFEUSZ_KEEP_WHITESPACE 2
#ifdef __cplusplus
} /* extern C */
#endif /* __cplusplus */
#endif /* __MORFEUSZ_H__ */