config.ini
11.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
#
# FILES LOCATION
# all paths in this file are relative to location of this config file,
# except for absolute paths
# (in UNIXes starting from '/', in Windows starting from '<letter>:\' or '\')
#
# a file containing Spejd's grammar
# in this example file you can find the Spejd rules syntax explained
rules = rules.sr
# tagset used in grammar and input/output
# see that file for details on used format
tagset = sample.cfg
#
# PROCESSING CHAIN
#
# list of tools to be executed between reader and writer modules
# for spejd with preprocessing with dictionary
# (dictionary entries may be multiple - with different names after colon, see below)
# processingChain = dictionary:example_dict spejd
# spejd preceded with the pantera tagger (Spejd must have pantera support built in)
# processingChain = pantera spejd
# spejd alone (the default)
processingChain = spejd
# no tools (only reader and writer) - can be used as format converter
# processingChain =
# number of threads to use, 0 means autodetect ( = number of detected cpus)
maxThreads = 0
#
# INPUT
#
# inputType: auto|xcesAna|tei|txt
# auto chooses reader basing on the file name / extension:
# - *.txt/*.txt.gz = txt
# - morph.xml/morph.xml.gz = xcesAna
# - ann_segmentation.xml/ann_segmentation.xml.gz = tei, without using morphosyntax
# - ann_morphosyntax.xml/ann_morphosyntax.xml.gz = tei, using morphosyntax
# with txt and tei without morphosyntax the Morfeusz morphological analyzer is used (unless disabled)
inputType = auto
# encoding of input files (overrides any XML coding tags!)
#
# note: the acronymsAfter, acronymsBefore options, and contents of various files
# such as ogonkifyFile, morfeusz disambiguation rules or dictionaries must be
# in this encoding too.
inputEncoding = UTF-8
# regexp describing names of input files
# to look for when traversing directories given in command line
# does not affect file names given explicitly in command line
inputFiles = morph\.xml(\.gz)?|.*\.txt(\.gz)?|ann_morphosyntax\.xml(\.gz)?
# to ignore any disambiguation found in input?
ignoreDisamb = no
# Spejd can use XML id attributes available in the input. Sometimes this may cause
# problems (e.g. duplicate id values in the scope of file), so can be turned off
ignoreIDs = no
#
# OUTPUT
#
# format of the output file(s): tei|xcesAna|null
# null = for testing only, does not write anything
outputType = tei
# can interpretations deleted by Spejd be discarded at will (yes)
# or should be preserved for the final output (no)?
discardDeleted = yes
# the suffix to be added to the target file name
outputSuffix = .xml
# The core name of the output file. Depending on the output type
# some infixes can be added between it and output suffix.
# Caution: this option replaces the name of the input file.
# With output suffix containing only extension equal to extension of
# the input file (e.g. .xml for the XML input file) spejd will
# overwrite input files with output.
#
# Leave empty or comment out to use the input file name instead.
outputFilenameCore = ann
# If set to 'yes' spejd will backup existing output files to <name>.bak
backupExistingFiles = no
# apply gzip compression to output?
compressOutput = yes
# put <f>'s in single line and omit empty sentences/paragraphs when writing tei?
compactTeiOutput = no
# NKJP (National Corpus of Polish) compatibility mode:
# dont write <f name="interps"> and <fs type="lex"> in *_words.xml
# assuming that there can be only one interpretation marked as "correct" for each token.
#
# It is a user task to make sure that there will be no tokens with multiple
# correct interpretations
teiSingleSyntokInterp = yes
# again, NKJP compatibility:
# place group's heads information inside the <fs type"group">
# as features instead of marking it as 'type' attribute of group's elements
teiFsGroupHeads = yes
# for backward compatibility with Spejd 1.2, it is probably easier to parse structures
# written in bottom up order, starting from leafs to the root
# (all entities defined before referencing)
teiBottomUpSyntacticStructures = no
# DIAGNOSTICS
# report progress every reportInterval seconds
# use 0 to completely disable progress reports
reportInterval = 5
# more verbose reports?
debug = no
# mark which rule has deleted an interpretation?
ruleMarking = no
# are tag/tagset errors fatal?
# If turned on, Spejd will try to its best to output only tags conforming the tagset,
# but they may be useless.
# This option exists only to preserve compatibility with older versions of Spejd, which
# accepted incorrect rules. Please do not use when developing new grammars.
#
# !!! use at your own risk and don't report crashes when using this option !!!
nonfatalTagErrors = no
# if to silence the (nonfatal) tag errors?
muffleTagWarnings = no
# Disable correctness checks of tags in between rules execution?
# If set, tags can be temporary incomplete or incorrect, but the usual validation
# before writing is stil performed to make sure Spejd will output only
# correct tags.
# Not recommended for developing new grammars.
tagErrorsOnlyOnTheEnd = no
######################################################################
# MODULE-SPECIFIC OPTIONS
######################################################################
# DICTIONARIES
# list of files containing morphological dictionaries
# to be applied as "dictionary:example_dict" tool to the input
# the format of lines of files is:
# orthographic form,base (lexical) form:tag
#
# or:
# ,base (lexical) form:some_parts_of_tag;condition
#
# In the first variant the orthographical form is used for matching words.
# Tag definition is expanded (it may contain wildcards).
#
# In the second variant orthographic form is ommited. In that case a base form is used to match.
# The tags of existing interpretations which match the base form are corrected/modified
# according to the specified tag.
# This variant allows the tag to be not full/complete, but only specifying some
# of the attributes (some parts).
# This variant also allows to specify conditions on tag that must
# be meet to perform the modification. The condition has form of a partial tag, just like
# in the "tag" section of modifying variant. A condition restricts modified interpretations to
# that ones which have all values of the specified attributes among the specified values.
# If an attribute is ommited in the specification it means that there are no restrictions
# on this attribute value and it can be anything (including absence of value).
# When a condition is empty (that means: there are no restrictions on any attribute),
# a semicolon preceding it can be ommited and the format is:
# ,base (lexical) form:some_parts_of_tag
#
# The above two variants of entries can be mixed.
# All the entries with orthographic form are applied before applying any
# of the entries without orth in the scope of a single 'dictionary:<name>' tool,
# no matter in which file in this list they appear.
#
# The encoding of dictionary files must be the same as inputEncoding.
#dictionary:example_dict = sample_dict lexdictnum
# PANTERA CONFIGURATION
# Pantera can use its own built-in tweaked version of Morfeusz.
# If this option is set, all interpretations set by reader
# or any tools preceding pantera in the toolchain are dropped
panteraDoOwnMorphAnalysis = yes
# tagset for pantera, leave empty for a default (check pantera documentation for details)
panteraTagsetName =
# pantera's engine, leave empty for a default (check pantera documentation for details)
panteraEnginePath =
# SPEJD SEMANTICS
# default strategy for matching syntactic entities
# use * for greedy, + for possessive, ? for reluctant
matchStrategy = *
# should agree(case,1,2) return true, if both 1 and 2 have no case?
nullAgreement = no
# SPEJD FSM INTERNALS
# number of single-rule automata to be composed together, usually not needed to change
# Rule of thumb: if Spejd consumes much too much memory, it's better to decrease this
# number than to set very low memoryLimit - it gives smaller impact on performance
composeLimit = 150
# memory limit in megabytes
# when memory usage exceeds this limit the rarely-used states removal procedure
# (or GC, a garbage collector) is launched
# use as an emergency brake, for standard limit see above.
# The memoryLimit is approximate, actual memory usage may be slightly higher
# (it depends on memory allocator library buffers size)
memoryLimit = 1900
# approx. percent of DFA states to leave after the states removal
leavePercent = 80
# The definitive limit of normal GC usage. GC removes only complex states, so if there are lots
# of plain states it can't prevent from exceeding memoryLimit. If the percent of complex states
# is less than minComplexPercent, all the DFAs are dropped and they are built from the beggining
# just like if the spejd would be restarted. However it does not recompile rules, so it's faster.
minComplexPercent = 10
# A maximal number of unicode characters which can appear in rules compiled to internal regex
# It must be higher than the highest number of values of a single attribute (including
# numeric attributes) and must be higher than a number of unique characters appearing in all rules.
# Setting too high can increase the memory usage.
maxNumberOfValues = 4000
# BUILT-IN MORPHOLOGICAL ANALYZER 'MORFEUSZ'
# disable Morfeusz completely, useful when some other tool replaces interpretations, e.g. pantera
disableMorfeusz = yes
# Morfeusz produces ambiguous segmentation, which can be resolved by simple rule-based
# disambiguator. This option specifies a file to load rules from.
# The rule format is described in the example file
# (leave empty for the builtin default, which is actualy the example file)
# The encoding of this file must be the same as inputEncoding.
morfeuszSegmentationDisambiguationRules = segm_disamb.conf
# PLAIN TEXT READER - GENERAL
# mock xml:id for the whole text input referred from the output in string-range notation
# (in TEI output it appears in *segmentation.xml)
stringRangeMockID = p-1
# PLAIN TEXT READER - SENTENCER
# list of acronyms -
# if a dot is found after one of them, it is not a sentence break
acronymsAfter = prof|dr|mgr|doc|ul|np|godz|gen|płk|mjr|por|tzw|tzn|proc|nt|art|ust|ww|www|ws|dz
# list of acronyms (actually top level domain names) -
# if a dot is found before one of them, it is not a sentence break
acronymsBefore = ac|ad|ae|aero|af|ag|ai|al|am|an|ao|aq|ar|arpa|as|asia|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|biz|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cat|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|com|coop|cr|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|edu|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|info|int|io|iq|ir|is|it|je|jm|jo|jobs|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mil|mk|ml|mm|mn|mo|mobi|mp|mq|mr|ms|mt|mu|museum|mv|mw|mx|my|mz|na|name|nc|ne|net|nf|ng|ni|nl|no|np|nr|nu|nz|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|pro|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|travel|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw
# PLAIN TEXT READER - OGONKIFIER
# name of file with ogonkify (diacrit completion) substitutions
# the format is:
# <letter without diacritics>=<list of possible letters with diacritics separated by '|'>
# see the example ogonkifier.ini
# The encoding of this file must be the same as inputEncoding.
#ogonkifyFile = ogonkifier.ini
# when to use ogonkifier:
# A - Always,
# N - Never,
# M - only when the Morphological analyzer fails to analyse a word
ogonkifyStrategy = N
# min and max length of words to ogonkify
ogonkifyMinLength = 3
ogonkifyMaxLength = 13