Commit 46a46821b37bc5e11065304c0e9494a667a66c25
1 parent
8f86545e
Added preferable spejd config to resources directory.
Showing
3 changed files
with
8533 additions
and
0 deletions
Too many changes to show.
To preserve performance only 1 of 3 files are displayed.
src/main/resources/spejd_config/config.ini
0 → 100644
1 | +# | ||
2 | +# FILES LOCATION | ||
3 | +# all paths in this file are relative to location of this config file, | ||
4 | +# except for absolute paths | ||
5 | +# (in UNIXes starting from '/', in Windows starting from '<letter>:\' or '\') | ||
6 | +# | ||
7 | + | ||
8 | +# a file containing Spejd's grammar | ||
9 | +# in this example file you can find the Spejd rules syntax explained | ||
10 | +rules = rules.sr | ||
11 | + | ||
12 | +# tagset used in grammar and input/output | ||
13 | +# see that file for details on used format | ||
14 | +tagset = sample.cfg | ||
15 | + | ||
16 | +# | ||
17 | +# PROCESSING CHAIN | ||
18 | +# | ||
19 | + | ||
20 | +# list of tools to be executed between reader and writer modules | ||
21 | + | ||
22 | +# for spejd with preprocessing with dictionary | ||
23 | +# (dictionary entries may be multiple - with different names after colon, see below) | ||
24 | +# processingChain = dictionary:example_dict spejd | ||
25 | + | ||
26 | +# spejd preceded with the pantera tagger (Spejd must have pantera support built in) | ||
27 | +# processingChain = pantera spejd | ||
28 | + | ||
29 | +# spejd alone (the default) | ||
30 | +processingChain = spejd | ||
31 | + | ||
32 | +# no tools (only reader and writer) - can be used as format converter | ||
33 | +# processingChain = | ||
34 | + | ||
35 | +# number of threads to use, 0 means autodetect ( = number of detected cpus) | ||
36 | +maxThreads = 0 | ||
37 | + | ||
38 | +# | ||
39 | +# INPUT | ||
40 | +# | ||
41 | + | ||
42 | +# inputType: auto|xcesAna|tei|txt | ||
43 | +# auto chooses reader basing on the file name / extension: | ||
44 | +# - *.txt/*.txt.gz = txt | ||
45 | +# - morph.xml/morph.xml.gz = xcesAna | ||
46 | +# - ann_segmentation.xml/ann_segmentation.xml.gz = tei, without using morphosyntax | ||
47 | +# - ann_morphosyntax.xml/ann_morphosyntax.xml.gz = tei, using morphosyntax | ||
48 | +# with txt and tei without morphosyntax the Morfeusz morphological analyzer is used (unless disabled) | ||
49 | + | ||
50 | +inputType = auto | ||
51 | + | ||
52 | +# encoding of input files (overrides any XML coding tags!) | ||
53 | +# | ||
54 | +# note: the acronymsAfter, acronymsBefore options, and contents of various files | ||
55 | +# such as ogonkifyFile, morfeusz disambiguation rules or dictionaries must be | ||
56 | +# in this encoding too. | ||
57 | +inputEncoding = UTF-8 | ||
58 | + | ||
59 | +# regexp describing names of input files | ||
60 | +# to look for when traversing directories given in command line | ||
61 | +# does not affect file names given explicitly in command line | ||
62 | + | ||
63 | +inputFiles = morph\.xml(\.gz)?|.*\.txt(\.gz)?|ann_morphosyntax\.xml(\.gz)? | ||
64 | + | ||
65 | +# to ignore any disambiguation found in input? | ||
66 | + | ||
67 | +ignoreDisamb = no | ||
68 | + | ||
69 | +# Spejd can use XML id attributes available in the input. Sometimes this may cause | ||
70 | +# problems (e.g. duplicate id values in the scope of file), so can be turned off | ||
71 | +ignoreIDs = no | ||
72 | + | ||
73 | +# | ||
74 | +# OUTPUT | ||
75 | +# | ||
76 | + | ||
77 | +# format of the output file(s): tei|xcesAna|null | ||
78 | +# null = for testing only, does not write anything | ||
79 | + | ||
80 | +outputType = tei | ||
81 | + | ||
82 | +# can interpretations deleted by Spejd be discarded at will (yes) | ||
83 | +# or should be preserved for the final output (no)? | ||
84 | + | ||
85 | +discardDeleted = yes | ||
86 | + | ||
87 | +# the suffix to be added to the target file name | ||
88 | + | ||
89 | +outputSuffix = .xml | ||
90 | + | ||
91 | +# The core name of the output file. Depending on the output type | ||
92 | +# some infixes can be added between it and output suffix. | ||
93 | +# Caution: this option replaces the name of the input file. | ||
94 | +# With output suffix containing only extension equal to extension of | ||
95 | +# the input file (e.g. .xml for the XML input file) spejd will | ||
96 | +# overwrite input files with output. | ||
97 | +# | ||
98 | +# Leave empty or comment out to use the input file name instead. | ||
99 | + | ||
100 | +outputFilenameCore = ann | ||
101 | + | ||
102 | +# If set to 'yes' spejd will backup existing output files to <name>.bak | ||
103 | + | ||
104 | +backupExistingFiles = no | ||
105 | + | ||
106 | +# apply gzip compression to output? | ||
107 | + | ||
108 | +compressOutput = yes | ||
109 | + | ||
110 | +# put <f>'s in single line and omit empty sentences/paragraphs when writing tei? | ||
111 | + | ||
112 | +compactTeiOutput = no | ||
113 | + | ||
114 | +# NKJP (National Corpus of Polish) compatibility mode: | ||
115 | +# dont write <f name="interps"> and <fs type="lex"> in *_words.xml | ||
116 | +# assuming that there can be only one interpretation marked as "correct" for each token. | ||
117 | +# | ||
118 | +# It is a user task to make sure that there will be no tokens with multiple | ||
119 | +# correct interpretations | ||
120 | +teiSingleSyntokInterp = yes | ||
121 | + | ||
122 | +# again, NKJP compatibility: | ||
123 | +# place group's heads information inside the <fs type"group"> | ||
124 | +# as features instead of marking it as 'type' attribute of group's elements | ||
125 | +teiFsGroupHeads = yes | ||
126 | + | ||
127 | +# for backward compatibility with Spejd 1.2, it is probably easier to parse structures | ||
128 | +# written in bottom up order, starting from leafs to the root | ||
129 | +# (all entities defined before referencing) | ||
130 | +teiBottomUpSyntacticStructures = no | ||
131 | + | ||
132 | +# DIAGNOSTICS | ||
133 | + | ||
134 | +# report progress every reportInterval seconds | ||
135 | +# use 0 to completely disable progress reports | ||
136 | + | ||
137 | +reportInterval = 5 | ||
138 | + | ||
139 | +# more verbose reports? | ||
140 | +debug = no | ||
141 | + | ||
142 | +# mark which rule has deleted an interpretation? | ||
143 | + | ||
144 | +ruleMarking = no | ||
145 | + | ||
146 | +# are tag/tagset errors fatal? | ||
147 | +# If turned on, Spejd will try to its best to output only tags conforming the tagset, | ||
148 | +# but they may be useless. | ||
149 | +# This option exists only to preserve compatibility with older versions of Spejd, which | ||
150 | +# accepted incorrect rules. Please do not use when developing new grammars. | ||
151 | +# | ||
152 | +# !!! use at your own risk and don't report crashes when using this option !!! | ||
153 | +nonfatalTagErrors = no | ||
154 | + | ||
155 | +# if to silence the (nonfatal) tag errors? | ||
156 | +muffleTagWarnings = no | ||
157 | + | ||
158 | +# Disable correctness checks of tags in between rules execution? | ||
159 | +# If set, tags can be temporary incomplete or incorrect, but the usual validation | ||
160 | +# before writing is stil performed to make sure Spejd will output only | ||
161 | +# correct tags. | ||
162 | +# Not recommended for developing new grammars. | ||
163 | +tagErrorsOnlyOnTheEnd = no | ||
164 | + | ||
165 | + | ||
166 | +###################################################################### | ||
167 | +# MODULE-SPECIFIC OPTIONS | ||
168 | +###################################################################### | ||
169 | + | ||
170 | + | ||
171 | +# DICTIONARIES | ||
172 | + | ||
173 | +# list of files containing morphological dictionaries | ||
174 | +# to be applied as "dictionary:example_dict" tool to the input | ||
175 | +# the format of lines of files is: | ||
176 | +# orthographic form,base (lexical) form:tag | ||
177 | +# | ||
178 | +# or: | ||
179 | +# ,base (lexical) form:some_parts_of_tag;condition | ||
180 | +# | ||
181 | +# In the first variant the orthographical form is used for matching words. | ||
182 | +# Tag definition is expanded (it may contain wildcards). | ||
183 | +# | ||
184 | +# In the second variant orthographic form is ommited. In that case a base form is used to match. | ||
185 | +# The tags of existing interpretations which match the base form are corrected/modified | ||
186 | +# according to the specified tag. | ||
187 | +# This variant allows the tag to be not full/complete, but only specifying some | ||
188 | +# of the attributes (some parts). | ||
189 | +# This variant also allows to specify conditions on tag that must | ||
190 | +# be meet to perform the modification. The condition has form of a partial tag, just like | ||
191 | +# in the "tag" section of modifying variant. A condition restricts modified interpretations to | ||
192 | +# that ones which have all values of the specified attributes among the specified values. | ||
193 | +# If an attribute is ommited in the specification it means that there are no restrictions | ||
194 | +# on this attribute value and it can be anything (including absence of value). | ||
195 | +# When a condition is empty (that means: there are no restrictions on any attribute), | ||
196 | +# a semicolon preceding it can be ommited and the format is: | ||
197 | +# ,base (lexical) form:some_parts_of_tag | ||
198 | +# | ||
199 | +# The above two variants of entries can be mixed. | ||
200 | +# All the entries with orthographic form are applied before applying any | ||
201 | +# of the entries without orth in the scope of a single 'dictionary:<name>' tool, | ||
202 | +# no matter in which file in this list they appear. | ||
203 | +# | ||
204 | +# The encoding of dictionary files must be the same as inputEncoding. | ||
205 | +#dictionary:example_dict = sample_dict lexdictnum | ||
206 | + | ||
207 | + | ||
208 | +# PANTERA CONFIGURATION | ||
209 | + | ||
210 | +# Pantera can use its own built-in tweaked version of Morfeusz. | ||
211 | +# If this option is set, all interpretations set by reader | ||
212 | +# or any tools preceding pantera in the toolchain are dropped | ||
213 | +panteraDoOwnMorphAnalysis = yes | ||
214 | + | ||
215 | +# tagset for pantera, leave empty for a default (check pantera documentation for details) | ||
216 | +panteraTagsetName = | ||
217 | + | ||
218 | +# pantera's engine, leave empty for a default (check pantera documentation for details) | ||
219 | +panteraEnginePath = | ||
220 | + | ||
221 | + | ||
222 | +# SPEJD SEMANTICS | ||
223 | + | ||
224 | +# default strategy for matching syntactic entities | ||
225 | +# use * for greedy, + for possessive, ? for reluctant | ||
226 | + | ||
227 | +matchStrategy = * | ||
228 | + | ||
229 | +# should agree(case,1,2) return true, if both 1 and 2 have no case? | ||
230 | + | ||
231 | +nullAgreement = no | ||
232 | + | ||
233 | +# SPEJD FSM INTERNALS | ||
234 | + | ||
235 | +# number of single-rule automata to be composed together, usually not needed to change | ||
236 | +# Rule of thumb: if Spejd consumes much too much memory, it's better to decrease this | ||
237 | +# number than to set very low memoryLimit - it gives smaller impact on performance | ||
238 | +composeLimit = 150 | ||
239 | + | ||
240 | +# memory limit in megabytes | ||
241 | +# when memory usage exceeds this limit the rarely-used states removal procedure | ||
242 | +# (or GC, a garbage collector) is launched | ||
243 | +# use as an emergency brake, for standard limit see above. | ||
244 | +# The memoryLimit is approximate, actual memory usage may be slightly higher | ||
245 | +# (it depends on memory allocator library buffers size) | ||
246 | +memoryLimit = 1900 | ||
247 | + | ||
248 | +# approx. percent of DFA states to leave after the states removal | ||
249 | +leavePercent = 80 | ||
250 | + | ||
251 | +# The definitive limit of normal GC usage. GC removes only complex states, so if there are lots | ||
252 | +# of plain states it can't prevent from exceeding memoryLimit. If the percent of complex states | ||
253 | +# is less than minComplexPercent, all the DFAs are dropped and they are built from the beggining | ||
254 | +# just like if the spejd would be restarted. However it does not recompile rules, so it's faster. | ||
255 | +minComplexPercent = 10 | ||
256 | + | ||
257 | + | ||
258 | +# A maximal number of unicode characters which can appear in rules compiled to internal regex | ||
259 | +# It must be higher than the highest number of values of a single attribute (including | ||
260 | +# numeric attributes) and must be higher than a number of unique characters appearing in all rules. | ||
261 | +# Setting too high can increase the memory usage. | ||
262 | +maxNumberOfValues = 4000 | ||
263 | + | ||
264 | + | ||
265 | +# BUILT-IN MORPHOLOGICAL ANALYZER 'MORFEUSZ' | ||
266 | + | ||
267 | +# disable Morfeusz completely, useful when some other tool replaces interpretations, e.g. pantera | ||
268 | +disableMorfeusz = yes | ||
269 | + | ||
270 | +# Morfeusz produces ambiguous segmentation, which can be resolved by simple rule-based | ||
271 | +# disambiguator. This option specifies a file to load rules from. | ||
272 | +# The rule format is described in the example file | ||
273 | +# (leave empty for the builtin default, which is actualy the example file) | ||
274 | + | ||
275 | +# The encoding of this file must be the same as inputEncoding. | ||
276 | +morfeuszSegmentationDisambiguationRules = segm_disamb.conf | ||
277 | + | ||
278 | +# PLAIN TEXT READER - GENERAL | ||
279 | + | ||
280 | +# mock xml:id for the whole text input referred from the output in string-range notation | ||
281 | +# (in TEI output it appears in *segmentation.xml) | ||
282 | +stringRangeMockID = p-1 | ||
283 | + | ||
284 | +# PLAIN TEXT READER - SENTENCER | ||
285 | + | ||
286 | +# list of acronyms - | ||
287 | +# if a dot is found after one of them, it is not a sentence break | ||
288 | + | ||
289 | +acronymsAfter = prof|dr|mgr|doc|ul|np|godz|gen|płk|mjr|por|tzw|tzn|proc|nt|art|ust|ww|www|ws|dz | ||
290 | + | ||
291 | +# list of acronyms (actually top level domain names) - | ||
292 | +# if a dot is found before one of them, it is not a sentence break | ||
293 | + | ||
294 | +acronymsBefore = ac|ad|ae|aero|af|ag|ai|al|am|an|ao|aq|ar|arpa|as|asia|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|biz|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cat|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|com|coop|cr|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|edu|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|info|int|io|iq|ir|is|it|je|jm|jo|jobs|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mil|mk|ml|mm|mn|mo|mobi|mp|mq|mr|ms|mt|mu|museum|mv|mw|mx|my|mz|na|name|nc|ne|net|nf|ng|ni|nl|no|np|nr|nu|nz|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|pro|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|travel|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw | ||
295 | + | ||
296 | + | ||
297 | +# PLAIN TEXT READER - OGONKIFIER | ||
298 | + | ||
299 | +# name of file with ogonkify (diacrit completion) substitutions | ||
300 | +# the format is: | ||
301 | +# <letter without diacritics>=<list of possible letters with diacritics separated by '|'> | ||
302 | +# see the example ogonkifier.ini | ||
303 | + | ||
304 | +# The encoding of this file must be the same as inputEncoding. | ||
305 | +#ogonkifyFile = ogonkifier.ini | ||
306 | + | ||
307 | +# when to use ogonkifier: | ||
308 | +# A - Always, | ||
309 | +# N - Never, | ||
310 | +# M - only when the Morphological analyzer fails to analyse a word | ||
311 | + | ||
312 | +ogonkifyStrategy = N | ||
313 | + | ||
314 | +# min and max length of words to ogonkify | ||
315 | + | ||
316 | +ogonkifyMinLength = 3 | ||
317 | +ogonkifyMaxLength = 13 | ||
318 | + | ||
319 | + | ||
320 | + |