Commit 46a46821b37bc5e11065304c0e9494a667a66c25
1 parent
8f86545e
Added preferable spejd config to resources directory.
Showing
3 changed files
with
8533 additions
and
0 deletions
Too many changes to show.
To preserve performance only 1 of 3 files are displayed.
src/main/resources/spejd_config/config.ini
0 → 100644
1 | +# | |
2 | +# FILES LOCATION | |
3 | +# all paths in this file are relative to location of this config file, | |
4 | +# except for absolute paths | |
5 | +# (in UNIXes starting from '/', in Windows starting from '<letter>:\' or '\') | |
6 | +# | |
7 | + | |
8 | +# a file containing Spejd's grammar | |
9 | +# in this example file you can find the Spejd rules syntax explained | |
10 | +rules = rules.sr | |
11 | + | |
12 | +# tagset used in grammar and input/output | |
13 | +# see that file for details on used format | |
14 | +tagset = sample.cfg | |
15 | + | |
16 | +# | |
17 | +# PROCESSING CHAIN | |
18 | +# | |
19 | + | |
20 | +# list of tools to be executed between reader and writer modules | |
21 | + | |
22 | +# for spejd with preprocessing with dictionary | |
23 | +# (dictionary entries may be multiple - with different names after colon, see below) | |
24 | +# processingChain = dictionary:example_dict spejd | |
25 | + | |
26 | +# spejd preceded with the pantera tagger (Spejd must have pantera support built in) | |
27 | +# processingChain = pantera spejd | |
28 | + | |
29 | +# spejd alone (the default) | |
30 | +processingChain = spejd | |
31 | + | |
32 | +# no tools (only reader and writer) - can be used as format converter | |
33 | +# processingChain = | |
34 | + | |
35 | +# number of threads to use, 0 means autodetect ( = number of detected cpus) | |
36 | +maxThreads = 0 | |
37 | + | |
38 | +# | |
39 | +# INPUT | |
40 | +# | |
41 | + | |
42 | +# inputType: auto|xcesAna|tei|txt | |
43 | +# auto chooses reader basing on the file name / extension: | |
44 | +# - *.txt/*.txt.gz = txt | |
45 | +# - morph.xml/morph.xml.gz = xcesAna | |
46 | +# - ann_segmentation.xml/ann_segmentation.xml.gz = tei, without using morphosyntax | |
47 | +# - ann_morphosyntax.xml/ann_morphosyntax.xml.gz = tei, using morphosyntax | |
48 | +# with txt and tei without morphosyntax the Morfeusz morphological analyzer is used (unless disabled) | |
49 | + | |
50 | +inputType = auto | |
51 | + | |
52 | +# encoding of input files (overrides any XML coding tags!) | |
53 | +# | |
54 | +# note: the acronymsAfter, acronymsBefore options, and contents of various files | |
55 | +# such as ogonkifyFile, morfeusz disambiguation rules or dictionaries must be | |
56 | +# in this encoding too. | |
57 | +inputEncoding = UTF-8 | |
58 | + | |
59 | +# regexp describing names of input files | |
60 | +# to look for when traversing directories given in command line | |
61 | +# does not affect file names given explicitly in command line | |
62 | + | |
63 | +inputFiles = morph\.xml(\.gz)?|.*\.txt(\.gz)?|ann_morphosyntax\.xml(\.gz)? | |
64 | + | |
65 | +# to ignore any disambiguation found in input? | |
66 | + | |
67 | +ignoreDisamb = no | |
68 | + | |
69 | +# Spejd can use XML id attributes available in the input. Sometimes this may cause | |
70 | +# problems (e.g. duplicate id values in the scope of file), so can be turned off | |
71 | +ignoreIDs = no | |
72 | + | |
73 | +# | |
74 | +# OUTPUT | |
75 | +# | |
76 | + | |
77 | +# format of the output file(s): tei|xcesAna|null | |
78 | +# null = for testing only, does not write anything | |
79 | + | |
80 | +outputType = tei | |
81 | + | |
82 | +# can interpretations deleted by Spejd be discarded at will (yes) | |
83 | +# or should be preserved for the final output (no)? | |
84 | + | |
85 | +discardDeleted = yes | |
86 | + | |
87 | +# the suffix to be added to the target file name | |
88 | + | |
89 | +outputSuffix = .xml | |
90 | + | |
91 | +# The core name of the output file. Depending on the output type | |
92 | +# some infixes can be added between it and output suffix. | |
93 | +# Caution: this option replaces the name of the input file. | |
94 | +# With output suffix containing only extension equal to extension of | |
95 | +# the input file (e.g. .xml for the XML input file) spejd will | |
96 | +# overwrite input files with output. | |
97 | +# | |
98 | +# Leave empty or comment out to use the input file name instead. | |
99 | + | |
100 | +outputFilenameCore = ann | |
101 | + | |
102 | +# If set to 'yes' spejd will backup existing output files to <name>.bak | |
103 | + | |
104 | +backupExistingFiles = no | |
105 | + | |
106 | +# apply gzip compression to output? | |
107 | + | |
108 | +compressOutput = yes | |
109 | + | |
110 | +# put <f>'s in single line and omit empty sentences/paragraphs when writing tei? | |
111 | + | |
112 | +compactTeiOutput = no | |
113 | + | |
114 | +# NKJP (National Corpus of Polish) compatibility mode: | |
115 | +# dont write <f name="interps"> and <fs type="lex"> in *_words.xml | |
116 | +# assuming that there can be only one interpretation marked as "correct" for each token. | |
117 | +# | |
118 | +# It is a user task to make sure that there will be no tokens with multiple | |
119 | +# correct interpretations | |
120 | +teiSingleSyntokInterp = yes | |
121 | + | |
122 | +# again, NKJP compatibility: | |
123 | +# place group's heads information inside the <fs type"group"> | |
124 | +# as features instead of marking it as 'type' attribute of group's elements | |
125 | +teiFsGroupHeads = yes | |
126 | + | |
127 | +# for backward compatibility with Spejd 1.2, it is probably easier to parse structures | |
128 | +# written in bottom up order, starting from leafs to the root | |
129 | +# (all entities defined before referencing) | |
130 | +teiBottomUpSyntacticStructures = no | |
131 | + | |
132 | +# DIAGNOSTICS | |
133 | + | |
134 | +# report progress every reportInterval seconds | |
135 | +# use 0 to completely disable progress reports | |
136 | + | |
137 | +reportInterval = 5 | |
138 | + | |
139 | +# more verbose reports? | |
140 | +debug = no | |
141 | + | |
142 | +# mark which rule has deleted an interpretation? | |
143 | + | |
144 | +ruleMarking = no | |
145 | + | |
146 | +# are tag/tagset errors fatal? | |
147 | +# If turned on, Spejd will try to its best to output only tags conforming the tagset, | |
148 | +# but they may be useless. | |
149 | +# This option exists only to preserve compatibility with older versions of Spejd, which | |
150 | +# accepted incorrect rules. Please do not use when developing new grammars. | |
151 | +# | |
152 | +# !!! use at your own risk and don't report crashes when using this option !!! | |
153 | +nonfatalTagErrors = no | |
154 | + | |
155 | +# if to silence the (nonfatal) tag errors? | |
156 | +muffleTagWarnings = no | |
157 | + | |
158 | +# Disable correctness checks of tags in between rules execution? | |
159 | +# If set, tags can be temporary incomplete or incorrect, but the usual validation | |
160 | +# before writing is stil performed to make sure Spejd will output only | |
161 | +# correct tags. | |
162 | +# Not recommended for developing new grammars. | |
163 | +tagErrorsOnlyOnTheEnd = no | |
164 | + | |
165 | + | |
166 | +###################################################################### | |
167 | +# MODULE-SPECIFIC OPTIONS | |
168 | +###################################################################### | |
169 | + | |
170 | + | |
171 | +# DICTIONARIES | |
172 | + | |
173 | +# list of files containing morphological dictionaries | |
174 | +# to be applied as "dictionary:example_dict" tool to the input | |
175 | +# the format of lines of files is: | |
176 | +# orthographic form,base (lexical) form:tag | |
177 | +# | |
178 | +# or: | |
179 | +# ,base (lexical) form:some_parts_of_tag;condition | |
180 | +# | |
181 | +# In the first variant the orthographical form is used for matching words. | |
182 | +# Tag definition is expanded (it may contain wildcards). | |
183 | +# | |
184 | +# In the second variant orthographic form is ommited. In that case a base form is used to match. | |
185 | +# The tags of existing interpretations which match the base form are corrected/modified | |
186 | +# according to the specified tag. | |
187 | +# This variant allows the tag to be not full/complete, but only specifying some | |
188 | +# of the attributes (some parts). | |
189 | +# This variant also allows to specify conditions on tag that must | |
190 | +# be meet to perform the modification. The condition has form of a partial tag, just like | |
191 | +# in the "tag" section of modifying variant. A condition restricts modified interpretations to | |
192 | +# that ones which have all values of the specified attributes among the specified values. | |
193 | +# If an attribute is ommited in the specification it means that there are no restrictions | |
194 | +# on this attribute value and it can be anything (including absence of value). | |
195 | +# When a condition is empty (that means: there are no restrictions on any attribute), | |
196 | +# a semicolon preceding it can be ommited and the format is: | |
197 | +# ,base (lexical) form:some_parts_of_tag | |
198 | +# | |
199 | +# The above two variants of entries can be mixed. | |
200 | +# All the entries with orthographic form are applied before applying any | |
201 | +# of the entries without orth in the scope of a single 'dictionary:<name>' tool, | |
202 | +# no matter in which file in this list they appear. | |
203 | +# | |
204 | +# The encoding of dictionary files must be the same as inputEncoding. | |
205 | +#dictionary:example_dict = sample_dict lexdictnum | |
206 | + | |
207 | + | |
208 | +# PANTERA CONFIGURATION | |
209 | + | |
210 | +# Pantera can use its own built-in tweaked version of Morfeusz. | |
211 | +# If this option is set, all interpretations set by reader | |
212 | +# or any tools preceding pantera in the toolchain are dropped | |
213 | +panteraDoOwnMorphAnalysis = yes | |
214 | + | |
215 | +# tagset for pantera, leave empty for a default (check pantera documentation for details) | |
216 | +panteraTagsetName = | |
217 | + | |
218 | +# pantera's engine, leave empty for a default (check pantera documentation for details) | |
219 | +panteraEnginePath = | |
220 | + | |
221 | + | |
222 | +# SPEJD SEMANTICS | |
223 | + | |
224 | +# default strategy for matching syntactic entities | |
225 | +# use * for greedy, + for possessive, ? for reluctant | |
226 | + | |
227 | +matchStrategy = * | |
228 | + | |
229 | +# should agree(case,1,2) return true, if both 1 and 2 have no case? | |
230 | + | |
231 | +nullAgreement = no | |
232 | + | |
233 | +# SPEJD FSM INTERNALS | |
234 | + | |
235 | +# number of single-rule automata to be composed together, usually not needed to change | |
236 | +# Rule of thumb: if Spejd consumes much too much memory, it's better to decrease this | |
237 | +# number than to set very low memoryLimit - it gives smaller impact on performance | |
238 | +composeLimit = 150 | |
239 | + | |
240 | +# memory limit in megabytes | |
241 | +# when memory usage exceeds this limit the rarely-used states removal procedure | |
242 | +# (or GC, a garbage collector) is launched | |
243 | +# use as an emergency brake, for standard limit see above. | |
244 | +# The memoryLimit is approximate, actual memory usage may be slightly higher | |
245 | +# (it depends on memory allocator library buffers size) | |
246 | +memoryLimit = 1900 | |
247 | + | |
248 | +# approx. percent of DFA states to leave after the states removal | |
249 | +leavePercent = 80 | |
250 | + | |
251 | +# The definitive limit of normal GC usage. GC removes only complex states, so if there are lots | |
252 | +# of plain states it can't prevent from exceeding memoryLimit. If the percent of complex states | |
253 | +# is less than minComplexPercent, all the DFAs are dropped and they are built from the beggining | |
254 | +# just like if the spejd would be restarted. However it does not recompile rules, so it's faster. | |
255 | +minComplexPercent = 10 | |
256 | + | |
257 | + | |
258 | +# A maximal number of unicode characters which can appear in rules compiled to internal regex | |
259 | +# It must be higher than the highest number of values of a single attribute (including | |
260 | +# numeric attributes) and must be higher than a number of unique characters appearing in all rules. | |
261 | +# Setting too high can increase the memory usage. | |
262 | +maxNumberOfValues = 4000 | |
263 | + | |
264 | + | |
265 | +# BUILT-IN MORPHOLOGICAL ANALYZER 'MORFEUSZ' | |
266 | + | |
267 | +# disable Morfeusz completely, useful when some other tool replaces interpretations, e.g. pantera | |
268 | +disableMorfeusz = yes | |
269 | + | |
270 | +# Morfeusz produces ambiguous segmentation, which can be resolved by simple rule-based | |
271 | +# disambiguator. This option specifies a file to load rules from. | |
272 | +# The rule format is described in the example file | |
273 | +# (leave empty for the builtin default, which is actualy the example file) | |
274 | + | |
275 | +# The encoding of this file must be the same as inputEncoding. | |
276 | +morfeuszSegmentationDisambiguationRules = segm_disamb.conf | |
277 | + | |
278 | +# PLAIN TEXT READER - GENERAL | |
279 | + | |
280 | +# mock xml:id for the whole text input referred from the output in string-range notation | |
281 | +# (in TEI output it appears in *segmentation.xml) | |
282 | +stringRangeMockID = p-1 | |
283 | + | |
284 | +# PLAIN TEXT READER - SENTENCER | |
285 | + | |
286 | +# list of acronyms - | |
287 | +# if a dot is found after one of them, it is not a sentence break | |
288 | + | |
289 | +acronymsAfter = prof|dr|mgr|doc|ul|np|godz|gen|płk|mjr|por|tzw|tzn|proc|nt|art|ust|ww|www|ws|dz | |
290 | + | |
291 | +# list of acronyms (actually top level domain names) - | |
292 | +# if a dot is found before one of them, it is not a sentence break | |
293 | + | |
294 | +acronymsBefore = ac|ad|ae|aero|af|ag|ai|al|am|an|ao|aq|ar|arpa|as|asia|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|biz|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cat|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|com|coop|cr|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|edu|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|info|int|io|iq|ir|is|it|je|jm|jo|jobs|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mil|mk|ml|mm|mn|mo|mobi|mp|mq|mr|ms|mt|mu|museum|mv|mw|mx|my|mz|na|name|nc|ne|net|nf|ng|ni|nl|no|np|nr|nu|nz|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|pro|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|travel|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw | |
295 | + | |
296 | + | |
297 | +# PLAIN TEXT READER - OGONKIFIER | |
298 | + | |
299 | +# name of file with ogonkify (diacrit completion) substitutions | |
300 | +# the format is: | |
301 | +# <letter without diacritics>=<list of possible letters with diacritics separated by '|'> | |
302 | +# see the example ogonkifier.ini | |
303 | + | |
304 | +# The encoding of this file must be the same as inputEncoding. | |
305 | +#ogonkifyFile = ogonkifier.ini | |
306 | + | |
307 | +# when to use ogonkifier: | |
308 | +# A - Always, | |
309 | +# N - Never, | |
310 | +# M - only when the Morphological analyzer fails to analyse a word | |
311 | + | |
312 | +ogonkifyStrategy = N | |
313 | + | |
314 | +# min and max length of words to ogonkify | |
315 | + | |
316 | +ogonkifyMinLength = 3 | |
317 | +ogonkifyMaxLength = 13 | |
318 | + | |
319 | + | |
320 | + | |
... | ... |