Commit 46a46821b37bc5e11065304c0e9494a667a66c25

Authored by Bartłomiej Nitoń
1 parent 8f86545e

Added preferable spejd config to resources directory.

Too many changes to show.

To preserve performance only 1 of 3 files are displayed.

src/main/resources/spejd_config/config.ini 0 → 100644
  1 +#
  2 +# FILES LOCATION
  3 +# all paths in this file are relative to location of this config file,
  4 +# except for absolute paths
  5 +# (in UNIXes starting from '/', in Windows starting from '<letter>:\' or '\')
  6 +#
  7 +
  8 +# a file containing Spejd's grammar
  9 +# in this example file you can find the Spejd rules syntax explained
  10 +rules = rules.sr
  11 +
  12 +# tagset used in grammar and input/output
  13 +# see that file for details on used format
  14 +tagset = sample.cfg
  15 +
  16 +#
  17 +# PROCESSING CHAIN
  18 +#
  19 +
  20 +# list of tools to be executed between reader and writer modules
  21 +
  22 +# for spejd with preprocessing with dictionary
  23 +# (dictionary entries may be multiple - with different names after colon, see below)
  24 +# processingChain = dictionary:example_dict spejd
  25 +
  26 +# spejd preceded with the pantera tagger (Spejd must have pantera support built in)
  27 +# processingChain = pantera spejd
  28 +
  29 +# spejd alone (the default)
  30 +processingChain = spejd
  31 +
  32 +# no tools (only reader and writer) - can be used as format converter
  33 +# processingChain =
  34 +
  35 +# number of threads to use, 0 means autodetect ( = number of detected cpus)
  36 +maxThreads = 0
  37 +
  38 +#
  39 +# INPUT
  40 +#
  41 +
  42 +# inputType: auto|xcesAna|tei|txt
  43 +# auto chooses reader basing on the file name / extension:
  44 +# - *.txt/*.txt.gz = txt
  45 +# - morph.xml/morph.xml.gz = xcesAna
  46 +# - ann_segmentation.xml/ann_segmentation.xml.gz = tei, without using morphosyntax
  47 +# - ann_morphosyntax.xml/ann_morphosyntax.xml.gz = tei, using morphosyntax
  48 +# with txt and tei without morphosyntax the Morfeusz morphological analyzer is used (unless disabled)
  49 +
  50 +inputType = auto
  51 +
  52 +# encoding of input files (overrides any XML coding tags!)
  53 +#
  54 +# note: the acronymsAfter, acronymsBefore options, and contents of various files
  55 +# such as ogonkifyFile, morfeusz disambiguation rules or dictionaries must be
  56 +# in this encoding too.
  57 +inputEncoding = UTF-8
  58 +
  59 +# regexp describing names of input files
  60 +# to look for when traversing directories given in command line
  61 +# does not affect file names given explicitly in command line
  62 +
  63 +inputFiles = morph\.xml(\.gz)?|.*\.txt(\.gz)?|ann_morphosyntax\.xml(\.gz)?
  64 +
  65 +# to ignore any disambiguation found in input?
  66 +
  67 +ignoreDisamb = no
  68 +
  69 +# Spejd can use XML id attributes available in the input. Sometimes this may cause
  70 +# problems (e.g. duplicate id values in the scope of file), so can be turned off
  71 +ignoreIDs = no
  72 +
  73 +#
  74 +# OUTPUT
  75 +#
  76 +
  77 +# format of the output file(s): tei|xcesAna|null
  78 +# null = for testing only, does not write anything
  79 +
  80 +outputType = tei
  81 +
  82 +# can interpretations deleted by Spejd be discarded at will (yes)
  83 +# or should be preserved for the final output (no)?
  84 +
  85 +discardDeleted = yes
  86 +
  87 +# the suffix to be added to the target file name
  88 +
  89 +outputSuffix = .xml
  90 +
  91 +# The core name of the output file. Depending on the output type
  92 +# some infixes can be added between it and output suffix.
  93 +# Caution: this option replaces the name of the input file.
  94 +# With output suffix containing only extension equal to extension of
  95 +# the input file (e.g. .xml for the XML input file) spejd will
  96 +# overwrite input files with output.
  97 +#
  98 +# Leave empty or comment out to use the input file name instead.
  99 +
  100 +outputFilenameCore = ann
  101 +
  102 +# If set to 'yes' spejd will backup existing output files to <name>.bak
  103 +
  104 +backupExistingFiles = no
  105 +
  106 +# apply gzip compression to output?
  107 +
  108 +compressOutput = yes
  109 +
  110 +# put <f>'s in single line and omit empty sentences/paragraphs when writing tei?
  111 +
  112 +compactTeiOutput = no
  113 +
  114 +# NKJP (National Corpus of Polish) compatibility mode:
  115 +# dont write <f name="interps"> and <fs type="lex"> in *_words.xml
  116 +# assuming that there can be only one interpretation marked as "correct" for each token.
  117 +#
  118 +# It is a user task to make sure that there will be no tokens with multiple
  119 +# correct interpretations
  120 +teiSingleSyntokInterp = yes
  121 +
  122 +# again, NKJP compatibility:
  123 +# place group's heads information inside the <fs type"group">
  124 +# as features instead of marking it as 'type' attribute of group's elements
  125 +teiFsGroupHeads = yes
  126 +
  127 +# for backward compatibility with Spejd 1.2, it is probably easier to parse structures
  128 +# written in bottom up order, starting from leafs to the root
  129 +# (all entities defined before referencing)
  130 +teiBottomUpSyntacticStructures = no
  131 +
  132 +# DIAGNOSTICS
  133 +
  134 +# report progress every reportInterval seconds
  135 +# use 0 to completely disable progress reports
  136 +
  137 +reportInterval = 5
  138 +
  139 +# more verbose reports?
  140 +debug = no
  141 +
  142 +# mark which rule has deleted an interpretation?
  143 +
  144 +ruleMarking = no
  145 +
  146 +# are tag/tagset errors fatal?
  147 +# If turned on, Spejd will try to its best to output only tags conforming the tagset,
  148 +# but they may be useless.
  149 +# This option exists only to preserve compatibility with older versions of Spejd, which
  150 +# accepted incorrect rules. Please do not use when developing new grammars.
  151 +#
  152 +# !!! use at your own risk and don't report crashes when using this option !!!
  153 +nonfatalTagErrors = no
  154 +
  155 +# if to silence the (nonfatal) tag errors?
  156 +muffleTagWarnings = no
  157 +
  158 +# Disable correctness checks of tags in between rules execution?
  159 +# If set, tags can be temporary incomplete or incorrect, but the usual validation
  160 +# before writing is stil performed to make sure Spejd will output only
  161 +# correct tags.
  162 +# Not recommended for developing new grammars.
  163 +tagErrorsOnlyOnTheEnd = no
  164 +
  165 +
  166 +######################################################################
  167 +# MODULE-SPECIFIC OPTIONS
  168 +######################################################################
  169 +
  170 +
  171 +# DICTIONARIES
  172 +
  173 +# list of files containing morphological dictionaries
  174 +# to be applied as "dictionary:example_dict" tool to the input
  175 +# the format of lines of files is:
  176 +# orthographic form,base (lexical) form:tag
  177 +#
  178 +# or:
  179 +# ,base (lexical) form:some_parts_of_tag;condition
  180 +#
  181 +# In the first variant the orthographical form is used for matching words.
  182 +# Tag definition is expanded (it may contain wildcards).
  183 +#
  184 +# In the second variant orthographic form is ommited. In that case a base form is used to match.
  185 +# The tags of existing interpretations which match the base form are corrected/modified
  186 +# according to the specified tag.
  187 +# This variant allows the tag to be not full/complete, but only specifying some
  188 +# of the attributes (some parts).
  189 +# This variant also allows to specify conditions on tag that must
  190 +# be meet to perform the modification. The condition has form of a partial tag, just like
  191 +# in the "tag" section of modifying variant. A condition restricts modified interpretations to
  192 +# that ones which have all values of the specified attributes among the specified values.
  193 +# If an attribute is ommited in the specification it means that there are no restrictions
  194 +# on this attribute value and it can be anything (including absence of value).
  195 +# When a condition is empty (that means: there are no restrictions on any attribute),
  196 +# a semicolon preceding it can be ommited and the format is:
  197 +# ,base (lexical) form:some_parts_of_tag
  198 +#
  199 +# The above two variants of entries can be mixed.
  200 +# All the entries with orthographic form are applied before applying any
  201 +# of the entries without orth in the scope of a single 'dictionary:<name>' tool,
  202 +# no matter in which file in this list they appear.
  203 +#
  204 +# The encoding of dictionary files must be the same as inputEncoding.
  205 +#dictionary:example_dict = sample_dict lexdictnum
  206 +
  207 +
  208 +# PANTERA CONFIGURATION
  209 +
  210 +# Pantera can use its own built-in tweaked version of Morfeusz.
  211 +# If this option is set, all interpretations set by reader
  212 +# or any tools preceding pantera in the toolchain are dropped
  213 +panteraDoOwnMorphAnalysis = yes
  214 +
  215 +# tagset for pantera, leave empty for a default (check pantera documentation for details)
  216 +panteraTagsetName =
  217 +
  218 +# pantera's engine, leave empty for a default (check pantera documentation for details)
  219 +panteraEnginePath =
  220 +
  221 +
  222 +# SPEJD SEMANTICS
  223 +
  224 +# default strategy for matching syntactic entities
  225 +# use * for greedy, + for possessive, ? for reluctant
  226 +
  227 +matchStrategy = *
  228 +
  229 +# should agree(case,1,2) return true, if both 1 and 2 have no case?
  230 +
  231 +nullAgreement = no
  232 +
  233 +# SPEJD FSM INTERNALS
  234 +
  235 +# number of single-rule automata to be composed together, usually not needed to change
  236 +# Rule of thumb: if Spejd consumes much too much memory, it's better to decrease this
  237 +# number than to set very low memoryLimit - it gives smaller impact on performance
  238 +composeLimit = 150
  239 +
  240 +# memory limit in megabytes
  241 +# when memory usage exceeds this limit the rarely-used states removal procedure
  242 +# (or GC, a garbage collector) is launched
  243 +# use as an emergency brake, for standard limit see above.
  244 +# The memoryLimit is approximate, actual memory usage may be slightly higher
  245 +# (it depends on memory allocator library buffers size)
  246 +memoryLimit = 1900
  247 +
  248 +# approx. percent of DFA states to leave after the states removal
  249 +leavePercent = 80
  250 +
  251 +# The definitive limit of normal GC usage. GC removes only complex states, so if there are lots
  252 +# of plain states it can't prevent from exceeding memoryLimit. If the percent of complex states
  253 +# is less than minComplexPercent, all the DFAs are dropped and they are built from the beggining
  254 +# just like if the spejd would be restarted. However it does not recompile rules, so it's faster.
  255 +minComplexPercent = 10
  256 +
  257 +
  258 +# A maximal number of unicode characters which can appear in rules compiled to internal regex
  259 +# It must be higher than the highest number of values of a single attribute (including
  260 +# numeric attributes) and must be higher than a number of unique characters appearing in all rules.
  261 +# Setting too high can increase the memory usage.
  262 +maxNumberOfValues = 4000
  263 +
  264 +
  265 +# BUILT-IN MORPHOLOGICAL ANALYZER 'MORFEUSZ'
  266 +
  267 +# disable Morfeusz completely, useful when some other tool replaces interpretations, e.g. pantera
  268 +disableMorfeusz = yes
  269 +
  270 +# Morfeusz produces ambiguous segmentation, which can be resolved by simple rule-based
  271 +# disambiguator. This option specifies a file to load rules from.
  272 +# The rule format is described in the example file
  273 +# (leave empty for the builtin default, which is actualy the example file)
  274 +
  275 +# The encoding of this file must be the same as inputEncoding.
  276 +morfeuszSegmentationDisambiguationRules = segm_disamb.conf
  277 +
  278 +# PLAIN TEXT READER - GENERAL
  279 +
  280 +# mock xml:id for the whole text input referred from the output in string-range notation
  281 +# (in TEI output it appears in *segmentation.xml)
  282 +stringRangeMockID = p-1
  283 +
  284 +# PLAIN TEXT READER - SENTENCER
  285 +
  286 +# list of acronyms -
  287 +# if a dot is found after one of them, it is not a sentence break
  288 +
  289 +acronymsAfter = prof|dr|mgr|doc|ul|np|godz|gen|płk|mjr|por|tzw|tzn|proc|nt|art|ust|ww|www|ws|dz
  290 +
  291 +# list of acronyms (actually top level domain names) -
  292 +# if a dot is found before one of them, it is not a sentence break
  293 +
  294 +acronymsBefore = ac|ad|ae|aero|af|ag|ai|al|am|an|ao|aq|ar|arpa|as|asia|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|biz|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cat|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|com|coop|cr|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|edu|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|info|int|io|iq|ir|is|it|je|jm|jo|jobs|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mil|mk|ml|mm|mn|mo|mobi|mp|mq|mr|ms|mt|mu|museum|mv|mw|mx|my|mz|na|name|nc|ne|net|nf|ng|ni|nl|no|np|nr|nu|nz|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|pro|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|travel|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw
  295 +
  296 +
  297 +# PLAIN TEXT READER - OGONKIFIER
  298 +
  299 +# name of file with ogonkify (diacrit completion) substitutions
  300 +# the format is:
  301 +# <letter without diacritics>=<list of possible letters with diacritics separated by '|'>
  302 +# see the example ogonkifier.ini
  303 +
  304 +# The encoding of this file must be the same as inputEncoding.
  305 +#ogonkifyFile = ogonkifier.ini
  306 +
  307 +# when to use ogonkifier:
  308 +# A - Always,
  309 +# N - Never,
  310 +# M - only when the Morphological analyzer fails to analyse a word
  311 +
  312 +ogonkifyStrategy = N
  313 +
  314 +# min and max length of words to ogonkify
  315 +
  316 +ogonkifyMinLength = 3
  317 +ogonkifyMaxLength = 13
  318 +
  319 +
  320 +
... ...