config.ini 11.9 KB
#
# FILES LOCATION
# all paths in this file are relative to location of this config file, 
# except for absolute paths 
# (in UNIXes starting from '/', in Windows starting from '<letter>:\' or '\')
#

# a file containing Spejd's grammar
# in this example file you can find the Spejd rules syntax explained
rules = rules.sr

# tagset used in grammar and input/output
# see that file for details on used format
tagset = sample.cfg

#
# PROCESSING CHAIN
#

# list of tools to be executed between reader and writer modules

# for spejd with preprocessing with dictionary
# (dictionary entries may be multiple - with different names after colon, see below)
# processingChain = dictionary:example_dict spejd

# spejd preceded with the pantera tagger (Spejd must have pantera support built in)
# processingChain = pantera spejd

# spejd alone (the default)
processingChain = spejd

# no tools (only reader and writer) - can be used as format converter
# processingChain = 

# number of threads to use, 0 means autodetect ( = number of detected cpus)
maxThreads = 0

#
# INPUT
#

# inputType: auto|xcesAna|tei|txt
# auto chooses reader basing on the file name / extension:
#   - *.txt/*.txt.gz = txt
#   - morph.xml/morph.xml.gz = xcesAna
#   - ann_segmentation.xml/ann_segmentation.xml.gz = tei, without using morphosyntax
#   - ann_morphosyntax.xml/ann_morphosyntax.xml.gz = tei, using morphosyntax
# with txt and tei without morphosyntax the Morfeusz morphological analyzer is used (unless disabled)

inputType = auto

# encoding of input files (overrides any XML coding tags!)
#
# note: the acronymsAfter, acronymsBefore options, and contents of various files
# such as ogonkifyFile, morfeusz disambiguation rules or dictionaries must be
# in this encoding too.
inputEncoding = UTF-8

# regexp describing names of input files 
# to look for when traversing directories given in command line
# does not affect file names given explicitly in command line

inputFiles = morph\.xml(\.gz)?|.*\.txt(\.gz)?|ann_morphosyntax\.xml(\.gz)?

# to ignore any disambiguation found in input?

ignoreDisamb = no

# Spejd can use XML id attributes available in the input. Sometimes this may cause
# problems (e.g. duplicate id values in the scope of file), so can be turned off
ignoreIDs = no

#
# OUTPUT
#

# format of the output file(s): tei|xcesAna|null
#       null = for testing only, does not write anything 

outputType = tei

# can interpretations deleted by Spejd be discarded at will (yes)
# or should be preserved for the final output (no)? 

discardDeleted = yes

# the suffix to be added to the target file name

outputSuffix = .xml

# The core name of the output file. Depending on the output type
# some infixes can be added between it and output suffix.
# Caution: this option replaces the name of the input file.
# With output suffix containing only extension equal to extension of 
# the input file (e.g. .xml for the XML input file) spejd will
# overwrite input files with output.
# 
# Leave empty or comment out to use the input file name instead.

outputFilenameCore = ann

# If set to 'yes' spejd will backup existing output files to <name>.bak

backupExistingFiles = no

# apply gzip compression to output?

compressOutput = yes

# put <f>'s in single line and omit empty sentences/paragraphs when writing tei?

compactTeiOutput = no

# NKJP (National Corpus of Polish) compatibility mode:
# dont write <f name="interps"> and <fs type="lex"> in *_words.xml
# assuming that there can be only one interpretation marked as "correct" for each token.
# 
# It is a user task to make sure that there will be no tokens with multiple
# correct interpretations
teiSingleSyntokInterp = yes

# again, NKJP compatibility:
# place group's heads information inside the <fs type"group">
# as features instead of marking it as 'type' attribute of group's elements
teiFsGroupHeads = yes

# for backward compatibility with Spejd 1.2, it is probably easier to parse structures
# written in bottom up order, starting from leafs to the root 
# (all entities defined before referencing)
teiBottomUpSyntacticStructures = no

# DIAGNOSTICS

# report progress every reportInterval seconds
# use 0 to completely disable progress reports

reportInterval = 5

# more verbose reports?
debug = no

# mark which rule has deleted an interpretation?

ruleMarking = no

# are tag/tagset errors fatal?
# If turned on, Spejd will try to its best to output only tags conforming the tagset,
# but they may be useless.
# This option exists only to preserve compatibility with older versions of Spejd, which
# accepted incorrect rules. Please do not use when developing new grammars.
#
# !!! use at your own risk and don't report crashes when using this option !!!
nonfatalTagErrors = no

# if to silence the (nonfatal) tag errors?
muffleTagWarnings = no

# Disable correctness checks of tags in between rules execution?
# If set, tags can be temporary incomplete or incorrect, but the usual validation
# before writing is stil performed to make sure Spejd will output only
# correct tags.
# Not recommended for developing new grammars.
tagErrorsOnlyOnTheEnd = no


######################################################################
# MODULE-SPECIFIC OPTIONS
######################################################################


# DICTIONARIES

# list of files containing morphological dictionaries 
# to be applied as "dictionary:example_dict" tool to the input
# the format of lines of files is:
# orthographic form,base (lexical) form:tag
#
# or:
# ,base (lexical) form:some_parts_of_tag;condition
#
# In the first variant the orthographical form is used for matching words.
# Tag definition is expanded (it may contain wildcards).
#
# In the second variant orthographic form is ommited. In that case a base form is used to match.
# The tags of existing interpretations which match the base form are corrected/modified
# according to the specified tag.
# This variant allows the tag to be not full/complete, but only specifying some
# of the attributes (some parts).
# This variant also allows to specify conditions on tag that must
# be meet to perform the modification. The condition has form of a partial tag, just like
# in the "tag" section of modifying variant. A condition restricts modified interpretations to
# that ones which have all values of the specified attributes among the specified values.
# If an attribute is ommited in the specification it means that there are no restrictions
# on this attribute value and it can be anything (including absence of value).
# When a condition is empty (that means: there are no restrictions on any attribute),
# a semicolon preceding it can be ommited and the format is:
# ,base (lexical) form:some_parts_of_tag
#
# The above two variants of entries can be mixed.
# All the entries with orthographic form are applied before applying any
# of the entries without orth in the scope of a single 'dictionary:<name>' tool,
# no matter in which file in this list they appear.
#
# The encoding of dictionary files must be the same as inputEncoding.
#dictionary:example_dict = sample_dict lexdictnum


# PANTERA CONFIGURATION

# Pantera can use its own built-in tweaked version of Morfeusz.
# If this option is set, all interpretations set by reader
# or any tools preceding pantera in the toolchain are dropped
panteraDoOwnMorphAnalysis = yes

# tagset for pantera, leave empty for a default (check pantera documentation for details)
panteraTagsetName =

# pantera's engine, leave empty for a default (check pantera documentation for details)
panteraEnginePath =


# SPEJD SEMANTICS

# default strategy for matching syntactic entities
# use * for greedy, + for possessive, ? for reluctant

matchStrategy = *

# should agree(case,1,2) return true, if both 1 and 2 have no case?

nullAgreement = no

# SPEJD FSM INTERNALS

# number of single-rule automata to be composed together, usually not needed to change
# Rule of thumb: if Spejd consumes much too much memory, it's better to decrease this
# number than to set very low memoryLimit - it gives smaller impact on performance
composeLimit = 150

# memory limit in megabytes 
# when memory usage exceeds this limit the rarely-used states removal procedure 
# (or GC, a garbage collector) is launched
# use as an emergency brake, for standard limit see above.
# The memoryLimit is approximate, actual memory usage may be slightly higher
# (it depends on memory allocator library buffers size)
memoryLimit = 1900

# approx. percent of DFA states to leave after the states removal
leavePercent = 80

# The definitive limit of normal GC usage. GC removes only complex states, so if there are lots
# of plain states it can't prevent from exceeding memoryLimit. If the percent of complex states
# is less than minComplexPercent, all the DFAs are dropped and they are built from the beggining
# just like if the spejd would be restarted. However it does not recompile rules, so it's faster.
minComplexPercent = 10


# A maximal number of unicode characters which can appear in rules compiled to internal regex
# It must be higher than the highest number of values of a single attribute (including
# numeric attributes) and must be higher than a number of unique characters appearing in all rules.
# Setting too high can increase the memory usage.
maxNumberOfValues = 4000


# BUILT-IN MORPHOLOGICAL ANALYZER 'MORFEUSZ'

# disable Morfeusz completely, useful when some other tool replaces interpretations, e.g. pantera
disableMorfeusz = yes

# Morfeusz produces ambiguous segmentation, which can be resolved by simple rule-based 
# disambiguator. This option specifies a file to load rules from.
# The rule format is described in the example file
# (leave empty for the builtin default, which is actualy the example file)

# The encoding of this file must be the same as inputEncoding.
morfeuszSegmentationDisambiguationRules = segm_disamb.conf

# PLAIN TEXT READER - GENERAL

# mock xml:id for the whole text input referred from the output in string-range notation
# (in TEI output it appears in *segmentation.xml)
stringRangeMockID = p-1

# PLAIN TEXT READER - SENTENCER

# list of acronyms - 
# if a dot is found after one of them, it is not a sentence break

acronymsAfter  = prof|dr|mgr|doc|ul|np|godz|gen|płk|mjr|por|tzw|tzn|proc|nt|art|ust|ww|www|ws|dz

# list of acronyms (actually top level domain names) - 
# if a dot is found before one of them, it is not a sentence break

acronymsBefore = ac|ad|ae|aero|af|ag|ai|al|am|an|ao|aq|ar|arpa|as|asia|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|biz|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cat|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|com|coop|cr|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|edu|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|info|int|io|iq|ir|is|it|je|jm|jo|jobs|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mil|mk|ml|mm|mn|mo|mobi|mp|mq|mr|ms|mt|mu|museum|mv|mw|mx|my|mz|na|name|nc|ne|net|nf|ng|ni|nl|no|np|nr|nu|nz|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|pro|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|travel|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw


# PLAIN TEXT READER - OGONKIFIER

# name of file with ogonkify (diacrit completion) substitutions
# the format is:
# <letter without diacritics>=<list of possible letters with diacritics separated by '|'>
# see the example ogonkifier.ini

# The encoding of this file must be the same as inputEncoding.
#ogonkifyFile     = ogonkifier.ini

# when to use ogonkifier:
# A - Always, 
# N - Never,
# M - only when the Morphological analyzer fails to analyse a word

ogonkifyStrategy = N

# min and max length of words to ogonkify

ogonkifyMinLength = 3
ogonkifyMaxLength = 13