Added preferable spejd config to resources directory. (46a46821) | Commits | core / md

Browse Code »

Commit 46a46821b37bc5e11065304c0e9494a667a66c25

Authored by Bartłomiej Nitoń 8 years ago

1 parent 8f86545e

Added preferable spejd config to resources directory.

Inline Side-by-side

Showing 3 changed files with 8533 additions and 0 deletions

Too many changes to show.
Reload with full diff Plain diff Email patch

To preserve performance only 1 of 3 files are displayed.

src/main/resources/spejd_config/config.ini 0 → 100644

View file @46a4682

	1	+#
	2	+# FILES LOCATION
	3	+# all paths in this file are relative to location of this config file,
	4	+# except for absolute paths
	5	+# (in UNIXes starting from '/', in Windows starting from '<letter>:\' or '\')
	6	+#
	7	+
	8	+# a file containing Spejd's grammar
	9	+# in this example file you can find the Spejd rules syntax explained
	10	+rules = rules.sr
	11	+
	12	+# tagset used in grammar and input/output
	13	+# see that file for details on used format
	14	+tagset = sample.cfg
	15	+
	16	+#
	17	+# PROCESSING CHAIN
	18	+#
	19	+
	20	+# list of tools to be executed between reader and writer modules
	21	+
	22	+# for spejd with preprocessing with dictionary
	23	+# (dictionary entries may be multiple - with different names after colon, see below)
	24	+# processingChain = dictionary:example_dict spejd
	25	+
	26	+# spejd preceded with the pantera tagger (Spejd must have pantera support built in)
	27	+# processingChain = pantera spejd
	28	+
	29	+# spejd alone (the default)
	30	+processingChain = spejd
	31	+
	32	+# no tools (only reader and writer) - can be used as format converter
	33	+# processingChain =
	34	+
	35	+# number of threads to use, 0 means autodetect ( = number of detected cpus)
	36	+maxThreads = 0
	37	+
	38	+#
	39	+# INPUT
	40	+#
	41	+
	42	+# inputType: auto\|xcesAna\|tei\|txt
	43	+# auto chooses reader basing on the file name / extension:
	44	+# - .txt/.txt.gz = txt
	45	+# - morph.xml/morph.xml.gz = xcesAna
	46	+# - ann_segmentation.xml/ann_segmentation.xml.gz = tei, without using morphosyntax
	47	+# - ann_morphosyntax.xml/ann_morphosyntax.xml.gz = tei, using morphosyntax
	48	+# with txt and tei without morphosyntax the Morfeusz morphological analyzer is used (unless disabled)
	49	+
	50	+inputType = auto
	51	+
	52	+# encoding of input files (overrides any XML coding tags!)
	53	+#
	54	+# note: the acronymsAfter, acronymsBefore options, and contents of various files
	55	+# such as ogonkifyFile, morfeusz disambiguation rules or dictionaries must be
	56	+# in this encoding too.
	57	+inputEncoding = UTF-8
	58	+
	59	+# regexp describing names of input files
	60	+# to look for when traversing directories given in command line
	61	+# does not affect file names given explicitly in command line
	62	+
	63	+inputFiles = morph\.xml(\.gz)?\|.*\.txt(\.gz)?\|ann_morphosyntax\.xml(\.gz)?
	64	+
	65	+# to ignore any disambiguation found in input?
	66	+
	67	+ignoreDisamb = no
	68	+
	69	+# Spejd can use XML id attributes available in the input. Sometimes this may cause
	70	+# problems (e.g. duplicate id values in the scope of file), so can be turned off
	71	+ignoreIDs = no
	72	+
	73	+#
	74	+# OUTPUT
	75	+#
	76	+
	77	+# format of the output file(s): tei\|xcesAna\|null
	78	+# null = for testing only, does not write anything
	79	+
	80	+outputType = tei
	81	+
	82	+# can interpretations deleted by Spejd be discarded at will (yes)
	83	+# or should be preserved for the final output (no)?
	84	+
	85	+discardDeleted = yes
	86	+
	87	+# the suffix to be added to the target file name
	88	+
	89	+outputSuffix = .xml
	90	+
	91	+# The core name of the output file. Depending on the output type
	92	+# some infixes can be added between it and output suffix.
	93	+# Caution: this option replaces the name of the input file.
	94	+# With output suffix containing only extension equal to extension of
	95	+# the input file (e.g. .xml for the XML input file) spejd will
	96	+# overwrite input files with output.
	97	+#
	98	+# Leave empty or comment out to use the input file name instead.
	99	+
	100	+outputFilenameCore = ann
	101	+
	102	+# If set to 'yes' spejd will backup existing output files to <name>.bak
	103	+
	104	+backupExistingFiles = no
	105	+
	106	+# apply gzip compression to output?
	107	+
	108	+compressOutput = yes
	109	+
	110	+# put <f>'s in single line and omit empty sentences/paragraphs when writing tei?
	111	+
	112	+compactTeiOutput = no
	113	+
	114	+# NKJP (National Corpus of Polish) compatibility mode:
	115	+# dont write <f name="interps"> and <fs type="lex"> in *_words.xml
	116	+# assuming that there can be only one interpretation marked as "correct" for each token.
	117	+#
	118	+# It is a user task to make sure that there will be no tokens with multiple
	119	+# correct interpretations
	120	+teiSingleSyntokInterp = yes
	121	+
	122	+# again, NKJP compatibility:
	123	+# place group's heads information inside the <fs type"group">
	124	+# as features instead of marking it as 'type' attribute of group's elements
	125	+teiFsGroupHeads = yes
	126	+
	127	+# for backward compatibility with Spejd 1.2, it is probably easier to parse structures
	128	+# written in bottom up order, starting from leafs to the root
	129	+# (all entities defined before referencing)
	130	+teiBottomUpSyntacticStructures = no
	131	+
	132	+# DIAGNOSTICS
	133	+
	134	+# report progress every reportInterval seconds
	135	+# use 0 to completely disable progress reports
	136	+
	137	+reportInterval = 5
	138	+
	139	+# more verbose reports?
	140	+debug = no
	141	+
	142	+# mark which rule has deleted an interpretation?
	143	+
	144	+ruleMarking = no
	145	+
	146	+# are tag/tagset errors fatal?
	147	+# If turned on, Spejd will try to its best to output only tags conforming the tagset,
	148	+# but they may be useless.
	149	+# This option exists only to preserve compatibility with older versions of Spejd, which
	150	+# accepted incorrect rules. Please do not use when developing new grammars.
	151	+#
	152	+# !!! use at your own risk and don't report crashes when using this option !!!
	153	+nonfatalTagErrors = no
	154	+
	155	+# if to silence the (nonfatal) tag errors?
	156	+muffleTagWarnings = no
	157	+
	158	+# Disable correctness checks of tags in between rules execution?
	159	+# If set, tags can be temporary incomplete or incorrect, but the usual validation
	160	+# before writing is stil performed to make sure Spejd will output only
	161	+# correct tags.
	162	+# Not recommended for developing new grammars.
	163	+tagErrorsOnlyOnTheEnd = no
	164	+
	165	+
	166	+######################################################################
	167	+# MODULE-SPECIFIC OPTIONS
	168	+######################################################################
	169	+
	170	+
	171	+# DICTIONARIES
	172	+
	173	+# list of files containing morphological dictionaries
	174	+# to be applied as "dictionary:example_dict" tool to the input
	175	+# the format of lines of files is:
	176	+# orthographic form,base (lexical) form:tag
	177	+#
	178	+# or:
	179	+# ,base (lexical) form:some_parts_of_tag;condition
	180	+#
	181	+# In the first variant the orthographical form is used for matching words.
	182	+# Tag definition is expanded (it may contain wildcards).
	183	+#
	184	+# In the second variant orthographic form is ommited. In that case a base form is used to match.
	185	+# The tags of existing interpretations which match the base form are corrected/modified
	186	+# according to the specified tag.
	187	+# This variant allows the tag to be not full/complete, but only specifying some
	188	+# of the attributes (some parts).
	189	+# This variant also allows to specify conditions on tag that must
	190	+# be meet to perform the modification. The condition has form of a partial tag, just like
	191	+# in the "tag" section of modifying variant. A condition restricts modified interpretations to
	192	+# that ones which have all values of the specified attributes among the specified values.
	193	+# If an attribute is ommited in the specification it means that there are no restrictions
	194	+# on this attribute value and it can be anything (including absence of value).
	195	+# When a condition is empty (that means: there are no restrictions on any attribute),
	196	+# a semicolon preceding it can be ommited and the format is:
	197	+# ,base (lexical) form:some_parts_of_tag
	198	+#
	199	+# The above two variants of entries can be mixed.
	200	+# All the entries with orthographic form are applied before applying any
	201	+# of the entries without orth in the scope of a single 'dictionary:<name>' tool,
	202	+# no matter in which file in this list they appear.
	203	+#
	204	+# The encoding of dictionary files must be the same as inputEncoding.
	205	+#dictionary:example_dict = sample_dict lexdictnum
	206	+
	207	+
	208	+# PANTERA CONFIGURATION
	209	+
	210	+# Pantera can use its own built-in tweaked version of Morfeusz.
	211	+# If this option is set, all interpretations set by reader
	212	+# or any tools preceding pantera in the toolchain are dropped
	213	+panteraDoOwnMorphAnalysis = yes
	214	+
	215	+# tagset for pantera, leave empty for a default (check pantera documentation for details)
	216	+panteraTagsetName =
	217	+
	218	+# pantera's engine, leave empty for a default (check pantera documentation for details)
	219	+panteraEnginePath =
	220	+
	221	+
	222	+# SPEJD SEMANTICS
	223	+
	224	+# default strategy for matching syntactic entities
	225	+# use * for greedy, + for possessive, ? for reluctant
	226	+
	227	+matchStrategy = *
	228	+
	229	+# should agree(case,1,2) return true, if both 1 and 2 have no case?
	230	+
	231	+nullAgreement = no
	232	+
	233	+# SPEJD FSM INTERNALS
	234	+
	235	+# number of single-rule automata to be composed together, usually not needed to change
	236	+# Rule of thumb: if Spejd consumes much too much memory, it's better to decrease this
	237	+# number than to set very low memoryLimit - it gives smaller impact on performance
	238	+composeLimit = 150
	239	+
	240	+# memory limit in megabytes
	241	+# when memory usage exceeds this limit the rarely-used states removal procedure
	242	+# (or GC, a garbage collector) is launched
	243	+# use as an emergency brake, for standard limit see above.
	244	+# The memoryLimit is approximate, actual memory usage may be slightly higher
	245	+# (it depends on memory allocator library buffers size)
	246	+memoryLimit = 1900
	247	+
	248	+# approx. percent of DFA states to leave after the states removal
	249	+leavePercent = 80
	250	+
	251	+# The definitive limit of normal GC usage. GC removes only complex states, so if there are lots
	252	+# of plain states it can't prevent from exceeding memoryLimit. If the percent of complex states
	253	+# is less than minComplexPercent, all the DFAs are dropped and they are built from the beggining
	254	+# just like if the spejd would be restarted. However it does not recompile rules, so it's faster.
	255	+minComplexPercent = 10
	256	+
	257	+
	258	+# A maximal number of unicode characters which can appear in rules compiled to internal regex
	259	+# It must be higher than the highest number of values of a single attribute (including
	260	+# numeric attributes) and must be higher than a number of unique characters appearing in all rules.
	261	+# Setting too high can increase the memory usage.
	262	+maxNumberOfValues = 4000
	263	+
	264	+
	265	+# BUILT-IN MORPHOLOGICAL ANALYZER 'MORFEUSZ'
	266	+
	267	+# disable Morfeusz completely, useful when some other tool replaces interpretations, e.g. pantera
	268	+disableMorfeusz = yes
	269	+
	270	+# Morfeusz produces ambiguous segmentation, which can be resolved by simple rule-based
	271	+# disambiguator. This option specifies a file to load rules from.
	272	+# The rule format is described in the example file
	273	+# (leave empty for the builtin default, which is actualy the example file)
	274	+
	275	+# The encoding of this file must be the same as inputEncoding.
	276	+morfeuszSegmentationDisambiguationRules = segm_disamb.conf
	277	+
	278	+# PLAIN TEXT READER - GENERAL
	279	+
	280	+# mock xml:id for the whole text input referred from the output in string-range notation
	281	+# (in TEI output it appears in *segmentation.xml)
	282	+stringRangeMockID = p-1
	283	+
	284	+# PLAIN TEXT READER - SENTENCER
	285	+
	286	+# list of acronyms -
	287	+# if a dot is found after one of them, it is not a sentence break
	288	+
	289	+acronymsAfter = prof\|dr\|mgr\|doc\|ul\|np\|godz\|gen\|płk\|mjr\|por\|tzw\|tzn\|proc\|nt\|art\|ust\|ww\|www\|ws\|dz
	290	+
	291	+# list of acronyms (actually top level domain names) -
	292	+# if a dot is found before one of them, it is not a sentence break
	293	+
	294	+acronymsBefore = ac\|ad\|ae\|aero\|af\|ag\|ai\|al\|am\|an\|ao\|aq\|ar\|arpa\|as\|asia\|at\|au\|aw\|ax\|az\|ba\|bb\|bd\|be\|bf\|bg\|bh\|bi\|biz\|bj\|bm\|bn\|bo\|br\|bs\|bt\|bv\|bw\|by\|bz\|ca\|cat\|cc\|cd\|cf\|cg\|ch\|ci\|ck\|cl\|cm\|cn\|co\|com\|coop\|cr\|cu\|cv\|cx\|cy\|cz\|de\|dj\|dk\|dm\|do\|dz\|ec\|edu\|ee\|eg\|er\|es\|et\|eu\|fi\|fj\|fk\|fm\|fo\|fr\|ga\|gb\|gd\|ge\|gf\|gg\|gh\|gi\|gl\|gm\|gn\|gov\|gp\|gq\|gr\|gs\|gt\|gu\|gw\|gy\|hk\|hm\|hn\|hr\|ht\|hu\|id\|ie\|il\|im\|in\|info\|int\|io\|iq\|ir\|is\|it\|je\|jm\|jo\|jobs\|jp\|ke\|kg\|kh\|ki\|km\|kn\|kp\|kr\|kw\|ky\|kz\|la\|lb\|lc\|li\|lk\|lr\|ls\|lt\|lu\|lv\|ly\|ma\|mc\|md\|me\|mg\|mh\|mil\|mk\|ml\|mm\|mn\|mo\|mobi\|mp\|mq\|mr\|ms\|mt\|mu\|museum\|mv\|mw\|mx\|my\|mz\|na\|name\|nc\|ne\|net\|nf\|ng\|ni\|nl\|no\|np\|nr\|nu\|nz\|om\|org\|pa\|pe\|pf\|pg\|ph\|pk\|pl\|pm\|pn\|pr\|pro\|ps\|pt\|pw\|py\|qa\|re\|ro\|rs\|ru\|rw\|sa\|sb\|sc\|sd\|se\|sg\|sh\|si\|sj\|sk\|sl\|sm\|sn\|so\|sr\|st\|su\|sv\|sy\|sz\|tc\|td\|tel\|tf\|tg\|th\|tj\|tk\|tl\|tm\|tn\|to\|tp\|tr\|travel\|tt\|tv\|tw\|tz\|ua\|ug\|uk\|us\|uy\|uz\|va\|vc\|ve\|vg\|vi\|vn\|vu\|wf\|ws\|ye\|yt\|yu\|za\|zm\|zw
	295	+
	296	+
	297	+# PLAIN TEXT READER - OGONKIFIER
	298	+
	299	+# name of file with ogonkify (diacrit completion) substitutions
	300	+# the format is:
	301	+# <letter without diacritics>=<list of possible letters with diacritics separated by '\|'>
	302	+# see the example ogonkifier.ini
	303	+
	304	+# The encoding of this file must be the same as inputEncoding.
	305	+#ogonkifyFile = ogonkifier.ini
	306	+
	307	+# when to use ogonkifier:
	308	+# A - Always,
	309	+# N - Never,
	310	+# M - only when the Morphological analyzer fails to analyse a word
	311	+
	312	+ogonkifyStrategy = N
	313	+
	314	+# min and max length of words to ogonkify
	315	+
	316	+ogonkifyMinLength = 3
	317	+ogonkifyMaxLength = 13
	318	+
	319	+
	320	+
...	...

Added preferable spejd config to resources directory.

Too many changes to show. Reload with full diff Plain diff Email patch

Too many changes to show.
Reload with full diff Plain diff Email patch