Commit e05d60fbf11f94bc4d2eeedb0affff25d4b2cbe6

Authored by Michał Lenart
1 parent 1eff484c

praca nad przechowywaniem słownika z uwzględnieniem tagsetu.

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@13 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsabuilder/fsa/buildfsa.py
... ... @@ -10,6 +10,7 @@ import logging
10 10 import codecs
11 11 import encode
12 12 import convertinput
  13 +import common
13 14 from fsa import FSA
14 15 from serializer import VLengthSerializer1, VLengthSerializer2, SimpleSerializer
15 16 from visualizer import Visualizer
... ... @@ -36,7 +37,7 @@ class SerializationMethod():
36 37 V1 = 'V1'
37 38 V2 = 'V2'
38 39  
39   -def parseOptions():
  40 +def _parseOptions():
40 41 """
41 42 Parses commandline args
42 43 """
... ... @@ -45,13 +46,17 @@ def parseOptions():
45 46 dest='inputFile',
46 47 metavar='FILE',
47 48 help='path to input file')
  49 + parser.add_option('--tagset-file',
  50 + dest='tagsetFile',
  51 + metavar='FILE',
  52 + help='path to the file with tagset')
48 53 parser.add_option('-o', '--output-file',
49 54 dest='outputFile',
50 55 metavar='FILE',
51 56 help='path to output file')
52   - parser.add_option('-t', '--fsa-type',
53   - dest='fsaType',
54   - help='result FSA type - MORPH (for morphological analysis) or SPELL (for simple spell checker)')
  57 +# parser.add_option('-t', '--fsa-type',
  58 +# dest='fsaType',
  59 +# help='result FSA type - MORPH (for morphological analysis) or SPELL (for simple spell checker)')
55 60 # parser.add_option('--input-format',
56 61 # dest='inputFormat',
57 62 # help='input format - ENCODED, POLIMORF or PLAIN')
... ... @@ -90,7 +95,7 @@ def parseOptions():
90 95  
91 96 opts, args = parser.parse_args()
92 97  
93   - if None in [opts.inputFile, opts.outputFile, opts.outputFormat, opts.fsaType, opts.serializationMethod]:
  98 + if None in [opts.inputFile, opts.outputFile, opts.outputFormat, opts.tagsetFile, opts.serializationMethod]:
94 99 parser.print_help()
95 100 exit(1)
96 101 if not opts.outputFormat.upper() in [OutputFormat.BINARY, OutputFormat.CPP]:
... ... @@ -101,10 +106,14 @@ def parseOptions():
101 106 # logging.error('input format must be one of ('+str([InputFormat.ENCODED, InputFormat.POLIMORF, InputFormat.PLAIN])+')')
102 107 # parser.print_help()
103 108 # exit(1)
104   - if not opts.fsaType.upper() in [FSAType.MORPH, FSAType.SPELL]:
105   - logging.error('--fsa-type must be one of ('+str([FSAType.MORPH, FSAType.SPELL])+')')
106   - parser.print_help()
107   - exit(1)
  109 +# if not opts.fsaType.upper() in [FSAType.MORPH, FSAType.SPELL]:
  110 +# logging.error('--fsa-type must be one of ('+str([FSAType.MORPH, FSAType.SPELL])+')')
  111 +# parser.print_help()
  112 +# exit(1)
  113 +# if opts.fsaType == FSAType.MORPH and opts.tagsetFile is None:
  114 +# logging.error('must provide tagset file')
  115 +# parser.print_help()
  116 +# exit(1)
108 117  
109 118 if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1, SerializationMethod.V2]:
110 119 logging.error('--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1, SerializationMethod.V2])+')')
... ... @@ -122,58 +131,93 @@ def parseOptions():
122 131 # exit(1)
123 132 return opts
124 133  
125   -def readEncodedInput(inputFile):
126   - with codecs.open(inputFile, 'r', 'utf8') as f:
127   - for line in f:
128   - word, interps = line.strip().split()
129   - yield word, interps.split(u'|')
130   -
131   -def readPolimorfInput(inputFile, encoder):
  134 +def _readPolimorfInput(inputFile, tagsetFile, encoder):
  135 + tagset = common.Tagset(tagsetFile)
132 136 with codecs.open(inputFile, 'r', 'utf8') as f:
133   - for entry in convertinput.convertPolimorf(f, lambda (word, interp): encoder.word2SortKey(word)):
  137 + for entry in convertinput.convertPolimorf(f, tagset, encoder):
134 138 yield entry
135 139  
136   -def readPlainInput(inputFile, encoder):
  140 +def _readPlainInput(inputFile, encoder):
137 141 with codecs.open(inputFile, 'r', 'utf8') as f:
138 142 for line in sorted(f, key=encoder.word2SortKey):
139 143 word = line.strip()
140 144 yield word, ''
141 145  
142   -def readTrainData(trainFile):
  146 +def _readTrainData(trainFile):
143 147 with codecs.open(trainFile, 'r', 'utf8') as f:
144 148 for line in f:
145 149 yield line.strip()
146 150  
  151 +def _printStats(fsa):
  152 + acceptingNum = 0
  153 + sinkNum = 0
  154 + arrayNum = 0
  155 + for s in fsa.dfs():
  156 + if s.isAccepting():
  157 + acceptingNum += 1
  158 + if s.transitionsNum == 0:
  159 + sinkNum += 1
  160 + if s.serializeAsArray:
  161 + arrayNum += 1
  162 + logging.info('states num: '+str(fsa.getStatesNum()))
  163 + logging.info('transitions num: '+str(fsa.getTransitionsNum()))
  164 + logging.info('accepting states num: '+str(acceptingNum))
  165 + logging.info('sink states num: '+str(sinkNum))
  166 + logging.info('array states num: '+str(arrayNum))
  167 +
  168 +def buildFromPoliMorf(inputFile, tagsetFile):
  169 + encoder = encode.MorphEncoder()
  170 + fsa = FSA(encoder)
  171 + inputData = _readPolimorfInput(inputFile, tagsetFile, encoder)
  172 + fsa.feed(inputData)
  173 + _printStats(fsa)
  174 + return fsa
  175 +
  176 +def buildFromPlain(inputFile, tagsetFile):
  177 + pass
  178 +
147 179 def main(opts):
148 180 if opts.debug:
149 181 logging.basicConfig(level=logging.DEBUG)
150 182 else:
151 183 logging.basicConfig(level=logging.INFO)
152   - encoder = encode.Encoder()
153   - fsa = FSA(encoder)
154 184  
155   - inputData = {
156   - FSAType.MORPH: readPolimorfInput(opts.inputFile, encoder),
157   - FSAType.SPELL: readPlainInput(opts.inputFile, encoder)
158   - }[opts.fsaType]
  185 + fsa = buildFromPoliMorf(opts.inputFile, opts.tagsetFile)
  186 +# {
  187 +# FSAType.SPELL: buildFromPlain(opts.inputFile),
  188 +# FSAType.MORPH: buildFromPoliMorf(opts.inputFile, opts.tagsetFile)
  189 +# }[opts.fsaType]
159 190  
160   - logging.info('feeding FSA with data ...')
161   - fsa.feed(inputData)
162 191 if opts.trainFile:
163 192 logging.info('training with '+opts.trainFile+' ...')
164   - fsa.train(readTrainData(opts.trainFile))
  193 + fsa.train(_readTrainData(opts.trainFile))
165 194 logging.info('done training')
  195 +
  196 +# encoder = {
  197 +# FSAType.SPELL: encode.SimpleEncoder(),
  198 +# FSAType.MORPH: encode.MorphEncoder()
  199 +# }[opts.fsaType]
  200 +#
  201 +# fsa = FSA(encoder)
  202 +#
  203 +# inputData = {
  204 +# FSAType.MORPH: _readPolimorfInput(opts.inputFile, opts.tagsetFile, encoder),
  205 +# FSAType.SPELL: _readPlainInput(opts.inputFile, encoder)
  206 +# }[opts.fsaType]
  207 +
  208 +# logging.info('feeding FSA with data ...')
  209 +# fsa.feed(inputData)
  210 +# if opts.trainFile:
  211 +# logging.info('training with '+opts.trainFile+' ...')
  212 +# fsa.train(readTrainData(opts.trainFile))
  213 +# logging.info('done training')
166 214  
167 215 serializer = {
168 216 SerializationMethod.SIMPLE: SimpleSerializer,
169 217 SerializationMethod.V1: VLengthSerializer1,
170 218 SerializationMethod.V2: VLengthSerializer2,
171 219 }[opts.serializationMethod](fsa)
172   - logging.info('states num: '+str(fsa.getStatesNum()))
173   - logging.info('transitions num: '+str(fsa.getTransitionsNum()))
174   - logging.info('accepting states num: '+str(len([s for s in fsa.dfs() if s.isAccepting()])))
175   - logging.info('sink states num: '+str(len([s for s in fsa.dfs() if len(s.transitionsMap.items()) == 0])))
176   - logging.info('array states num: '+str(len([s for s in fsa.dfs() if s.serializeAsArray])))
  220 +
177 221 {
178 222 OutputFormat.CPP: serializer.serialize2CppFile,
179 223 OutputFormat.BINARY: serializer.serialize2BinaryFile
... ... @@ -184,7 +228,7 @@ def main(opts):
184 228 Visualizer().visualize(fsa)
185 229  
186 230 if __name__ == '__main__':
187   - opts = parseOptions()
  231 + opts = _parseOptions()
188 232 if opts.profile:
189 233 with PyCallGraph(output=GraphvizOutput()):
190 234 main(opts)
... ...
fsabuilder/fsa/common.py 0 → 100644
  1 +'''
  2 +Created on Nov 7, 2013
  3 +
  4 +@author: mlenart
  5 +'''
  6 +
  7 +import codecs
  8 +
  9 +class Lemma(object):
  10 +
  11 + def __init__(self, cutLength, suffixToAdd):
  12 + self.cutLength = cutLength
  13 + self.suffixToAdd = suffixToAdd
  14 +
  15 +class Interpretation(object):
  16 +
  17 + def __init__(self, orth, base, tagnum, namenum, encoder):
  18 + assert type(orth) == unicode
  19 + assert type(base) == unicode
  20 + root = u''
  21 + for o, b in zip(orth, base):
  22 + if o == b:
  23 + root += o
  24 + else:
  25 + break
  26 + cutLength = len(orth) - len(root)
  27 + self.lemma = Lemma(
  28 + cutLength=cutLength,
  29 + suffixToAdd=base[len(root):])
  30 + self.tagnum = tagnum
  31 + self.namenum = namenum
  32 +
  33 + def getSortKey(self):
  34 + return (self.lemma.cutLength, self.lemma.suffixToAdd, self.tagnum, self.namenum)
  35 +
  36 + def __eq__(self, other):
  37 + if isinstance(other, Interpretation):
  38 + return self.getSortKey() == other.getSortKey()
  39 + else:
  40 + return False
  41 +
  42 + def __hash__(self):
  43 + return hash(self.getSortKey())
  44 +
  45 +class Tagset(object):
  46 +
  47 + TAGS = 1
  48 + NAMES = 2
  49 + SEP = '\t'
  50 +
  51 + def __init__(self, filename, encoding='utf8'):
  52 + self.tag2tagnum = {}
  53 + self.name2namenum = {}
  54 + self._doInit(filename, encoding)
  55 + print self.tag2tagnum
  56 + print self.name2namenum
  57 +
  58 + def _doInit(self, filename, encoding):
  59 + addingTo = None
  60 + with codecs.open(filename, 'r', encoding) as f:
  61 + for line in f:
  62 + line = line.strip('\n')
  63 + if line == u'[TAGS]':
  64 + addingTo = Tagset.TAGS
  65 + elif line == u'[NAMES]':
  66 + addingTo = Tagset.NAMES
  67 + elif line and not line.startswith(u'#'):
  68 + assert addingTo in [Tagset.TAGS, Tagset.NAMES]
  69 + res = {Tagset.TAGS: self.tag2tagnum,
  70 + Tagset.NAMES: self.name2namenum}[addingTo]
  71 + tagNum = line.split(Tagset.SEP)[0]
  72 + tag = line.split(Tagset.SEP)[1]
  73 + assert tag not in res
  74 + res[tag] = int(tagNum)
... ...
fsabuilder/fsa/convertinput.py
... ... @@ -3,59 +3,42 @@ Created on Oct 23, 2013
3 3  
4 4 @author: mlenart
5 5 '''
6   -import sys
7   -import fileinput
8 6 import logging
9   -from encode import Encoder
  7 +from common import Interpretation
10 8  
11   -def _encodeInterp(orth, base, tag, name):
12   - removePrefix = 0
13   - root = u''
14   - for o, b in zip(orth, base):
15   - if o == b:
16   - root += o
17   - else:
18   - break
19   - removeSuffixNum = len(orth) - len(root)
20   - addSuffix = base[len(root):]
21   - return u'+'.join([
22   - chr(ord('A')+removePrefix) + chr(ord('A')+removeSuffixNum) + addSuffix,
23   - tag,
24   - name])
  9 +def _sortLines(inputLines, encoder):
  10 + logging.info('sorting input...')
  11 + lines = list(inputLines)
  12 + logging.info('done read data into list')
  13 + lines.sort(key=lambda line: encoder.word2SortKey(line.split('\t')[0]))
  14 + logging.info('done sorting')
  15 + return lines
25 16  
26   -def _parsePolimorf(inputLines):
27   - for line0 in inputLines:
28   - line = line0.strip(u'\n')
  17 +def _parseLines(inputLines, tagset, encoder):
  18 + for line in inputLines:
  19 + line = line.strip(u'\n')
29 20 if line:
30 21 # print line
31 22 orth, base, tag, name = line.split(u'\t')
32   - yield (orth, _encodeInterp(orth, base, tag, name))
  23 + tagnum = tagset.tag2tagnum[tag]
  24 + namenum = tagset.name2namenum[name]
  25 + yield (orth, Interpretation(orth, base, tagnum, namenum, encoder))
33 26  
34   -def _sortAndMergeParsedInput(inputData, key=lambda k: k):
35   - logging.info('sorting input...')
36   - entries = list(inputData)
37   - entries.sort(key=key)
38   - logging.info('done sorting')
  27 +def _mergeEntries(inputLines):
39 28 prevOrth = None
40 29 prevInterps = None
41   - for orth, interp in entries:
  30 + for orth, interp in inputLines:
  31 + orth = orth.lower()
  32 + assert orth
42 33 if prevOrth and prevOrth == orth:
43 34 prevInterps.append(interp)
44 35 else:
45 36 if prevOrth:
46   - yield (prevOrth, sorted(set(prevInterps)))
  37 + yield (prevOrth, frozenset(prevInterps))
47 38 prevOrth = orth
48 39 prevInterps = [interp]
  40 + yield (prevOrth, frozenset(prevInterps))
49 41  
50   -def convertPolimorf(inputLines, sortKey=lambda k: k):
51   - for orth, interps in _sortAndMergeParsedInput(_parsePolimorf(inputLines), key=sortKey):
  42 +def convertPolimorf(inputLines, tagset, encoder):
  43 + for orth, interps in _mergeEntries(_parseLines(_sortLines(inputLines, encoder), tagset, encoder)):
52 44 yield orth, interps
53   -
54   -def _decodeInputLines(rawInputLines, encoding):
55   - for line in rawInputLines:
56   - yield line.decode(encoding)
57   -
58   -if __name__ == '__main__':
59   - encoder = Encoder()
60   - for orth, interps in convertPolimorf(_decodeInputLines(fileinput.input(), 'utf8'), lambda (orth, interp): encoder.word2SortKey(orth)):
61   - print u'\t'.join([orth, u'|'.join(interps)]).encode('utf8')
... ...
fsabuilder/fsa/encode.py
1 1 '''
2 2 Created on Oct 23, 2013
3 3  
4   -@author: lennyn
  4 +@author: mlenart
5 5 '''
6 6  
  7 +import logging
  8 +
7 9 class Encoder(object):
8 10 '''
9 11 classdocs
10 12 '''
11 13  
12 14  
13   - def __init__(self, encoding='utf8', appendZero=False):
  15 + def __init__(self, encoding='utf8'):
14 16 '''
15 17 Constructor
16 18 '''
17 19 self.encoding = encoding
18   - self.appendZero = appendZero
19 20  
20   - def encodeWord(self, word):
  21 + def encodeWord(self, word, lowercase=True):
21 22 assert type(word) == unicode
22   - res = bytearray(word, self.encoding)
23   - if self.appendZero:
24   - res.append(0)
  23 + res = bytearray(word.lower() if lowercase else word, self.encoding)
25 24 return res
26 25  
27 26 def encodeData(self, data):
28   - return bytearray(u'|'.join(data).encode(self.encoding)) + bytearray([0])
  27 + raise NotImplementedError()
  28 +# return bytearray(u'|'.join(data).encode(self.encoding)) + bytearray([0])
29 29  
30 30 def decodeData(self, rawData):
  31 + return NotImplementedError()
31 32 # print unicode(str(rawData), self.encoding)[:-1]
32 33 # print unicode(str(rawData), self.encoding)[:-1].split(u'|')
33   - return unicode(str(rawData), self.encoding)[:-1].split(u'|')
  34 +# return unicode(str(rawData), self.encoding)[:-1].split(u'|')
34 35  
35 36 def word2SortKey(self, word):
36   - return word.encode(self.encoding)
  37 + return word.lower().encode(self.encoding)
  38 +
  39 +class SimpleEncoder(Encoder):
  40 +
  41 + def __init__(self, encoding='utf8', appendZero=False):
  42 + super(SimpleEncoder, self).__init__(encoding, appendZero)
  43 +
  44 + def encodeData(self, data):
  45 + return bytearray(data, encoding=self.encoding) + bytearray([0])
  46 +
  47 + def decodeData(self, rawData):
  48 + return unicode(str(rawData)[:-1], self.encoding)
  49 +
  50 +class MorphEncoder(Encoder):
  51 +
  52 + def __init__(self, encoding='utf8'):
  53 + super(MorphEncoder, self).__init__(encoding)
  54 +
  55 + def encodeData(self, interpsList):
  56 + res = bytearray()
  57 +# print interpsList
  58 + firstByte = len(interpsList)
  59 + assert firstByte < 256
  60 + assert firstByte > 0
  61 + res.append(firstByte)
  62 + assert type(interpsList) == frozenset
  63 + for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
  64 + res.extend(self._encodeLemma(interp.lemma))
  65 + res.extend(self._encodeTagNum(interp.tagnum))
  66 + res.extend(self._encodeNameNum(interp.namenum))
  67 + return res
  68 +
  69 + def _encodeLemma(self, lemma):
  70 + res = bytearray()
  71 + assert lemma.cutLength < 256 and lemma.cutLength >= 0
  72 + res.append(lemma.cutLength)
  73 + res.extend(self.encodeWord(lemma.suffixToAdd, lowercase=False))
  74 + res.append(0)
  75 + return res
  76 +
  77 + def _encodeTagNum(self, tagnum):
  78 + res = bytearray()
  79 +# logging.info((tagnum & 0xFF00) >> 8)
  80 + assert tagnum < 65536 and tagnum >= 0
  81 + res.append((tagnum & 0xFF00) >> 8)
  82 + res.append(tagnum & 0x00FF)
  83 +# logging.info('%d %s %s' % (tagnum, hex(res[0]), hex(res[1])))
  84 + return res
  85 +
  86 + def _encodeNameNum(self, namenum):
  87 + assert namenum < 256 and namenum >= 0
  88 + return bytearray([namenum])
  89 +
37 90 \ No newline at end of file
... ...
fsabuilder/fsa/fsa.py
... ... @@ -31,16 +31,15 @@ class FSA(object):
31 31 # allWords = []
32 32 for n, (word, data) in enumerate(input, start=1):
33 33 assert data is not None
34   - if type(data) in [str, unicode]:
35   - data = [data]
36 34 encodedWord = self.encodeWord(word)
37   - assert encodedWord >= self.encodedPrevWord
  35 + assert encodedWord > self.encodedPrevWord
38 36 if encodedWord > self.encodedPrevWord:
39 37 self._addSorted(encodedWord, self.encodeData(data))
40 38 self.encodedPrevWord = encodedWord
41 39 # assert self.tryToRecognize(word) == data
42 40 if n % 10000 == 0:
43 41 logging.info(word)
  42 + logging.info(str(self.register.getStatesNum()))
44 43 # allWords.append(word)
45 44 for label in encodedWord:
46 45 self.label2Freq[label] = self.label2Freq.get(label, 0) + 1
... ...
fsabuilder/fsa/test/PoliMorfSmall.tab 0 → 100644
  1 +abdominalności abdominalność subst:pl:acc:f pospolita
  2 +abdominalności abdominalność subst:pl:gen:f pospolita
  3 +abdominalności abdominalność subst:pl:nom:f pospolita
  4 +abdominalności abdominalność subst:pl:voc:f pospolita
  5 +abdominalności abdominalność subst:sg:dat:f pospolita
  6 +abdominalności abdominalność subst:sg:gen:f pospolita
  7 +abdominalności abdominalność subst:sg:loc:f pospolita
  8 +abdominalności abdominalność subst:sg:voc:f pospolita
  9 +abdominalnościach abdominalność subst:pl:loc:f pospolita
  10 +abdominalnościami abdominalność subst:pl:inst:f pospolita
  11 +abdominalnością abdominalność subst:sg:inst:f pospolita
  12 +abdominalnościom abdominalność subst:pl:dat:f pospolita
  13 +abdominalność abdominalność subst:sg:acc:f pospolita
  14 +abdominalność abdominalność subst:sg:nom:f pospolita
  15 +abdominalna abdominalny adj:sg:nom.voc:f:pos pospolita
  16 +abdominalną abdominalny adj:sg:acc:f:pos pospolita
  17 +abdominalną abdominalny adj:sg:inst:f:pos pospolita
  18 +abdominalne abdominalny adj:pl:acc:m2.m3.f.n1.n2.p2.p3:pos pospolita
  19 +abdominalne abdominalny adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:pos pospolita
  20 +abdominalne abdominalny adj:sg:acc:n1.n2:pos pospolita
  21 +abdominalne abdominalny adj:sg:nom.voc:n1.n2:pos pospolita
  22 +abdominalnego abdominalny adj:sg:acc:m1.m2:pos pospolita
  23 +abdominalnego abdominalny adj:sg:gen:m1.m2.m3.n1.n2:pos pospolita
  24 +abdominalnej abdominalny adj:sg:dat:f:pos pospolita
  25 +abdominalnej abdominalny adj:sg:gen:f:pos pospolita
  26 +abdominalnej abdominalny adj:sg:loc:f:pos pospolita
  27 +abdominalnemu abdominalny adj:sg:dat:m1.m2.m3.n1.n2:pos pospolita
  28 +abdominalni abdominalny adj:pl:nom.voc:m1.p1:pos pospolita
  29 +abdominalno abdominalny adja pospolita
  30 +abdominalny abdominalny adj:sg:acc:m3:pos pospolita
  31 +abdominalny abdominalny adj:sg:nom.voc:m1.m2.m3:pos pospolita
  32 +abdominalnych abdominalny adj:pl:acc:m1.p1:pos pospolita
  33 +abdominalnych abdominalny adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:pos pospolita
  34 +abdominalnych abdominalny adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos pospolita
  35 +abdominalnym abdominalny adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:pos pospolita
  36 +abdominalnym abdominalny adj:sg:inst:m1.m2.m3.n1.n2:pos pospolita
  37 +abdominalnym abdominalny adj:sg:loc:m1.m2.m3.n1.n2:pos pospolita
  38 +abdominalnymi abdominalny adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:pos pospolita
  39 +abdominoplastyce abdominoplastyka subst:sg:dat:f pospolita
  40 +abdominoplastyce abdominoplastyka subst:sg:loc:f pospolita
  41 +abdominoplastyk abdominoplastyka subst:pl:gen:f pospolita
  42 +abdominoplastyka abdominoplastyka subst:sg:nom:f pospolita
  43 +abdominoplastykach abdominoplastyka subst:pl:loc:f pospolita
  44 +abdominoplastykami abdominoplastyka subst:pl:inst:f pospolita
  45 +abdominoplastyką abdominoplastyka subst:sg:inst:f pospolita
  46 +abdominoplastykę abdominoplastyka subst:sg:acc:f pospolita
  47 +abdominoplastyki abdominoplastyka subst:pl:acc:f pospolita
  48 +abdominoplastyki abdominoplastyka subst:pl:nom:f pospolita
  49 +abdominoplastyki abdominoplastyka subst:pl:voc:f pospolita
  50 +abdominoplastyki abdominoplastyka subst:sg:gen:f pospolita
  51 +abdominoplastyko abdominoplastyka subst:sg:voc:f pospolita
  52 +abdominoplastykom abdominoplastyka subst:pl:dat:f pospolita
0 53 \ No newline at end of file
... ...
fsabuilder/fsa/test/polimorf.tagset 0 → 100644
  1 +#!MORFEUSZ-TAGSET 0.1
  2 +
  3 +[TAGS]
  4 +
  5 +0 adj:pl:acc:m1.p1:com
  6 +1 adj:pl:acc:m1.p1:pos
  7 +2 adj:pl:acc:m1.p1:sup
  8 +3 adj:pl:acc:m2.m3.f.n1.n2.p2.p3:com
  9 +4 adj:pl:acc:m2.m3.f.n1.n2.p2.p3:pos
  10 +5 adj:pl:acc:m2.m3.f.n1.n2.p2.p3:sup
  11 +6 adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:com
  12 +7 adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:pos
  13 +8 adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:sup
  14 +9 adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:com
  15 +10 adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:pos
  16 +11 adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:sup
  17 +12 adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:com
  18 +13 adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:pos
  19 +14 adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:sup
  20 +15 adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:com
  21 +16 adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos
  22 +17 adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:sup
  23 +18 adj:pl:nom.voc:m1.p1:com
  24 +19 adj:pl:nom.voc:m1.p1:pos
  25 +20 adj:pl:nom.voc:m1.p1:sup
  26 +21 adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:com
  27 +22 adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:pos
  28 +23 adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:sup
  29 +24 adj:pl:nom:m1.p1:pos
  30 +25 adj:pl:nom:m2.m3.f.n1.n2.p2.p3:pos
  31 +26 adj:sg:acc:f:com
  32 +27 adj:sg:acc:f:pos
  33 +28 adj:sg:acc:f:sup
  34 +29 adj:sg:acc:m1.m2:com
  35 +30 adj:sg:acc:m1.m2:pos
  36 +31 adj:sg:acc:m1.m2:sup
  37 +32 adj:sg:acc:m3:com
  38 +33 adj:sg:acc:m3:pos
  39 +34 adj:sg:acc:m3:sup
  40 +35 adj:sg:acc:n1.n2:com
  41 +36 adj:sg:acc:n1.n2:pos
  42 +37 adj:sg:acc:n1.n2:sup
  43 +38 adj:sg:dat:f:com
  44 +39 adj:sg:dat:f:pos
  45 +40 adj:sg:dat:f:sup
  46 +41 adj:sg:dat:m1.m2.m3.n1.n2:com
  47 +42 adj:sg:dat:m1.m2.m3.n1.n2:pos
  48 +43 adj:sg:dat:m1.m2.m3.n1.n2:sup
  49 +44 adj:sg:gen:f:com
  50 +45 adj:sg:gen:f:pos
  51 +46 adj:sg:gen:f:sup
  52 +47 adj:sg:gen:m1.m2.m3.n1.n2:com
  53 +48 adj:sg:gen:m1.m2.m3.n1.n2:pos
  54 +49 adj:sg:gen:m1.m2.m3.n1.n2:sup
  55 +50 adj:sg:inst:f:com
  56 +51 adj:sg:inst:f:pos
  57 +52 adj:sg:inst:f:sup
  58 +53 adj:sg:inst:m1.m2.m3.n1.n2:com
  59 +54 adj:sg:inst:m1.m2.m3.n1.n2:pos
  60 +55 adj:sg:inst:m1.m2.m3.n1.n2:sup
  61 +56 adj:sg:loc:f:com
  62 +57 adj:sg:loc:f:pos
  63 +58 adj:sg:loc:f:sup
  64 +59 adj:sg:loc:m1.m2.m3.n1.n2:com
  65 +60 adj:sg:loc:m1.m2.m3.n1.n2:pos
  66 +61 adj:sg:loc:m1.m2.m3.n1.n2:sup
  67 +62 adj:sg:nom.voc:f:com
  68 +63 adj:sg:nom.voc:f:pos
  69 +64 adj:sg:nom.voc:f:sup
  70 +65 adj:sg:nom.voc:m1.m2.m3:com
  71 +66 adj:sg:nom.voc:m1.m2.m3:pos
  72 +67 adj:sg:nom.voc:m1.m2.m3:sup
  73 +68 adj:sg:nom.voc:n1.n2:com
  74 +69 adj:sg:nom.voc:n1.n2:pos
  75 +70 adj:sg:nom.voc:n1.n2:sup
  76 +71 adj:sg:nom:f:pos
  77 +72 adj:sg:nom:m1.m2.m3:pos
  78 +73 adj:sg:nom:n1.n2:pos
  79 +74 adja
  80 +75 adjc
  81 +76 adjp
  82 +77 adv
  83 +78 adv:com
  84 +79 adv:pos
  85 +80 adv:sup
  86 +81 aglt:pl:pri:imperf:nwok
  87 +82 aglt:pl:pri:imperf:wok
  88 +83 aglt:pl:sec:imperf:nwok
  89 +84 aglt:pl:sec:imperf:wok
  90 +85 aglt:sg:pri:imperf:nwok
  91 +86 aglt:sg:pri:imperf:wok
  92 +87 aglt:sg:sec:imperf:nwok
  93 +88 aglt:sg:sec:imperf:wok
  94 +89 bedzie:pl:pri:imperf
  95 +90 bedzie:pl:sec:imperf
  96 +91 bedzie:pl:ter:imperf
  97 +92 bedzie:sg:pri:imperf
  98 +93 bedzie:sg:sec:imperf
  99 +94 bedzie:sg:ter:imperf
  100 +95 burk
  101 +96 comp
  102 +97 conj
  103 +98 depr:pl:nom:m2
  104 +99 depr:pl:voc:m2
  105 +100 fin:pl:pri:imperf
  106 +101 fin:pl:pri:imperf.perf
  107 +102 fin:pl:pri:perf
  108 +103 fin:pl:sec:imperf
  109 +104 fin:pl:sec:imperf.perf
  110 +105 fin:pl:sec:perf
  111 +106 fin:pl:ter:imperf
  112 +107 fin:pl:ter:imperf.perf
  113 +108 fin:pl:ter:perf
  114 +109 fin:sg:pri:imperf
  115 +110 fin:sg:pri:imperf.perf
  116 +111 fin:sg:pri:perf
  117 +112 fin:sg:sec:imperf
  118 +113 fin:sg:sec:imperf.perf
  119 +114 fin:sg:sec:perf
  120 +115 fin:sg:ter:imperf
  121 +116 fin:sg:ter:imperf.perf
  122 +117 fin:sg:ter:perf
  123 +118 ger:sg:dat.loc:n2:imperf.perf:aff
  124 +119 ger:sg:dat.loc:n2:imperf.perf:neg
  125 +120 ger:sg:dat.loc:n2:imperf:aff
  126 +121 ger:sg:dat.loc:n2:imperf:neg
  127 +122 ger:sg:dat.loc:n2:perf:aff
  128 +123 ger:sg:dat.loc:n2:perf:neg
  129 +124 ger:sg:gen:n2:imperf.perf:aff
  130 +125 ger:sg:gen:n2:imperf.perf:neg
  131 +126 ger:sg:gen:n2:imperf:aff
  132 +127 ger:sg:gen:n2:imperf:neg
  133 +128 ger:sg:gen:n2:perf:aff
  134 +129 ger:sg:gen:n2:perf:neg
  135 +130 ger:sg:inst:n2:imperf.perf:aff
  136 +131 ger:sg:inst:n2:imperf.perf:neg
  137 +132 ger:sg:inst:n2:imperf:aff
  138 +133 ger:sg:inst:n2:imperf:neg
  139 +134 ger:sg:inst:n2:perf:aff
  140 +135 ger:sg:inst:n2:perf:neg
  141 +136 ger:sg:nom.acc:n2:imperf.perf:aff
  142 +137 ger:sg:nom.acc:n2:imperf.perf:neg
  143 +138 ger:sg:nom.acc:n2:imperf:aff
  144 +139 ger:sg:nom.acc:n2:imperf:neg
  145 +140 ger:sg:nom.acc:n2:perf:aff
  146 +141 ger:sg:nom.acc:n2:perf:neg
  147 +142 imps:imperf
  148 +143 imps:imperf.perf
  149 +144 imps:perf
  150 +145 impt:pl:pri:imperf
  151 +146 impt:pl:pri:imperf.perf
  152 +147 impt:pl:pri:perf
  153 +148 impt:pl:sec:imperf
  154 +149 impt:pl:sec:imperf.perf
  155 +150 impt:pl:sec:perf
  156 +151 impt:sg:sec:imperf
  157 +152 impt:sg:sec:imperf.perf
  158 +153 impt:sg:sec:perf
  159 +154 inf:imperf
  160 +155 inf:imperf.perf
  161 +156 inf:perf
  162 +157 interj
  163 +158 num:comp
  164 +159 num:pl:acc:m1:rec
  165 +160 num:pl:dat.loc:n1.p1.p2:congr.rec
  166 +161 num:pl:dat:m1.m2.m3.n2.f:congr
  167 +162 num:pl:gen.dat.inst.loc:m1.m2.m3.f.n1.n2.p1.p2:congr
  168 +163 num:pl:gen.dat.inst.loc:m1.m2.m3.f.n2:congr
  169 +164 num:pl:gen.dat.loc:m1.m2.m3.n2.f:congr
  170 +165 num:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2:congr
  171 +166 num:pl:gen.loc:m1.m2.m3.n2.f:congr
  172 +167 num:pl:gen:n1.p1.p2:rec
  173 +168 num:pl:inst:f:congr
  174 +169 num:pl:inst:m1.m2.m3.f.n1.n2.p1.p2:congr
  175 +170 num:pl:inst:m1.m2.m3.f.n2:congr
  176 +171 num:pl:inst:m1.m2.m3.n2.f:congr
  177 +172 num:pl:inst:m1.m2.m3.n2:congr
  178 +173 num:pl:inst:n1.p1.p2:rec
  179 +174 num:pl:nom.acc.voc:f:congr
  180 +175 num:pl:nom.acc.voc:m1:rec
  181 +176 num:pl:nom.acc.voc:m2.m3.f.n1.n2.p1.p2:rec
  182 +177 num:pl:nom.acc.voc:m2.m3.f.n2:rec
  183 +178 num:pl:nom.acc.voc:m2.m3.n2.f:congr
  184 +179 num:pl:nom.acc.voc:m2.m3.n2:congr
  185 +180 num:pl:nom.acc.voc:n1.p1.p2:rec
  186 +181 num:pl:nom.acc:m1.m2.m3.f.n1.n2.p1.p2:rec
  187 +182 num:pl:nom.gen.dat.inst.acc.loc.voc:m1.m2.m3.f.n1.n2.p1.p2:rec
  188 +183 num:pl:nom.voc:m1:congr
  189 +184 num:pl:nom.voc:m1:rec
  190 +185 num:sg:nom.gen.dat.inst.acc.loc.voc:f:rec
  191 +186 num:sg:nom.gen.dat.inst.acc.loc.voc:m1.m2.m3.n1.n2:rec
  192 +187 pact:pl:acc:m1.p1:imperf.perf:aff
  193 +188 pact:pl:acc:m1.p1:imperf.perf:neg
  194 +189 pact:pl:acc:m1.p1:imperf:aff
  195 +190 pact:pl:acc:m1.p1:imperf:neg
  196 +191 pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff
  197 +192 pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg
  198 +193 pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff
  199 +194 pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg
  200 +195 pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff
  201 +196 pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg
  202 +197 pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff
  203 +198 pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg
  204 +199 pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff
  205 +200 pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg
  206 +201 pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff
  207 +202 pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg
  208 +203 pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:aff
  209 +204 pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg
  210 +205 pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff
  211 +206 pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:neg
  212 +207 pact:pl:nom.voc:m1.p1:imperf.perf:aff
  213 +208 pact:pl:nom.voc:m1.p1:imperf.perf:neg
  214 +209 pact:pl:nom.voc:m1.p1:imperf:aff
  215 +210 pact:pl:nom.voc:m1.p1:imperf:neg
  216 +211 pact:sg:acc.inst:f:imperf.perf:aff
  217 +212 pact:sg:acc.inst:f:imperf.perf:neg
  218 +213 pact:sg:acc.inst:f:imperf:aff
  219 +214 pact:sg:acc.inst:f:imperf:neg
  220 +215 pact:sg:acc:m1.m2:imperf.perf:aff
  221 +216 pact:sg:acc:m1.m2:imperf.perf:neg
  222 +217 pact:sg:acc:m1.m2:imperf:aff
  223 +218 pact:sg:acc:m1.m2:imperf:neg
  224 +219 pact:sg:acc:m3:imperf.perf:aff
  225 +220 pact:sg:acc:m3:imperf.perf:neg
  226 +221 pact:sg:acc:m3:imperf:aff
  227 +222 pact:sg:acc:m3:imperf:neg
  228 +223 pact:sg:dat:m1.m2.m3.n1.n2:imperf.perf:aff
  229 +224 pact:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg
  230 +225 pact:sg:dat:m1.m2.m3.n1.n2:imperf:aff
  231 +226 pact:sg:dat:m1.m2.m3.n1.n2:imperf:neg
  232 +227 pact:sg:gen.dat.loc:f:imperf.perf:aff
  233 +228 pact:sg:gen.dat.loc:f:imperf.perf:neg
  234 +229 pact:sg:gen.dat.loc:f:imperf:aff
  235 +230 pact:sg:gen.dat.loc:f:imperf:neg
  236 +231 pact:sg:gen:m1.m2.m3.n1.n2:imperf.perf:aff
  237 +232 pact:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg
  238 +233 pact:sg:gen:m1.m2.m3.n1.n2:imperf:aff
  239 +234 pact:sg:gen:m1.m2.m3.n1.n2:imperf:neg
  240 +235 pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:aff
  241 +236 pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg
  242 +237 pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff
  243 +238 pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg
  244 +239 pact:sg:nom.acc.voc:n1.n2:imperf.perf:aff
  245 +240 pact:sg:nom.acc.voc:n1.n2:imperf.perf:neg
  246 +241 pact:sg:nom.acc.voc:n1.n2:imperf:aff
  247 +242 pact:sg:nom.acc.voc:n1.n2:imperf:neg
  248 +243 pact:sg:nom.voc:f:imperf.perf:aff
  249 +244 pact:sg:nom.voc:f:imperf.perf:neg
  250 +245 pact:sg:nom.voc:f:imperf:aff
  251 +246 pact:sg:nom.voc:f:imperf:neg
  252 +247 pact:sg:nom.voc:m1.m2.m3:imperf.perf:aff
  253 +248 pact:sg:nom.voc:m1.m2.m3:imperf.perf:neg
  254 +249 pact:sg:nom.voc:m1.m2.m3:imperf:aff
  255 +250 pact:sg:nom.voc:m1.m2.m3:imperf:neg
  256 +251 pant:perf
  257 +252 pcon:imperf
  258 +253 ppas:pl:acc:m1.p1:imperf.perf:aff
  259 +254 ppas:pl:acc:m1.p1:imperf.perf:neg
  260 +255 ppas:pl:acc:m1.p1:imperf:aff
  261 +256 ppas:pl:acc:m1.p1:imperf:neg
  262 +257 ppas:pl:acc:m1.p1:perf:aff
  263 +258 ppas:pl:acc:m1.p1:perf:neg
  264 +259 ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff
  265 +260 ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg
  266 +261 ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff
  267 +262 ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg
  268 +263 ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:aff
  269 +264 ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg
  270 +265 ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff
  271 +266 ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg
  272 +267 ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff
  273 +268 ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg
  274 +269 ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:aff
  275 +270 ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg
  276 +271 ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff
  277 +272 ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg
  278 +273 ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff
  279 +274 ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg
  280 +275 ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:aff
  281 +276 ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg
  282 +277 ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:aff
  283 +278 ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg
  284 +279 ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff
  285 +280 ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:neg
  286 +281 ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:perf:aff
  287 +282 ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:perf:neg
  288 +283 ppas:pl:nom.voc:m1.p1:imperf.perf:aff
  289 +284 ppas:pl:nom.voc:m1.p1:imperf.perf:neg
  290 +285 ppas:pl:nom.voc:m1.p1:imperf:aff
  291 +286 ppas:pl:nom.voc:m1.p1:imperf:neg
  292 +287 ppas:pl:nom.voc:m1.p1:perf:aff
  293 +288 ppas:pl:nom.voc:m1.p1:perf:neg
  294 +289 ppas:sg:acc.inst:f:imperf.perf:aff
  295 +290 ppas:sg:acc.inst:f:imperf.perf:neg
  296 +291 ppas:sg:acc.inst:f:imperf:aff
  297 +292 ppas:sg:acc.inst:f:imperf:neg
  298 +293 ppas:sg:acc.inst:f:perf:aff
  299 +294 ppas:sg:acc.inst:f:perf:neg
  300 +295 ppas:sg:acc:m1.m2:imperf.perf:aff
  301 +296 ppas:sg:acc:m1.m2:imperf.perf:neg
  302 +297 ppas:sg:acc:m1.m2:imperf:aff
  303 +298 ppas:sg:acc:m1.m2:imperf:neg
  304 +299 ppas:sg:acc:m1.m2:perf:aff
  305 +300 ppas:sg:acc:m1.m2:perf:neg
  306 +301 ppas:sg:acc:m3:imperf.perf:aff
  307 +302 ppas:sg:acc:m3:imperf.perf:neg
  308 +303 ppas:sg:acc:m3:imperf:aff
  309 +304 ppas:sg:acc:m3:imperf:neg
  310 +305 ppas:sg:acc:m3:perf:aff
  311 +306 ppas:sg:acc:m3:perf:neg
  312 +307 ppas:sg:dat:m1.m2.m3.n1.n2:imperf.perf:aff
  313 +308 ppas:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg
  314 +309 ppas:sg:dat:m1.m2.m3.n1.n2:imperf:aff
  315 +310 ppas:sg:dat:m1.m2.m3.n1.n2:imperf:neg
  316 +311 ppas:sg:dat:m1.m2.m3.n1.n2:perf:aff
  317 +312 ppas:sg:dat:m1.m2.m3.n1.n2:perf:neg
  318 +313 ppas:sg:gen.dat.loc:f:imperf.perf:aff
  319 +314 ppas:sg:gen.dat.loc:f:imperf.perf:neg
  320 +315 ppas:sg:gen.dat.loc:f:imperf:aff
  321 +316 ppas:sg:gen.dat.loc:f:imperf:neg
  322 +317 ppas:sg:gen.dat.loc:f:perf:aff
  323 +318 ppas:sg:gen.dat.loc:f:perf:neg
  324 +319 ppas:sg:gen:m1.m2.m3.n1.n2:imperf.perf:aff
  325 +320 ppas:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg
  326 +321 ppas:sg:gen:m1.m2.m3.n1.n2:imperf:aff
  327 +322 ppas:sg:gen:m1.m2.m3.n1.n2:imperf:neg
  328 +323 ppas:sg:gen:m1.m2.m3.n1.n2:perf:aff
  329 +324 ppas:sg:gen:m1.m2.m3.n1.n2:perf:neg
  330 +325 ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:aff
  331 +326 ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg
  332 +327 ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff
  333 +328 ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg
  334 +329 ppas:sg:inst.loc:m1.m2.m3.n1.n2:perf:aff
  335 +330 ppas:sg:inst.loc:m1.m2.m3.n1.n2:perf:neg
  336 +331 ppas:sg:nom.acc.voc:n1.n2:imperf.perf:aff
  337 +332 ppas:sg:nom.acc.voc:n1.n2:imperf.perf:neg
  338 +333 ppas:sg:nom.acc.voc:n1.n2:imperf:aff
  339 +334 ppas:sg:nom.acc.voc:n1.n2:imperf:neg
  340 +335 ppas:sg:nom.acc.voc:n1.n2:perf:aff
  341 +336 ppas:sg:nom.acc.voc:n1.n2:perf:neg
  342 +337 ppas:sg:nom.voc:f:imperf.perf:aff
  343 +338 ppas:sg:nom.voc:f:imperf.perf:neg
  344 +339 ppas:sg:nom.voc:f:imperf:aff
  345 +340 ppas:sg:nom.voc:f:imperf:neg
  346 +341 ppas:sg:nom.voc:f:perf:aff
  347 +342 ppas:sg:nom.voc:f:perf:neg
  348 +343 ppas:sg:nom.voc:m1.m2.m3:imperf.perf:aff
  349 +344 ppas:sg:nom.voc:m1.m2.m3:imperf.perf:neg
  350 +345 ppas:sg:nom.voc:m1.m2.m3:imperf:aff
  351 +346 ppas:sg:nom.voc:m1.m2.m3:imperf:neg
  352 +347 ppas:sg:nom.voc:m1.m2.m3:perf:aff
  353 +348 ppas:sg:nom.voc:m1.m2.m3:perf:neg
  354 +349 ppron12:pl:acc:_:pri
  355 +350 ppron12:pl:acc:_:sec
  356 +351 ppron12:pl:dat:_:pri
  357 +352 ppron12:pl:dat:_:sec
  358 +353 ppron12:pl:gen:_:pri
  359 +354 ppron12:pl:gen:_:sec
  360 +355 ppron12:pl:inst:_:pri
  361 +356 ppron12:pl:inst:_:sec
  362 +357 ppron12:pl:loc:_:pri
  363 +358 ppron12:pl:loc:_:sec
  364 +359 ppron12:pl:nom:_:pri
  365 +360 ppron12:pl:nom:_:sec
  366 +361 ppron12:pl:voc:_:pri
  367 +362 ppron12:pl:voc:_:sec
  368 +363 ppron12:sg:acc:m1.m2.m3.f.n1.n2:pri:akc
  369 +364 ppron12:sg:acc:m1.m2.m3.f.n1.n2:pri:nakc
  370 +365 ppron12:sg:acc:m1.m2.m3.f.n1.n2:sec:akc
  371 +366 ppron12:sg:acc:m1.m2.m3.f.n1.n2:sec:nakc
  372 +367 ppron12:sg:dat:m1.m2.m3.f.n1.n2:pri:akc
  373 +368 ppron12:sg:dat:m1.m2.m3.f.n1.n2:pri:nakc
  374 +369 ppron12:sg:dat:m1.m2.m3.f.n1.n2:sec:akc
  375 +370 ppron12:sg:dat:m1.m2.m3.f.n1.n2:sec:nakc
  376 +371 ppron12:sg:gen:m1.m2.m3.f.n1.n2:pri:akc
  377 +372 ppron12:sg:gen:m1.m2.m3.f.n1.n2:pri:nakc
  378 +373 ppron12:sg:gen:m1.m2.m3.f.n1.n2:sec:akc
  379 +374 ppron12:sg:gen:m1.m2.m3.f.n1.n2:sec:nakc
  380 +375 ppron12:sg:inst:m1.m2.m3.f.n1.n2:pri
  381 +376 ppron12:sg:inst:m1.m2.m3.f.n1.n2:sec
  382 +377 ppron12:sg:loc:m1.m2.m3.f.n1.n2:pri
  383 +378 ppron12:sg:loc:m1.m2.m3.f.n1.n2:sec
  384 +379 ppron12:sg:nom:m1.m2.m3.f.n1.n2:pri
  385 +380 ppron12:sg:nom:m1.m2.m3.f.n1.n2:sec
  386 +381 ppron12:sg:voc:m1.m2.m3.f.n1.n2:pri
  387 +382 ppron12:sg:voc:m1.m2.m3.f.n1.n2:sec
  388 +383 ppron3:pl:acc:m1.p1:ter:_:npraep
  389 +384 ppron3:pl:acc:m1.p1:ter:_:praep
  390 +385 ppron3:pl:acc:m2.m3.f.n1.n2.p2.p3:ter:_:npraep
  391 +386 ppron3:pl:acc:m2.m3.f.n1.n2.p2.p3:ter:_:praep
  392 +387 ppron3:pl:dat:_:ter:_:npraep
  393 +388 ppron3:pl:dat:_:ter:_:praep
  394 +389 ppron3:pl:gen:_:ter:_:npraep
  395 +390 ppron3:pl:gen:_:ter:_:praep
  396 +391 ppron3:pl:inst:_:ter:_:_
  397 +392 ppron3:pl:loc:_:ter:_:_
  398 +393 ppron3:pl:nom:m1.p1:ter:_:_
  399 +394 ppron3:pl:nom:m2.m3.f.n1.n2.p2.p3:ter:_:_
  400 +395 ppron3:sg:acc:f:ter:_:npraep
  401 +396 ppron3:sg:acc:f:ter:_:praep
  402 +397 ppron3:sg:acc:m1.m2.m3:ter:akc:npraep
  403 +398 ppron3:sg:acc:m1.m2.m3:ter:akc:praep
  404 +399 ppron3:sg:acc:m1.m2.m3:ter:nakc:npraep
  405 +400 ppron3:sg:acc:m1.m2.m3:ter:nakc:praep
  406 +401 ppron3:sg:acc:n1.n2:ter:_:npraep
  407 +402 ppron3:sg:acc:n1.n2:ter:_:praep
  408 +403 ppron3:sg:dat:f:ter:_:npraep
  409 +404 ppron3:sg:dat:f:ter:_:praep
  410 +405 ppron3:sg:dat:m1.m2.m3:ter:_:praep
  411 +406 ppron3:sg:dat:m1.m2.m3:ter:akc:npraep
  412 +407 ppron3:sg:dat:m1.m2.m3:ter:nakc:npraep
  413 +408 ppron3:sg:dat:n1.n2:ter:_:praep
  414 +409 ppron3:sg:dat:n1.n2:ter:akc:npraep
  415 +410 ppron3:sg:dat:n1.n2:ter:nakc:npraep
  416 +411 ppron3:sg:gen:f:ter:_:npraep
  417 +412 ppron3:sg:gen:f:ter:_:praep
  418 +413 ppron3:sg:gen:m1.m2.m3:ter:akc:npraep
  419 +414 ppron3:sg:gen:m1.m2.m3:ter:akc:praep
  420 +415 ppron3:sg:gen:m1.m2.m3:ter:nakc:npraep
  421 +416 ppron3:sg:gen:m1.m2.m3:ter:nakc:praep
  422 +417 ppron3:sg:gen:n1.n2:ter:_:praep
  423 +418 ppron3:sg:gen:n1.n2:ter:akc:npraep
  424 +419 ppron3:sg:gen:n1.n2:ter:nakc:npraep
  425 +420 ppron3:sg:inst:f:ter:_:praep
  426 +421 ppron3:sg:inst:m1.m2.m3:ter:_:_
  427 +422 ppron3:sg:inst:n1.n2:ter:_:_
  428 +423 ppron3:sg:loc:f:ter:_:_
  429 +424 ppron3:sg:loc:m1.m2.m3:ter:_:_
  430 +425 ppron3:sg:loc:n1.n2:ter:_:_
  431 +426 ppron3:sg:nom:f:ter:_:_
  432 +427 ppron3:sg:nom:m1.m2.m3:ter:_:_
  433 +428 ppron3:sg:nom:n1.n2:ter:_:_
  434 +429 praet:pl:m1.p1:imperf
  435 +430 praet:pl:m1.p1:imperf.perf
  436 +431 praet:pl:m1.p1:perf
  437 +432 praet:pl:m2.m3.f.n1.n2.p2.p3:imperf
  438 +433 praet:pl:m2.m3.f.n1.n2.p2.p3:imperf.perf
  439 +434 praet:pl:m2.m3.f.n1.n2.p2.p3:perf
  440 +435 praet:sg:f:imperf
  441 +436 praet:sg:f:imperf.perf
  442 +437 praet:sg:f:perf
  443 +438 praet:sg:m1.m2.m3:imperf
  444 +439 praet:sg:m1.m2.m3:imperf.perf
  445 +440 praet:sg:m1.m2.m3:imperf:agl
  446 +441 praet:sg:m1.m2.m3:imperf:nagl
  447 +442 praet:sg:m1.m2.m3:perf
  448 +443 praet:sg:m1.m2.m3:perf:agl
  449 +444 praet:sg:m1.m2.m3:perf:nagl
  450 +445 praet:sg:n1.n2:imperf
  451 +446 praet:sg:n1.n2:imperf.perf
  452 +447 praet:sg:n1.n2:perf
  453 +448 pred
  454 +449 prep:acc
  455 +450 prep:acc:nwok
  456 +451 prep:acc:wok
  457 +452 prep:dat
  458 +453 prep:gen
  459 +454 prep:gen:nwok
  460 +455 prep:gen:wok
  461 +456 prep:inst
  462 +457 prep:inst:nwok
  463 +458 prep:inst:wok
  464 +459 prep:loc
  465 +460 prep:loc:nwok
  466 +461 prep:loc:wok
  467 +462 prep:nom
  468 +463 qub
  469 +464 subst:pl:acc:f
  470 +465 subst:pl:acc:m1
  471 +466 subst:pl:acc:m2
  472 +467 subst:pl:acc:m3
  473 +468 subst:pl:acc:n1
  474 +469 subst:pl:acc:n2
  475 +470 subst:pl:acc:p1
  476 +471 subst:pl:acc:p2
  477 +472 subst:pl:acc:p3
  478 +473 subst:pl:dat:f
  479 +474 subst:pl:dat:m1
  480 +475 subst:pl:dat:m2
  481 +476 subst:pl:dat:m3
  482 +477 subst:pl:dat:n1
  483 +478 subst:pl:dat:n2
  484 +479 subst:pl:dat:p1
  485 +480 subst:pl:dat:p2
  486 +481 subst:pl:dat:p3
  487 +482 subst:pl:gen:f
  488 +483 subst:pl:gen:m1
  489 +484 subst:pl:gen:m2
  490 +485 subst:pl:gen:m3
  491 +486 subst:pl:gen:n1
  492 +487 subst:pl:gen:n2
  493 +488 subst:pl:gen:p1
  494 +489 subst:pl:gen:p2
  495 +490 subst:pl:gen:p3
  496 +491 subst:pl:inst:f
  497 +492 subst:pl:inst:m1
  498 +493 subst:pl:inst:m2
  499 +494 subst:pl:inst:m3
  500 +495 subst:pl:inst:n1
  501 +496 subst:pl:inst:n2
  502 +497 subst:pl:inst:p1
  503 +498 subst:pl:inst:p2
  504 +499 subst:pl:inst:p3
  505 +500 subst:pl:loc:f
  506 +501 subst:pl:loc:m1
  507 +502 subst:pl:loc:m2
  508 +503 subst:pl:loc:m3
  509 +504 subst:pl:loc:n1
  510 +505 subst:pl:loc:n2
  511 +506 subst:pl:loc:p1
  512 +507 subst:pl:loc:p2
  513 +508 subst:pl:loc:p3
  514 +509 subst:pl:nom:f
  515 +510 subst:pl:nom:m1
  516 +511 subst:pl:nom:m2
  517 +512 subst:pl:nom:m3
  518 +513 subst:pl:nom:n1
  519 +514 subst:pl:nom:n2
  520 +515 subst:pl:nom:p1
  521 +516 subst:pl:nom:p2
  522 +517 subst:pl:nom:p3
  523 +518 subst:pl:voc:f
  524 +519 subst:pl:voc:m1
  525 +520 subst:pl:voc:m2
  526 +521 subst:pl:voc:m3
  527 +522 subst:pl:voc:n1
  528 +523 subst:pl:voc:n2
  529 +524 subst:pl:voc:p1
  530 +525 subst:pl:voc:p2
  531 +526 subst:pl:voc:p3
  532 +527 subst:sg:acc:f
  533 +528 subst:sg:acc:m1
  534 +529 subst:sg:acc:m2
  535 +530 subst:sg:acc:m3
  536 +531 subst:sg:acc:n1
  537 +532 subst:sg:acc:n2
  538 +533 subst:sg:dat:f
  539 +534 subst:sg:dat:m1
  540 +535 subst:sg:dat:m2
  541 +536 subst:sg:dat:m3
  542 +537 subst:sg:dat:n1
  543 +538 subst:sg:dat:n2
  544 +539 subst:sg:gen:f
  545 +540 subst:sg:gen:m1
  546 +541 subst:sg:gen:m2
  547 +542 subst:sg:gen:m3
  548 +543 subst:sg:gen:n1
  549 +544 subst:sg:gen:n2
  550 +545 subst:sg:inst:f
  551 +546 subst:sg:inst:m1
  552 +547 subst:sg:inst:m2
  553 +548 subst:sg:inst:m3
  554 +549 subst:sg:inst:n1
  555 +550 subst:sg:inst:n2
  556 +551 subst:sg:loc:f
  557 +552 subst:sg:loc:m1
  558 +553 subst:sg:loc:m2
  559 +554 subst:sg:loc:m3
  560 +555 subst:sg:loc:n1
  561 +556 subst:sg:loc:n2
  562 +557 subst:sg:nom:f
  563 +558 subst:sg:nom:m1
  564 +559 subst:sg:nom:m2
  565 +560 subst:sg:nom:m3
  566 +561 subst:sg:nom:n1
  567 +562 subst:sg:nom:n2
  568 +563 subst:sg:voc:f
  569 +564 subst:sg:voc:m1
  570 +565 subst:sg:voc:m2
  571 +566 subst:sg:voc:m3
  572 +567 subst:sg:voc:n1
  573 +568 subst:sg:voc:n2
  574 +569 winien:pl:m1.p1:imperf
  575 +570 winien:pl:m2.m3.f.n1.n2.p2.p3:imperf
  576 +571 winien:sg:f:imperf
  577 +572 winien:sg:m1.m2.m3:imperf
  578 +573 winien:sg:n1.n2:imperf
  579 +
  580 +[NAMES]
  581 +
  582 +0
  583 +1 etnonim
  584 +2 geograficzna
  585 +3 imię
  586 +4 nazwisko
  587 +5 określenie dodatkowe
  588 +6 organizacja
  589 +7 osoba
  590 +8 pospolita
  591 +9 własna
  592 +10 wydarzenie
  593 +11 wytwór
  594 +
... ...
fsabuilder/fsa/test/testConstruction.py
... ... @@ -2,17 +2,16 @@
2 2 '''
3 3 Created on Oct 8, 2013
4 4  
5   -@author: lennyn
  5 +@author: mlenart
6 6 '''
7 7 import unittest
8   -from fsa import fsa, visualizer, encode
  8 +import os
  9 +from fsa import fsa, visualizer, encode, buildfsa
9 10  
10 11 class Test(unittest.TestCase):
11 12  
12   -
13 13 def testSimpleConstruction(self):
14   - print 'dupa'
15   - a = fsa.FSA(encode.Encoder())
  14 + a = fsa.FSA(encode.SimpleEncoder())
16 15 input = sorted([
17 16 (u'bić', ''),
18 17 (u'bij', ''),
... ... @@ -50,19 +49,17 @@ class Test(unittest.TestCase):
50 49 (u'biłyśmy', ''),
51 50 ], key=lambda w: bytearray(w[0], 'utf8'))
52 51 a.feed(input)
53   - print a.getStatesNum()
54   -# print a.tryToRecognize(u'bi')
55   -# print a.tryToRecognize(u'bić')
56   -# print a.tryToRecognize(u'bili')
57 52 for w, res in input:
58   - print w, res, a.tryToRecognize(w)
59 53 recognized = a.tryToRecognize(w)
60   - if type(res) in [str, unicode]:
61   - recognized = recognized[0]
62 54 assert recognized == res
63 55 a.calculateOffsets(lambda state: 1 + 4 * len(state.transitionsMap.keys()) + (len(state.encodedData) if state.isAccepting() else 0))
64 56 visualizer.Visualizer().visualize(a)
65   - print 'done'
  57 +
  58 + def testPolimorfConstruction(self):
  59 + inputFile = os.path.join(os.path.dirname(__file__), 'PoliMorfSmall.tab')
  60 + tagsetFile = os.path.join(os.path.dirname(__file__), 'polimorf.tagset')
  61 + fsa = buildfsa.buildFromPoliMorf(inputFile, tagsetFile)
  62 +# visualizer.Visualizer().visualize(fsa)
66 63  
67 64 if __name__ == "__main__":
68 65 #import sys;sys.argv = ['', 'Test.testSimpleConstruction']
... ...
fsabuilder/utils/__init__.py 0 → 100644
fsabuilder/utils/extractTagset.py 0 → 100644
  1 +'''
  2 +Created on Nov 7, 2013
  3 +
  4 +@author: mlenart
  5 +'''
  6 +import sys
  7 +
  8 +if __name__ == '__main__':
  9 + version = sys.argv[1]
  10 + res = set()
  11 + print '#morfeusz-tagset', version
  12 + for line in sys.stdin:
  13 + if line.strip():
  14 + tag = line.split('\t')[2]
  15 + res.add(tag)
  16 + for idx, tag in enumerate(sorted(res)):
  17 + print str(idx) + '\t' + tag
... ...