Commit de0e960dbe882750998828aedd42322904c1255a
1 parent
00e66248
upodabnianie syntezy do analizy
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@113 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
22 changed files
with
457 additions
and
297 deletions
CMakeLists.txt
... | ... | @@ -47,7 +47,7 @@ endif () |
47 | 47 | |
48 | 48 | # SEGMENT_RULES_FILE |
49 | 49 | if ("${SEGMENT_RULES_FILE}" STREQUAL "") |
50 | - set (SEGMENT_RULES_FILE "${PROJECT_SOURCE_DIR}/input/segmenty.dat") | |
50 | + set (SEGMENT_RULES_FILE "${PROJECT_SOURCE_DIR}/input/segmenty1.dat") | |
51 | 51 | endif () |
52 | 52 | |
53 | 53 | message ("Will use ${INPUT_DICTIONARIES} as default dictionary input, ${INPUT_TAGSET} as tagset and ${SEGMENT_RULES_FILE} as segmentation rules") |
... | ... |
fsabuilder/buildfsa.py
... | ... | @@ -137,8 +137,8 @@ def _parseOptions(): |
137 | 137 | for filename in opts.inputFiles: |
138 | 138 | _checkOpen(filename, 'r') |
139 | 139 | _checkOpen(opts.outputFile, 'w') |
140 | + _checkOption(opts.segmentsFile, parser, "Segment rules file is missing") | |
140 | 141 | if opts.analyzer: |
141 | - _checkOption(opts.segmentsFile, parser, "Segment rules file is missing") | |
142 | 142 | _checkOpen(opts.segmentsFile, 'r') |
143 | 143 | |
144 | 144 | if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1, SerializationMethod.V2]: |
... | ... | @@ -161,9 +161,9 @@ def _readPolimorfInput4Analyzer(inputFiles, tagset, encoder, segmentRulesManager |
161 | 161 | for entry in convertinput.PolimorfConverter4Analyzer(tagset, encoder, 'utf8', segmentRulesManager, trimSupneg).convert(_concatFiles(inputFiles)): |
162 | 162 | yield entry |
163 | 163 | |
164 | -def _readPolimorfInput4Generator(inputFiles, tagset, encoder): | |
164 | +def _readPolimorfInput4Generator(inputFiles, tagset, encoder, segmentRulesManager): | |
165 | 165 | logging.info('reading generator data from %s', str(inputFiles)) |
166 | - for entry in convertinput.PolimorfConverter4Generator(tagset, encoder, 'utf8').convert(_concatFiles(inputFiles)): | |
166 | + for entry in convertinput.PolimorfConverter4Generator(tagset, encoder, 'utf8', segmentRulesManager).convert(_concatFiles(inputFiles)): | |
167 | 167 | yield entry |
168 | 168 | |
169 | 169 | def _readTrainData(trainFile): |
... | ... | @@ -201,10 +201,10 @@ def buildAnalyzerFromPoliMorf(inputFiles, tagset, segmentRulesManager, trimSupne |
201 | 201 | _printStats(fsa) |
202 | 202 | return fsa |
203 | 203 | |
204 | -def buildGeneratorFromPoliMorf(inputFiles, tagset): | |
204 | +def buildGeneratorFromPoliMorf(inputFiles, tagset, segmentRulesManager): | |
205 | 205 | encoder = encode.Encoder4Generator() |
206 | 206 | fsa = FSA(encoder, tagset) |
207 | - inputData = _readPolimorfInput4Generator(inputFiles, tagset, encoder) | |
207 | + inputData = _readPolimorfInput4Generator(inputFiles, tagset, encoder, segmentRulesManager) | |
208 | 208 | for word, data in inputData: |
209 | 209 | fsa.addEntry(word, data) |
210 | 210 | fsa.close() |
... | ... | @@ -227,14 +227,14 @@ def main(opts): |
227 | 227 | |
228 | 228 | logging.info('reading tagset from %s', opts.tagsetFile) |
229 | 229 | tagset = Tagset(opts.tagsetFile) |
230 | + rulesType = rulesParser.RulesParser.PARSE4ANALYZER if opts.analyzer else rulesParser.RulesParser.PARSE4GENERATOR | |
231 | + segmentRulesManager = rulesParser.RulesParser(tagset, rulesType).parse(opts.segmentsFile) | |
232 | + segmentationRulesData = segmentRulesManager.serialize() | |
230 | 233 | |
231 | 234 | if opts.analyzer: |
232 | - segmentRulesManager = rulesParser.RulesParser(tagset).parse(opts.segmentsFile) | |
233 | - additionalData = segmentRulesManager.serialize() | |
234 | 235 | fsa = buildAnalyzerFromPoliMorf(opts.inputFiles, tagset, segmentRulesManager, opts.trimSupneg) |
235 | 236 | else: |
236 | - fsa = buildGeneratorFromPoliMorf(opts.inputFiles, tagset) | |
237 | - additionalData = bytearray() | |
237 | + fsa = buildGeneratorFromPoliMorf(opts.inputFiles, tagset, segmentRulesManager) | |
238 | 238 | |
239 | 239 | if opts.trainFile: |
240 | 240 | logging.info('training with '+opts.trainFile+' ...') |
... | ... | @@ -248,9 +248,9 @@ def main(opts): |
248 | 248 | }[opts.serializationMethod](fsa) |
249 | 249 | |
250 | 250 | if opts.cpp: |
251 | - serializer.serialize2CppFile(opts.outputFile, generator=opts.generator, additionalData=additionalData) | |
251 | + serializer.serialize2CppFile(opts.outputFile, generator=opts.generator, segmentationRulesData=segmentationRulesData) | |
252 | 252 | else: |
253 | - serializer.serialize2BinaryFile(opts.outputFile, additionalData=additionalData) | |
253 | + serializer.serialize2BinaryFile(opts.outputFile, segmentationRulesData=segmentationRulesData) | |
254 | 254 | |
255 | 255 | logging.info('total FSA size (in bytes): '+str(fsa.initialState.reverseOffset)) |
256 | 256 | # { |
... | ... |
fsabuilder/morfeuszbuilder/fsa/common.py
... | ... | @@ -67,11 +67,12 @@ class Interpretation(object): |
67 | 67 | |
68 | 68 | class Interpretation4Generator(object): |
69 | 69 | |
70 | - def __init__(self, orth, base, tagnum, namenum): | |
70 | + def __init__(self, orth, base, tagnum, namenum, typenum): | |
71 | 71 | self.lemma = base |
72 | 72 | self.orth = EncodedFormWithPrefix(base, orth) |
73 | 73 | self.tagnum = tagnum |
74 | 74 | self.namenum = namenum |
75 | + self.typenum = typenum | |
75 | 76 | |
76 | 77 | def getSortKey(self): |
77 | 78 | return ( |
... | ... |
fsabuilder/morfeuszbuilder/fsa/convertinput.py
... | ... | @@ -81,10 +81,11 @@ class PolimorfConverter4Analyzer(object): |
81 | 81 | |
82 | 82 | class PolimorfConverter4Generator(object): |
83 | 83 | |
84 | - def __init__(self, tagset, encoder, inputEncoding='utf8'): | |
84 | + def __init__(self, tagset, encoder, inputEncoding, segmentRulesManager): | |
85 | 85 | self.tagset = tagset |
86 | 86 | self.encoder = encoder |
87 | 87 | self.inputEncoding = inputEncoding |
88 | + self.segmentRulesManager = segmentRulesManager | |
88 | 89 | |
89 | 90 | # we do it the ugly way (parse to plain text) because it is way more memory-efficient |
90 | 91 | def _partiallyParseLines(self, inputLines): |
... | ... | @@ -94,10 +95,11 @@ class PolimorfConverter4Generator(object): |
94 | 95 | if base: |
95 | 96 | tagnum = self.tagset.getTagnum4Tag(tag) |
96 | 97 | namenum = self.tagset.getNamenum4Name(name) |
97 | - yield '%s %s %d %d' % ( | |
98 | + typenum = self.segmentRulesManager.lexeme2SegmentTypeNum(base, tagnum) | |
99 | + yield '%s %s %d %d %d' % ( | |
98 | 100 | orth.encode(self.inputEncoding), |
99 | 101 | base.encode(self.inputEncoding), |
100 | - tagnum, namenum) | |
102 | + tagnum, namenum, typenum) | |
101 | 103 | else: |
102 | 104 | logging.warn('Ignoring line: %s', line.strip()) |
103 | 105 | |
... | ... | @@ -109,10 +111,11 @@ class PolimorfConverter4Generator(object): |
109 | 111 | for line in inputLines: |
110 | 112 | line = line.decode(self.inputEncoding).strip(u'\n') |
111 | 113 | if line: |
112 | - orth, base, tagnum, namenum = line.split(u' ') | |
114 | + orth, base, tagnum, namenum, typenum = line.split(u' ') | |
113 | 115 | tagnum = int(tagnum) |
114 | 116 | namenum = int(namenum) |
115 | - yield (base, Interpretation4Generator(orth, base, tagnum, namenum)) | |
117 | + typenum = int(typenum) | |
118 | + yield (base, Interpretation4Generator(orth, base, tagnum, namenum, typenum)) | |
116 | 119 | |
117 | 120 | def convert(self, inputLines): |
118 | 121 | return _mergeEntries(self._reallyParseLines(self._sortLines(self._partiallyParseLines(inputLines)))) |
... | ... |
fsabuilder/morfeuszbuilder/fsa/encode.py
... | ... | @@ -114,7 +114,7 @@ class MorphEncoder(Encoder): |
114 | 114 | assert type(interpsList) == frozenset |
115 | 115 | for interp in sorted(interpsList, key=lambda i: i.getSortKey()): |
116 | 116 | res.extend(self._encodeTypeNum(interp.typenum)) |
117 | - res.extend(self._encodeEncodedForm(interp.lemma, withCasePattern=True)) | |
117 | + res.extend(self._encodeEncodedForm(interp.lemma, withCasePattern=True, withPrefix=False)) | |
118 | 118 | res.extend(self._encodeTagNum(interp.tagnum)) |
119 | 119 | res.extend(self._encodeNameNum(interp.namenum)) |
120 | 120 | del interpsList |
... | ... | @@ -133,6 +133,7 @@ class Encoder4Generator(Encoder): |
133 | 133 | res.append(firstByte) |
134 | 134 | assert type(interpsList) == frozenset |
135 | 135 | for interp in sorted(interpsList, key=lambda i: i.getSortKey()): |
136 | + res.extend(self._encodeTypeNum(interp.typenum)) | |
136 | 137 | res.extend(self._encodeEncodedForm(interp.orth, withCasePattern=False, withPrefix=True)) |
137 | 138 | res.extend(self._encodeTagNum(interp.tagnum)) |
138 | 139 | res.extend(self._encodeNameNum(interp.namenum)) |
... | ... |
fsabuilder/morfeuszbuilder/fsa/serializer.py
... | ... | @@ -24,7 +24,7 @@ class Serializer(object): |
24 | 24 | def getVersion(self): |
25 | 25 | return 10 |
26 | 26 | |
27 | - def serialize2CppFile(self, fname, generator, additionalData): | |
27 | + def serialize2CppFile(self, fname, generator, segmentationRulesData): | |
28 | 28 | res = [] |
29 | 29 | # self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) |
30 | 30 | res.append('\n') |
... | ... | @@ -37,8 +37,8 @@ class Serializer(object): |
37 | 37 | res.append('extern const unsigned char DEFAULT_FSA[] = {') |
38 | 38 | res.append('\n') |
39 | 39 | for byte in self.fsa2bytearray( |
40 | - additionalData=self.serializeTagset(self.fsa.tagset), | |
41 | - moreAdditionalData=additionalData): | |
40 | + tagsetData=self.serializeTagset(self.fsa.tagset), | |
41 | + segmentationRulesData=segmentationRulesData): | |
42 | 42 | res.append(hex(byte)); |
43 | 43 | res.append(','); |
44 | 44 | res.append('\n') |
... | ... | @@ -47,16 +47,16 @@ class Serializer(object): |
47 | 47 | with open(fname, 'w') as f: |
48 | 48 | f.write(''.join(res)) |
49 | 49 | |
50 | - def serialize2BinaryFile(self, fname, additionalData): | |
50 | + def serialize2BinaryFile(self, fname, segmentationRulesData): | |
51 | 51 | with open(fname, 'wb') as f: |
52 | 52 | f.write(self.fsa2bytearray( |
53 | - additionalData=self.serializeTagset(self.fsa.tagset), | |
54 | - moreAdditionalData=additionalData)) | |
53 | + tagsetData=self.serializeTagset(self.fsa.tagset), | |
54 | + segmentationRulesData=segmentationRulesData)) | |
55 | 55 | |
56 | 56 | def getStateSize(self, state): |
57 | 57 | raise NotImplementedError('Not implemented') |
58 | 58 | |
59 | - def fsa2bytearray(self, additionalData=bytearray(), moreAdditionalData=bytearray()): | |
59 | + def fsa2bytearray(self, tagsetData, segmentationRulesData): | |
60 | 60 | res = bytearray() |
61 | 61 | res.extend(self.serializePrologue()) |
62 | 62 | fsaData = bytearray() |
... | ... | @@ -66,7 +66,7 @@ class Serializer(object): |
66 | 66 | fsaData.extend(self.state2bytearray(state)) |
67 | 67 | res.extend(htonl(len(fsaData))) |
68 | 68 | res.extend(fsaData) |
69 | - res.extend(self.serializeEpilogue(additionalData, moreAdditionalData)) | |
69 | + res.extend(self.serializeEpilogue(tagsetData, segmentationRulesData)) | |
70 | 70 | return res |
71 | 71 | |
72 | 72 | def _serializeTags(self, tagsMap): |
... | ... | @@ -104,20 +104,20 @@ class Serializer(object): |
104 | 104 | |
105 | 105 | return res |
106 | 106 | |
107 | - def serializeEpilogue(self, additionalData, moreAdditionalData): | |
107 | + def serializeEpilogue(self, tagsetData, segmentationRulesData): | |
108 | 108 | res = bytearray() |
109 | - additionalDataSize = len(additionalData) if additionalData else 0 | |
110 | - moreAdditionalDataSize = len(moreAdditionalData) if moreAdditionalData else 0 | |
111 | - res.extend(htonl(additionalDataSize)) | |
109 | + tagsetDataSize = len(tagsetData) if tagsetData else 0 | |
110 | + segmentationDataSize = len(segmentationRulesData) if segmentationRulesData else 0 | |
111 | + res.extend(htonl(tagsetDataSize)) | |
112 | 112 | |
113 | 113 | # add additional data itself |
114 | - if additionalDataSize: | |
115 | - assert type(additionalData) == bytearray | |
116 | - res.extend(additionalData) | |
114 | + if tagsetDataSize: | |
115 | + assert type(tagsetData) == bytearray | |
116 | + res.extend(tagsetData) | |
117 | 117 | |
118 | - if moreAdditionalDataSize: | |
119 | - assert type(moreAdditionalData) == bytearray | |
120 | - res.extend(moreAdditionalData) | |
118 | + if segmentationDataSize: | |
119 | + assert type(segmentationRulesData) == bytearray | |
120 | + res.extend(segmentationRulesData) | |
121 | 121 | return res |
122 | 122 | |
123 | 123 | def state2bytearray(self, state): |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
... | ... | @@ -13,8 +13,13 @@ from morfeuszbuilder.segrules import rulesNFA |
13 | 13 | |
14 | 14 | class RulesParser(object): |
15 | 15 | |
16 | - def __init__(self, tagset): | |
16 | + PARSE4GENERATOR = 1 | |
17 | + PARSE4ANALYZER = 2 | |
18 | + | |
19 | + def __init__(self, tagset, rulesType): | |
17 | 20 | self.tagset = tagset |
21 | + assert rulesType in (RulesParser.PARSE4GENERATOR, RulesParser.PARSE4ANALYZER) | |
22 | + self.rulesType = rulesType | |
18 | 23 | |
19 | 24 | def _getKey2Defs(self, segtypesConfigFile): |
20 | 25 | res = {} |
... | ... | @@ -29,7 +34,7 @@ class RulesParser(object): |
29 | 34 | |
30 | 35 | def parse(self, filename): |
31 | 36 | |
32 | - segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes', 'segment types']) | |
37 | + segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'generator combinations', 'tags', 'lexemes', 'segment types']) | |
33 | 38 | key2Defs = self._getKey2Defs(segtypesConfigFile) |
34 | 39 | segtypesHelper = segtypes.Segtypes(self.tagset, segtypesConfigFile) |
35 | 40 | |
... | ... | @@ -47,7 +52,8 @@ class RulesParser(object): |
47 | 52 | nfa = rulesNFA.RulesNFA() |
48 | 53 | if not firstNFA: |
49 | 54 | firstNFA = nfa |
50 | - combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations') | |
55 | + section = 'combinations' if self.rulesType == RulesParser.PARSE4ANALYZER else 'generator combinations' | |
56 | + combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection(section) | |
51 | 57 | combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs, filename)) |
52 | 58 | for rule in self._doParse(combinationEnumeratedLines, segtypesHelper, filename): |
53 | 59 | # print rule |
... | ... | @@ -83,7 +89,10 @@ class RulesParser(object): |
83 | 89 | unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule |
84 | 90 | oneOfRule = delimitedList(unaryRule, delim='|') |
85 | 91 | complexRule = unaryRule ^ oneOfRule |
86 | - concatRule = OneOrMore(complexRule) | |
92 | + if self.rulesType == RulesParser.PARSE4ANALYZER: | |
93 | + concatRule = OneOrMore(complexRule) | |
94 | + else: | |
95 | + concatRule = ZeroOrMore(shiftOrthRule) + tagRule | |
87 | 96 | rule << concatRule |
88 | 97 | |
89 | 98 | tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper)) |
... | ... |
fsabuilder/morfeuszbuilder/tagset/segtypes.py
... | ... | @@ -15,6 +15,7 @@ class Segtypes(object): |
15 | 15 | |
16 | 16 | self.filename = segrulesConfigFile.filename |
17 | 17 | |
18 | + self.segtypes = set() | |
18 | 19 | self.segtype2Segnum = {} |
19 | 20 | self.segnum2Segtype = {} |
20 | 21 | self.patternsList = [] |
... | ... | @@ -22,6 +23,7 @@ class Segtypes(object): |
22 | 23 | self._tagnum2Segnum = {} |
23 | 24 | self._lemmaTagnum2Segnum = {} |
24 | 25 | |
26 | + self._readSegtypes(segrulesConfigFile) | |
25 | 27 | self._readLexemes(segrulesConfigFile) |
26 | 28 | self._readTags(segrulesConfigFile) |
27 | 29 | self._indexSegnums() |
... | ... | @@ -32,6 +34,20 @@ class Segtypes(object): |
32 | 34 | if not cond: |
33 | 35 | raise exceptions.ConfigFileException(self.filename, lineNum, msg) |
34 | 36 | |
37 | + def _readSegtypes(self, segrulesConfigFile): | |
38 | + for lineNum, line in segrulesConfigFile.enumerateLinesInSection('segment types'): | |
39 | + assert type(line) == unicode | |
40 | + self._validate( | |
41 | + u'Segment type must be a single word', | |
42 | + lineNum, | |
43 | + re.match(r'^\w+$', line)) | |
44 | + self._validate( | |
45 | + u'Segment type already defined: "%s"' % line, | |
46 | + lineNum, | |
47 | + line not in self.segtypes) | |
48 | + self.segtypes.add(line) | |
49 | + | |
50 | + | |
35 | 51 | def _readTags(self, segrulesConfigFile): |
36 | 52 | gotWildcardPattern = False |
37 | 53 | for lineNum, line in segrulesConfigFile.enumerateLinesInSection('tags'): |
... | ... | @@ -42,6 +58,10 @@ class Segtypes(object): |
42 | 58 | len(splitLine) == 2) |
43 | 59 | segtype, pattern = splitLine |
44 | 60 | self._validate( |
61 | + u'Undeclared segment type: "%s"' % segtype, | |
62 | + lineNum, | |
63 | + segtype in self.segtypes) | |
64 | + self._validate( | |
45 | 65 | u'Segment type must be a lowercase alphanumeric with optional underscores', |
46 | 66 | lineNum, |
47 | 67 | re.match(r'[a-z_]+', segtype)) |
... | ... | @@ -78,6 +98,10 @@ class Segtypes(object): |
78 | 98 | for lineNum, line in segrulesConfigFile.enumerateLinesInSection('lexemes'): |
79 | 99 | segtype, pattern = line.strip().split('\t') |
80 | 100 | self._validate( |
101 | + u'Undeclared segment type: "%s"' % segtype, | |
102 | + lineNum, | |
103 | + segtype in self.segtypes) | |
104 | + self._validate( | |
81 | 105 | u'Segment type must be a lowercase alphanumeric with optional underscores', |
82 | 106 | lineNum, |
83 | 107 | re.match(r'[a-z_]+', segtype)) |
... | ... |
input/segmenty.dat
... | ... | @@ -111,7 +111,7 @@ moze_interp(z_on_agl) |
111 | 111 | moze_interp(z_on_agl on_agl) |
112 | 112 | |
113 | 113 | # Liczba zapisana jako ciąg cyfr: |
114 | -moze_interp( dig ) | |
114 | +moze_interp( dig>* dig ) | |
115 | 115 | |
116 | 116 | # Formacje prefiksalne |
117 | 117 | #### trzeba wydzielić odpowiednie samodze! |
... | ... | @@ -154,6 +154,10 @@ moze_interp( fin dywiz li) |
154 | 154 | #moze_interp( praet_sg_na li) |
155 | 155 | #moze_interp( fin li) |
156 | 156 | |
157 | +[generator combinations] | |
158 | +prefs> nomina | |
159 | +nomina | |
160 | + | |
157 | 161 | [segment types] |
158 | 162 | naj |
159 | 163 | nie |
... | ... | @@ -164,6 +168,8 @@ dig |
164 | 168 | adja |
165 | 169 | adj |
166 | 170 | adj_sup |
171 | +adj_com | |
172 | +fin | |
167 | 173 | negat |
168 | 174 | on_agl |
169 | 175 | z_on_agl |
... | ... | @@ -176,8 +182,42 @@ praet_sg_agl |
176 | 182 | praet_sg_na |
177 | 183 | praet_sg |
178 | 184 | praet_pl |
185 | +z_aglt | |
186 | +by | |
187 | +li | |
188 | +nomina | |
189 | +adjectiva | |
190 | +verba_imperf | |
191 | +dywiz | |
192 | +kropka | |
179 | 193 | samodz |
180 | 194 | |
195 | +[lexemes] | |
196 | +z_aglt aby:comp | |
197 | +z_aglt bowiem:comp | |
198 | +by by:qub | |
199 | +li li:qub | |
200 | +z_aglt by:comp | |
201 | +z_aglt cóż:subst | |
202 | +z_aglt czemu:adv | |
203 | +z_aglt czyżby:qub | |
204 | +z_aglt choćby:comp | |
205 | +z_aglt chociażby:comp | |
206 | +z_aglt dlaczego:adv | |
207 | +z_aglt dopóki:comp | |
208 | +z_aglt dopóty:conj | |
209 | +z_aglt gdyby:comp | |
210 | +z_aglt gdzie:qub | |
211 | +z_aglt gdzie:adv | |
212 | +z_aglt jakby:comp | |
213 | +z_aglt jakoby:comp | |
214 | +z_aglt kiedy:adv | |
215 | +z_aglt kiedy:comp | |
216 | +z_aglt tylko:qub | |
217 | +z_aglt żeby:comp | |
218 | +dywiz -:interp | |
219 | +kropka .:interp | |
220 | + | |
181 | 221 | [tags] |
182 | 222 | naj naj |
183 | 223 | nie nie |
... | ... | @@ -221,29 +261,3 @@ verba_imperf inf:imperf |
221 | 261 | verba_imperf imps:imperf |
222 | 262 | verba_imperf impt:%:imperf |
223 | 263 | samodz % |
224 | - | |
225 | -[lexemes] | |
226 | -z_aglt aby:comp | |
227 | -z_aglt bowiem:comp | |
228 | -by by:qub | |
229 | -li li:qub | |
230 | -z_aglt by:comp | |
231 | -z_aglt cóż:subst | |
232 | -z_aglt czemu:adv | |
233 | -z_aglt czyżby:qub | |
234 | -z_aglt choćby:comp | |
235 | -z_aglt chociażby:comp | |
236 | -z_aglt dlaczego:adv | |
237 | -z_aglt dopóki:comp | |
238 | -z_aglt dopóty:conj | |
239 | -z_aglt gdyby:comp | |
240 | -z_aglt gdzie:qub | |
241 | -z_aglt gdzie:adv | |
242 | -z_aglt jakby:comp | |
243 | -z_aglt jakoby:comp | |
244 | -z_aglt kiedy:adv | |
245 | -z_aglt kiedy:comp | |
246 | -z_aglt tylko:qub | |
247 | -z_aglt żeby:comp | |
248 | -dywiz -:interp | |
249 | -kropka .:interp | |
... | ... |
input/segmenty1.dat
... | ... | @@ -9,19 +9,22 @@ praet=split composite |
9 | 9 | |
10 | 10 | dig>* dig |
11 | 11 | (adja dywiz)+ adj |
12 | -#dig!>+ | |
13 | -#dig!> dig!> dig!> | |
14 | 12 | naj> adj_sup |
15 | 13 | |
14 | +[generator combinations] | |
15 | + | |
16 | 16 | [segment types] |
17 | 17 | naj |
18 | 18 | nie |
19 | 19 | prefs |
20 | 20 | prefv |
21 | +prefa | |
21 | 22 | dig |
22 | 23 | adja |
23 | 24 | adj |
24 | 25 | adj_sup |
26 | +adj_com | |
27 | +fin | |
25 | 28 | negat |
26 | 29 | on_agl |
27 | 30 | z_on_agl |
... | ... | @@ -34,6 +37,14 @@ praet_sg_agl |
34 | 37 | praet_sg_na |
35 | 38 | praet_sg |
36 | 39 | praet_pl |
40 | +z_aglt | |
41 | +by | |
42 | +li | |
43 | +nomina | |
44 | +adjectiva | |
45 | +verba_imperf | |
46 | +dywiz | |
47 | +kropka | |
37 | 48 | samodz |
38 | 49 | |
39 | 50 | [tags] |
... | ... |
morfeusz/CMakeLists.txt
... | ... | @@ -2,13 +2,13 @@ |
2 | 2 | ########## generate default dictionary data ################# |
3 | 3 | add_custom_command ( |
4 | 4 | OUTPUT "${INPUT_DICTIONARY_CPP}" |
5 | - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/buildfsa.py --analyzer --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_DICTIONARY_CPP}" "--tagset-file=${INPUT_TAGSET}" "--segments-file=${SEGMENT_RULES_FILE}" --cpp --serialization-method=V1 --trim-supneg | |
5 | + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/buildfsa.py --analyzer --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V1 --trim-supneg | |
6 | 6 | DEPENDS "${INPUT_DICTIONARY}" |
7 | 7 | COMMENT "Building default dictionary C++ file" |
8 | 8 | ) |
9 | 9 | add_custom_command ( |
10 | 10 | OUTPUT "${INPUT_SYNTH_DICTIONARY_CPP}" |
11 | - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/buildfsa.py --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" "--tagset-file=${INPUT_TAGSET}" --cpp --serialization-method=V1 | |
11 | + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/buildfsa.py --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V1 | |
12 | 12 | DEPENDS "${INPUT_DICTIONARY}" |
13 | 13 | COMMENT "Building default dictionary C++ file" |
14 | 14 | ) |
... | ... |
morfeusz/EncodedInterpretation.hpp
... | ... | @@ -16,17 +16,18 @@ |
16 | 16 | /* |
17 | 17 | * Lemma in a compressed format (as in an automaton) |
18 | 18 | */ |
19 | -struct EncodedLemma { | |
19 | +struct EncodedForm { | |
20 | 20 | int suffixToCut; |
21 | 21 | std::string suffixToAdd; |
22 | 22 | std::vector<bool> casePattern; |
23 | + std::string prefixToAdd; | |
23 | 24 | }; |
24 | 25 | |
25 | 26 | /* |
26 | 27 | * Internal representation of an interpretation - with lemma encoded |
27 | 28 | */ |
28 | 29 | struct EncodedInterpretation { |
29 | - EncodedLemma lemma; | |
30 | + EncodedForm value; | |
30 | 31 | unsigned char type; |
31 | 32 | int tag; |
32 | 33 | int nameClassifier; |
... | ... |
morfeusz/Environment.cpp
... | ... | @@ -6,21 +6,57 @@ |
6 | 6 | */ |
7 | 7 | |
8 | 8 | #include "Environment.hpp" |
9 | +#include "InterpretedChunksDecoder.hpp" | |
10 | +#include "MorphDeserializer.hpp" | |
9 | 11 | #include "exceptions.hpp" |
10 | 12 | |
13 | +//class InterpretedChunksDecoder4Analyzer; | |
14 | +//class InterpretedChunksDecoder4Generator; | |
15 | + | |
16 | +static Deserializer<vector<InterpsGroup> >* initializeDeserializer() { | |
17 | + static Deserializer < vector < InterpsGroup > > *deserializer | |
18 | + = new MorphDeserializer(); | |
19 | + return deserializer; | |
20 | +} | |
21 | + | |
22 | +static SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map) { | |
23 | + SegrulesOptions opts; | |
24 | + opts["aggl"] = "isolated"; | |
25 | + opts["praet"] = "split"; | |
26 | + return (*(map.find(opts))).second; | |
27 | +} | |
28 | + | |
29 | +static void deleteSegrulesFSAs(std::map<SegrulesOptions, SegrulesFSA*>& fsasMap) { | |
30 | + for ( | |
31 | + std::map<SegrulesOptions, SegrulesFSA*>::iterator it = fsasMap.begin(); | |
32 | + it != fsasMap.end(); | |
33 | + ++it) { | |
34 | + delete it->second; | |
35 | + } | |
36 | + fsasMap.clear(); | |
37 | +} | |
38 | + | |
11 | 39 | Environment::Environment( |
12 | - const Tagset& analyzerTagset, | |
13 | - const Tagset& generatorTagset, | |
14 | - MorfeuszCharset charset) | |
40 | + MorfeuszCharset charset, | |
41 | + MorfeuszProcessorType processorType, | |
42 | + const unsigned char* fsaFileStartPtr) | |
15 | 43 | : currentCharsetConverter(getCharsetConverter(charset)), |
16 | 44 | utf8CharsetConverter(), |
17 | 45 | isoCharsetConverter(), |
18 | 46 | cp1250CharsetConverter(), |
19 | 47 | cp852CharsetConverter(), |
20 | - analyzerTagset(analyzerTagset), | |
21 | - generatorTagset(generatorTagset), | |
22 | - caseConverter() { | |
23 | - | |
48 | + caseConverter(), | |
49 | + tagset(fsaFileStartPtr), | |
50 | + fsaFileStartPtr(fsaFileStartPtr), | |
51 | + fsa(FSAType::getFSA(fsaFileStartPtr, *initializeDeserializer())), | |
52 | + segrulesFSAsMap(createSegrulesFSAsMap(fsaFileStartPtr)), | |
53 | + currSegrulesFSA(getDefaultSegrulesFSA(segrulesFSAsMap)), | |
54 | + isFromFile(false), | |
55 | + chunksDecoder( | |
56 | + processorType == ANALYZER | |
57 | + ? (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Analyzer(*this) | |
58 | + : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)) | |
59 | + { | |
24 | 60 | } |
25 | 61 | |
26 | 62 | const CharsetConverter* Environment::getCharsetConverter(MorfeuszCharset charset) const { |
... | ... | @@ -39,6 +75,12 @@ const CharsetConverter* Environment::getCharsetConverter(MorfeuszCharset charset |
39 | 75 | } |
40 | 76 | |
41 | 77 | Environment::~Environment() { |
78 | + delete this->fsa; | |
79 | + if (this->isFromFile) { | |
80 | + deleteSegrulesFSAs(this->segrulesFSAsMap); | |
81 | + delete this->fsaFileStartPtr; | |
82 | + } | |
83 | + delete this->chunksDecoder; | |
42 | 84 | } |
43 | 85 | |
44 | 86 | void Environment::setCharset(MorfeuszCharset charset) { |
... | ... | @@ -49,22 +91,38 @@ const CharsetConverter& Environment::getCharsetConverter() const { |
49 | 91 | return *this->currentCharsetConverter; |
50 | 92 | } |
51 | 93 | |
52 | -void Environment::setAnalyzerTagset(const Tagset& tagset) { | |
53 | - this->analyzerTagset = tagset; | |
94 | +const CaseConverter& Environment::getCaseConverter() const { | |
95 | + return this->caseConverter; | |
54 | 96 | } |
55 | 97 | |
56 | -const Tagset& Environment::getAnalyzerTagset() const { | |
57 | - return this->analyzerTagset; | |
98 | +void Environment::setTagset(const Tagset& tagset) { | |
99 | + this->tagset = tagset; | |
58 | 100 | } |
59 | 101 | |
60 | -void Environment::setGeneratorTagset(const Tagset& tagset) { | |
61 | - this->generatorTagset = tagset; | |
102 | +const Tagset& Environment::getTagset() const { | |
103 | + return this->tagset; | |
62 | 104 | } |
63 | 105 | |
64 | -const Tagset& Environment::getGeneratorTagset() const { | |
65 | - return this->generatorTagset; | |
106 | +void Environment::setFSAFile(const std::string& filename) { | |
107 | + if (this->isFromFile) { | |
108 | + delete this->fsa; | |
109 | + deleteSegrulesFSAs(this->segrulesFSAsMap); | |
110 | + delete this->fsaFileStartPtr; | |
111 | + } | |
112 | + this->fsaFileStartPtr = readFile<unsigned char>(filename.c_str()); | |
113 | + this->fsa = FSA< vector<InterpsGroup> > ::getFSA(fsaFileStartPtr, *initializeDeserializer()); | |
114 | + this->segrulesFSAsMap = createSegrulesFSAsMap(this->fsaFileStartPtr); | |
115 | + this->isFromFile = true; | |
66 | 116 | } |
67 | 117 | |
68 | -const CaseConverter& Environment::getCaseConverter() const { | |
69 | - return this->caseConverter; | |
118 | +const SegrulesFSA& Environment::getCurrentSegrulesFSA() const { | |
119 | + return *(this->currSegrulesFSA); | |
120 | +} | |
121 | + | |
122 | +const FSAType& Environment::getFSA() const { | |
123 | + return *(this->fsa); | |
124 | +} | |
125 | + | |
126 | +const InterpretedChunksDecoder& Environment::getInterpretedChunksDecoder() const { | |
127 | + return *(this->chunksDecoder); | |
70 | 128 | } |
... | ... |
morfeusz/Environment.hpp
... | ... | @@ -8,28 +8,44 @@ |
8 | 8 | #ifndef ENVIRONMENT_HPP |
9 | 9 | #define ENVIRONMENT_HPP |
10 | 10 | |
11 | +#include <vector> | |
12 | + | |
13 | +class InterpretedChunksDecoder; | |
14 | + | |
11 | 15 | #include "charset/CaseConverter.hpp" |
12 | 16 | #include "charset/CharsetConverter.hpp" |
17 | +#include "fsa/fsa.hpp" | |
18 | +#include "segrules/segrules.hpp" | |
13 | 19 | #include "const.hpp" |
14 | 20 | #include "Tagset.hpp" |
21 | +//#include "InterpretedChunksDecoder.hpp" | |
22 | +#include "InterpsGroup.hpp" | |
15 | 23 | |
24 | +typedef FSA< std::vector<InterpsGroup > > FSAType; | |
16 | 25 | |
17 | 26 | class Environment { |
18 | 27 | public: |
19 | 28 | Environment( |
20 | - const Tagset& analyzerTagset, | |
21 | - const Tagset& generatorTagset, | |
22 | - MorfeuszCharset charset); | |
29 | + MorfeuszCharset charset, | |
30 | + MorfeuszProcessorType morfeuszProcessor, | |
31 | + const unsigned char* fileStartPtr); | |
32 | + | |
23 | 33 | void setCharset(MorfeuszCharset charset); |
34 | + | |
24 | 35 | const CharsetConverter& getCharsetConverter() const; |
25 | 36 | |
26 | - void setAnalyzerTagset(const Tagset& tagset); | |
27 | - const Tagset& getAnalyzerTagset() const; | |
37 | + const CaseConverter& getCaseConverter() const; | |
28 | 38 | |
29 | - void setGeneratorTagset(const Tagset& tagset); | |
30 | - const Tagset& getGeneratorTagset() const; | |
39 | + void setTagset(const Tagset& tagset); | |
40 | + const Tagset& getTagset() const; | |
31 | 41 | |
32 | - const CaseConverter& getCaseConverter() const; | |
42 | + void setFSAFile(const std::string& filename); | |
43 | + | |
44 | + const SegrulesFSA& getCurrentSegrulesFSA() const; | |
45 | + | |
46 | + const FSAType& getFSA() const; | |
47 | + | |
48 | + const InterpretedChunksDecoder& getInterpretedChunksDecoder() const; | |
33 | 49 | |
34 | 50 | virtual ~Environment(); |
35 | 51 | private: |
... | ... | @@ -38,9 +54,16 @@ private: |
38 | 54 | const ISO8859_2_CharsetConverter isoCharsetConverter; |
39 | 55 | const Windows_1250_CharsetConverter cp1250CharsetConverter; |
40 | 56 | const CP852_CharsetConverter cp852CharsetConverter; |
41 | - Tagset analyzerTagset; | |
42 | - Tagset generatorTagset; | |
43 | 57 | const CaseConverter caseConverter; |
58 | + Tagset tagset; | |
59 | + | |
60 | + const unsigned char* fsaFileStartPtr; | |
61 | + const FSAType* fsa; | |
62 | + std::map<SegrulesOptions, SegrulesFSA*> segrulesFSAsMap; | |
63 | + const SegrulesFSA* currSegrulesFSA; | |
64 | + bool isFromFile; | |
65 | + | |
66 | + const InterpretedChunksDecoder* chunksDecoder; | |
44 | 67 | |
45 | 68 | const CharsetConverter* getCharsetConverter(MorfeuszCharset charset) const; |
46 | 69 | }; |
... | ... |
morfeusz/Generator.cpp
morfeusz/InterpretedChunksDecoder.hpp
... | ... | @@ -8,6 +8,9 @@ |
8 | 8 | #ifndef INTERPSGROUPDECODER_HPP |
9 | 9 | #define INTERPSGROUPDECODER_HPP |
10 | 10 | |
11 | +#include <string> | |
12 | +#include <vector> | |
13 | + | |
11 | 14 | #include "charset/CharsetConverter.hpp" |
12 | 15 | #include "EncodedInterpretation.hpp" |
13 | 16 | #include "InterpretedChunk.hpp" |
... | ... | @@ -20,50 +23,75 @@ public: |
20 | 23 | |
21 | 24 | InterpretedChunksDecoder(const Environment& env) |
22 | 25 | : env(env) { |
23 | - | |
24 | 26 | } |
25 | 27 | |
26 | - template <class OutputIterator> | |
27 | - OutputIterator decode( | |
28 | + virtual void decode( | |
28 | 29 | unsigned int startNode, |
29 | 30 | unsigned int endNode, |
30 | 31 | const InterpretedChunk& interpretedChunk, |
31 | - OutputIterator out) { | |
32 | - string orth; | |
33 | - string lemmaPrefix; | |
32 | + std::vector<MorphInterpretation>& out) const = 0; | |
33 | + | |
34 | + virtual ~InterpretedChunksDecoder() {} | |
35 | + | |
36 | +protected: | |
37 | + | |
38 | + void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& originalForm, std::string& decodedForm) const { | |
34 | 39 | for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { |
35 | 40 | const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; |
36 | - orth += env.getCharsetConverter().toString(prefixChunk.originalCodepoints); | |
37 | - lemmaPrefix += convertLemma( | |
41 | + originalForm += env.getCharsetConverter().toString(prefixChunk.originalCodepoints); | |
42 | + decodeForm( | |
38 | 43 | prefixChunk.lowercaseCodepoints, |
39 | - prefixChunk.interpsGroup.interps[0].lemma); | |
44 | + prefixChunk.interpsGroup.interps[0].value, | |
45 | + decodedForm); | |
40 | 46 | } |
47 | + } | |
48 | + | |
49 | + virtual void decodeForm( | |
50 | + const std::vector<uint32_t>& orth, | |
51 | + const EncodedForm& form, | |
52 | + std::string& res) const = 0; | |
53 | + | |
54 | + const Environment& env; | |
55 | +}; | |
56 | + | |
57 | +class InterpretedChunksDecoder4Analyzer : public InterpretedChunksDecoder { | |
58 | + | |
59 | +public: | |
60 | + InterpretedChunksDecoder4Analyzer(const Environment& env): InterpretedChunksDecoder(env) {} | |
61 | + | |
62 | + void decode( | |
63 | + unsigned int startNode, | |
64 | + unsigned int endNode, | |
65 | + const InterpretedChunk& interpretedChunk, | |
66 | + std::vector<MorphInterpretation>& out) const { | |
67 | + string orth; | |
68 | + string lemma; | |
69 | + convertPrefixes(interpretedChunk, orth, lemma); | |
41 | 70 | orth += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); |
42 | 71 | for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) { |
43 | 72 | const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i]; |
44 | - string lemma = lemmaPrefix + convertLemma( | |
73 | + decodeForm( | |
45 | 74 | interpretedChunk.lowercaseCodepoints, |
46 | - ei.lemma); | |
47 | - *out = MorphInterpretation( | |
75 | + ei.value, | |
76 | + lemma); | |
77 | + out.push_back(MorphInterpretation( | |
48 | 78 | startNode, endNode, |
49 | 79 | orth, lemma, |
50 | 80 | ei.tag, |
51 | 81 | ei.nameClassifier, |
52 | - env.getAnalyzerTagset(), | |
53 | - env.getCharsetConverter()); | |
54 | - ++out; | |
82 | + env.getTagset(), | |
83 | + env.getCharsetConverter())); | |
55 | 84 | } |
56 | - return out; | |
57 | 85 | } |
58 | 86 | |
59 | -private: | |
87 | +protected: | |
60 | 88 | |
61 | - string convertLemma( | |
89 | + void decodeForm( | |
62 | 90 | const vector<uint32_t>& orth, |
63 | - const EncodedLemma& lemma) { | |
64 | - string res; | |
91 | + const EncodedForm& lemma, | |
92 | + string& res) const { | |
65 | 93 | for (unsigned int i = 0; i < orth.size() - lemma.suffixToCut; i++) { |
66 | - uint32_t cp = | |
94 | + uint32_t cp = | |
67 | 95 | (i < lemma.casePattern.size() && lemma.casePattern[i]) |
68 | 96 | ? env.getCaseConverter().toTitle(orth[i]) |
69 | 97 | : orth[i]; |
... | ... | @@ -75,10 +103,56 @@ private: |
75 | 103 | uint32_t cp = UTF8CharsetConverter().next(suffixPtr, suffixEnd); |
76 | 104 | env.getCharsetConverter().append(cp, res); |
77 | 105 | } |
78 | - return res; | |
79 | 106 | } |
107 | +}; | |
80 | 108 | |
81 | - const Environment& env; | |
109 | +class InterpretedChunksDecoder4Generator : public InterpretedChunksDecoder { | |
110 | + | |
111 | +public: | |
112 | + InterpretedChunksDecoder4Generator(const Environment& env): InterpretedChunksDecoder(env) {} | |
113 | + | |
114 | + void decode( | |
115 | + unsigned int startNode, | |
116 | + unsigned int endNode, | |
117 | + const InterpretedChunk& interpretedChunk, | |
118 | + std::vector<MorphInterpretation>& out) const { | |
119 | + string orth; | |
120 | + string lemma; | |
121 | + convertPrefixes(interpretedChunk, lemma, orth); | |
122 | + lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); | |
123 | + for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) { | |
124 | + const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i]; | |
125 | + decodeForm( | |
126 | + interpretedChunk.originalCodepoints, | |
127 | + ei.value, | |
128 | + orth); | |
129 | + out.push_back(MorphInterpretation( | |
130 | + startNode, endNode, | |
131 | + orth, lemma, | |
132 | + ei.tag, | |
133 | + ei.nameClassifier, | |
134 | + env.getTagset(), | |
135 | + env.getCharsetConverter())); | |
136 | + } | |
137 | + } | |
138 | + | |
139 | +private: | |
140 | + | |
141 | + void decodeForm( | |
142 | + const vector<uint32_t>& lemma, | |
143 | + const EncodedForm& orth, | |
144 | + string& res) const { | |
145 | + res += orth.prefixToAdd; | |
146 | + for (unsigned int i = 0; i < lemma.size() - orth.suffixToCut; i++) { | |
147 | + env.getCharsetConverter().append(lemma[i], res); | |
148 | + } | |
149 | + const char* suffixPtr = orth.suffixToAdd.c_str(); | |
150 | + const char* suffixEnd = suffixPtr + orth.suffixToAdd.length(); | |
151 | + while (suffixPtr != suffixEnd) { | |
152 | + uint32_t cp = UTF8CharsetConverter().next(suffixPtr, suffixEnd); | |
153 | + env.getCharsetConverter().append(cp, res); | |
154 | + } | |
155 | + } | |
82 | 156 | }; |
83 | 157 | |
84 | 158 | #endif /* INTERPSGROUPDECODER_HPP */ |
... | ... |
morfeusz/Morfeusz.cpp
... | ... | @@ -24,12 +24,6 @@ |
24 | 24 | |
25 | 25 | using namespace std; |
26 | 26 | |
27 | -static Deserializer<vector<InterpsGroup> >* initializeAnalyzerDeserializer() { | |
28 | - static Deserializer < vector < InterpsGroup > > *deserializer | |
29 | - = new MorphDeserializer(); | |
30 | - return deserializer; | |
31 | -} | |
32 | - | |
33 | 27 | static MorfeuszOptions createDefaultOptions() { |
34 | 28 | MorfeuszOptions res; |
35 | 29 | res.caseSensitive = true; |
... | ... | @@ -37,95 +31,74 @@ static MorfeuszOptions createDefaultOptions() { |
37 | 31 | return res; |
38 | 32 | } |
39 | 33 | |
40 | -static SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map) { | |
41 | - SegrulesOptions opts; | |
42 | - opts["aggl"] = "isolated"; | |
43 | - opts["praet"] = "split"; | |
44 | - return (*(map.find(opts))).second; | |
45 | -} | |
46 | - | |
47 | 34 | Morfeusz::Morfeusz() |
48 | -: env(Tagset(DEFAULT_FSA), Tagset(DEFAULT_SYNTH_FSA), DEFAULT_MORFEUSZ_CHARSET), | |
49 | -analyzerPtr(DEFAULT_FSA), | |
50 | -analyzerFSA(FSAType::getFSA(analyzerPtr, *initializeAnalyzerDeserializer())), | |
51 | -segrulesFSAsMap(createSegrulesFSAsMap(analyzerPtr)), | |
52 | -currSegrulesFSA(getDefaultSegrulesFSA(segrulesFSAsMap)), | |
53 | -isAnalyzerFSAFromFile(false), | |
54 | -generatorPtr(DEFAULT_SYNTH_FSA), | |
55 | -isGeneratorFSAFromFile(false), | |
56 | -generator(generatorPtr, env), | |
35 | +: analyzerEnv(DEFAULT_MORFEUSZ_CHARSET, ANALYZER, DEFAULT_FSA), | |
36 | +generatorEnv(DEFAULT_MORFEUSZ_CHARSET, GENERATOR, DEFAULT_SYNTH_FSA), | |
57 | 37 | options(createDefaultOptions()) { |
58 | 38 | |
59 | 39 | } |
60 | 40 | |
61 | -static void deleteSegrulesFSAs(std::map<SegrulesOptions, SegrulesFSA*>& fsasMap) { | |
62 | - for ( | |
63 | - std::map<SegrulesOptions, SegrulesFSA*>::iterator it = fsasMap.begin(); | |
64 | - it != fsasMap.end(); | |
65 | - ++it) { | |
66 | - delete it->second; | |
67 | - } | |
68 | - fsasMap.clear(); | |
69 | -} | |
70 | - | |
71 | 41 | void Morfeusz::setAnalyzerFile(const string& filename) { |
72 | - if (this->isAnalyzerFSAFromFile) { | |
73 | - delete this->analyzerFSA; | |
74 | - deleteSegrulesFSAs(this->segrulesFSAsMap); | |
75 | - delete this->analyzerPtr; | |
76 | - } | |
77 | - this->analyzerPtr = readFile<unsigned char>(filename.c_str()); | |
78 | - this->analyzerFSA = FSA< vector<InterpsGroup> > ::getFSA(analyzerPtr, *initializeAnalyzerDeserializer()); | |
79 | - this->segrulesFSAsMap = createSegrulesFSAsMap(analyzerPtr); | |
80 | - this->isAnalyzerFSAFromFile = true; | |
42 | + this->analyzerEnv.setFSAFile(filename); | |
43 | + // if (this->isAnalyzerFSAFromFile) { | |
44 | + // delete this->analyzerFSA; | |
45 | + // deleteSegrulesFSAs(this->analyzerSegrulesFSAsMap); | |
46 | + // delete this->analyzerPtr; | |
47 | + // } | |
48 | + // this->analyzerPtr = readFile<unsigned char>(filename.c_str()); | |
49 | + // this->analyzerFSA = FSA< vector<InterpsGroup> > ::getFSA(analyzerPtr, *initializeAnalyzerDeserializer()); | |
50 | + // this->analyzerSegrulesFSAsMap = createSegrulesFSAsMap(analyzerPtr); | |
51 | + // this->isAnalyzerFSAFromFile = true; | |
81 | 52 | } |
82 | 53 | |
83 | 54 | void Morfeusz::setGeneratorFile(const string& filename) { |
84 | - if (this->isGeneratorFSAFromFile) { | |
85 | - delete this->generatorPtr; | |
86 | - } | |
87 | - this->generatorPtr = readFile<unsigned char>(filename.c_str()); | |
88 | - this->generator.setGeneratorPtr(generatorPtr); | |
55 | + this->generatorEnv.setFSAFile(filename); | |
56 | + // if (this->isGeneratorFSAFromFile) { | |
57 | + // delete this->generatorPtr; | |
58 | + // } | |
59 | + // this->generatorPtr = readFile<unsigned char>(filename.c_str()); | |
60 | + // this->generator.setGeneratorPtr(generatorPtr); | |
89 | 61 | } |
90 | 62 | |
91 | 63 | Morfeusz::~Morfeusz() { |
92 | - if (this->isAnalyzerFSAFromFile) { | |
93 | - delete this->analyzerFSA; | |
94 | - deleteSegrulesFSAs(this->segrulesFSAsMap); | |
95 | - delete this->analyzerPtr; | |
96 | - } | |
64 | + // if (this->isAnalyzerFSAFromFile) { | |
65 | + // delete this->analyzerFSA; | |
66 | + // deleteSegrulesFSAs(this->analyzerSegrulesFSAsMap); | |
67 | + // delete this->analyzerPtr; | |
68 | + // } | |
97 | 69 | } |
98 | 70 | |
99 | -void Morfeusz::analyzeOneWord( | |
71 | +void Morfeusz::processOneWord( | |
72 | + const Environment& env, | |
100 | 73 | const char*& inputStart, |
101 | 74 | const char* inputEnd, |
102 | 75 | int startNodeNum, |
103 | 76 | std::vector<MorphInterpretation>& results) const { |
104 | 77 | while (inputStart != inputEnd |
105 | - && isEndOfWord(this->env.getCharsetConverter().peek(inputStart, inputEnd))) { | |
106 | - this->env.getCharsetConverter().next(inputStart, inputEnd); | |
78 | + && isEndOfWord(env.getCharsetConverter().peek(inputStart, inputEnd))) { | |
79 | + env.getCharsetConverter().next(inputStart, inputEnd); | |
107 | 80 | } |
108 | 81 | vector<InterpretedChunk> accum; |
109 | 82 | FlexionGraph graph; |
110 | 83 | const char* currInput = inputStart; |
111 | - SegrulesFSA* segrulesFSA = this->currSegrulesFSA; | |
112 | - doAnalyzeOneWord(currInput, inputEnd, accum, graph, segrulesFSA->initialState); | |
84 | + const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA(); | |
85 | + doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph); | |
113 | 86 | if (!graph.empty()) { |
114 | - InterpretedChunksDecoder interpretedChunksDecoder(env); | |
87 | + const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder(); | |
115 | 88 | int srcNode = startNodeNum; |
116 | 89 | for (unsigned int i = 0; i < graph.getTheGraph().size(); i++) { |
117 | 90 | vector<FlexionGraph::Edge>& edges = graph.getTheGraph()[i]; |
118 | 91 | for (unsigned int j = 0; j < edges.size(); j++) { |
119 | 92 | FlexionGraph::Edge& e = edges[j]; |
120 | 93 | int targetNode = startNodeNum + e.nextNode; |
121 | - interpretedChunksDecoder.decode(srcNode, targetNode, e.chunk, back_inserter(results)); | |
94 | + interpretedChunksDecoder.decode(srcNode, targetNode, e.chunk, results); | |
122 | 95 | } |
123 | 96 | srcNode++; |
124 | 97 | } |
125 | 98 | // graph.getResults(*this->tagset, results); |
126 | 99 | } |
127 | 100 | else if (inputStart != inputEnd) { |
128 | - this->appendIgnotiumToResults(string(inputStart, currInput), startNodeNum, results); | |
101 | + this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results); | |
129 | 102 | } |
130 | 103 | inputStart = currInput; |
131 | 104 | } |
... | ... | @@ -139,109 +112,82 @@ static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) { |
139 | 112 | from.orthWasShifted = true; |
140 | 113 | } |
141 | 114 | |
142 | -void Morfeusz::doAnalyzeOneWord( | |
115 | +void Morfeusz::doProcessOneWord( | |
116 | + const Environment& env, | |
143 | 117 | const char*& inputData, |
144 | 118 | const char* inputEnd, |
119 | + SegrulesState segrulesState, | |
145 | 120 | vector<InterpretedChunk>& accum, |
146 | - FlexionGraph& graph, | |
147 | - SegrulesState segrulesState) const { | |
148 | - // cerr << "doAnalyzeOneWord " << inputData << endl; | |
149 | - bool endOfWord = inputData == inputEnd; | |
121 | + FlexionGraph& graph) const { | |
122 | + cerr << "doAnalyzeOneWord " << inputData << endl; | |
123 | + bool endOfProcessing = inputData == inputEnd; | |
150 | 124 | const char* currInput = inputData; |
151 | - uint32_t codepoint = endOfWord ? 0 : this->env.getCharsetConverter().next(currInput, inputEnd); | |
125 | + uint32_t codepoint = endOfProcessing ? 0 : env.getCharsetConverter().next(currInput, inputEnd); | |
152 | 126 | // UnicodeChunk uchunk(*(this->charsetConverter), *(this->caseConverter)); |
153 | 127 | vector<uint32_t> originalCodepoints; |
154 | 128 | vector<uint32_t> lowercaseCodepoints; |
155 | 129 | |
156 | - StateType state = this->analyzerFSA->getInitialState(); | |
130 | + StateType state = env.getFSA().getInitialState(); | |
157 | 131 | |
158 | - while (!isEndOfWord(codepoint)) { | |
159 | - uint32_t lowerCP = this->env.getCaseConverter().toLower(codepoint); | |
132 | + while (!endOfProcessing) { | |
133 | + if (isEndOfWord(codepoint)) { | |
134 | + endOfProcessing = true; | |
135 | + } | |
136 | + cerr << "not end of word '" << string(currInput) << "'" << endl; | |
137 | + uint32_t lowerCP = env.getCaseConverter().toLower(codepoint); | |
160 | 138 | originalCodepoints.push_back(codepoint); |
161 | 139 | lowercaseCodepoints.push_back(lowerCP); |
162 | 140 | feedState(state, lowerCP, UTF8CharsetConverter()); |
163 | - codepoint = currInput == inputEnd ? 0 : this->env.getCharsetConverter().peek(currInput, inputEnd); | |
164 | - if (!isEndOfWord(codepoint)) { | |
165 | - if (state.isAccepting()) { | |
166 | - vector<InterpsGroup> val(state.getValue()); | |
167 | - for (unsigned int i = 0; i < val.size(); i++) { | |
168 | - InterpsGroup& ig = val[i]; | |
169 | - // newSegrulesState.proceedToNext(ig.type); | |
170 | - // this->currSegrulesFSA->proceedToNext(ig.type, segrulesStates, newSegrulesStates); | |
171 | - set<SegrulesState> newSegrulesStates; | |
172 | - currSegrulesFSA->proceedToNext(ig.type, segrulesState, newSegrulesStates); | |
173 | - for ( | |
174 | - set<SegrulesState>::iterator it = newSegrulesStates.begin(); | |
175 | - it != newSegrulesStates.end(); | |
176 | - it++) { | |
177 | - SegrulesState newSegrulesState = *it; | |
178 | - // bool shiftOrth = newSegrulesState.getLastTransitionValue() == 1; | |
179 | - // bool shiftOrthSameType = newSegrulesState.getLastTransitionValue() == 2; | |
180 | - InterpretedChunk ic = { | |
181 | - inputData, | |
182 | - originalCodepoints, | |
183 | - lowercaseCodepoints, | |
184 | - ig, | |
185 | - newSegrulesState.shiftOrthFromPrevious, | |
186 | - false, | |
187 | - vector<InterpretedChunk>() | |
188 | - }; | |
189 | - if (!accum.empty() && accum.back().shiftOrth) { | |
190 | - doShiftOrth(accum.back(), ic); | |
191 | - } | |
192 | - accum.push_back(ic); | |
193 | - const char* newCurrInput = currInput; | |
194 | - doAnalyzeOneWord(newCurrInput, inputEnd, accum, graph, newSegrulesState); | |
195 | - accum.pop_back(); | |
196 | - } | |
197 | - } | |
198 | - } | |
199 | - | |
200 | - this->env.getCharsetConverter().next(currInput, inputEnd); | |
201 | - } | |
202 | - } | |
203 | - // cerr << "end of word" << endl; | |
204 | - // we are at the end of word | |
205 | - if (state.isAccepting()) { | |
206 | - vector<InterpsGroup > val(state.getValue()); | |
207 | - for (unsigned int i = 0; i < val.size(); i++) { | |
208 | - InterpsGroup& ig = val[i]; | |
209 | - // cerr << "currInput=" << currInput << endl; | |
210 | - // cerr << "type=" << (int) ig.type << endl; | |
211 | - set<SegrulesState> newSegrulesStates; | |
212 | - currSegrulesFSA->proceedToNext(ig.type, segrulesState, newSegrulesStates); | |
213 | - for ( | |
214 | - set<SegrulesState>::iterator it = newSegrulesStates.begin(); | |
215 | - it != newSegrulesStates.end(); | |
216 | - it++) { | |
217 | - SegrulesState newSegrulesState = *it; | |
218 | - if (newSegrulesState.accepting) { | |
141 | + codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().peek(currInput, inputEnd); | |
142 | + if (state.isAccepting()) { | |
143 | + cerr << "accepting" << endl; | |
144 | + vector<InterpsGroup> val(state.getValue()); | |
145 | + for (unsigned int i = 0; i < val.size(); i++) { | |
146 | + InterpsGroup& ig = val[i]; | |
147 | + set<SegrulesState> newSegrulesStates; | |
148 | + env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates); | |
149 | + for ( | |
150 | + set<SegrulesState>::iterator it = newSegrulesStates.begin(); | |
151 | + it != newSegrulesStates.end(); | |
152 | + ++it) { | |
153 | + SegrulesState newSegrulesState = *it; | |
219 | 154 | InterpretedChunk ic = { |
220 | - inputData, | |
221 | - originalCodepoints, | |
222 | - lowercaseCodepoints, | |
223 | - ig, | |
224 | - newSegrulesState.shiftOrthFromPrevious, | |
155 | + inputData, | |
156 | + originalCodepoints, | |
157 | + lowercaseCodepoints, | |
158 | + ig, | |
159 | + newSegrulesState.shiftOrthFromPrevious, | |
225 | 160 | false, |
226 | - vector<InterpretedChunk>()}; | |
161 | + vector<InterpretedChunk>() | |
162 | + }; | |
227 | 163 | if (!accum.empty() && accum.back().shiftOrth) { |
228 | 164 | doShiftOrth(accum.back(), ic); |
229 | 165 | } |
230 | 166 | accum.push_back(ic); |
231 | - graph.addPath(accum); | |
167 | + if (isEndOfWord(codepoint)) { | |
168 | + cerr << "end of word inside " << currInput <<endl; | |
169 | + if (newSegrulesState.accepting) | |
170 | + graph.addPath(accum); | |
171 | + } | |
172 | + else { | |
173 | + const char* newCurrInput = currInput; | |
174 | + doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState, accum, graph); | |
175 | + } | |
232 | 176 | accum.pop_back(); |
233 | 177 | } |
234 | 178 | } |
235 | 179 | } |
236 | 180 | } |
181 | + cerr << "end of word " << currInput << endl; | |
237 | 182 | inputData = currInput; |
238 | 183 | } |
239 | 184 | |
240 | 185 | void Morfeusz::appendIgnotiumToResults( |
186 | + const Environment& env, | |
241 | 187 | const string& word, |
242 | 188 | int startNodeNum, |
243 | 189 | std::vector<MorphInterpretation>& results) const { |
244 | - MorphInterpretation interp = MorphInterpretation::createIgn(startNodeNum, word, env.getAnalyzerTagset(), env.getCharsetConverter()); | |
190 | + MorphInterpretation interp = MorphInterpretation::createIgn(startNodeNum, word, env.getTagset(), env.getCharsetConverter()); | |
245 | 191 | results.push_back(interp); |
246 | 192 | } |
247 | 193 | |
... | ... | @@ -258,7 +204,7 @@ void Morfeusz::analyze(const string& text, vector<MorphInterpretation>& results) |
258 | 204 | const char* inputEnd = input + text.length(); |
259 | 205 | while (input != inputEnd) { |
260 | 206 | int startNode = results.empty() ? 0 : results.back().getEndNode(); |
261 | - this->analyzeOneWord(input, inputEnd, startNode, results); | |
207 | + this->processOneWord(this->analyzerEnv, input, inputEnd, startNode, results); | |
262 | 208 | } |
263 | 209 | } |
264 | 210 | |
... | ... | @@ -271,12 +217,18 @@ ResultsIterator Morfeusz::generate(const string& text) const { |
271 | 217 | } |
272 | 218 | |
273 | 219 | void Morfeusz::generate(const string& text, vector<MorphInterpretation>& results) const { |
274 | - this->generator.generate(text, results); | |
220 | + const char* input = text.c_str(); | |
221 | + const char* inputEnd = input + text.length(); | |
222 | + while (input != inputEnd) { | |
223 | + int startNode = results.empty() ? 0 : results.back().getEndNode(); | |
224 | + this->processOneWord(this->generatorEnv, input, inputEnd, startNode, results); | |
225 | + } | |
275 | 226 | } |
276 | 227 | |
277 | 228 | void Morfeusz::setCharset(MorfeuszCharset charset) { |
278 | 229 | this->options.encoding = charset; |
279 | - this->env.setCharset(charset); | |
230 | + this->analyzerEnv.setCharset(charset); | |
231 | + this->generatorEnv.setCharset(charset); | |
280 | 232 | } |
281 | 233 | |
282 | 234 | ResultsIterator::ResultsIterator(vector<MorphInterpretation>& res) { |
... | ... |
morfeusz/Morfeusz.hpp
... | ... | @@ -33,7 +33,6 @@ |
33 | 33 | class Morfeusz; |
34 | 34 | class ResultsIterator; |
35 | 35 | |
36 | -typedef FSA< std::vector<InterpsGroup > > FSAType; | |
37 | 36 | typedef State< std::vector<InterpsGroup > > StateType; |
38 | 37 | |
39 | 38 | class Morfeusz { |
... | ... | @@ -102,33 +101,38 @@ public: |
102 | 101 | friend class ResultsIterator; |
103 | 102 | private: |
104 | 103 | |
105 | - void analyzeOneWord( | |
104 | + void processOneWord( | |
105 | + const Environment& env, | |
106 | 106 | const char*& inputData, |
107 | 107 | const char* inputEnd, |
108 | 108 | int startNodeNum, |
109 | 109 | std::vector<MorphInterpretation>& result) const; |
110 | 110 | |
111 | - void doAnalyzeOneWord( | |
111 | + void doProcessOneWord( | |
112 | + const Environment& env, | |
112 | 113 | const char*& inputData, |
113 | 114 | const char* inputEnd, |
115 | + SegrulesState segrulesState, | |
114 | 116 | std::vector<InterpretedChunk>& accum, |
115 | - FlexionGraph& graph, | |
116 | - SegrulesState segrulesState) const; | |
117 | + FlexionGraph& graph) const; | |
117 | 118 | |
118 | 119 | void appendIgnotiumToResults( |
120 | + const Environment& env, | |
119 | 121 | const std::string& word, |
120 | 122 | int startNodeNum, |
121 | 123 | std::vector<MorphInterpretation>& results) const; |
122 | - Environment env; | |
123 | - const unsigned char* analyzerPtr; | |
124 | - FSAType* analyzerFSA; | |
125 | - std::map<SegrulesOptions, SegrulesFSA*> segrulesFSAsMap; | |
126 | - SegrulesFSA* currSegrulesFSA; | |
127 | - bool isAnalyzerFSAFromFile; | |
128 | - | |
129 | - const unsigned char* generatorPtr; | |
130 | - bool isGeneratorFSAFromFile; | |
131 | - Generator generator; | |
124 | + Environment analyzerEnv; | |
125 | + Environment generatorEnv; | |
126 | +// const unsigned char* analyzerPtr; | |
127 | +// FSAType* analyzerFSA; | |
128 | +// std::map<SegrulesOptions, SegrulesFSA*> analyzerSegrulesFSAsMap; | |
129 | +// SegrulesFSA* currAnalyzerSegrulesFSA; | |
130 | +// bool isAnalyzerFSAFromFile; | |
131 | +// | |
132 | +// const unsigned char* generatorPtr; | |
133 | +// FSAType* generatorFSA; | |
134 | +// bool isGeneratorFSAFromFile; | |
135 | +// Generator generator; | |
132 | 136 | |
133 | 137 | MorfeuszOptions options; |
134 | 138 | }; |
... | ... |
morfeusz/MorphDeserializer.cpp
... | ... | @@ -23,7 +23,7 @@ MorphDeserializer::MorphDeserializer() { |
23 | 23 | MorphDeserializer::~MorphDeserializer() { |
24 | 24 | } |
25 | 25 | |
26 | -static void deserializeLemma(const unsigned char*& ptr, EncodedLemma& lemma) { | |
26 | +static void deserializeLemma(const unsigned char*& ptr, EncodedForm& lemma) { | |
27 | 27 | // XXX uważać na poprawność danych |
28 | 28 | lemma.suffixToCut = *ptr; |
29 | 29 | ptr++; |
... | ... | @@ -64,7 +64,7 @@ static void deserializeLemma(const unsigned char*& ptr, EncodedLemma& lemma) { |
64 | 64 | static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) { |
65 | 65 | interp.type = *ptr; |
66 | 66 | ptr++; |
67 | - deserializeLemma(ptr, interp.lemma); | |
67 | + deserializeLemma(ptr, interp.value); | |
68 | 68 | interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr))); |
69 | 69 | ptr += 2; |
70 | 70 | interp.nameClassifier = *ptr; |
... | ... |
morfeusz/const.hpp
morfeusz/fsa/fsa_impl.hpp
... | ... | @@ -88,8 +88,6 @@ FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserial |
88 | 88 | return new CompressedFSA1<T>(startPtr, deserializer); |
89 | 89 | case 2: |
90 | 90 | return new CompressedFSA2<T>(startPtr, deserializer); |
91 | - case 128: | |
92 | - return new SimpleFSA<T>(startPtr, deserializer, true); | |
93 | 91 | default: |
94 | 92 | std::ostringstream oss; |
95 | 93 | oss << "Invalid implementation number: " << versionNum << ", should be: " << VERSION_NUM; |
... | ... |
nbproject/configurations.xml
... | ... | @@ -106,20 +106,14 @@ |
106 | 106 | </makeTool> |
107 | 107 | </makefileType> |
108 | 108 | <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4"> |
109 | - <ccTool flags="1"> | |
110 | - </ccTool> | |
111 | 109 | </item> |
112 | 110 | <item path="../default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> |
113 | - <ccTool flags="1"> | |
114 | - </ccTool> | |
115 | 111 | </item> |
116 | 112 | <item path="build/default_fsa.cpp" ex="false" tool="1" flavor2="4"> |
117 | 113 | </item> |
118 | 114 | <item path="build/default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> |
119 | 115 | </item> |
120 | 116 | <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> |
121 | - <ccTool flags="1"> | |
122 | - </ccTool> | |
123 | 117 | </item> |
124 | 118 | <item path="build/morfeusz/morfeuszJAVA_wrap.cxx" |
125 | 119 | ex="false" |
... | ... | @@ -414,26 +408,18 @@ |
414 | 408 | </ccTool> |
415 | 409 | </item> |
416 | 410 | <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4"> |
417 | - <ccTool flags="1"> | |
418 | - </ccTool> | |
419 | 411 | </item> |
420 | 412 | <item path="morfeusz/charset/CharsetConverter.cpp" |
421 | 413 | ex="false" |
422 | 414 | tool="1" |
423 | 415 | flavor2="4"> |
424 | - <ccTool flags="1"> | |
425 | - </ccTool> | |
426 | 416 | </item> |
427 | 417 | <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4"> |
428 | - <ccTool flags="1"> | |
429 | - </ccTool> | |
430 | 418 | </item> |
431 | 419 | <item path="morfeusz/charset/conversion_tables.cpp" |
432 | 420 | ex="false" |
433 | 421 | tool="1" |
434 | 422 | flavor2="4"> |
435 | - <ccTool flags="1"> | |
436 | - </ccTool> | |
437 | 423 | </item> |
438 | 424 | <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4"> |
439 | 425 | <ccTool flags="1"> |
... | ... | @@ -522,12 +508,8 @@ |
522 | 508 | ex="false" |
523 | 509 | tool="1" |
524 | 510 | flavor2="4"> |
525 | - <ccTool flags="1"> | |
526 | - </ccTool> | |
527 | 511 | </item> |
528 | 512 | <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> |
529 | - <ccTool flags="1"> | |
530 | - </ccTool> | |
531 | 513 | </item> |
532 | 514 | <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> |
533 | 515 | <ccTool flags="0"> |
... | ... |