Commit de0e960dbe882750998828aedd42322904c1255a

Authored by Michał Lenart
1 parent 00e66248

upodabnianie syntezy do analizy

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@113 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
CMakeLists.txt
... ... @@ -47,7 +47,7 @@ endif ()
47 47  
48 48 # SEGMENT_RULES_FILE
49 49 if ("${SEGMENT_RULES_FILE}" STREQUAL "")
50   - set (SEGMENT_RULES_FILE "${PROJECT_SOURCE_DIR}/input/segmenty.dat")
  50 + set (SEGMENT_RULES_FILE "${PROJECT_SOURCE_DIR}/input/segmenty1.dat")
51 51 endif ()
52 52  
53 53 message ("Will use ${INPUT_DICTIONARIES} as default dictionary input, ${INPUT_TAGSET} as tagset and ${SEGMENT_RULES_FILE} as segmentation rules")
... ...
fsabuilder/buildfsa.py
... ... @@ -137,8 +137,8 @@ def _parseOptions():
137 137 for filename in opts.inputFiles:
138 138 _checkOpen(filename, 'r')
139 139 _checkOpen(opts.outputFile, 'w')
  140 + _checkOption(opts.segmentsFile, parser, "Segment rules file is missing")
140 141 if opts.analyzer:
141   - _checkOption(opts.segmentsFile, parser, "Segment rules file is missing")
142 142 _checkOpen(opts.segmentsFile, 'r')
143 143  
144 144 if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1, SerializationMethod.V2]:
... ... @@ -161,9 +161,9 @@ def _readPolimorfInput4Analyzer(inputFiles, tagset, encoder, segmentRulesManager
161 161 for entry in convertinput.PolimorfConverter4Analyzer(tagset, encoder, 'utf8', segmentRulesManager, trimSupneg).convert(_concatFiles(inputFiles)):
162 162 yield entry
163 163  
164   -def _readPolimorfInput4Generator(inputFiles, tagset, encoder):
  164 +def _readPolimorfInput4Generator(inputFiles, tagset, encoder, segmentRulesManager):
165 165 logging.info('reading generator data from %s', str(inputFiles))
166   - for entry in convertinput.PolimorfConverter4Generator(tagset, encoder, 'utf8').convert(_concatFiles(inputFiles)):
  166 + for entry in convertinput.PolimorfConverter4Generator(tagset, encoder, 'utf8', segmentRulesManager).convert(_concatFiles(inputFiles)):
167 167 yield entry
168 168  
169 169 def _readTrainData(trainFile):
... ... @@ -201,10 +201,10 @@ def buildAnalyzerFromPoliMorf(inputFiles, tagset, segmentRulesManager, trimSupne
201 201 _printStats(fsa)
202 202 return fsa
203 203  
204   -def buildGeneratorFromPoliMorf(inputFiles, tagset):
  204 +def buildGeneratorFromPoliMorf(inputFiles, tagset, segmentRulesManager):
205 205 encoder = encode.Encoder4Generator()
206 206 fsa = FSA(encoder, tagset)
207   - inputData = _readPolimorfInput4Generator(inputFiles, tagset, encoder)
  207 + inputData = _readPolimorfInput4Generator(inputFiles, tagset, encoder, segmentRulesManager)
208 208 for word, data in inputData:
209 209 fsa.addEntry(word, data)
210 210 fsa.close()
... ... @@ -227,14 +227,14 @@ def main(opts):
227 227  
228 228 logging.info('reading tagset from %s', opts.tagsetFile)
229 229 tagset = Tagset(opts.tagsetFile)
  230 + rulesType = rulesParser.RulesParser.PARSE4ANALYZER if opts.analyzer else rulesParser.RulesParser.PARSE4GENERATOR
  231 + segmentRulesManager = rulesParser.RulesParser(tagset, rulesType).parse(opts.segmentsFile)
  232 + segmentationRulesData = segmentRulesManager.serialize()
230 233  
231 234 if opts.analyzer:
232   - segmentRulesManager = rulesParser.RulesParser(tagset).parse(opts.segmentsFile)
233   - additionalData = segmentRulesManager.serialize()
234 235 fsa = buildAnalyzerFromPoliMorf(opts.inputFiles, tagset, segmentRulesManager, opts.trimSupneg)
235 236 else:
236   - fsa = buildGeneratorFromPoliMorf(opts.inputFiles, tagset)
237   - additionalData = bytearray()
  237 + fsa = buildGeneratorFromPoliMorf(opts.inputFiles, tagset, segmentRulesManager)
238 238  
239 239 if opts.trainFile:
240 240 logging.info('training with '+opts.trainFile+' ...')
... ... @@ -248,9 +248,9 @@ def main(opts):
248 248 }[opts.serializationMethod](fsa)
249 249  
250 250 if opts.cpp:
251   - serializer.serialize2CppFile(opts.outputFile, generator=opts.generator, additionalData=additionalData)
  251 + serializer.serialize2CppFile(opts.outputFile, generator=opts.generator, segmentationRulesData=segmentationRulesData)
252 252 else:
253   - serializer.serialize2BinaryFile(opts.outputFile, additionalData=additionalData)
  253 + serializer.serialize2BinaryFile(opts.outputFile, segmentationRulesData=segmentationRulesData)
254 254  
255 255 logging.info('total FSA size (in bytes): '+str(fsa.initialState.reverseOffset))
256 256 # {
... ...
fsabuilder/morfeuszbuilder/fsa/common.py
... ... @@ -67,11 +67,12 @@ class Interpretation(object):
67 67  
68 68 class Interpretation4Generator(object):
69 69  
70   - def __init__(self, orth, base, tagnum, namenum):
  70 + def __init__(self, orth, base, tagnum, namenum, typenum):
71 71 self.lemma = base
72 72 self.orth = EncodedFormWithPrefix(base, orth)
73 73 self.tagnum = tagnum
74 74 self.namenum = namenum
  75 + self.typenum = typenum
75 76  
76 77 def getSortKey(self):
77 78 return (
... ...
fsabuilder/morfeuszbuilder/fsa/convertinput.py
... ... @@ -81,10 +81,11 @@ class PolimorfConverter4Analyzer(object):
81 81  
82 82 class PolimorfConverter4Generator(object):
83 83  
84   - def __init__(self, tagset, encoder, inputEncoding='utf8'):
  84 + def __init__(self, tagset, encoder, inputEncoding, segmentRulesManager):
85 85 self.tagset = tagset
86 86 self.encoder = encoder
87 87 self.inputEncoding = inputEncoding
  88 + self.segmentRulesManager = segmentRulesManager
88 89  
89 90 # we do it the ugly way (parse to plain text) because it is way more memory-efficient
90 91 def _partiallyParseLines(self, inputLines):
... ... @@ -94,10 +95,11 @@ class PolimorfConverter4Generator(object):
94 95 if base:
95 96 tagnum = self.tagset.getTagnum4Tag(tag)
96 97 namenum = self.tagset.getNamenum4Name(name)
97   - yield '%s %s %d %d' % (
  98 + typenum = self.segmentRulesManager.lexeme2SegmentTypeNum(base, tagnum)
  99 + yield '%s %s %d %d %d' % (
98 100 orth.encode(self.inputEncoding),
99 101 base.encode(self.inputEncoding),
100   - tagnum, namenum)
  102 + tagnum, namenum, typenum)
101 103 else:
102 104 logging.warn('Ignoring line: %s', line.strip())
103 105  
... ... @@ -109,10 +111,11 @@ class PolimorfConverter4Generator(object):
109 111 for line in inputLines:
110 112 line = line.decode(self.inputEncoding).strip(u'\n')
111 113 if line:
112   - orth, base, tagnum, namenum = line.split(u' ')
  114 + orth, base, tagnum, namenum, typenum = line.split(u' ')
113 115 tagnum = int(tagnum)
114 116 namenum = int(namenum)
115   - yield (base, Interpretation4Generator(orth, base, tagnum, namenum))
  117 + typenum = int(typenum)
  118 + yield (base, Interpretation4Generator(orth, base, tagnum, namenum, typenum))
116 119  
117 120 def convert(self, inputLines):
118 121 return _mergeEntries(self._reallyParseLines(self._sortLines(self._partiallyParseLines(inputLines))))
... ...
fsabuilder/morfeuszbuilder/fsa/encode.py
... ... @@ -114,7 +114,7 @@ class MorphEncoder(Encoder):
114 114 assert type(interpsList) == frozenset
115 115 for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
116 116 res.extend(self._encodeTypeNum(interp.typenum))
117   - res.extend(self._encodeEncodedForm(interp.lemma, withCasePattern=True))
  117 + res.extend(self._encodeEncodedForm(interp.lemma, withCasePattern=True, withPrefix=False))
118 118 res.extend(self._encodeTagNum(interp.tagnum))
119 119 res.extend(self._encodeNameNum(interp.namenum))
120 120 del interpsList
... ... @@ -133,6 +133,7 @@ class Encoder4Generator(Encoder):
133 133 res.append(firstByte)
134 134 assert type(interpsList) == frozenset
135 135 for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
  136 + res.extend(self._encodeTypeNum(interp.typenum))
136 137 res.extend(self._encodeEncodedForm(interp.orth, withCasePattern=False, withPrefix=True))
137 138 res.extend(self._encodeTagNum(interp.tagnum))
138 139 res.extend(self._encodeNameNum(interp.namenum))
... ...
fsabuilder/morfeuszbuilder/fsa/serializer.py
... ... @@ -24,7 +24,7 @@ class Serializer(object):
24 24 def getVersion(self):
25 25 return 10
26 26  
27   - def serialize2CppFile(self, fname, generator, additionalData):
  27 + def serialize2CppFile(self, fname, generator, segmentationRulesData):
28 28 res = []
29 29 # self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
30 30 res.append('\n')
... ... @@ -37,8 +37,8 @@ class Serializer(object):
37 37 res.append('extern const unsigned char DEFAULT_FSA[] = {')
38 38 res.append('\n')
39 39 for byte in self.fsa2bytearray(
40   - additionalData=self.serializeTagset(self.fsa.tagset),
41   - moreAdditionalData=additionalData):
  40 + tagsetData=self.serializeTagset(self.fsa.tagset),
  41 + segmentationRulesData=segmentationRulesData):
42 42 res.append(hex(byte));
43 43 res.append(',');
44 44 res.append('\n')
... ... @@ -47,16 +47,16 @@ class Serializer(object):
47 47 with open(fname, 'w') as f:
48 48 f.write(''.join(res))
49 49  
50   - def serialize2BinaryFile(self, fname, additionalData):
  50 + def serialize2BinaryFile(self, fname, segmentationRulesData):
51 51 with open(fname, 'wb') as f:
52 52 f.write(self.fsa2bytearray(
53   - additionalData=self.serializeTagset(self.fsa.tagset),
54   - moreAdditionalData=additionalData))
  53 + tagsetData=self.serializeTagset(self.fsa.tagset),
  54 + segmentationRulesData=segmentationRulesData))
55 55  
56 56 def getStateSize(self, state):
57 57 raise NotImplementedError('Not implemented')
58 58  
59   - def fsa2bytearray(self, additionalData=bytearray(), moreAdditionalData=bytearray()):
  59 + def fsa2bytearray(self, tagsetData, segmentationRulesData):
60 60 res = bytearray()
61 61 res.extend(self.serializePrologue())
62 62 fsaData = bytearray()
... ... @@ -66,7 +66,7 @@ class Serializer(object):
66 66 fsaData.extend(self.state2bytearray(state))
67 67 res.extend(htonl(len(fsaData)))
68 68 res.extend(fsaData)
69   - res.extend(self.serializeEpilogue(additionalData, moreAdditionalData))
  69 + res.extend(self.serializeEpilogue(tagsetData, segmentationRulesData))
70 70 return res
71 71  
72 72 def _serializeTags(self, tagsMap):
... ... @@ -104,20 +104,20 @@ class Serializer(object):
104 104  
105 105 return res
106 106  
107   - def serializeEpilogue(self, additionalData, moreAdditionalData):
  107 + def serializeEpilogue(self, tagsetData, segmentationRulesData):
108 108 res = bytearray()
109   - additionalDataSize = len(additionalData) if additionalData else 0
110   - moreAdditionalDataSize = len(moreAdditionalData) if moreAdditionalData else 0
111   - res.extend(htonl(additionalDataSize))
  109 + tagsetDataSize = len(tagsetData) if tagsetData else 0
  110 + segmentationDataSize = len(segmentationRulesData) if segmentationRulesData else 0
  111 + res.extend(htonl(tagsetDataSize))
112 112  
113 113 # add additional data itself
114   - if additionalDataSize:
115   - assert type(additionalData) == bytearray
116   - res.extend(additionalData)
  114 + if tagsetDataSize:
  115 + assert type(tagsetData) == bytearray
  116 + res.extend(tagsetData)
117 117  
118   - if moreAdditionalDataSize:
119   - assert type(moreAdditionalData) == bytearray
120   - res.extend(moreAdditionalData)
  118 + if segmentationDataSize:
  119 + assert type(segmentationRulesData) == bytearray
  120 + res.extend(segmentationRulesData)
121 121 return res
122 122  
123 123 def state2bytearray(self, state):
... ...
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
... ... @@ -13,8 +13,13 @@ from morfeuszbuilder.segrules import rulesNFA
13 13  
14 14 class RulesParser(object):
15 15  
16   - def __init__(self, tagset):
  16 + PARSE4GENERATOR = 1
  17 + PARSE4ANALYZER = 2
  18 +
  19 + def __init__(self, tagset, rulesType):
17 20 self.tagset = tagset
  21 + assert rulesType in (RulesParser.PARSE4GENERATOR, RulesParser.PARSE4ANALYZER)
  22 + self.rulesType = rulesType
18 23  
19 24 def _getKey2Defs(self, segtypesConfigFile):
20 25 res = {}
... ... @@ -29,7 +34,7 @@ class RulesParser(object):
29 34  
30 35 def parse(self, filename):
31 36  
32   - segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes', 'segment types'])
  37 + segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'generator combinations', 'tags', 'lexemes', 'segment types'])
33 38 key2Defs = self._getKey2Defs(segtypesConfigFile)
34 39 segtypesHelper = segtypes.Segtypes(self.tagset, segtypesConfigFile)
35 40  
... ... @@ -47,7 +52,8 @@ class RulesParser(object):
47 52 nfa = rulesNFA.RulesNFA()
48 53 if not firstNFA:
49 54 firstNFA = nfa
50   - combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations')
  55 + section = 'combinations' if self.rulesType == RulesParser.PARSE4ANALYZER else 'generator combinations'
  56 + combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection(section)
51 57 combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs, filename))
52 58 for rule in self._doParse(combinationEnumeratedLines, segtypesHelper, filename):
53 59 # print rule
... ... @@ -83,7 +89,10 @@ class RulesParser(object):
83 89 unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule
84 90 oneOfRule = delimitedList(unaryRule, delim='|')
85 91 complexRule = unaryRule ^ oneOfRule
86   - concatRule = OneOrMore(complexRule)
  92 + if self.rulesType == RulesParser.PARSE4ANALYZER:
  93 + concatRule = OneOrMore(complexRule)
  94 + else:
  95 + concatRule = ZeroOrMore(shiftOrthRule) + tagRule
87 96 rule << concatRule
88 97  
89 98 tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper))
... ...
fsabuilder/morfeuszbuilder/tagset/segtypes.py
... ... @@ -15,6 +15,7 @@ class Segtypes(object):
15 15  
16 16 self.filename = segrulesConfigFile.filename
17 17  
  18 + self.segtypes = set()
18 19 self.segtype2Segnum = {}
19 20 self.segnum2Segtype = {}
20 21 self.patternsList = []
... ... @@ -22,6 +23,7 @@ class Segtypes(object):
22 23 self._tagnum2Segnum = {}
23 24 self._lemmaTagnum2Segnum = {}
24 25  
  26 + self._readSegtypes(segrulesConfigFile)
25 27 self._readLexemes(segrulesConfigFile)
26 28 self._readTags(segrulesConfigFile)
27 29 self._indexSegnums()
... ... @@ -32,6 +34,20 @@ class Segtypes(object):
32 34 if not cond:
33 35 raise exceptions.ConfigFileException(self.filename, lineNum, msg)
34 36  
  37 + def _readSegtypes(self, segrulesConfigFile):
  38 + for lineNum, line in segrulesConfigFile.enumerateLinesInSection('segment types'):
  39 + assert type(line) == unicode
  40 + self._validate(
  41 + u'Segment type must be a single word',
  42 + lineNum,
  43 + re.match(r'^\w+$', line))
  44 + self._validate(
  45 + u'Segment type already defined: "%s"' % line,
  46 + lineNum,
  47 + line not in self.segtypes)
  48 + self.segtypes.add(line)
  49 +
  50 +
35 51 def _readTags(self, segrulesConfigFile):
36 52 gotWildcardPattern = False
37 53 for lineNum, line in segrulesConfigFile.enumerateLinesInSection('tags'):
... ... @@ -42,6 +58,10 @@ class Segtypes(object):
42 58 len(splitLine) == 2)
43 59 segtype, pattern = splitLine
44 60 self._validate(
  61 + u'Undeclared segment type: "%s"' % segtype,
  62 + lineNum,
  63 + segtype in self.segtypes)
  64 + self._validate(
45 65 u'Segment type must be a lowercase alphanumeric with optional underscores',
46 66 lineNum,
47 67 re.match(r'[a-z_]+', segtype))
... ... @@ -78,6 +98,10 @@ class Segtypes(object):
78 98 for lineNum, line in segrulesConfigFile.enumerateLinesInSection('lexemes'):
79 99 segtype, pattern = line.strip().split('\t')
80 100 self._validate(
  101 + u'Undeclared segment type: "%s"' % segtype,
  102 + lineNum,
  103 + segtype in self.segtypes)
  104 + self._validate(
81 105 u'Segment type must be a lowercase alphanumeric with optional underscores',
82 106 lineNum,
83 107 re.match(r'[a-z_]+', segtype))
... ...
input/segmenty.dat
... ... @@ -111,7 +111,7 @@ moze_interp(z_on_agl)
111 111 moze_interp(z_on_agl on_agl)
112 112  
113 113 # Liczba zapisana jako ciąg cyfr:
114   -moze_interp( dig )
  114 +moze_interp( dig>* dig )
115 115  
116 116 # Formacje prefiksalne
117 117 #### trzeba wydzielić odpowiednie samodze!
... ... @@ -154,6 +154,10 @@ moze_interp( fin dywiz li)
154 154 #moze_interp( praet_sg_na li)
155 155 #moze_interp( fin li)
156 156  
  157 +[generator combinations]
  158 +prefs> nomina
  159 +nomina
  160 +
157 161 [segment types]
158 162 naj
159 163 nie
... ... @@ -164,6 +168,8 @@ dig
164 168 adja
165 169 adj
166 170 adj_sup
  171 +adj_com
  172 +fin
167 173 negat
168 174 on_agl
169 175 z_on_agl
... ... @@ -176,8 +182,42 @@ praet_sg_agl
176 182 praet_sg_na
177 183 praet_sg
178 184 praet_pl
  185 +z_aglt
  186 +by
  187 +li
  188 +nomina
  189 +adjectiva
  190 +verba_imperf
  191 +dywiz
  192 +kropka
179 193 samodz
180 194  
  195 +[lexemes]
  196 +z_aglt aby:comp
  197 +z_aglt bowiem:comp
  198 +by by:qub
  199 +li li:qub
  200 +z_aglt by:comp
  201 +z_aglt cóż:subst
  202 +z_aglt czemu:adv
  203 +z_aglt czyżby:qub
  204 +z_aglt choćby:comp
  205 +z_aglt chociażby:comp
  206 +z_aglt dlaczego:adv
  207 +z_aglt dopóki:comp
  208 +z_aglt dopóty:conj
  209 +z_aglt gdyby:comp
  210 +z_aglt gdzie:qub
  211 +z_aglt gdzie:adv
  212 +z_aglt jakby:comp
  213 +z_aglt jakoby:comp
  214 +z_aglt kiedy:adv
  215 +z_aglt kiedy:comp
  216 +z_aglt tylko:qub
  217 +z_aglt żeby:comp
  218 +dywiz -:interp
  219 +kropka .:interp
  220 +
181 221 [tags]
182 222 naj naj
183 223 nie nie
... ... @@ -221,29 +261,3 @@ verba_imperf inf:imperf
221 261 verba_imperf imps:imperf
222 262 verba_imperf impt:%:imperf
223 263 samodz %
224   -
225   -[lexemes]
226   -z_aglt aby:comp
227   -z_aglt bowiem:comp
228   -by by:qub
229   -li li:qub
230   -z_aglt by:comp
231   -z_aglt cóż:subst
232   -z_aglt czemu:adv
233   -z_aglt czyżby:qub
234   -z_aglt choćby:comp
235   -z_aglt chociażby:comp
236   -z_aglt dlaczego:adv
237   -z_aglt dopóki:comp
238   -z_aglt dopóty:conj
239   -z_aglt gdyby:comp
240   -z_aglt gdzie:qub
241   -z_aglt gdzie:adv
242   -z_aglt jakby:comp
243   -z_aglt jakoby:comp
244   -z_aglt kiedy:adv
245   -z_aglt kiedy:comp
246   -z_aglt tylko:qub
247   -z_aglt żeby:comp
248   -dywiz -:interp
249   -kropka .:interp
... ...
input/segmenty1.dat
... ... @@ -9,19 +9,22 @@ praet=split composite
9 9  
10 10 dig>* dig
11 11 (adja dywiz)+ adj
12   -#dig!>+
13   -#dig!> dig!> dig!>
14 12 naj> adj_sup
15 13  
  14 +[generator combinations]
  15 +
16 16 [segment types]
17 17 naj
18 18 nie
19 19 prefs
20 20 prefv
  21 +prefa
21 22 dig
22 23 adja
23 24 adj
24 25 adj_sup
  26 +adj_com
  27 +fin
25 28 negat
26 29 on_agl
27 30 z_on_agl
... ... @@ -34,6 +37,14 @@ praet_sg_agl
34 37 praet_sg_na
35 38 praet_sg
36 39 praet_pl
  40 +z_aglt
  41 +by
  42 +li
  43 +nomina
  44 +adjectiva
  45 +verba_imperf
  46 +dywiz
  47 +kropka
37 48 samodz
38 49  
39 50 [tags]
... ...
morfeusz/CMakeLists.txt
... ... @@ -2,13 +2,13 @@
2 2 ########## generate default dictionary data #################
3 3 add_custom_command (
4 4 OUTPUT "${INPUT_DICTIONARY_CPP}"
5   - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/buildfsa.py --analyzer --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_DICTIONARY_CPP}" "--tagset-file=${INPUT_TAGSET}" "--segments-file=${SEGMENT_RULES_FILE}" --cpp --serialization-method=V1 --trim-supneg
  5 + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/buildfsa.py --analyzer --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V1 --trim-supneg
6 6 DEPENDS "${INPUT_DICTIONARY}"
7 7 COMMENT "Building default dictionary C++ file"
8 8 )
9 9 add_custom_command (
10 10 OUTPUT "${INPUT_SYNTH_DICTIONARY_CPP}"
11   - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/buildfsa.py --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" "--tagset-file=${INPUT_TAGSET}" --cpp --serialization-method=V1
  11 + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/buildfsa.py --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V1
12 12 DEPENDS "${INPUT_DICTIONARY}"
13 13 COMMENT "Building default dictionary C++ file"
14 14 )
... ...
morfeusz/EncodedInterpretation.hpp
... ... @@ -16,17 +16,18 @@
16 16 /*
17 17 * Lemma in a compressed format (as in an automaton)
18 18 */
19   -struct EncodedLemma {
  19 +struct EncodedForm {
20 20 int suffixToCut;
21 21 std::string suffixToAdd;
22 22 std::vector<bool> casePattern;
  23 + std::string prefixToAdd;
23 24 };
24 25  
25 26 /*
26 27 * Internal representation of an interpretation - with lemma encoded
27 28 */
28 29 struct EncodedInterpretation {
29   - EncodedLemma lemma;
  30 + EncodedForm value;
30 31 unsigned char type;
31 32 int tag;
32 33 int nameClassifier;
... ...
morfeusz/Environment.cpp
... ... @@ -6,21 +6,57 @@
6 6 */
7 7  
8 8 #include "Environment.hpp"
  9 +#include "InterpretedChunksDecoder.hpp"
  10 +#include "MorphDeserializer.hpp"
9 11 #include "exceptions.hpp"
10 12  
  13 +//class InterpretedChunksDecoder4Analyzer;
  14 +//class InterpretedChunksDecoder4Generator;
  15 +
  16 +static Deserializer<vector<InterpsGroup> >* initializeDeserializer() {
  17 + static Deserializer < vector < InterpsGroup > > *deserializer
  18 + = new MorphDeserializer();
  19 + return deserializer;
  20 +}
  21 +
  22 +static SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map) {
  23 + SegrulesOptions opts;
  24 + opts["aggl"] = "isolated";
  25 + opts["praet"] = "split";
  26 + return (*(map.find(opts))).second;
  27 +}
  28 +
  29 +static void deleteSegrulesFSAs(std::map<SegrulesOptions, SegrulesFSA*>& fsasMap) {
  30 + for (
  31 + std::map<SegrulesOptions, SegrulesFSA*>::iterator it = fsasMap.begin();
  32 + it != fsasMap.end();
  33 + ++it) {
  34 + delete it->second;
  35 + }
  36 + fsasMap.clear();
  37 +}
  38 +
11 39 Environment::Environment(
12   - const Tagset& analyzerTagset,
13   - const Tagset& generatorTagset,
14   - MorfeuszCharset charset)
  40 + MorfeuszCharset charset,
  41 + MorfeuszProcessorType processorType,
  42 + const unsigned char* fsaFileStartPtr)
15 43 : currentCharsetConverter(getCharsetConverter(charset)),
16 44 utf8CharsetConverter(),
17 45 isoCharsetConverter(),
18 46 cp1250CharsetConverter(),
19 47 cp852CharsetConverter(),
20   - analyzerTagset(analyzerTagset),
21   - generatorTagset(generatorTagset),
22   - caseConverter() {
23   -
  48 + caseConverter(),
  49 + tagset(fsaFileStartPtr),
  50 + fsaFileStartPtr(fsaFileStartPtr),
  51 + fsa(FSAType::getFSA(fsaFileStartPtr, *initializeDeserializer())),
  52 + segrulesFSAsMap(createSegrulesFSAsMap(fsaFileStartPtr)),
  53 + currSegrulesFSA(getDefaultSegrulesFSA(segrulesFSAsMap)),
  54 + isFromFile(false),
  55 + chunksDecoder(
  56 + processorType == ANALYZER
  57 + ? (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Analyzer(*this)
  58 + : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this))
  59 + {
24 60 }
25 61  
26 62 const CharsetConverter* Environment::getCharsetConverter(MorfeuszCharset charset) const {
... ... @@ -39,6 +75,12 @@ const CharsetConverter* Environment::getCharsetConverter(MorfeuszCharset charset
39 75 }
40 76  
41 77 Environment::~Environment() {
  78 + delete this->fsa;
  79 + if (this->isFromFile) {
  80 + deleteSegrulesFSAs(this->segrulesFSAsMap);
  81 + delete this->fsaFileStartPtr;
  82 + }
  83 + delete this->chunksDecoder;
42 84 }
43 85  
44 86 void Environment::setCharset(MorfeuszCharset charset) {
... ... @@ -49,22 +91,38 @@ const CharsetConverter&amp; Environment::getCharsetConverter() const {
49 91 return *this->currentCharsetConverter;
50 92 }
51 93  
52   -void Environment::setAnalyzerTagset(const Tagset& tagset) {
53   - this->analyzerTagset = tagset;
  94 +const CaseConverter& Environment::getCaseConverter() const {
  95 + return this->caseConverter;
54 96 }
55 97  
56   -const Tagset& Environment::getAnalyzerTagset() const {
57   - return this->analyzerTagset;
  98 +void Environment::setTagset(const Tagset& tagset) {
  99 + this->tagset = tagset;
58 100 }
59 101  
60   -void Environment::setGeneratorTagset(const Tagset& tagset) {
61   - this->generatorTagset = tagset;
  102 +const Tagset& Environment::getTagset() const {
  103 + return this->tagset;
62 104 }
63 105  
64   -const Tagset& Environment::getGeneratorTagset() const {
65   - return this->generatorTagset;
  106 +void Environment::setFSAFile(const std::string& filename) {
  107 + if (this->isFromFile) {
  108 + delete this->fsa;
  109 + deleteSegrulesFSAs(this->segrulesFSAsMap);
  110 + delete this->fsaFileStartPtr;
  111 + }
  112 + this->fsaFileStartPtr = readFile<unsigned char>(filename.c_str());
  113 + this->fsa = FSA< vector<InterpsGroup> > ::getFSA(fsaFileStartPtr, *initializeDeserializer());
  114 + this->segrulesFSAsMap = createSegrulesFSAsMap(this->fsaFileStartPtr);
  115 + this->isFromFile = true;
66 116 }
67 117  
68   -const CaseConverter& Environment::getCaseConverter() const {
69   - return this->caseConverter;
  118 +const SegrulesFSA& Environment::getCurrentSegrulesFSA() const {
  119 + return *(this->currSegrulesFSA);
  120 +}
  121 +
  122 +const FSAType& Environment::getFSA() const {
  123 + return *(this->fsa);
  124 +}
  125 +
  126 +const InterpretedChunksDecoder& Environment::getInterpretedChunksDecoder() const {
  127 + return *(this->chunksDecoder);
70 128 }
... ...
morfeusz/Environment.hpp
... ... @@ -8,28 +8,44 @@
8 8 #ifndef ENVIRONMENT_HPP
9 9 #define ENVIRONMENT_HPP
10 10  
  11 +#include <vector>
  12 +
  13 +class InterpretedChunksDecoder;
  14 +
11 15 #include "charset/CaseConverter.hpp"
12 16 #include "charset/CharsetConverter.hpp"
  17 +#include "fsa/fsa.hpp"
  18 +#include "segrules/segrules.hpp"
13 19 #include "const.hpp"
14 20 #include "Tagset.hpp"
  21 +//#include "InterpretedChunksDecoder.hpp"
  22 +#include "InterpsGroup.hpp"
15 23  
  24 +typedef FSA< std::vector<InterpsGroup > > FSAType;
16 25  
17 26 class Environment {
18 27 public:
19 28 Environment(
20   - const Tagset& analyzerTagset,
21   - const Tagset& generatorTagset,
22   - MorfeuszCharset charset);
  29 + MorfeuszCharset charset,
  30 + MorfeuszProcessorType morfeuszProcessor,
  31 + const unsigned char* fileStartPtr);
  32 +
23 33 void setCharset(MorfeuszCharset charset);
  34 +
24 35 const CharsetConverter& getCharsetConverter() const;
25 36  
26   - void setAnalyzerTagset(const Tagset& tagset);
27   - const Tagset& getAnalyzerTagset() const;
  37 + const CaseConverter& getCaseConverter() const;
28 38  
29   - void setGeneratorTagset(const Tagset& tagset);
30   - const Tagset& getGeneratorTagset() const;
  39 + void setTagset(const Tagset& tagset);
  40 + const Tagset& getTagset() const;
31 41  
32   - const CaseConverter& getCaseConverter() const;
  42 + void setFSAFile(const std::string& filename);
  43 +
  44 + const SegrulesFSA& getCurrentSegrulesFSA() const;
  45 +
  46 + const FSAType& getFSA() const;
  47 +
  48 + const InterpretedChunksDecoder& getInterpretedChunksDecoder() const;
33 49  
34 50 virtual ~Environment();
35 51 private:
... ... @@ -38,9 +54,16 @@ private:
38 54 const ISO8859_2_CharsetConverter isoCharsetConverter;
39 55 const Windows_1250_CharsetConverter cp1250CharsetConverter;
40 56 const CP852_CharsetConverter cp852CharsetConverter;
41   - Tagset analyzerTagset;
42   - Tagset generatorTagset;
43 57 const CaseConverter caseConverter;
  58 + Tagset tagset;
  59 +
  60 + const unsigned char* fsaFileStartPtr;
  61 + const FSAType* fsa;
  62 + std::map<SegrulesOptions, SegrulesFSA*> segrulesFSAsMap;
  63 + const SegrulesFSA* currSegrulesFSA;
  64 + bool isFromFile;
  65 +
  66 + const InterpretedChunksDecoder* chunksDecoder;
44 67  
45 68 const CharsetConverter* getCharsetConverter(MorfeuszCharset charset) const;
46 69 };
... ...
morfeusz/Generator.cpp
... ... @@ -75,7 +75,7 @@ void Generator::decodeRes(
75 75 decodedOrth, lemma,
76 76 egi.tag,
77 77 egi.nameClassifier,
78   - env.getAnalyzerTagset(),
  78 + env.getTagset(),
79 79 env.getCharsetConverter());
80 80 result.push_back(mi);
81 81 }
... ...
morfeusz/InterpretedChunksDecoder.hpp
... ... @@ -8,6 +8,9 @@
8 8 #ifndef INTERPSGROUPDECODER_HPP
9 9 #define INTERPSGROUPDECODER_HPP
10 10  
  11 +#include <string>
  12 +#include <vector>
  13 +
11 14 #include "charset/CharsetConverter.hpp"
12 15 #include "EncodedInterpretation.hpp"
13 16 #include "InterpretedChunk.hpp"
... ... @@ -20,50 +23,75 @@ public:
20 23  
21 24 InterpretedChunksDecoder(const Environment& env)
22 25 : env(env) {
23   -
24 26 }
25 27  
26   - template <class OutputIterator>
27   - OutputIterator decode(
  28 + virtual void decode(
28 29 unsigned int startNode,
29 30 unsigned int endNode,
30 31 const InterpretedChunk& interpretedChunk,
31   - OutputIterator out) {
32   - string orth;
33   - string lemmaPrefix;
  32 + std::vector<MorphInterpretation>& out) const = 0;
  33 +
  34 + virtual ~InterpretedChunksDecoder() {}
  35 +
  36 +protected:
  37 +
  38 + void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& originalForm, std::string& decodedForm) const {
34 39 for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) {
35 40 const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i];
36   - orth += env.getCharsetConverter().toString(prefixChunk.originalCodepoints);
37   - lemmaPrefix += convertLemma(
  41 + originalForm += env.getCharsetConverter().toString(prefixChunk.originalCodepoints);
  42 + decodeForm(
38 43 prefixChunk.lowercaseCodepoints,
39   - prefixChunk.interpsGroup.interps[0].lemma);
  44 + prefixChunk.interpsGroup.interps[0].value,
  45 + decodedForm);
40 46 }
  47 + }
  48 +
  49 + virtual void decodeForm(
  50 + const std::vector<uint32_t>& orth,
  51 + const EncodedForm& form,
  52 + std::string& res) const = 0;
  53 +
  54 + const Environment& env;
  55 +};
  56 +
  57 +class InterpretedChunksDecoder4Analyzer : public InterpretedChunksDecoder {
  58 +
  59 +public:
  60 + InterpretedChunksDecoder4Analyzer(const Environment& env): InterpretedChunksDecoder(env) {}
  61 +
  62 + void decode(
  63 + unsigned int startNode,
  64 + unsigned int endNode,
  65 + const InterpretedChunk& interpretedChunk,
  66 + std::vector<MorphInterpretation>& out) const {
  67 + string orth;
  68 + string lemma;
  69 + convertPrefixes(interpretedChunk, orth, lemma);
41 70 orth += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
42 71 for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) {
43 72 const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i];
44   - string lemma = lemmaPrefix + convertLemma(
  73 + decodeForm(
45 74 interpretedChunk.lowercaseCodepoints,
46   - ei.lemma);
47   - *out = MorphInterpretation(
  75 + ei.value,
  76 + lemma);
  77 + out.push_back(MorphInterpretation(
48 78 startNode, endNode,
49 79 orth, lemma,
50 80 ei.tag,
51 81 ei.nameClassifier,
52   - env.getAnalyzerTagset(),
53   - env.getCharsetConverter());
54   - ++out;
  82 + env.getTagset(),
  83 + env.getCharsetConverter()));
55 84 }
56   - return out;
57 85 }
58 86  
59   -private:
  87 +protected:
60 88  
61   - string convertLemma(
  89 + void decodeForm(
62 90 const vector<uint32_t>& orth,
63   - const EncodedLemma& lemma) {
64   - string res;
  91 + const EncodedForm& lemma,
  92 + string& res) const {
65 93 for (unsigned int i = 0; i < orth.size() - lemma.suffixToCut; i++) {
66   - uint32_t cp =
  94 + uint32_t cp =
67 95 (i < lemma.casePattern.size() && lemma.casePattern[i])
68 96 ? env.getCaseConverter().toTitle(orth[i])
69 97 : orth[i];
... ... @@ -75,10 +103,56 @@ private:
75 103 uint32_t cp = UTF8CharsetConverter().next(suffixPtr, suffixEnd);
76 104 env.getCharsetConverter().append(cp, res);
77 105 }
78   - return res;
79 106 }
  107 +};
80 108  
81   - const Environment& env;
  109 +class InterpretedChunksDecoder4Generator : public InterpretedChunksDecoder {
  110 +
  111 +public:
  112 + InterpretedChunksDecoder4Generator(const Environment& env): InterpretedChunksDecoder(env) {}
  113 +
  114 + void decode(
  115 + unsigned int startNode,
  116 + unsigned int endNode,
  117 + const InterpretedChunk& interpretedChunk,
  118 + std::vector<MorphInterpretation>& out) const {
  119 + string orth;
  120 + string lemma;
  121 + convertPrefixes(interpretedChunk, lemma, orth);
  122 + lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
  123 + for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) {
  124 + const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i];
  125 + decodeForm(
  126 + interpretedChunk.originalCodepoints,
  127 + ei.value,
  128 + orth);
  129 + out.push_back(MorphInterpretation(
  130 + startNode, endNode,
  131 + orth, lemma,
  132 + ei.tag,
  133 + ei.nameClassifier,
  134 + env.getTagset(),
  135 + env.getCharsetConverter()));
  136 + }
  137 + }
  138 +
  139 +private:
  140 +
  141 + void decodeForm(
  142 + const vector<uint32_t>& lemma,
  143 + const EncodedForm& orth,
  144 + string& res) const {
  145 + res += orth.prefixToAdd;
  146 + for (unsigned int i = 0; i < lemma.size() - orth.suffixToCut; i++) {
  147 + env.getCharsetConverter().append(lemma[i], res);
  148 + }
  149 + const char* suffixPtr = orth.suffixToAdd.c_str();
  150 + const char* suffixEnd = suffixPtr + orth.suffixToAdd.length();
  151 + while (suffixPtr != suffixEnd) {
  152 + uint32_t cp = UTF8CharsetConverter().next(suffixPtr, suffixEnd);
  153 + env.getCharsetConverter().append(cp, res);
  154 + }
  155 + }
82 156 };
83 157  
84 158 #endif /* INTERPSGROUPDECODER_HPP */
... ...
morfeusz/Morfeusz.cpp
... ... @@ -24,12 +24,6 @@
24 24  
25 25 using namespace std;
26 26  
27   -static Deserializer<vector<InterpsGroup> >* initializeAnalyzerDeserializer() {
28   - static Deserializer < vector < InterpsGroup > > *deserializer
29   - = new MorphDeserializer();
30   - return deserializer;
31   -}
32   -
33 27 static MorfeuszOptions createDefaultOptions() {
34 28 MorfeuszOptions res;
35 29 res.caseSensitive = true;
... ... @@ -37,95 +31,74 @@ static MorfeuszOptions createDefaultOptions() {
37 31 return res;
38 32 }
39 33  
40   -static SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map) {
41   - SegrulesOptions opts;
42   - opts["aggl"] = "isolated";
43   - opts["praet"] = "split";
44   - return (*(map.find(opts))).second;
45   -}
46   -
47 34 Morfeusz::Morfeusz()
48   -: env(Tagset(DEFAULT_FSA), Tagset(DEFAULT_SYNTH_FSA), DEFAULT_MORFEUSZ_CHARSET),
49   -analyzerPtr(DEFAULT_FSA),
50   -analyzerFSA(FSAType::getFSA(analyzerPtr, *initializeAnalyzerDeserializer())),
51   -segrulesFSAsMap(createSegrulesFSAsMap(analyzerPtr)),
52   -currSegrulesFSA(getDefaultSegrulesFSA(segrulesFSAsMap)),
53   -isAnalyzerFSAFromFile(false),
54   -generatorPtr(DEFAULT_SYNTH_FSA),
55   -isGeneratorFSAFromFile(false),
56   -generator(generatorPtr, env),
  35 +: analyzerEnv(DEFAULT_MORFEUSZ_CHARSET, ANALYZER, DEFAULT_FSA),
  36 +generatorEnv(DEFAULT_MORFEUSZ_CHARSET, GENERATOR, DEFAULT_SYNTH_FSA),
57 37 options(createDefaultOptions()) {
58 38  
59 39 }
60 40  
61   -static void deleteSegrulesFSAs(std::map<SegrulesOptions, SegrulesFSA*>& fsasMap) {
62   - for (
63   - std::map<SegrulesOptions, SegrulesFSA*>::iterator it = fsasMap.begin();
64   - it != fsasMap.end();
65   - ++it) {
66   - delete it->second;
67   - }
68   - fsasMap.clear();
69   -}
70   -
71 41 void Morfeusz::setAnalyzerFile(const string& filename) {
72   - if (this->isAnalyzerFSAFromFile) {
73   - delete this->analyzerFSA;
74   - deleteSegrulesFSAs(this->segrulesFSAsMap);
75   - delete this->analyzerPtr;
76   - }
77   - this->analyzerPtr = readFile<unsigned char>(filename.c_str());
78   - this->analyzerFSA = FSA< vector<InterpsGroup> > ::getFSA(analyzerPtr, *initializeAnalyzerDeserializer());
79   - this->segrulesFSAsMap = createSegrulesFSAsMap(analyzerPtr);
80   - this->isAnalyzerFSAFromFile = true;
  42 + this->analyzerEnv.setFSAFile(filename);
  43 + // if (this->isAnalyzerFSAFromFile) {
  44 + // delete this->analyzerFSA;
  45 + // deleteSegrulesFSAs(this->analyzerSegrulesFSAsMap);
  46 + // delete this->analyzerPtr;
  47 + // }
  48 + // this->analyzerPtr = readFile<unsigned char>(filename.c_str());
  49 + // this->analyzerFSA = FSA< vector<InterpsGroup> > ::getFSA(analyzerPtr, *initializeAnalyzerDeserializer());
  50 + // this->analyzerSegrulesFSAsMap = createSegrulesFSAsMap(analyzerPtr);
  51 + // this->isAnalyzerFSAFromFile = true;
81 52 }
82 53  
83 54 void Morfeusz::setGeneratorFile(const string& filename) {
84   - if (this->isGeneratorFSAFromFile) {
85   - delete this->generatorPtr;
86   - }
87   - this->generatorPtr = readFile<unsigned char>(filename.c_str());
88   - this->generator.setGeneratorPtr(generatorPtr);
  55 + this->generatorEnv.setFSAFile(filename);
  56 + // if (this->isGeneratorFSAFromFile) {
  57 + // delete this->generatorPtr;
  58 + // }
  59 + // this->generatorPtr = readFile<unsigned char>(filename.c_str());
  60 + // this->generator.setGeneratorPtr(generatorPtr);
89 61 }
90 62  
91 63 Morfeusz::~Morfeusz() {
92   - if (this->isAnalyzerFSAFromFile) {
93   - delete this->analyzerFSA;
94   - deleteSegrulesFSAs(this->segrulesFSAsMap);
95   - delete this->analyzerPtr;
96   - }
  64 + // if (this->isAnalyzerFSAFromFile) {
  65 + // delete this->analyzerFSA;
  66 + // deleteSegrulesFSAs(this->analyzerSegrulesFSAsMap);
  67 + // delete this->analyzerPtr;
  68 + // }
97 69 }
98 70  
99   -void Morfeusz::analyzeOneWord(
  71 +void Morfeusz::processOneWord(
  72 + const Environment& env,
100 73 const char*& inputStart,
101 74 const char* inputEnd,
102 75 int startNodeNum,
103 76 std::vector<MorphInterpretation>& results) const {
104 77 while (inputStart != inputEnd
105   - && isEndOfWord(this->env.getCharsetConverter().peek(inputStart, inputEnd))) {
106   - this->env.getCharsetConverter().next(inputStart, inputEnd);
  78 + && isEndOfWord(env.getCharsetConverter().peek(inputStart, inputEnd))) {
  79 + env.getCharsetConverter().next(inputStart, inputEnd);
107 80 }
108 81 vector<InterpretedChunk> accum;
109 82 FlexionGraph graph;
110 83 const char* currInput = inputStart;
111   - SegrulesFSA* segrulesFSA = this->currSegrulesFSA;
112   - doAnalyzeOneWord(currInput, inputEnd, accum, graph, segrulesFSA->initialState);
  84 + const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA();
  85 + doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph);
113 86 if (!graph.empty()) {
114   - InterpretedChunksDecoder interpretedChunksDecoder(env);
  87 + const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder();
115 88 int srcNode = startNodeNum;
116 89 for (unsigned int i = 0; i < graph.getTheGraph().size(); i++) {
117 90 vector<FlexionGraph::Edge>& edges = graph.getTheGraph()[i];
118 91 for (unsigned int j = 0; j < edges.size(); j++) {
119 92 FlexionGraph::Edge& e = edges[j];
120 93 int targetNode = startNodeNum + e.nextNode;
121   - interpretedChunksDecoder.decode(srcNode, targetNode, e.chunk, back_inserter(results));
  94 + interpretedChunksDecoder.decode(srcNode, targetNode, e.chunk, results);
122 95 }
123 96 srcNode++;
124 97 }
125 98 // graph.getResults(*this->tagset, results);
126 99 }
127 100 else if (inputStart != inputEnd) {
128   - this->appendIgnotiumToResults(string(inputStart, currInput), startNodeNum, results);
  101 + this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results);
129 102 }
130 103 inputStart = currInput;
131 104 }
... ... @@ -139,109 +112,82 @@ static inline void doShiftOrth(InterpretedChunk&amp; from, InterpretedChunk&amp; to) {
139 112 from.orthWasShifted = true;
140 113 }
141 114  
142   -void Morfeusz::doAnalyzeOneWord(
  115 +void Morfeusz::doProcessOneWord(
  116 + const Environment& env,
143 117 const char*& inputData,
144 118 const char* inputEnd,
  119 + SegrulesState segrulesState,
145 120 vector<InterpretedChunk>& accum,
146   - FlexionGraph& graph,
147   - SegrulesState segrulesState) const {
148   - // cerr << "doAnalyzeOneWord " << inputData << endl;
149   - bool endOfWord = inputData == inputEnd;
  121 + FlexionGraph& graph) const {
  122 + cerr << "doAnalyzeOneWord " << inputData << endl;
  123 + bool endOfProcessing = inputData == inputEnd;
150 124 const char* currInput = inputData;
151   - uint32_t codepoint = endOfWord ? 0 : this->env.getCharsetConverter().next(currInput, inputEnd);
  125 + uint32_t codepoint = endOfProcessing ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
152 126 // UnicodeChunk uchunk(*(this->charsetConverter), *(this->caseConverter));
153 127 vector<uint32_t> originalCodepoints;
154 128 vector<uint32_t> lowercaseCodepoints;
155 129  
156   - StateType state = this->analyzerFSA->getInitialState();
  130 + StateType state = env.getFSA().getInitialState();
157 131  
158   - while (!isEndOfWord(codepoint)) {
159   - uint32_t lowerCP = this->env.getCaseConverter().toLower(codepoint);
  132 + while (!endOfProcessing) {
  133 + if (isEndOfWord(codepoint)) {
  134 + endOfProcessing = true;
  135 + }
  136 + cerr << "not end of word '" << string(currInput) << "'" << endl;
  137 + uint32_t lowerCP = env.getCaseConverter().toLower(codepoint);
160 138 originalCodepoints.push_back(codepoint);
161 139 lowercaseCodepoints.push_back(lowerCP);
162 140 feedState(state, lowerCP, UTF8CharsetConverter());
163   - codepoint = currInput == inputEnd ? 0 : this->env.getCharsetConverter().peek(currInput, inputEnd);
164   - if (!isEndOfWord(codepoint)) {
165   - if (state.isAccepting()) {
166   - vector<InterpsGroup> val(state.getValue());
167   - for (unsigned int i = 0; i < val.size(); i++) {
168   - InterpsGroup& ig = val[i];
169   - // newSegrulesState.proceedToNext(ig.type);
170   - // this->currSegrulesFSA->proceedToNext(ig.type, segrulesStates, newSegrulesStates);
171   - set<SegrulesState> newSegrulesStates;
172   - currSegrulesFSA->proceedToNext(ig.type, segrulesState, newSegrulesStates);
173   - for (
174   - set<SegrulesState>::iterator it = newSegrulesStates.begin();
175   - it != newSegrulesStates.end();
176   - it++) {
177   - SegrulesState newSegrulesState = *it;
178   - // bool shiftOrth = newSegrulesState.getLastTransitionValue() == 1;
179   - // bool shiftOrthSameType = newSegrulesState.getLastTransitionValue() == 2;
180   - InterpretedChunk ic = {
181   - inputData,
182   - originalCodepoints,
183   - lowercaseCodepoints,
184   - ig,
185   - newSegrulesState.shiftOrthFromPrevious,
186   - false,
187   - vector<InterpretedChunk>()
188   - };
189   - if (!accum.empty() && accum.back().shiftOrth) {
190   - doShiftOrth(accum.back(), ic);
191   - }
192   - accum.push_back(ic);
193   - const char* newCurrInput = currInput;
194   - doAnalyzeOneWord(newCurrInput, inputEnd, accum, graph, newSegrulesState);
195   - accum.pop_back();
196   - }
197   - }
198   - }
199   -
200   - this->env.getCharsetConverter().next(currInput, inputEnd);
201   - }
202   - }
203   - // cerr << "end of word" << endl;
204   - // we are at the end of word
205   - if (state.isAccepting()) {
206   - vector<InterpsGroup > val(state.getValue());
207   - for (unsigned int i = 0; i < val.size(); i++) {
208   - InterpsGroup& ig = val[i];
209   - // cerr << "currInput=" << currInput << endl;
210   - // cerr << "type=" << (int) ig.type << endl;
211   - set<SegrulesState> newSegrulesStates;
212   - currSegrulesFSA->proceedToNext(ig.type, segrulesState, newSegrulesStates);
213   - for (
214   - set<SegrulesState>::iterator it = newSegrulesStates.begin();
215   - it != newSegrulesStates.end();
216   - it++) {
217   - SegrulesState newSegrulesState = *it;
218   - if (newSegrulesState.accepting) {
  141 + codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().peek(currInput, inputEnd);
  142 + if (state.isAccepting()) {
  143 + cerr << "accepting" << endl;
  144 + vector<InterpsGroup> val(state.getValue());
  145 + for (unsigned int i = 0; i < val.size(); i++) {
  146 + InterpsGroup& ig = val[i];
  147 + set<SegrulesState> newSegrulesStates;
  148 + env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates);
  149 + for (
  150 + set<SegrulesState>::iterator it = newSegrulesStates.begin();
  151 + it != newSegrulesStates.end();
  152 + ++it) {
  153 + SegrulesState newSegrulesState = *it;
219 154 InterpretedChunk ic = {
220   - inputData,
221   - originalCodepoints,
222   - lowercaseCodepoints,
223   - ig,
224   - newSegrulesState.shiftOrthFromPrevious,
  155 + inputData,
  156 + originalCodepoints,
  157 + lowercaseCodepoints,
  158 + ig,
  159 + newSegrulesState.shiftOrthFromPrevious,
225 160 false,
226   - vector<InterpretedChunk>()};
  161 + vector<InterpretedChunk>()
  162 + };
227 163 if (!accum.empty() && accum.back().shiftOrth) {
228 164 doShiftOrth(accum.back(), ic);
229 165 }
230 166 accum.push_back(ic);
231   - graph.addPath(accum);
  167 + if (isEndOfWord(codepoint)) {
  168 + cerr << "end of word inside " << currInput <<endl;
  169 + if (newSegrulesState.accepting)
  170 + graph.addPath(accum);
  171 + }
  172 + else {
  173 + const char* newCurrInput = currInput;
  174 + doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState, accum, graph);
  175 + }
232 176 accum.pop_back();
233 177 }
234 178 }
235 179 }
236 180 }
  181 + cerr << "end of word " << currInput << endl;
237 182 inputData = currInput;
238 183 }
239 184  
240 185 void Morfeusz::appendIgnotiumToResults(
  186 + const Environment& env,
241 187 const string& word,
242 188 int startNodeNum,
243 189 std::vector<MorphInterpretation>& results) const {
244   - MorphInterpretation interp = MorphInterpretation::createIgn(startNodeNum, word, env.getAnalyzerTagset(), env.getCharsetConverter());
  190 + MorphInterpretation interp = MorphInterpretation::createIgn(startNodeNum, word, env.getTagset(), env.getCharsetConverter());
245 191 results.push_back(interp);
246 192 }
247 193  
... ... @@ -258,7 +204,7 @@ void Morfeusz::analyze(const string&amp; text, vector&lt;MorphInterpretation&gt;&amp; results)
258 204 const char* inputEnd = input + text.length();
259 205 while (input != inputEnd) {
260 206 int startNode = results.empty() ? 0 : results.back().getEndNode();
261   - this->analyzeOneWord(input, inputEnd, startNode, results);
  207 + this->processOneWord(this->analyzerEnv, input, inputEnd, startNode, results);
262 208 }
263 209 }
264 210  
... ... @@ -271,12 +217,18 @@ ResultsIterator Morfeusz::generate(const string&amp; text) const {
271 217 }
272 218  
273 219 void Morfeusz::generate(const string& text, vector<MorphInterpretation>& results) const {
274   - this->generator.generate(text, results);
  220 + const char* input = text.c_str();
  221 + const char* inputEnd = input + text.length();
  222 + while (input != inputEnd) {
  223 + int startNode = results.empty() ? 0 : results.back().getEndNode();
  224 + this->processOneWord(this->generatorEnv, input, inputEnd, startNode, results);
  225 + }
275 226 }
276 227  
277 228 void Morfeusz::setCharset(MorfeuszCharset charset) {
278 229 this->options.encoding = charset;
279   - this->env.setCharset(charset);
  230 + this->analyzerEnv.setCharset(charset);
  231 + this->generatorEnv.setCharset(charset);
280 232 }
281 233  
282 234 ResultsIterator::ResultsIterator(vector<MorphInterpretation>& res) {
... ...
morfeusz/Morfeusz.hpp
... ... @@ -33,7 +33,6 @@
33 33 class Morfeusz;
34 34 class ResultsIterator;
35 35  
36   -typedef FSA< std::vector<InterpsGroup > > FSAType;
37 36 typedef State< std::vector<InterpsGroup > > StateType;
38 37  
39 38 class Morfeusz {
... ... @@ -102,33 +101,38 @@ public:
102 101 friend class ResultsIterator;
103 102 private:
104 103  
105   - void analyzeOneWord(
  104 + void processOneWord(
  105 + const Environment& env,
106 106 const char*& inputData,
107 107 const char* inputEnd,
108 108 int startNodeNum,
109 109 std::vector<MorphInterpretation>& result) const;
110 110  
111   - void doAnalyzeOneWord(
  111 + void doProcessOneWord(
  112 + const Environment& env,
112 113 const char*& inputData,
113 114 const char* inputEnd,
  115 + SegrulesState segrulesState,
114 116 std::vector<InterpretedChunk>& accum,
115   - FlexionGraph& graph,
116   - SegrulesState segrulesState) const;
  117 + FlexionGraph& graph) const;
117 118  
118 119 void appendIgnotiumToResults(
  120 + const Environment& env,
119 121 const std::string& word,
120 122 int startNodeNum,
121 123 std::vector<MorphInterpretation>& results) const;
122   - Environment env;
123   - const unsigned char* analyzerPtr;
124   - FSAType* analyzerFSA;
125   - std::map<SegrulesOptions, SegrulesFSA*> segrulesFSAsMap;
126   - SegrulesFSA* currSegrulesFSA;
127   - bool isAnalyzerFSAFromFile;
128   -
129   - const unsigned char* generatorPtr;
130   - bool isGeneratorFSAFromFile;
131   - Generator generator;
  124 + Environment analyzerEnv;
  125 + Environment generatorEnv;
  126 +// const unsigned char* analyzerPtr;
  127 +// FSAType* analyzerFSA;
  128 +// std::map<SegrulesOptions, SegrulesFSA*> analyzerSegrulesFSAsMap;
  129 +// SegrulesFSA* currAnalyzerSegrulesFSA;
  130 +// bool isAnalyzerFSAFromFile;
  131 +//
  132 +// const unsigned char* generatorPtr;
  133 +// FSAType* generatorFSA;
  134 +// bool isGeneratorFSAFromFile;
  135 +// Generator generator;
132 136  
133 137 MorfeuszOptions options;
134 138 };
... ...
morfeusz/MorphDeserializer.cpp
... ... @@ -23,7 +23,7 @@ MorphDeserializer::MorphDeserializer() {
23 23 MorphDeserializer::~MorphDeserializer() {
24 24 }
25 25  
26   -static void deserializeLemma(const unsigned char*& ptr, EncodedLemma& lemma) {
  26 +static void deserializeLemma(const unsigned char*& ptr, EncodedForm& lemma) {
27 27 // XXX uważać na poprawność danych
28 28 lemma.suffixToCut = *ptr;
29 29 ptr++;
... ... @@ -64,7 +64,7 @@ static void deserializeLemma(const unsigned char*&amp; ptr, EncodedLemma&amp; lemma) {
64 64 static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) {
65 65 interp.type = *ptr;
66 66 ptr++;
67   - deserializeLemma(ptr, interp.lemma);
  67 + deserializeLemma(ptr, interp.value);
68 68 interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr)));
69 69 ptr += 2;
70 70 interp.nameClassifier = *ptr;
... ...
morfeusz/const.hpp
... ... @@ -18,6 +18,11 @@ enum MorfeuszCharset {
18 18 CP852
19 19 };
20 20  
  21 +enum MorfeuszProcessorType {
  22 + GENERATOR,
  23 + ANALYZER
  24 +};
  25 +
21 26 extern const MorfeuszCharset DEFAULT_MORFEUSZ_CHARSET;
22 27  
23 28 extern const unsigned char SHIFT_ORTH_NODE;
... ...
morfeusz/fsa/fsa_impl.hpp
... ... @@ -88,8 +88,6 @@ FSA&lt;T&gt;* FSA&lt;T&gt;::getFSA(const unsigned char* ptr, const Deserializer&lt;T&gt;&amp; deserial
88 88 return new CompressedFSA1<T>(startPtr, deserializer);
89 89 case 2:
90 90 return new CompressedFSA2<T>(startPtr, deserializer);
91   - case 128:
92   - return new SimpleFSA<T>(startPtr, deserializer, true);
93 91 default:
94 92 std::ostringstream oss;
95 93 oss << "Invalid implementation number: " << versionNum << ", should be: " << VERSION_NUM;
... ...
nbproject/configurations.xml
... ... @@ -106,20 +106,14 @@
106 106 </makeTool>
107 107 </makefileType>
108 108 <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4">
109   - <ccTool flags="1">
110   - </ccTool>
111 109 </item>
112 110 <item path="../default_synth_fsa.cpp" ex="false" tool="1" flavor2="4">
113   - <ccTool flags="1">
114   - </ccTool>
115 111 </item>
116 112 <item path="build/default_fsa.cpp" ex="false" tool="1" flavor2="4">
117 113 </item>
118 114 <item path="build/default_synth_fsa.cpp" ex="false" tool="1" flavor2="4">
119 115 </item>
120 116 <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4">
121   - <ccTool flags="1">
122   - </ccTool>
123 117 </item>
124 118 <item path="build/morfeusz/morfeuszJAVA_wrap.cxx"
125 119 ex="false"
... ... @@ -414,26 +408,18 @@
414 408 </ccTool>
415 409 </item>
416 410 <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4">
417   - <ccTool flags="1">
418   - </ccTool>
419 411 </item>
420 412 <item path="morfeusz/charset/CharsetConverter.cpp"
421 413 ex="false"
422 414 tool="1"
423 415 flavor2="4">
424   - <ccTool flags="1">
425   - </ccTool>
426 416 </item>
427 417 <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4">
428   - <ccTool flags="1">
429   - </ccTool>
430 418 </item>
431 419 <item path="morfeusz/charset/conversion_tables.cpp"
432 420 ex="false"
433 421 tool="1"
434 422 flavor2="4">
435   - <ccTool flags="1">
436   - </ccTool>
437 423 </item>
438 424 <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4">
439 425 <ccTool flags="1">
... ... @@ -522,12 +508,8 @@
522 508 ex="false"
523 509 tool="1"
524 510 flavor2="4">
525   - <ccTool flags="1">
526   - </ccTool>
527 511 </item>
528 512 <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4">
529   - <ccTool flags="1">
530   - </ccTool>
531 513 </item>
532 514 <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4">
533 515 <ccTool flags="0">
... ...