Commit d42f73bcc0bb85658229c6d1c1fa0366a4d5be02

Authored by Michał Lenart
1 parent e2ef01be

już praktycznie działa zakodowywanie interpretacji morfologicznych tak, jak ma być "docelowo"

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@15 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsabuilder/fsa/buildfsa.py
@@ -16,9 +16,6 @@ from serializer import VLengthSerializer1, VLengthSerializer2, SimpleSerializer @@ -16,9 +16,6 @@ from serializer import VLengthSerializer1, VLengthSerializer2, SimpleSerializer
16 from visualizer import Visualizer 16 from visualizer import Visualizer
17 from optparse import OptionParser 17 from optparse import OptionParser
18 18
19 -from pycallgraph import PyCallGraph  
20 -from pycallgraph.output import GraphvizOutput  
21 -  
22 class OutputFormat(): 19 class OutputFormat():
23 BINARY = 'BINARY' 20 BINARY = 'BINARY'
24 CPP = 'CPP' 21 CPP = 'CPP'
@@ -131,8 +128,7 @@ def _parseOptions(): @@ -131,8 +128,7 @@ def _parseOptions():
131 # exit(1) 128 # exit(1)
132 return opts 129 return opts
133 130
134 -def _readPolimorfInput(inputFile, tagsetFile, encoder):  
135 - tagset = common.Tagset(tagsetFile) 131 +def _readPolimorfInput(inputFile, tagset, encoder):
136 with codecs.open(inputFile, 'r', 'utf8') as f: 132 with codecs.open(inputFile, 'r', 'utf8') as f:
137 for entry in convertinput.convertPolimorf(f, tagset, encoder): 133 for entry in convertinput.convertPolimorf(f, tagset, encoder):
138 yield entry 134 yield entry
@@ -167,8 +163,9 @@ def _printStats(fsa): @@ -167,8 +163,9 @@ def _printStats(fsa):
167 163
168 def buildFromPoliMorf(inputFile, tagsetFile): 164 def buildFromPoliMorf(inputFile, tagsetFile):
169 encoder = encode.MorphEncoder() 165 encoder = encode.MorphEncoder()
170 - fsa = FSA(encoder)  
171 - inputData = _readPolimorfInput(inputFile, tagsetFile, encoder) 166 + tagset = common.Tagset(tagsetFile)
  167 + fsa = FSA(encoder, tagset)
  168 + inputData = _readPolimorfInput(inputFile, tagset, encoder)
172 fsa.feed(inputData) 169 fsa.feed(inputData)
173 _printStats(fsa) 170 _printStats(fsa)
174 return fsa 171 return fsa
@@ -230,6 +227,8 @@ def main(opts): @@ -230,6 +227,8 @@ def main(opts):
230 if __name__ == '__main__': 227 if __name__ == '__main__':
231 opts = _parseOptions() 228 opts = _parseOptions()
232 if opts.profile: 229 if opts.profile:
  230 + from pycallgraph import PyCallGraph
  231 + from pycallgraph.output import GraphvizOutput
233 with PyCallGraph(output=GraphvizOutput()): 232 with PyCallGraph(output=GraphvizOutput()):
234 main(opts) 233 main(opts)
235 else: 234 else:
fsabuilder/fsa/common.py
@@ -23,15 +23,15 @@ class Interpretation(object): @@ -23,15 +23,15 @@ class Interpretation(object):
23 root += o 23 root += o
24 else: 24 else:
25 break 25 break
26 - cutLength = len(orth) - len(root) 26 + cutLength = len(encoder.encodeWord(orth)) - len(encoder.encodeWord(root))
27 self.lemma = Lemma( 27 self.lemma = Lemma(
28 cutLength=cutLength, 28 cutLength=cutLength,
29 - suffixToAdd=base[len(root):]) 29 + suffixToAdd=encoder.encodeWord(base[len(root):], lowercase=False))
30 self.tagnum = tagnum 30 self.tagnum = tagnum
31 self.namenum = namenum 31 self.namenum = namenum
32 32
33 def getSortKey(self): 33 def getSortKey(self):
34 - return (self.lemma.cutLength, self.lemma.suffixToAdd, self.tagnum, self.namenum) 34 + return (self.lemma.cutLength, tuple(self.lemma.suffixToAdd), self.tagnum, self.namenum)
35 35
36 def __eq__(self, other): 36 def __eq__(self, other):
37 if isinstance(other, Interpretation): 37 if isinstance(other, Interpretation):
fsabuilder/fsa/encode.py
@@ -38,8 +38,8 @@ class Encoder(object): @@ -38,8 +38,8 @@ class Encoder(object):
38 38
39 class SimpleEncoder(Encoder): 39 class SimpleEncoder(Encoder):
40 40
41 - def __init__(self, encoding='utf8', appendZero=False):  
42 - super(SimpleEncoder, self).__init__(encoding, appendZero) 41 + def __init__(self, encoding='utf8'):
  42 + super(SimpleEncoder, self).__init__(encoding)
43 43
44 def encodeData(self, data): 44 def encodeData(self, data):
45 return bytearray(data, encoding=self.encoding) + bytearray([0]) 45 return bytearray(data, encoding=self.encoding) + bytearray([0])
@@ -70,7 +70,7 @@ class MorphEncoder(Encoder): @@ -70,7 +70,7 @@ class MorphEncoder(Encoder):
70 res = bytearray() 70 res = bytearray()
71 assert lemma.cutLength < 256 and lemma.cutLength >= 0 71 assert lemma.cutLength < 256 and lemma.cutLength >= 0
72 res.append(lemma.cutLength) 72 res.append(lemma.cutLength)
73 - res.extend(self.encodeWord(lemma.suffixToAdd, lowercase=False)) 73 + res.extend(lemma.suffixToAdd)
74 res.append(0) 74 res.append(0)
75 return res 75 return res
76 76
fsabuilder/fsa/fsa.py
@@ -14,11 +14,12 @@ class FSA(object): @@ -14,11 +14,12 @@ class FSA(object):
14 ''' 14 '''
15 15
16 16
17 - def __init__(self, encoder): 17 + def __init__(self, encoder, tagset=None):
18 self.encodeWord = encoder.encodeWord 18 self.encodeWord = encoder.encodeWord
19 self.encodeData = encoder.encodeData 19 self.encodeData = encoder.encodeData
20 self.decodeData = encoder.decodeData 20 self.decodeData = encoder.decodeData
21 self.encodedPrevWord = None 21 self.encodedPrevWord = None
  22 + self.tagset = tagset
22 self.initialState = state.State() 23 self.initialState = state.State()
23 self.register = register.Register() 24 self.register = register.Register()
24 self.label2Freq = {} 25 self.label2Freq = {}
fsabuilder/fsa/serializer.py
@@ -18,6 +18,9 @@ class Serializer(object): @@ -18,6 +18,9 @@ class Serializer(object):
18 def fsa(self): 18 def fsa(self):
19 return self._fsa 19 return self._fsa
20 20
  21 + def getVersion(self):
  22 + return 9
  23 +
21 def serialize2CppFile(self, fname): 24 def serialize2CppFile(self, fname):
22 res = [] 25 res = []
23 # self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) 26 # self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
@@ -39,14 +42,48 @@ class Serializer(object): @@ -39,14 +42,48 @@ class Serializer(object):
39 def fsa2bytearray(self): 42 def fsa2bytearray(self):
40 43
41 res = bytearray() 44 res = bytearray()
42 - res.extend(self.serializePrologue()) 45 + res.extend(self.serializePrologue(self.serializeTagset(self.fsa.tagset)))
43 self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) 46 self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
44 logging.debug('SERIALIZE') 47 logging.debug('SERIALIZE')
45 for state in sorted(self.fsa.dfs(), key=lambda s: s.offset): 48 for state in sorted(self.fsa.dfs(), key=lambda s: s.offset):
46 res.extend(self.state2bytearray(state)) 49 res.extend(self.state2bytearray(state))
47 return res 50 return res
48 51
49 - def serializePrologue(self): 52 + def serializeTags(self, tagsMap):
  53 + res = bytearray()
  54 + numOfTags = len(tagsMap)
  55 + res.extend(self.htons(numOfTags))
  56 + for tag, tagnum in sorted(tagsMap.iteritems(), key=lambda (tag, tagnum): tagnum):
  57 + res.extend(self.htons(tagnum))
  58 + res.extend(self.fsa.encodeWord(tag))
  59 + res.append(0)
  60 + return res
  61 +
  62 + def serializeTagset(self, tagset):
  63 + res = bytearray()
  64 + if tagset:
  65 + res.extend(self.serializeTags(tagset.tag2tagnum))
  66 + res.extend(self.serializeTags(tagset.name2namenum))
  67 + return res
  68 +
  69 + def htons(self, n):
  70 + assert n < 65536
  71 + assert n >= 0
  72 + res = bytearray()
  73 + res.append((n & 0x00FF00) >> 8)
  74 + res.append(n & 0x0000FF)
  75 + return res
  76 +
  77 + def htonl(self, n):
  78 + assert n >= 0
  79 + res = bytearray()
  80 + res.append((n & 0xFF000000) >> 24)
  81 + res.append((n & 0x00FF0000) >> 16)
  82 + res.append((n & 0x0000FF00) >> 8)
  83 + res.append(n & 0x000000FF)
  84 + return res
  85 +
  86 + def serializePrologue(self, additionalData=None):
50 res = bytearray() 87 res = bytearray()
51 88
52 # serialize magic number in big-endian order 89 # serialize magic number in big-endian order
@@ -61,6 +98,15 @@ class Serializer(object): @@ -61,6 +98,15 @@ class Serializer(object):
61 # serialize implementation code 98 # serialize implementation code
62 res.append(self.getImplementationCode()) 99 res.append(self.getImplementationCode())
63 100
  101 + # serialize additional data size in 2-byte big-endian
  102 + additionalDataSize = len(additionalData) if additionalData else 0
  103 + res.extend(self.htonl(additionalDataSize))
  104 +
  105 + # add additional data itself
  106 + if additionalDataSize:
  107 + assert type(additionalData) == bytearray
  108 + res.extend(additionalData)
  109 +
64 return res 110 return res
65 111
66 def state2bytearray(self, state): 112 def state2bytearray(self, state):
@@ -81,9 +127,6 @@ class Serializer(object): @@ -81,9 +127,6 @@ class Serializer(object):
81 def transitionsData2bytearray(self, state): 127 def transitionsData2bytearray(self, state):
82 raise NotImplementedError('Not implemented') 128 raise NotImplementedError('Not implemented')
83 129
84 - def getVersion(self):  
85 - raise NotImplementedError('Not implemented')  
86 -  
87 def getImplementationCode(self): 130 def getImplementationCode(self):
88 raise NotImplementedError('Not implemented') 131 raise NotImplementedError('Not implemented')
89 132
@@ -93,9 +136,6 @@ class SimpleSerializer(Serializer): @@ -93,9 +136,6 @@ class SimpleSerializer(Serializer):
93 super(SimpleSerializer, self).__init__(fsa) 136 super(SimpleSerializer, self).__init__(fsa)
94 self.ACCEPTING_FLAG = 128 137 self.ACCEPTING_FLAG = 128
95 138
96 - def getVersion(self):  
97 - return 8  
98 -  
99 def getImplementationCode(self): 139 def getImplementationCode(self):
100 return 0 140 return 0
101 141
@@ -141,9 +181,6 @@ class VLengthSerializer1(Serializer): @@ -141,9 +181,6 @@ class VLengthSerializer1(Serializer):
141 self.ACCEPTING_FLAG = 0b10000000 181 self.ACCEPTING_FLAG = 0b10000000
142 self.ARRAY_FLAG = 0b01000000 182 self.ARRAY_FLAG = 0b01000000
143 183
144 - def getVersion(self):  
145 - return 8  
146 -  
147 def getImplementationCode(self): 184 def getImplementationCode(self):
148 return 1 185 return 1
149 186
@@ -302,9 +339,6 @@ class VLengthSerializer2(Serializer): @@ -302,9 +339,6 @@ class VLengthSerializer2(Serializer):
302 self.ACCEPTING_FLAG = 64 339 self.ACCEPTING_FLAG = 64
303 self.LAST_FLAG = 32 340 self.LAST_FLAG = 32
304 341
305 - def getVersion(self):  
306 - return 8  
307 -  
308 def getImplementationCode(self): 342 def getImplementationCode(self):
309 return 2 343 return 2
310 344
fsabuilder/fsa/test/testConstruction.py
@@ -7,6 +7,7 @@ Created on Oct 8, 2013 @@ -7,6 +7,7 @@ Created on Oct 8, 2013
7 import unittest 7 import unittest
8 import os 8 import os
9 from fsa import fsa, visualizer, encode, buildfsa 9 from fsa import fsa, visualizer, encode, buildfsa
  10 +from fsa.serializer import SimpleSerializer
10 11
11 class Test(unittest.TestCase): 12 class Test(unittest.TestCase):
12 13
@@ -59,6 +60,8 @@ class Test(unittest.TestCase): @@ -59,6 +60,8 @@ class Test(unittest.TestCase):
59 inputFile = os.path.join(os.path.dirname(__file__), 'PoliMorfSmall.tab') 60 inputFile = os.path.join(os.path.dirname(__file__), 'PoliMorfSmall.tab')
60 tagsetFile = os.path.join(os.path.dirname(__file__), 'polimorf.tagset') 61 tagsetFile = os.path.join(os.path.dirname(__file__), 'polimorf.tagset')
61 fsa = buildfsa.buildFromPoliMorf(inputFile, tagsetFile) 62 fsa = buildfsa.buildFromPoliMorf(inputFile, tagsetFile)
  63 + serializer = SimpleSerializer(fsa)
  64 + serializer.serialize2BinaryFile('/tmp/test0.fsa')
62 # visualizer.Visualizer().visualize(fsa) 65 # visualizer.Visualizer().visualize(fsa)
63 66
64 if __name__ == "__main__": 67 if __name__ == "__main__":