Commit d42f73bcc0bb85658229c6d1c1fa0366a4d5be02

Authored by Michał Lenart
1 parent e2ef01be

już praktycznie działa zakodowywanie interpretacji morfologicznych tak, jak ma być "docelowo"

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@15 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsabuilder/fsa/buildfsa.py
... ... @@ -16,9 +16,6 @@ from serializer import VLengthSerializer1, VLengthSerializer2, SimpleSerializer
16 16 from visualizer import Visualizer
17 17 from optparse import OptionParser
18 18  
19   -from pycallgraph import PyCallGraph
20   -from pycallgraph.output import GraphvizOutput
21   -
22 19 class OutputFormat():
23 20 BINARY = 'BINARY'
24 21 CPP = 'CPP'
... ... @@ -131,8 +128,7 @@ def _parseOptions():
131 128 # exit(1)
132 129 return opts
133 130  
134   -def _readPolimorfInput(inputFile, tagsetFile, encoder):
135   - tagset = common.Tagset(tagsetFile)
  131 +def _readPolimorfInput(inputFile, tagset, encoder):
136 132 with codecs.open(inputFile, 'r', 'utf8') as f:
137 133 for entry in convertinput.convertPolimorf(f, tagset, encoder):
138 134 yield entry
... ... @@ -167,8 +163,9 @@ def _printStats(fsa):
167 163  
168 164 def buildFromPoliMorf(inputFile, tagsetFile):
169 165 encoder = encode.MorphEncoder()
170   - fsa = FSA(encoder)
171   - inputData = _readPolimorfInput(inputFile, tagsetFile, encoder)
  166 + tagset = common.Tagset(tagsetFile)
  167 + fsa = FSA(encoder, tagset)
  168 + inputData = _readPolimorfInput(inputFile, tagset, encoder)
172 169 fsa.feed(inputData)
173 170 _printStats(fsa)
174 171 return fsa
... ... @@ -230,6 +227,8 @@ def main(opts):
230 227 if __name__ == '__main__':
231 228 opts = _parseOptions()
232 229 if opts.profile:
  230 + from pycallgraph import PyCallGraph
  231 + from pycallgraph.output import GraphvizOutput
233 232 with PyCallGraph(output=GraphvizOutput()):
234 233 main(opts)
235 234 else:
... ...
fsabuilder/fsa/common.py
... ... @@ -23,15 +23,15 @@ class Interpretation(object):
23 23 root += o
24 24 else:
25 25 break
26   - cutLength = len(orth) - len(root)
  26 + cutLength = len(encoder.encodeWord(orth)) - len(encoder.encodeWord(root))
27 27 self.lemma = Lemma(
28 28 cutLength=cutLength,
29   - suffixToAdd=base[len(root):])
  29 + suffixToAdd=encoder.encodeWord(base[len(root):], lowercase=False))
30 30 self.tagnum = tagnum
31 31 self.namenum = namenum
32 32  
33 33 def getSortKey(self):
34   - return (self.lemma.cutLength, self.lemma.suffixToAdd, self.tagnum, self.namenum)
  34 + return (self.lemma.cutLength, tuple(self.lemma.suffixToAdd), self.tagnum, self.namenum)
35 35  
36 36 def __eq__(self, other):
37 37 if isinstance(other, Interpretation):
... ...
fsabuilder/fsa/encode.py
... ... @@ -38,8 +38,8 @@ class Encoder(object):
38 38  
39 39 class SimpleEncoder(Encoder):
40 40  
41   - def __init__(self, encoding='utf8', appendZero=False):
42   - super(SimpleEncoder, self).__init__(encoding, appendZero)
  41 + def __init__(self, encoding='utf8'):
  42 + super(SimpleEncoder, self).__init__(encoding)
43 43  
44 44 def encodeData(self, data):
45 45 return bytearray(data, encoding=self.encoding) + bytearray([0])
... ... @@ -70,7 +70,7 @@ class MorphEncoder(Encoder):
70 70 res = bytearray()
71 71 assert lemma.cutLength < 256 and lemma.cutLength >= 0
72 72 res.append(lemma.cutLength)
73   - res.extend(self.encodeWord(lemma.suffixToAdd, lowercase=False))
  73 + res.extend(lemma.suffixToAdd)
74 74 res.append(0)
75 75 return res
76 76  
... ...
fsabuilder/fsa/fsa.py
... ... @@ -14,11 +14,12 @@ class FSA(object):
14 14 '''
15 15  
16 16  
17   - def __init__(self, encoder):
  17 + def __init__(self, encoder, tagset=None):
18 18 self.encodeWord = encoder.encodeWord
19 19 self.encodeData = encoder.encodeData
20 20 self.decodeData = encoder.decodeData
21 21 self.encodedPrevWord = None
  22 + self.tagset = tagset
22 23 self.initialState = state.State()
23 24 self.register = register.Register()
24 25 self.label2Freq = {}
... ...
fsabuilder/fsa/serializer.py
... ... @@ -18,6 +18,9 @@ class Serializer(object):
18 18 def fsa(self):
19 19 return self._fsa
20 20  
  21 + def getVersion(self):
  22 + return 9
  23 +
21 24 def serialize2CppFile(self, fname):
22 25 res = []
23 26 # self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
... ... @@ -39,14 +42,48 @@ class Serializer(object):
39 42 def fsa2bytearray(self):
40 43  
41 44 res = bytearray()
42   - res.extend(self.serializePrologue())
  45 + res.extend(self.serializePrologue(self.serializeTagset(self.fsa.tagset)))
43 46 self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
44 47 logging.debug('SERIALIZE')
45 48 for state in sorted(self.fsa.dfs(), key=lambda s: s.offset):
46 49 res.extend(self.state2bytearray(state))
47 50 return res
48 51  
49   - def serializePrologue(self):
  52 + def serializeTags(self, tagsMap):
  53 + res = bytearray()
  54 + numOfTags = len(tagsMap)
  55 + res.extend(self.htons(numOfTags))
  56 + for tag, tagnum in sorted(tagsMap.iteritems(), key=lambda (tag, tagnum): tagnum):
  57 + res.extend(self.htons(tagnum))
  58 + res.extend(self.fsa.encodeWord(tag))
  59 + res.append(0)
  60 + return res
  61 +
  62 + def serializeTagset(self, tagset):
  63 + res = bytearray()
  64 + if tagset:
  65 + res.extend(self.serializeTags(tagset.tag2tagnum))
  66 + res.extend(self.serializeTags(tagset.name2namenum))
  67 + return res
  68 +
  69 + def htons(self, n):
  70 + assert n < 65536
  71 + assert n >= 0
  72 + res = bytearray()
  73 + res.append((n & 0x00FF00) >> 8)
  74 + res.append(n & 0x0000FF)
  75 + return res
  76 +
  77 + def htonl(self, n):
  78 + assert n >= 0
  79 + res = bytearray()
  80 + res.append((n & 0xFF000000) >> 24)
  81 + res.append((n & 0x00FF0000) >> 16)
  82 + res.append((n & 0x0000FF00) >> 8)
  83 + res.append(n & 0x000000FF)
  84 + return res
  85 +
  86 + def serializePrologue(self, additionalData=None):
50 87 res = bytearray()
51 88  
52 89 # serialize magic number in big-endian order
... ... @@ -61,6 +98,15 @@ class Serializer(object):
61 98 # serialize implementation code
62 99 res.append(self.getImplementationCode())
63 100  
  101 + # serialize additional data size in 2-byte big-endian
  102 + additionalDataSize = len(additionalData) if additionalData else 0
  103 + res.extend(self.htonl(additionalDataSize))
  104 +
  105 + # add additional data itself
  106 + if additionalDataSize:
  107 + assert type(additionalData) == bytearray
  108 + res.extend(additionalData)
  109 +
64 110 return res
65 111  
66 112 def state2bytearray(self, state):
... ... @@ -81,9 +127,6 @@ class Serializer(object):
81 127 def transitionsData2bytearray(self, state):
82 128 raise NotImplementedError('Not implemented')
83 129  
84   - def getVersion(self):
85   - raise NotImplementedError('Not implemented')
86   -
87 130 def getImplementationCode(self):
88 131 raise NotImplementedError('Not implemented')
89 132  
... ... @@ -93,9 +136,6 @@ class SimpleSerializer(Serializer):
93 136 super(SimpleSerializer, self).__init__(fsa)
94 137 self.ACCEPTING_FLAG = 128
95 138  
96   - def getVersion(self):
97   - return 8
98   -
99 139 def getImplementationCode(self):
100 140 return 0
101 141  
... ... @@ -141,9 +181,6 @@ class VLengthSerializer1(Serializer):
141 181 self.ACCEPTING_FLAG = 0b10000000
142 182 self.ARRAY_FLAG = 0b01000000
143 183  
144   - def getVersion(self):
145   - return 8
146   -
147 184 def getImplementationCode(self):
148 185 return 1
149 186  
... ... @@ -302,9 +339,6 @@ class VLengthSerializer2(Serializer):
302 339 self.ACCEPTING_FLAG = 64
303 340 self.LAST_FLAG = 32
304 341  
305   - def getVersion(self):
306   - return 8
307   -
308 342 def getImplementationCode(self):
309 343 return 2
310 344  
... ...
fsabuilder/fsa/test/testConstruction.py
... ... @@ -7,6 +7,7 @@ Created on Oct 8, 2013
7 7 import unittest
8 8 import os
9 9 from fsa import fsa, visualizer, encode, buildfsa
  10 +from fsa.serializer import SimpleSerializer
10 11  
11 12 class Test(unittest.TestCase):
12 13  
... ... @@ -59,6 +60,8 @@ class Test(unittest.TestCase):
59 60 inputFile = os.path.join(os.path.dirname(__file__), 'PoliMorfSmall.tab')
60 61 tagsetFile = os.path.join(os.path.dirname(__file__), 'polimorf.tagset')
61 62 fsa = buildfsa.buildFromPoliMorf(inputFile, tagsetFile)
  63 + serializer = SimpleSerializer(fsa)
  64 + serializer.serialize2BinaryFile('/tmp/test0.fsa')
62 65 # visualizer.Visualizer().visualize(fsa)
63 66  
64 67 if __name__ == "__main__":
... ...