diff --git a/fsabuilder/fsa/buildfsa.py b/fsabuilder/fsa/buildfsa.py index 0b32ccb..c51c3ee 100644 --- a/fsabuilder/fsa/buildfsa.py +++ b/fsabuilder/fsa/buildfsa.py @@ -16,9 +16,6 @@ from serializer import VLengthSerializer1, VLengthSerializer2, SimpleSerializer from visualizer import Visualizer from optparse import OptionParser -from pycallgraph import PyCallGraph -from pycallgraph.output import GraphvizOutput - class OutputFormat(): BINARY = 'BINARY' CPP = 'CPP' @@ -131,8 +128,7 @@ def _parseOptions(): # exit(1) return opts -def _readPolimorfInput(inputFile, tagsetFile, encoder): - tagset = common.Tagset(tagsetFile) +def _readPolimorfInput(inputFile, tagset, encoder): with codecs.open(inputFile, 'r', 'utf8') as f: for entry in convertinput.convertPolimorf(f, tagset, encoder): yield entry @@ -167,8 +163,9 @@ def _printStats(fsa): def buildFromPoliMorf(inputFile, tagsetFile): encoder = encode.MorphEncoder() - fsa = FSA(encoder) - inputData = _readPolimorfInput(inputFile, tagsetFile, encoder) + tagset = common.Tagset(tagsetFile) + fsa = FSA(encoder, tagset) + inputData = _readPolimorfInput(inputFile, tagset, encoder) fsa.feed(inputData) _printStats(fsa) return fsa @@ -230,6 +227,8 @@ def main(opts): if __name__ == '__main__': opts = _parseOptions() if opts.profile: + from pycallgraph import PyCallGraph + from pycallgraph.output import GraphvizOutput with PyCallGraph(output=GraphvizOutput()): main(opts) else: diff --git a/fsabuilder/fsa/common.py b/fsabuilder/fsa/common.py index 46717e9..50fe997 100644 --- a/fsabuilder/fsa/common.py +++ b/fsabuilder/fsa/common.py @@ -23,15 +23,15 @@ class Interpretation(object): root += o else: break - cutLength = len(orth) - len(root) + cutLength = len(encoder.encodeWord(orth)) - len(encoder.encodeWord(root)) self.lemma = Lemma( cutLength=cutLength, - suffixToAdd=base[len(root):]) + suffixToAdd=encoder.encodeWord(base[len(root):], lowercase=False)) self.tagnum = tagnum self.namenum = namenum def getSortKey(self): - return (self.lemma.cutLength, self.lemma.suffixToAdd, self.tagnum, self.namenum) + return (self.lemma.cutLength, tuple(self.lemma.suffixToAdd), self.tagnum, self.namenum) def __eq__(self, other): if isinstance(other, Interpretation): diff --git a/fsabuilder/fsa/encode.py b/fsabuilder/fsa/encode.py index 7e15fd5..059ee4d 100644 --- a/fsabuilder/fsa/encode.py +++ b/fsabuilder/fsa/encode.py @@ -38,8 +38,8 @@ class Encoder(object): class SimpleEncoder(Encoder): - def __init__(self, encoding='utf8', appendZero=False): - super(SimpleEncoder, self).__init__(encoding, appendZero) + def __init__(self, encoding='utf8'): + super(SimpleEncoder, self).__init__(encoding) def encodeData(self, data): return bytearray(data, encoding=self.encoding) + bytearray([0]) @@ -70,7 +70,7 @@ class MorphEncoder(Encoder): res = bytearray() assert lemma.cutLength < 256 and lemma.cutLength >= 0 res.append(lemma.cutLength) - res.extend(self.encodeWord(lemma.suffixToAdd, lowercase=False)) + res.extend(lemma.suffixToAdd) res.append(0) return res diff --git a/fsabuilder/fsa/fsa.py b/fsabuilder/fsa/fsa.py index 6bb417f..3305bfa 100644 --- a/fsabuilder/fsa/fsa.py +++ b/fsabuilder/fsa/fsa.py @@ -14,11 +14,12 @@ class FSA(object): ''' - def __init__(self, encoder): + def __init__(self, encoder, tagset=None): self.encodeWord = encoder.encodeWord self.encodeData = encoder.encodeData self.decodeData = encoder.decodeData self.encodedPrevWord = None + self.tagset = tagset self.initialState = state.State() self.register = register.Register() self.label2Freq = {} diff --git a/fsabuilder/fsa/serializer.py b/fsabuilder/fsa/serializer.py index 471d363..c7a3154 100644 --- a/fsabuilder/fsa/serializer.py +++ b/fsabuilder/fsa/serializer.py @@ -18,6 +18,9 @@ class Serializer(object): def fsa(self): return self._fsa + def getVersion(self): + return 9 + def serialize2CppFile(self, fname): res = [] # self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) @@ -39,14 +42,48 @@ class Serializer(object): def fsa2bytearray(self): res = bytearray() - res.extend(self.serializePrologue()) + res.extend(self.serializePrologue(self.serializeTagset(self.fsa.tagset))) self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) logging.debug('SERIALIZE') for state in sorted(self.fsa.dfs(), key=lambda s: s.offset): res.extend(self.state2bytearray(state)) return res - def serializePrologue(self): + def serializeTags(self, tagsMap): + res = bytearray() + numOfTags = len(tagsMap) + res.extend(self.htons(numOfTags)) + for tag, tagnum in sorted(tagsMap.iteritems(), key=lambda (tag, tagnum): tagnum): + res.extend(self.htons(tagnum)) + res.extend(self.fsa.encodeWord(tag)) + res.append(0) + return res + + def serializeTagset(self, tagset): + res = bytearray() + if tagset: + res.extend(self.serializeTags(tagset.tag2tagnum)) + res.extend(self.serializeTags(tagset.name2namenum)) + return res + + def htons(self, n): + assert n < 65536 + assert n >= 0 + res = bytearray() + res.append((n & 0x00FF00) >> 8) + res.append(n & 0x0000FF) + return res + + def htonl(self, n): + assert n >= 0 + res = bytearray() + res.append((n & 0xFF000000) >> 24) + res.append((n & 0x00FF0000) >> 16) + res.append((n & 0x0000FF00) >> 8) + res.append(n & 0x000000FF) + return res + + def serializePrologue(self, additionalData=None): res = bytearray() # serialize magic number in big-endian order @@ -61,6 +98,15 @@ class Serializer(object): # serialize implementation code res.append(self.getImplementationCode()) + # serialize additional data size in 2-byte big-endian + additionalDataSize = len(additionalData) if additionalData else 0 + res.extend(self.htonl(additionalDataSize)) + + # add additional data itself + if additionalDataSize: + assert type(additionalData) == bytearray + res.extend(additionalData) + return res def state2bytearray(self, state): @@ -81,9 +127,6 @@ class Serializer(object): def transitionsData2bytearray(self, state): raise NotImplementedError('Not implemented') - def getVersion(self): - raise NotImplementedError('Not implemented') - def getImplementationCode(self): raise NotImplementedError('Not implemented') @@ -93,9 +136,6 @@ class SimpleSerializer(Serializer): super(SimpleSerializer, self).__init__(fsa) self.ACCEPTING_FLAG = 128 - def getVersion(self): - return 8 - def getImplementationCode(self): return 0 @@ -141,9 +181,6 @@ class VLengthSerializer1(Serializer): self.ACCEPTING_FLAG = 0b10000000 self.ARRAY_FLAG = 0b01000000 - def getVersion(self): - return 8 - def getImplementationCode(self): return 1 @@ -302,9 +339,6 @@ class VLengthSerializer2(Serializer): self.ACCEPTING_FLAG = 64 self.LAST_FLAG = 32 - def getVersion(self): - return 8 - def getImplementationCode(self): return 2 diff --git a/fsabuilder/fsa/test/testConstruction.py b/fsabuilder/fsa/test/testConstruction.py index 08f312d..67dcb20 100644 --- a/fsabuilder/fsa/test/testConstruction.py +++ b/fsabuilder/fsa/test/testConstruction.py @@ -7,6 +7,7 @@ Created on Oct 8, 2013 import unittest import os from fsa import fsa, visualizer, encode, buildfsa +from fsa.serializer import SimpleSerializer class Test(unittest.TestCase): @@ -59,6 +60,8 @@ class Test(unittest.TestCase): inputFile = os.path.join(os.path.dirname(__file__), 'PoliMorfSmall.tab') tagsetFile = os.path.join(os.path.dirname(__file__), 'polimorf.tagset') fsa = buildfsa.buildFromPoliMorf(inputFile, tagsetFile) + serializer = SimpleSerializer(fsa) + serializer.serialize2BinaryFile('/tmp/test0.fsa') # visualizer.Visualizer().visualize(fsa) if __name__ == "__main__":