Blame view

fsabuilder/fsa/encode.py 4.16 KB
Michał Lenart authored
1
2
3
'''
Created on Oct 23, 2013
Michał Lenart authored
4
@author: mlenart
Michał Lenart authored
5
6
'''
Michał Lenart authored
7
8
import logging
Michał Lenart authored
9
10
11
12
13
14
class Encoder(object):
    '''
    classdocs
    '''
Michał Lenart authored
15
    def __init__(self, encoding='utf8'):
Michał Lenart authored
16
17
18
19
20
        '''
        Constructor
        '''
        self.encoding = encoding
Michał Lenart authored
21
    def encodeWord(self, word, lowercase=True):
Michał Lenart authored
22
        assert type(word) == unicode
Michał Lenart authored
23
        res = bytearray(word.lower() if lowercase else word, self.encoding)
Michał Lenart authored
24
        return res
Michał Lenart authored
25
26

    def encodeData(self, data):
Michał Lenart authored
27
28
        raise NotImplementedError()
#         return bytearray(u'|'.join(data).encode(self.encoding)) + bytearray([0])
Michał Lenart authored
29
30

    def decodeData(self, rawData):
Michał Lenart authored
31
        return NotImplementedError()
Michał Lenart authored
32
33
#         print unicode(str(rawData), self.encoding)[:-1]
#         print unicode(str(rawData), self.encoding)[:-1].split(u'|')
Michał Lenart authored
34
#         return unicode(str(rawData), self.encoding)[:-1].split(u'|')
Michał Lenart authored
35
36
37

    def decodeWord(self, rawWord):
        return unicode(str(rawWord).strip('\x00'), self.encoding)
Michał Lenart authored
38
39

    def word2SortKey(self, word):
Michał Lenart authored
40
41
42
43
        return word.lower().encode(self.encoding)

class SimpleEncoder(Encoder):
Michał Lenart authored
44
45
    def __init__(self, encoding='utf8'):
        super(SimpleEncoder, self).__init__(encoding)
Michał Lenart authored
46
47
48
49
50
51
52
53
54
55
56

    def encodeData(self, data):
        return bytearray(data, encoding=self.encoding) + bytearray([0])

    def decodeData(self, rawData):
        return unicode(str(rawData)[:-1], self.encoding)

class MorphEncoder(Encoder):

    def __init__(self, encoding='utf8'):
        super(MorphEncoder, self).__init__(encoding)
Michał Lenart authored
57
58
59
        self.LEMMA_ONLY_LOWER = 0
        self.LEMMA_UPPER_PREFIX = 1
        self.LEMMA_MIXED_CASE = 2
Michał Lenart authored
60
61
62
63
64
65
66
67
68
69

    def encodeData(self, interpsList):
        res = bytearray()
#         print interpsList
        firstByte = len(interpsList)
        assert firstByte < 256
        assert firstByte > 0
        res.append(firstByte)
        assert type(interpsList) == frozenset
        for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
Michał Lenart authored
70
            res.extend(self._encodeTypeNum(interp.typenum))
Michał Lenart authored
71
72
73
74
75
            res.extend(self._encodeLemma(interp.lemma))
            res.extend(self._encodeTagNum(interp.tagnum))
            res.extend(self._encodeNameNum(interp.namenum))
        return res
Michał Lenart authored
76
77
78
79
    def _encodeTypeNum(self, typenum):
        assert typenum >= 0 and typenum < 256
        return bytearray([typenum])
Michał Lenart authored
80
81
82
83
    def _encodeLemma(self, lemma):
        res = bytearray()
        assert lemma.cutLength < 256 and lemma.cutLength >= 0
        res.append(lemma.cutLength)
Michał Lenart authored
84
        res.extend(self.encodeWord(lemma.suffixToAdd, lowercase=False))
Michał Lenart authored
85
        res.append(0)
Michał Lenart authored
86
        res.extend(self._encodeCasePattern(lemma.casePattern))
Michał Lenart authored
87
88
        return res
Michał Lenart authored
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
    def _encodeCasePattern(self, casePattern):
        res = bytearray()
        if True not in casePattern:
            res.append(self.LEMMA_ONLY_LOWER)
            return res
        elif self._hasUpperPrefix(casePattern):
            res.append(self.LEMMA_UPPER_PREFIX)
            res.append(self._getUpperPrefixLength(casePattern))
            return res
        else:
            assert len(casePattern) < 256
            res.append(self.LEMMA_MIXED_CASE)
            res.append(len([c for c in casePattern if c]))
            for idx in range(len(casePattern)):
                if casePattern[idx]:
                    res.append(idx)
            return res

    def _hasUpperPrefix(self, casePattern):
Michał Lenart authored
108
        for i in range(len(casePattern) + 1):
Michał Lenart authored
109
110
111
112
113
114
115
116
117
118
119
            if all(casePattern[:i]) and not any(casePattern[i:]):
                return True
        return False

    def _getUpperPrefixLength(self, casePattern):
        assert self._hasUpperPrefix(casePattern)
        for i in range(len(casePattern)):
            if not casePattern[i]:
                return i
        return len(casePattern)
Michał Lenart authored
120
121
122
123
124
125
126
127
128
129
130
131
132
    def _encodeTagNum(self, tagnum):
        res = bytearray()
#         logging.info((tagnum & 0xFF00) >> 8)
        assert tagnum < 65536 and tagnum >= 0
        res.append((tagnum & 0xFF00) >> 8)
        res.append(tagnum & 0x00FF)
#         logging.info('%d %s %s' % (tagnum, hex(res[0]), hex(res[1])))
        return res

    def _encodeNameNum(self, namenum):
        assert namenum < 256 and namenum >= 0
        return bytearray([namenum])