|
1
2
3
|
'''
Created on Oct 23, 2013
|
|
4
|
@author: mlenart
|
|
5
6
|
'''
|
|
7
8
|
import logging
|
|
9
10
11
12
13
14
|
class Encoder(object):
'''
classdocs
'''
|
|
15
|
def __init__(self, encoding='utf8'):
|
|
16
17
18
19
20
|
'''
Constructor
'''
self.encoding = encoding
|
|
21
|
def encodeWord(self, word, lowercase=True):
|
|
22
|
assert type(word) == unicode
|
|
23
|
res = bytearray(word.lower() if lowercase else word, self.encoding)
|
|
24
|
return res
|
|
25
26
|
def encodeData(self, data):
|
|
27
28
|
raise NotImplementedError()
# return bytearray(u'|'.join(data).encode(self.encoding)) + bytearray([0])
|
|
29
30
|
def decodeData(self, rawData):
|
|
31
|
return NotImplementedError()
|
|
32
33
|
# print unicode(str(rawData), self.encoding)[:-1]
# print unicode(str(rawData), self.encoding)[:-1].split(u'|')
|
|
34
|
# return unicode(str(rawData), self.encoding)[:-1].split(u'|')
|
|
35
36
37
|
def decodeWord(self, rawWord):
return unicode(str(rawWord).strip('\x00'), self.encoding)
|
|
38
39
|
def word2SortKey(self, word):
|
|
40
41
42
43
|
return word.lower().encode(self.encoding)
class SimpleEncoder(Encoder):
|
|
44
45
|
def __init__(self, encoding='utf8'):
super(SimpleEncoder, self).__init__(encoding)
|
|
46
47
48
49
50
51
52
53
54
55
56
|
def encodeData(self, data):
return bytearray(data, encoding=self.encoding) + bytearray([0])
def decodeData(self, rawData):
return unicode(str(rawData)[:-1], self.encoding)
class MorphEncoder(Encoder):
def __init__(self, encoding='utf8'):
super(MorphEncoder, self).__init__(encoding)
|
|
57
58
59
|
self.LEMMA_ONLY_LOWER = 0
self.LEMMA_UPPER_PREFIX = 1
self.LEMMA_MIXED_CASE = 2
|
|
60
61
62
63
64
65
66
67
68
69
|
def encodeData(self, interpsList):
res = bytearray()
# print interpsList
firstByte = len(interpsList)
assert firstByte < 256
assert firstByte > 0
res.append(firstByte)
assert type(interpsList) == frozenset
for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
|
|
70
|
res.extend(self._encodeTypeNum(interp.typenum))
|
|
71
72
73
74
75
|
res.extend(self._encodeLemma(interp.lemma))
res.extend(self._encodeTagNum(interp.tagnum))
res.extend(self._encodeNameNum(interp.namenum))
return res
|
|
76
77
78
79
|
def _encodeTypeNum(self, typenum):
assert typenum >= 0 and typenum < 256
return bytearray([typenum])
|
|
80
81
82
83
|
def _encodeLemma(self, lemma):
res = bytearray()
assert lemma.cutLength < 256 and lemma.cutLength >= 0
res.append(lemma.cutLength)
|
|
84
|
res.extend(self.encodeWord(lemma.suffixToAdd, lowercase=False))
|
|
85
|
res.append(0)
|
|
86
|
res.extend(self._encodeCasePattern(lemma.casePattern))
|
|
87
88
|
return res
|
|
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
|
def _encodeCasePattern(self, casePattern):
res = bytearray()
if True not in casePattern:
res.append(self.LEMMA_ONLY_LOWER)
return res
elif self._hasUpperPrefix(casePattern):
res.append(self.LEMMA_UPPER_PREFIX)
res.append(self._getUpperPrefixLength(casePattern))
return res
else:
assert len(casePattern) < 256
res.append(self.LEMMA_MIXED_CASE)
res.append(len([c for c in casePattern if c]))
for idx in range(len(casePattern)):
if casePattern[idx]:
res.append(idx)
return res
def _hasUpperPrefix(self, casePattern):
|
|
108
|
for i in range(len(casePattern) + 1):
|
|
109
110
111
112
113
114
115
116
117
118
119
|
if all(casePattern[:i]) and not any(casePattern[i:]):
return True
return False
def _getUpperPrefixLength(self, casePattern):
assert self._hasUpperPrefix(casePattern)
for i in range(len(casePattern)):
if not casePattern[i]:
return i
return len(casePattern)
|
|
120
121
122
123
124
125
126
127
128
129
130
131
132
|
def _encodeTagNum(self, tagnum):
res = bytearray()
# logging.info((tagnum & 0xFF00) >> 8)
assert tagnum < 65536 and tagnum >= 0
res.append((tagnum & 0xFF00) >> 8)
res.append(tagnum & 0x00FF)
# logging.info('%d %s %s' % (tagnum, hex(res[0]), hex(res[1])))
return res
def _encodeNameNum(self, namenum):
assert namenum < 256 and namenum >= 0
return bytearray([namenum])
|