Commit 95cbe5ea03398610d704f1ad1d51cc981f0aafea

Authored by Marcin Woliński
1 parent a5484089

morfeusz_builder → Python 3

CMakeLists.txt
@@ -4,7 +4,7 @@ project (Morfeusz) @@ -4,7 +4,7 @@ project (Morfeusz)
4 4
5 set (Morfeusz_VERSION_MAJOR 1) 5 set (Morfeusz_VERSION_MAJOR 1)
6 set (Morfeusz_VERSION_MINOR 9) 6 set (Morfeusz_VERSION_MINOR 9)
7 -set (Morfeusz_VERSION_PATCH 15) 7 +set (Morfeusz_VERSION_PATCH 16)
8 set (Morfeusz_VERSION "${Morfeusz_VERSION_MAJOR}.${Morfeusz_VERSION_MINOR}.${Morfeusz_VERSION_PATCH}") 8 set (Morfeusz_VERSION "${Morfeusz_VERSION_MAJOR}.${Morfeusz_VERSION_MINOR}.${Morfeusz_VERSION_PATCH}")
9 set (Morfeusz_LIB_VERSION "${Morfeusz_VERSION}") 9 set (Morfeusz_LIB_VERSION "${Morfeusz_VERSION}")
10 if (BUILT_ON) 10 if (BUILT_ON)
fsabuilder/buildanalyzer.sh
1 -#!/bin/bash 1 +#! /bin/bash
2 2
3 -python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V1 --trim-supneg -o $1 3 +python3 morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V1 --trim-supneg -o $1
fsabuilder/buildgenerator.sh
1 -#!/bin/bash 1 +#! /bin/bash
2 2
3 -python morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \ 3 +python3 morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \
4 --tagset-file=../input/sgjp-morfeusz.tagset \ 4 --tagset-file=../input/sgjp-morfeusz.tagset \
5 --segments-file=../input/segmenty.dat \ 5 --segments-file=../input/segmenty.dat \
6 --generator \ 6 --generator \
fsabuilder/morfeusz_builder 100644 → 100755
1 -#!/usr/bin/python 1 +#! /usr/bin/python3
2 # -*- coding:utf-8 -*- 2 # -*- coding:utf-8 -*-
3 ''' 3 '''
4 Created on 21 paź 2013 4 Created on 21 paź 2013
@@ -20,13 +20,13 @@ from optparse import OptionParser @@ -20,13 +20,13 @@ from optparse import OptionParser
20 20
21 def _checkOption(opt, parser, msg): 21 def _checkOption(opt, parser, msg):
22 if opt is None: 22 if opt is None:
23 - print >> sys.stderr, msg 23 + print(msg, file=sys.stderr)
24 parser.print_help() 24 parser.print_help()
25 exit(1) 25 exit(1)
26 26
27 def _checkCondition(cond, parser, msg): 27 def _checkCondition(cond, parser, msg):
28 if not cond: 28 if not cond:
29 - print >> sys.stderr, msg 29 + print(msg, file=sys.stderr)
30 parser.print_help() 30 parser.print_help()
31 exit(1) 31 exit(1)
32 32
@@ -40,7 +40,7 @@ def _checkOpen(filename, mode): @@ -40,7 +40,7 @@ def _checkOpen(filename, mode):
40 if 'w' in mode: 40 if 'w' in mode:
41 os.remove(filename) 41 os.remove(filename)
42 except IOError as ex: 42 except IOError as ex:
43 - print >> sys.stderr, str(ex) 43 + print(str(ex), file=sys.stderr)
44 exit(1) 44 exit(1)
45 45
46 def _getDictFilename(opts, isGenerator): 46 def _getDictFilename(opts, isGenerator):
@@ -162,7 +162,7 @@ def _parseOptions(): @@ -162,7 +162,7 @@ def _parseOptions():
162 _checkOpen(_getDictFilename(opts, isGenerator=True), 'w') 162 _checkOpen(_getDictFilename(opts, isGenerator=True), 'w')
163 163
164 if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1]: 164 if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1]:
165 - print >> sys.stderr, '--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1])+')' 165 + print('--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1])+')', file=sys.stderr)
166 parser.print_help() 166 parser.print_help()
167 exit(1) 167 exit(1)
168 168
@@ -183,34 +183,34 @@ def _readDictIdAndCopyright(inputFiles): @@ -183,34 +183,34 @@ def _readDictIdAndCopyright(inputFiles):
183 with codecs.open(inputFile, 'r', 'utf8') as f: 183 with codecs.open(inputFile, 'r', 'utf8') as f:
184 inCopyright = False 184 inCopyright = False
185 for linenum, line in enumerate(f, start=1): 185 for linenum, line in enumerate(f, start=1):
186 - if dictId is None and line.startswith(u'#!DICT-ID'):  
187 - dictIdTag, _, dictId = line.strip().partition(u' ') 186 + if dictId is None and line.startswith('#!DICT-ID'):
  187 + dictIdTag, _, dictId = line.strip().partition(' ')
188 exceptions.validate( 188 exceptions.validate(
189 - dictIdTag == u'#!DICT-ID',  
190 - u'Dictionary ID tag must be followed by a space character and dictionary ID string') 189 + dictIdTag == '#!DICT-ID',
  190 + 'Dictionary ID tag must be followed by a space character and dictionary ID string')
191 exceptions.validate( 191 exceptions.validate(
192 - len(line.split(u' ')) > 1,  
193 - u'%s:%d: Must provide DICT-ID' % (inputFile, linenum)) 192 + len(line.split(' ')) > 1,
  193 + '%s:%d: Must provide DICT-ID' % (inputFile, linenum))
194 exceptions.validate( 194 exceptions.validate(
195 - len(line.split(u' ')) == 2,  
196 - u'%s:%d: DICT-ID must not contain spaces' % (inputFile, linenum))  
197 - elif copyright is None and line.startswith(u'#<COPYRIGHT>'): 195 + len(line.split(' ')) == 2,
  196 + '%s:%d: DICT-ID must not contain spaces' % (inputFile, linenum))
  197 + elif copyright is None and line.startswith('#<COPYRIGHT>'):
198 exceptions.validate( 198 exceptions.validate(
199 - line.strip() == u'#<COPYRIGHT>',  
200 - u'%s:%d: Copyright start tag must be the only one in the line' % (inputFile, linenum)) 199 + line.strip() == '#<COPYRIGHT>',
  200 + '%s:%d: Copyright start tag must be the only one in the line' % (inputFile, linenum))
201 201
202 inCopyright = True 202 inCopyright = True
203 - copyright = u'' 203 + copyright = ''
204 204
205 - elif line.startswith(u'#</COPYRIGHT>'): 205 + elif line.startswith('#</COPYRIGHT>'):
206 206
207 exceptions.validate( 207 exceptions.validate(
208 inCopyright, 208 inCopyright,
209 - u'%s:%d: Copyright end tag must be preceded by copyright start tag' % (inputFile, linenum)) 209 + '%s:%d: Copyright end tag must be preceded by copyright start tag' % (inputFile, linenum))
210 210
211 exceptions.validate( 211 exceptions.validate(
212 - line.strip() == u'#</COPYRIGHT>',  
213 - u'%s:%d: Copyright end tag must be the only one in the line' % (inputFile, linenum)) 212 + line.strip() == '#</COPYRIGHT>',
  213 + '%s:%d: Copyright end tag must be the only one in the line' % (inputFile, linenum))
214 214
215 inCopyright = False 215 inCopyright = False
216 216
@@ -219,21 +219,21 @@ def _readDictIdAndCopyright(inputFiles): @@ -219,21 +219,21 @@ def _readDictIdAndCopyright(inputFiles):
219 copyright += line 219 copyright += line
220 220
221 if dictId is None: 221 if dictId is None:
222 - logging.warn(u'No dictionary ID tag found')  
223 - dictId = u'' 222 + logging.warn('No dictionary ID tag found')
  223 + dictId = ''
224 224
225 if copyright is None: 225 if copyright is None:
226 - logging.warn(u'No copyright info found')  
227 - copyright = u'' 226 + logging.warn('No copyright info found')
  227 + copyright = ''
228 228
229 return (dictId, copyright) 229 return (dictId, copyright)
230 230
231 def _readNamesAndQualifiers(inputFiles): 231 def _readNamesAndQualifiers(inputFiles):
232 - names = set([u'']) 232 + names = set([''])
233 qualifiers = set([frozenset()]) 233 qualifiers = set([frozenset()])
234 lineParser = convertinput.LineParser() 234 lineParser = convertinput.LineParser()
235 for line in _concatFiles(inputFiles): 235 for line in _concatFiles(inputFiles):
236 - line = line.strip().decode('utf8') 236 + line = line.strip()
237 if not lineParser.ignoreLine(line): 237 if not lineParser.ignoreLine(line):
238 _, _, _, name, qualifier = lineParser.parseLine(line) 238 _, _, _, name, qualifier = lineParser.parseLine(line)
239 names.add(name) 239 names.add(name)
@@ -242,7 +242,7 @@ def _readNamesAndQualifiers(inputFiles): @@ -242,7 +242,7 @@ def _readNamesAndQualifiers(inputFiles):
242 qualifiersMap = dict([(quals, idx) for idx, quals in enumerate(sorted(qualifiers, key=lambda q: tuple(sorted(q))))]) 242 qualifiersMap = dict([(quals, idx) for idx, quals in enumerate(sorted(qualifiers, key=lambda q: tuple(sorted(q))))])
243 exceptions.validate( 243 exceptions.validate(
244 len(qualifiersMap) <= limits.MAX_QUALIFIERS_COMBINATIONS, 244 len(qualifiersMap) <= limits.MAX_QUALIFIERS_COMBINATIONS,
245 - u'Too many qualifiers combinations. The limit is %d' % limits.MAX_QUALIFIERS_COMBINATIONS) 245 + 'Too many qualifiers combinations. The limit is %d' % limits.MAX_QUALIFIERS_COMBINATIONS)
246 246
247 return namesMap, qualifiersMap 247 return namesMap, qualifiersMap
248 248
fsabuilder/morfeuszbuilder/fsa/common.py
@@ -10,9 +10,9 @@ import logging @@ -10,9 +10,9 @@ import logging
10 class EncodedFormWithoutPrefix(object): 10 class EncodedFormWithoutPrefix(object):
11 11
12 def __init__(self, fromWord, targetWord, lowercase): 12 def __init__(self, fromWord, targetWord, lowercase):
13 - assert type(fromWord) == unicode  
14 - assert type(targetWord) == unicode  
15 - root = u'' 13 + assert type(fromWord) == str
  14 + assert type(targetWord) == str
  15 + root = ''
16 for o, b in zip(fromWord, targetWord): 16 for o, b in zip(fromWord, targetWord):
17 if ((o.lower() == b.lower()) if lowercase else o == b): 17 if ((o.lower() == b.lower()) if lowercase else o == b):
18 root += b 18 root += b
@@ -26,8 +26,8 @@ class EncodedFormWithoutPrefix(object): @@ -26,8 +26,8 @@ class EncodedFormWithoutPrefix(object):
26 class EncodedForm4Generator(object): 26 class EncodedForm4Generator(object):
27 27
28 def __init__(self, fromWord, targetWord): 28 def __init__(self, fromWord, targetWord):
29 - assert type(fromWord) == unicode  
30 - assert type(targetWord) == unicode 29 + assert type(fromWord) == str
  30 + assert type(targetWord) == str
31 bestEncodedForm = None 31 bestEncodedForm = None
32 bestPrefixLength = -1 32 bestPrefixLength = -1
33 for prefixLength in range(min(len(targetWord), 5)): 33 for prefixLength in range(min(len(targetWord), 5)):
@@ -45,8 +45,8 @@ class EncodedForm4Generator(object): @@ -45,8 +45,8 @@ class EncodedForm4Generator(object):
45 class EncodedForm4Analyzer(object): 45 class EncodedForm4Analyzer(object):
46 46
47 def __init__(self, fromWord, targetWord): 47 def __init__(self, fromWord, targetWord):
48 - assert type(fromWord) == unicode  
49 - assert type(targetWord) == unicode 48 + assert type(fromWord) == str
  49 + assert type(targetWord) == str
50 bestEncodedForm = None 50 bestEncodedForm = None
51 bestPrefixCutLength = -1 51 bestPrefixCutLength = -1
52 for prefixCutLength in range(min(len(fromWord), 5)): 52 for prefixCutLength in range(min(len(fromWord), 5)):
@@ -123,7 +123,7 @@ class Interpretation4Generator(object): @@ -123,7 +123,7 @@ class Interpretation4Generator(object):
123 return hash(self.getSortKey()) 123 return hash(self.getSortKey())
124 124
125 def __unicode__(self): 125 def __unicode__(self):
126 - return u'<%s,(%d %s),%d,%d>' % (self.lemma.decode('utf8'), self.encodedForm.cutLength, self.encodedForm.suffixToAdd.decode('utf8'), self.tagnum, self.namenum) 126 + return '<%s,(%d %s),%d,%d>' % (self.lemma, self.encodedForm.cutLength, self.encodedForm.suffixToAdd.decode('utf8'), self.tagnum, self.namenum)
127 127
128 def __repr__(self): 128 def __repr__(self):
129 - return unicode(self) 129 + return str(self)
fsabuilder/morfeuszbuilder/fsa/convertinput.py
@@ -4,7 +4,7 @@ Created on Oct 23, 2013 @@ -4,7 +4,7 @@ Created on Oct 23, 2013
4 @author: mlenart 4 @author: mlenart
5 ''' 5 '''
6 import logging 6 import logging
7 -from common import Interpretation4Analyzer 7 +from .common import Interpretation4Analyzer
8 from morfeuszbuilder.fsa.common import Interpretation4Generator 8 from morfeuszbuilder.fsa.common import Interpretation4Generator
9 #from morfeuszbuilder.fsa import externalsort 9 #from morfeuszbuilder.fsa import externalsort
10 10
@@ -36,24 +36,24 @@ class LineParser(object): @@ -36,24 +36,24 @@ class LineParser(object):
36 def ignoreLine(self, line): 36 def ignoreLine(self, line):
37 if not line: 37 if not line:
38 return True 38 return True
39 - elif line.strip() == u'#<COPYRIGHT>': 39 + elif line.strip() == '#<COPYRIGHT>':
40 self.inCopyright = True 40 self.inCopyright = True
41 return True 41 return True
42 - elif line.strip() == u'#</COPYRIGHT>': 42 + elif line.strip() == '#</COPYRIGHT>':
43 self.inCopyright = False 43 self.inCopyright = False
44 return True 44 return True
45 elif self.inCopyright: 45 elif self.inCopyright:
46 return True 46 return True
47 elif line and not ' ' in ''.join(line.split('\t')[:2]): 47 elif line and not ' ' in ''.join(line.split('\t')[:2]):
48 return False 48 return False
49 - elif line.startswith(u'#!DICT-ID'): 49 + elif line.startswith('#!DICT-ID'):
50 return True 50 return True
51 else: 51 else:
52 - logging.warn(u'Ignoring line: "%s" - contains space in text form or lemma' % (line.strip())) 52 + logging.warn('Ignoring line: "%s" - contains space in text form or lemma' % (line.strip()))
53 return True 53 return True
54 54
55 def parseLine(self, line): 55 def parseLine(self, line):
56 - splitLine = line.strip().split(u'\t') 56 + splitLine = line.strip().split('\t')
57 if len(splitLine) == 5: 57 if len(splitLine) == 5:
58 orth, base, tag, name, qualifier = splitLine 58 orth, base, tag, name, qualifier = splitLine
59 elif len(splitLine) == 4: 59 elif len(splitLine) == 4:
@@ -69,7 +69,7 @@ class LineParser(object): @@ -69,7 +69,7 @@ class LineParser(object):
69 69
70 def parseQualifiers(string): 70 def parseQualifiers(string):
71 if string: 71 if string:
72 - return frozenset(string.split(u'|')) 72 + return frozenset(string.split('|'))
73 else: 73 else:
74 return frozenset() 74 return frozenset()
75 75
@@ -87,7 +87,7 @@ class PolimorfConverter4Analyzer(object): @@ -87,7 +87,7 @@ class PolimorfConverter4Analyzer(object):
87 def _partiallyParseLines(self, inputLines): 87 def _partiallyParseLines(self, inputLines):
88 lineParser = LineParser() 88 lineParser = LineParser()
89 for line in inputLines: 89 for line in inputLines:
90 - line = line.decode(self.inputEncoding).strip('\n') 90 + line = line.strip('\n')
91 if not lineParser.ignoreLine(line): 91 if not lineParser.ignoreLine(line):
92 orth, base, tag, name, qualifier = lineParser.parseLine(line) 92 orth, base, tag, name, qualifier = lineParser.parseLine(line)
93 93
@@ -106,8 +106,8 @@ class PolimorfConverter4Analyzer(object): @@ -106,8 +106,8 @@ class PolimorfConverter4Analyzer(object):
106 base = orth 106 base = orth
107 107
108 yield '\t'.join(( 108 yield '\t'.join((
109 - orth.encode(self.inputEncoding),  
110 - base.encode(self.inputEncoding), 109 + orth,
  110 + base,
111 str(tagnum), 111 str(tagnum),
112 str(namenum), 112 str(namenum),
113 str(typenum), 113 str(typenum),
@@ -118,8 +118,8 @@ class PolimorfConverter4Analyzer(object): @@ -118,8 +118,8 @@ class PolimorfConverter4Analyzer(object):
118 base = orth 118 base = orth
119 typenum = self.segmentRulesManager.shiftOrthMagic.getNewSegnum4ShiftOrth(typenum) 119 typenum = self.segmentRulesManager.shiftOrthMagic.getNewSegnum4ShiftOrth(typenum)
120 yield '\t'.join(( 120 yield '\t'.join((
121 - orth.encode(self.inputEncoding),  
122 - base.encode(self.inputEncoding), 121 + orth,
  122 + base,
123 str(tagnum), 123 str(tagnum),
124 str(namenum), 124 str(namenum),
125 str(typenum), 125 str(typenum),
@@ -127,14 +127,14 @@ class PolimorfConverter4Analyzer(object): @@ -127,14 +127,14 @@ class PolimorfConverter4Analyzer(object):
127 127
128 # input lines are encoded and partially parsed 128 # input lines are encoded and partially parsed
129 def _sortLines(self, inputLines): 129 def _sortLines(self, inputLines):
130 - return sorted(inputLines, key=lambda line: self.encoder.word2SortKey(line.split('\t')[0].decode('utf8'))) 130 + return sorted(inputLines, key=lambda line: self.encoder.word2SortKey(line.split('\t')[0]))
131 # return sorted(inputLines, key=lambda line: self.encoder.word2SortKey(line.split('\t')[0].decode('utf8'))) 131 # return sorted(inputLines, key=lambda line: self.encoder.word2SortKey(line.split('\t')[0].decode('utf8')))
132 132
133 def _reallyParseLines(self, inputLines): 133 def _reallyParseLines(self, inputLines):
134 for line in inputLines: 134 for line in inputLines:
135 - line = line.decode(self.inputEncoding).strip(u'\n') 135 + line = line.strip('\n')
136 if line: 136 if line:
137 - orth, base, tagnum, namenum, typenum, qualsnum = line.split(u'\t') 137 + orth, base, tagnum, namenum, typenum, qualsnum = line.split('\t')
138 tagnum = int(tagnum) 138 tagnum = int(tagnum)
139 namenum = int(namenum) 139 namenum = int(namenum)
140 typenum = int(typenum) 140 typenum = int(typenum)
@@ -159,14 +159,14 @@ class PolimorfConverter4Generator(object): @@ -159,14 +159,14 @@ class PolimorfConverter4Generator(object):
159 def _partiallyParseLines(self, inputLines): 159 def _partiallyParseLines(self, inputLines):
160 lineParser = LineParser() 160 lineParser = LineParser()
161 for line in inputLines: 161 for line in inputLines:
162 - line = line.decode(self.inputEncoding).strip('\n') 162 + line = line.strip('\n')
163 if not lineParser.ignoreLine(line): 163 if not lineParser.ignoreLine(line):
164 orth, base, tag, name, qualifier = lineParser.parseLine(line) 164 orth, base, tag, name, qualifier = lineParser.parseLine(line)
165 if base: 165 if base:
166 - homonymId = u''  
167 - if u':' in base:  
168 - assumedBase, assumedHomonymId = base.split(u':', 1)  
169 - if assumedBase != u'' and assumedHomonymId != u'' and assumedHomonymId.isalnum(): 166 + homonymId = ''
  167 + if ':' in base:
  168 + assumedBase, assumedHomonymId = base.split(':', 1)
  169 + if assumedBase != '' and assumedHomonymId != '' and assumedHomonymId.isalnum():
170 base, homonymId = assumedBase, assumedHomonymId 170 base, homonymId = assumedBase, assumedHomonymId
171 tagnum = self.tagset.getTagnum4Tag(tag) 171 tagnum = self.tagset.getTagnum4Tag(tag)
172 namenum = self.namesMap[name] 172 namenum = self.namesMap[name]
@@ -179,39 +179,39 @@ class PolimorfConverter4Generator(object): @@ -179,39 +179,39 @@ class PolimorfConverter4Generator(object):
179 base = orth 179 base = orth
180 180
181 yield '\t'.join(( 181 yield '\t'.join((
182 - orth.encode(self.inputEncoding),  
183 - base.encode(self.inputEncoding), 182 + orth,
  183 + base,
184 str(tagnum), 184 str(tagnum),
185 str(namenum), 185 str(namenum),
186 str(typenum), 186 str(typenum),
187 - homonymId.encode(self.inputEncoding), 187 + homonymId,
188 str(qualsnum))) 188 str(qualsnum)))
189 189
190 if self.segmentRulesManager.shiftOrthMagic.getNewSegnum4ShiftOrth(typenum) != None: 190 if self.segmentRulesManager.shiftOrthMagic.getNewSegnum4ShiftOrth(typenum) != None:
191 base = orth 191 base = orth
192 typenum = self.segmentRulesManager.shiftOrthMagic.getNewSegnum4ShiftOrth(typenum) 192 typenum = self.segmentRulesManager.shiftOrthMagic.getNewSegnum4ShiftOrth(typenum)
193 yield '\t'.join(( 193 yield '\t'.join((
194 - orth.encode(self.inputEncoding),  
195 - base.encode(self.inputEncoding), 194 + orth,
  195 + base,
196 str(tagnum), 196 str(tagnum),
197 str(namenum), 197 str(namenum),
198 str(typenum), 198 str(typenum),
199 - homonymId.encode(self.inputEncoding), 199 + homonymId,
200 str(qualsnum))) 200 str(qualsnum)))
201 else: 201 else:
202 logging.warn('Ignoring line: "%s" - contains empty lemma', line.strip()) 202 logging.warn('Ignoring line: "%s" - contains empty lemma', line.strip())
203 203
204 # input lines are encoded and partially parsed 204 # input lines are encoded and partially parsed
205 def _sortLines(self, inputLines): 205 def _sortLines(self, inputLines):
206 - return sorted(inputLines, key=lambda line: (self.encoder.word2SortKey(line.split('\t')[1].decode('utf8')), line)) 206 + return sorted(inputLines, key=lambda line: (self.encoder.word2SortKey(line.split('\t')[1]), line))
207 207
208 def _reallyParseLines(self, inputLines): 208 def _reallyParseLines(self, inputLines):
209 prevLine = None 209 prevLine = None
210 for line in inputLines: 210 for line in inputLines:
211 - line = line.decode(self.inputEncoding).strip(u'\n') 211 + line = line.strip('\n')
212 if line and line != prevLine: 212 if line and line != prevLine:
213 - orth, base, tagnum, namenum, typenum, homonymId, qualsnum = line.split(u'\t')  
214 -# print orth.encode('utf8'), base.encode('utf8'), homonymId 213 + orth, base, tagnum, namenum, typenum, homonymId, qualsnum = line.split('\t')
  214 +# print orth, base, homonymId
215 tagnum = int(tagnum) 215 tagnum = int(tagnum)
216 namenum = int(namenum) 216 namenum = int(namenum)
217 typenum = int(typenum) 217 typenum = int(typenum)
fsabuilder/morfeuszbuilder/fsa/encode.py
@@ -24,7 +24,7 @@ class Encoder(object): @@ -24,7 +24,7 @@ class Encoder(object):
24 #~ self.qualifiersMap = { frozenset(): 0} 24 #~ self.qualifiersMap = { frozenset(): 0}
25 25
26 def encodeWord(self, word, lowercase=True): 26 def encodeWord(self, word, lowercase=True):
27 - assert type(word) == unicode 27 + assert type(word) == str
28 res = bytearray(word.lower() if self.lowercase and lowercase else word, self.encoding) 28 res = bytearray(word.lower() if self.lowercase and lowercase else word, self.encoding)
29 return res 29 return res
30 30
@@ -35,16 +35,16 @@ class Encoder(object): @@ -35,16 +35,16 @@ class Encoder(object):
35 return NotImplementedError() 35 return NotImplementedError()
36 36
37 def decodeWord(self, rawWord): 37 def decodeWord(self, rawWord):
38 - return unicode(str(rawWord).strip('\x00'), self.encoding) 38 + return str(str(rawWord).strip('\x00'), self.encoding)
39 39
40 def word2SortKey(self, word): 40 def word2SortKey(self, word):
41 normalizedWord = word.lower() if self.lowercase else word 41 normalizedWord = word.lower() if self.lowercase else word
42 - return normalizedWord.encode(self.encoding) 42 + return normalizedWord
43 43
44 def _encodeTypeNum(self, typenum): 44 def _encodeTypeNum(self, typenum):
45 exceptions.validate( 45 exceptions.validate(
46 typenum <= limits.MAX_SEGMENT_TYPES, 46 typenum <= limits.MAX_SEGMENT_TYPES,
47 - u'Too many segment types. The limit is %d' % limits.MAX_SEGMENT_TYPES) 47 + 'Too many segment types. The limit is %d' % limits.MAX_SEGMENT_TYPES)
48 return bytearray([typenum]) 48 return bytearray([typenum])
49 49
50 def _hasUpperPrefix(self, casePattern): 50 def _hasUpperPrefix(self, casePattern):
@@ -62,13 +62,13 @@ class Encoder(object): @@ -62,13 +62,13 @@ class Encoder(object):
62 62
63 def _encodeTagNum(self, tagnum): 63 def _encodeTagNum(self, tagnum):
64 res = bytearray() 64 res = bytearray()
65 - exceptions.validate(tagnum <= limits.MAX_TAGS, u'Too many tags. The limit is %d' % limits.MAX_TAGS) 65 + exceptions.validate(tagnum <= limits.MAX_TAGS, 'Too many tags. The limit is %d' % limits.MAX_TAGS)
66 res.append((tagnum & 0xFF00) >> 8) 66 res.append((tagnum & 0xFF00) >> 8)
67 res.append(tagnum & 0x00FF) 67 res.append(tagnum & 0x00FF)
68 return res 68 return res
69 69
70 def _encodeNameNum(self, namenum): 70 def _encodeNameNum(self, namenum):
71 - exceptions.validate(namenum <= limits.MAX_NAMES, u'Too many named entity types. The limit is %d' % limits.MAX_NAMES) 71 + exceptions.validate(namenum <= limits.MAX_NAMES, 'Too many named entity types. The limit is %d' % limits.MAX_NAMES)
72 return bytearray([namenum]) 72 return bytearray([namenum])
73 73
74 def _groupInterpsByType(self, interpsList): 74 def _groupInterpsByType(self, interpsList):
@@ -86,7 +86,7 @@ class Encoder(object): @@ -86,7 +86,7 @@ class Encoder(object):
86 86
87 res = bytearray() 87 res = bytearray()
88 88
89 - for typenum, interpsList in segnum2Interps.iteritems(): 89 + for typenum, interpsList in list(segnum2Interps.items()):
90 res.extend(self._encodeInterps4Type(typenum, interpsList)) 90 res.extend(self._encodeInterps4Type(typenum, interpsList))
91 del interpsList 91 del interpsList
92 92
@@ -135,10 +135,10 @@ class MorphEncoder(Encoder): @@ -135,10 +135,10 @@ class MorphEncoder(Encoder):
135 return res 135 return res
136 136
137 def _casePatternsHaveOnlyLowercase(self, casePatterns): 137 def _casePatternsHaveOnlyLowercase(self, casePatterns):
138 - return not any(map(lambda cp: cp and True in cp, casePatterns)) 138 + return not any([cp and True in cp for cp in casePatterns])
139 139
140 def _casePatternsAreOnlyTitles(self, casePatterns): 140 def _casePatternsAreOnlyTitles(self, casePatterns):
141 - return all(map(lambda cp: cp and cp[0] == True and not True in cp[1:], casePatterns)) 141 + return all([cp and cp[0] == True and not True in cp[1:] for cp in casePatterns])
142 142
143 def _casePatternsAreEncodedInCompressByte(self, casePatterns): 143 def _casePatternsAreEncodedInCompressByte(self, casePatterns):
144 return self._casePatternsHaveOnlyLowercase(casePatterns) or self._casePatternsAreOnlyTitles(casePatterns) 144 return self._casePatternsHaveOnlyLowercase(casePatterns) or self._casePatternsAreOnlyTitles(casePatterns)
fsabuilder/morfeuszbuilder/fsa/fsa.py
@@ -4,8 +4,8 @@ Created on Oct 8, 2013 @@ -4,8 +4,8 @@ Created on Oct 8, 2013
4 @author: mlenart 4 @author: mlenart
5 ''' 5 '''
6 6
7 -import state  
8 -import register 7 +from . import state
  8 +from . import register
9 import logging 9 import logging
10 from morfeuszbuilder.utils import exceptions 10 from morfeuszbuilder.utils import exceptions
11 11
@@ -35,7 +35,7 @@ class FSA(object): @@ -35,7 +35,7 @@ class FSA(object):
35 assert not self.closed 35 assert not self.closed
36 assert data is not None 36 assert data is not None
37 encodedWord = self.encodeWord(word) 37 encodedWord = self.encodeWord(word)
38 - assert encodedWord > self.encodedPrevWord 38 + assert self.encodedPrevWord is None or encodedWord > self.encodedPrevWord
39 self._addSorted(encodedWord, self.encodeData(data)) 39 self._addSorted(encodedWord, self.encodeData(data))
40 self.encodedPrevWord = encodedWord 40 self.encodedPrevWord = encodedWord
41 41
@@ -43,7 +43,7 @@ class FSA(object): @@ -43,7 +43,7 @@ class FSA(object):
43 43
44 # debug 44 # debug
45 if self.n < 10 or (self.n < 10000 and self.n % 1000 == 0) or self.n % 10000 == 0: 45 if self.n < 10 or (self.n < 10000 and self.n % 1000 == 0) or self.n % 10000 == 0:
46 - logging.info(u'%d %s' % (self.n, word)) 46 + logging.info('%d %s' % (self.n, word))
47 for label in encodedWord: 47 for label in encodedWord:
48 self.label2Freq[label] = self.label2Freq.get(label, 0) + 1 48 self.label2Freq[label] = self.label2Freq.get(label, 0) + 1
49 49
@@ -78,7 +78,7 @@ class FSA(object): @@ -78,7 +78,7 @@ class FSA(object):
78 return res 78 return res
79 79
80 def _addSorted(self, encodedWord, data): 80 def _addSorted(self, encodedWord, data):
81 - assert self.encodedPrevWord < encodedWord 81 + assert self.encodedPrevWord is None or self.encodedPrevWord < encodedWord
82 assert type(data) == bytearray 82 assert type(data) == bytearray
83 q = self.initialState 83 q = self.initialState
84 i = 0 84 i = 0
fsabuilder/morfeuszbuilder/fsa/serializer.py
@@ -5,7 +5,7 @@ Created on Oct 20, 2013 @@ -5,7 +5,7 @@ Created on Oct 20, 2013
5 ''' 5 '''
6 6
7 import logging 7 import logging
8 -from state import State 8 +from .state import State
9 from morfeuszbuilder.utils import limits, exceptions 9 from morfeuszbuilder.utils import limits, exceptions
10 from morfeuszbuilder.utils.serializationUtils import * 10 from morfeuszbuilder.utils.serializationUtils import *
11 11
@@ -106,7 +106,7 @@ class Serializer(object): @@ -106,7 +106,7 @@ class Serializer(object):
106 res = bytearray() 106 res = bytearray()
107 numOfTags = len(tagsMap) 107 numOfTags = len(tagsMap)
108 res.extend(htons(numOfTags)) 108 res.extend(htons(numOfTags))
109 - for tag, tagnum in sorted(tagsMap.iteritems(), key=lambda (tag, tagnum): tagnum): 109 + for tag, tagnum in sorted(iter(list(tagsMap.items())), key=lambda tag_tagnum: tag_tagnum[1]):
110 res.extend(htons(tagnum)) 110 res.extend(htons(tagnum))
111 res.extend(self.fsa.encodeWord(tag)) 111 res.extend(self.fsa.encodeWord(tag))
112 res.append(0) 112 res.append(0)
@@ -121,7 +121,7 @@ class Serializer(object): @@ -121,7 +121,7 @@ class Serializer(object):
121 #~ return res 121 #~ return res
122 122
123 def serializeQualifiersMap(self): 123 def serializeQualifiersMap(self):
124 - label2labelId = dict([ (u'|'.join(qualifiers), n) for qualifiers, n in sorted(self.qualifiersMap.iteritems(), key=lambda (qs, n): n) ]) 124 + label2labelId = dict([ ('|'.join(sorted(qualifiers)), n) for qualifiers, n in sorted(iter(list(self.qualifiersMap.items())), key=lambda qs_n: qs_n[1]) ])
125 return self._serializeTags(label2labelId) 125 return self._serializeTags(label2labelId)
126 #~ res = bytearray() 126 #~ res = bytearray()
127 #~ res.extend(htons(len(self.qualifiersMap))) 127 #~ res.extend(htons(len(self.qualifiersMap)))
@@ -186,9 +186,9 @@ class Serializer(object): @@ -186,9 +186,9 @@ class Serializer(object):
186 return res 186 return res
187 187
188 def getSortedTransitions(self, state): 188 def getSortedTransitions(self, state):
189 - defaultKey = lambda (label, nextState): (-state.label2Freq.get(label, 0), -self.fsa.label2Freq.get(label, 0)) 189 + defaultKey = lambda label_nextState: (-state.label2Freq.get(label_nextState[0], 0), -self.fsa.label2Freq.get(label_nextState[0], 0))
190 return list(sorted( 190 return list(sorted(
191 - state.transitionsMap.iteritems(), 191 + iter(list(state.transitionsMap.items())),
192 key=defaultKey)) 192 key=defaultKey))
193 193
194 def stateData2bytearray(self, state): 194 def stateData2bytearray(self, state):
@@ -215,9 +215,9 @@ class SimpleSerializer(Serializer): @@ -215,9 +215,9 @@ class SimpleSerializer(Serializer):
215 215
216 def getStateSize(self, state): 216 def getStateSize(self, state):
217 if self.serializeTransitionsData: 217 if self.serializeTransitionsData:
218 - return 1 + 5 * len(state.transitionsMap.keys()) + self.getDataSize(state) 218 + return 1 + 5 * len(list(state.transitionsMap.keys())) + self.getDataSize(state)
219 else: 219 else:
220 - return 1 + 4 * len(state.transitionsMap.keys()) + self.getDataSize(state) 220 + return 1 + 4 * len(list(state.transitionsMap.keys())) + self.getDataSize(state)
221 221
222 def getDataSize(self, state): 222 def getDataSize(self, state):
223 return len(state.encodedData) if state.isAccepting() else 0 223 return len(state.encodedData) if state.isAccepting() else 0
@@ -270,12 +270,12 @@ class VLengthSerializer1(Serializer): @@ -270,12 +270,12 @@ class VLengthSerializer1(Serializer):
270 res = bytearray() 270 res = bytearray()
271 271
272 # labels sorted by popularity 272 # labels sorted by popularity
273 - sortedLabels = [label for (label, freq) in sorted(self.fsa.label2Freq.iteritems(), key=lambda (label, freq): (-freq, label))] 273 + sortedLabels = [label for (label, freq) in sorted(iter(list(self.fsa.label2Freq.items())), key=lambda label_freq: (-label_freq[1], label_freq[0]))]
274 274
275 # popular labels table 275 # popular labels table
276 self.label2ShortLabel = dict([(label, sortedLabels.index(label) + 1) for label in sortedLabels[:63]]) 276 self.label2ShortLabel = dict([(label, sortedLabels.index(label) + 1) for label in sortedLabels[:63]])
277 277
278 - logging.debug(dict([(chr(label), shortLabel) for label, shortLabel in self.label2ShortLabel.items()])) 278 + logging.debug(dict([(chr(label), shortLabel) for label, shortLabel in list(self.label2ShortLabel.items())]))
279 279
280 # write remaining short labels (zeros) 280 # write remaining short labels (zeros)
281 for label in range(256): 281 for label in range(256):
@@ -354,7 +354,7 @@ class VLengthSerializer1(Serializer): @@ -354,7 +354,7 @@ class VLengthSerializer1(Serializer):
354 offsetSize += 1 354 offsetSize += 1
355 exceptions.validate( 355 exceptions.validate(
356 offset < 256 * 256 * 256, 356 offset < 256 * 256 * 256,
357 - u'Cannot build the automaton - it would exceed its max size which is %d' % (256 * 256 * 256)) 357 + 'Cannot build the automaton - it would exceed its max size which is %d' % (256 * 256 * 256))
358 # assert offset < 256 * 256 * 256 # TODO - przerobic na jakis porzadny wyjatek 358 # assert offset < 256 * 256 * 256 # TODO - przerobic na jakis porzadny wyjatek
359 assert offsetSize <= 3 359 assert offsetSize <= 3
360 firstByte |= offsetSize 360 firstByte |= offsetSize
@@ -380,7 +380,7 @@ class VLengthSerializer1(Serializer): @@ -380,7 +380,7 @@ class VLengthSerializer1(Serializer):
380 newState.encodedData = state.encodedData 380 newState.encodedData = state.encodedData
381 newState.reverseOffset = state.reverseOffset 381 newState.reverseOffset = state.reverseOffset
382 newState.offset = state.offset 382 newState.offset = state.offset
383 - newState.transitionsMap = dict([(label, nextState) for (label, nextState) in state.transitionsMap.iteritems()]) 383 + newState.transitionsMap = dict([(label, nextState) for (label, nextState) in list(state.transitionsMap.items())])
384 # newState.transitionsMap = dict([(label, nextState) for (label, nextState) in state.transitionsMap.iteritems() if not label in self.label2ShortLabel or not self.label2ShortLabel[label] in range(1,64)]) 384 # newState.transitionsMap = dict([(label, nextState) for (label, nextState) in state.transitionsMap.iteritems() if not label in self.label2ShortLabel or not self.label2ShortLabel[label] in range(1,64)])
385 newState.serializeAsArray = False 385 newState.serializeAsArray = False
386 return newState 386 return newState
@@ -388,12 +388,12 @@ class VLengthSerializer1(Serializer): @@ -388,12 +388,12 @@ class VLengthSerializer1(Serializer):
388 def _transitions2ArrayBytes(self, state): 388 def _transitions2ArrayBytes(self, state):
389 res = bytearray() 389 res = bytearray()
390 array = [0] * 64 390 array = [0] * 64
391 - for label, nextState in state.transitionsMap.iteritems(): 391 + for label, nextState in list(state.transitionsMap.items()):
392 if label in self.label2ShortLabel: 392 if label in self.label2ShortLabel:
393 shortLabel = self.label2ShortLabel[label] 393 shortLabel = self.label2ShortLabel[label]
394 array[shortLabel] = nextState.offset 394 array[shortLabel] = nextState.offset
395 logging.debug(array) 395 logging.debug(array)
396 - for offset in map(lambda x: x if x else 0, array): 396 + for offset in [x if x else 0 for x in array]:
397 res.append(0) 397 res.append(0)
398 res.append((offset & 0xFF0000) >> 16) 398 res.append((offset & 0xFF0000) >> 16)
399 res.append((offset & 0x00FF00) >> 8) 399 res.append((offset & 0x00FF00) >> 8)
@@ -409,8 +409,8 @@ class VLengthSerializer1(Serializer): @@ -409,8 +409,8 @@ class VLengthSerializer1(Serializer):
409 return self._transitions2ListBytes(state) 409 return self._transitions2ListBytes(state)
410 410
411 def _chooseArrayStates(self): 411 def _chooseArrayStates(self):
412 - for state1 in self.fsa.initialState.transitionsMap.values():  
413 - for state2 in state1.transitionsMap.values(): 412 + for state1 in list(self.fsa.initialState.transitionsMap.values()):
  413 + for state2 in list(state1.transitionsMap.values()):
414 # for state3 in state2.transitionsMap.values(): 414 # for state3 in state2.transitionsMap.values():
415 # state3.serializeAsArray = True 415 # state3.serializeAsArray = True
416 state2.serializeAsArray = True 416 state2.serializeAsArray = True
fsabuilder/morfeuszbuilder/fsa/state.py
@@ -45,7 +45,7 @@ class State(object): @@ -45,7 +45,7 @@ class State(object):
45 return self.transitionsMap.get(byte, None) 45 return self.transitionsMap.get(byte, None)
46 46
47 def getRegisterKey(self): 47 def getRegisterKey(self):
48 - return ( frozenset(self.transitionsMap.iteritems()), tuple(self.encodedData) if self.encodedData else None ) 48 + return ( frozenset(iter(list(self.transitionsMap.items()))), tuple(self.encodedData) if self.encodedData else None )
49 49
50 def isAccepting(self): 50 def isAccepting(self):
51 return self.encodedData is not None 51 return self.encodedData is not None
@@ -60,10 +60,10 @@ class State(object): @@ -60,10 +60,10 @@ class State(object):
60 else: 60 else:
61 return self.encodedData 61 return self.encodedData
62 62
63 - def dfs(self, alreadyVisited, sortKey=lambda (_, state): -state.freq): 63 + def dfs(self, alreadyVisited, sortKey=lambda __state: -__state[1].freq):
64 if not self in alreadyVisited: 64 if not self in alreadyVisited:
65 alreadyVisited.add(self) 65 alreadyVisited.add(self)
66 - for _, state in sorted(self.transitionsMap.iteritems(), key=sortKey): 66 + for _, state in sorted(iter(list(self.transitionsMap.items())), key=sortKey):
67 for state1 in state.dfs(alreadyVisited): 67 for state1 in state.dfs(alreadyVisited):
68 yield state1 68 yield state1
69 yield self 69 yield self
@@ -77,7 +77,7 @@ class State(object): @@ -77,7 +77,7 @@ class State(object):
77 state.offset = currReverseOffset - state.reverseOffset 77 state.offset = currReverseOffset - state.reverseOffset
78 78
79 def debug(self): 79 def debug(self):
80 - print '----------------'  
81 - print 'STATE:', self.idx, 'accepting', self.isAccepting()  
82 - for label, s in self.transitionsMap.iteritems():  
83 - print label, '-->', s.idx 80 + print('----------------')
  81 + print(('STATE:', self.idx, 'accepting', self.isAccepting()))
  82 + for label, s in list(self.transitionsMap.items()):
  83 + print((label, '-->', s.idx))
fsabuilder/morfeuszbuilder/fsa/visualizer.py
@@ -19,7 +19,7 @@ class Visualizer(object): @@ -19,7 +19,7 @@ class Visualizer(object):
19 nodeLabelsMap = {} 19 nodeLabelsMap = {}
20 for idx, state in enumerate(allStates): 20 for idx, state in enumerate(allStates):
21 G.add_node(idx, offset=state.offset) 21 G.add_node(idx, offset=state.offset)
22 - for c, targetState in state.transitionsMap.iteritems(): 22 + for c, targetState in list(state.transitionsMap.items()):
23 G.add_edge(idx, allStates.index(targetState)) 23 G.add_edge(idx, allStates.index(targetState))
24 label = (chr(c) if c <= 127 else '%') if charLabels \ 24 label = (chr(c) if c <= 127 else '%') if charLabels \
25 else c 25 else c
@@ -37,11 +37,11 @@ class Visualizer(object): @@ -37,11 +37,11 @@ class Visualizer(object):
37 nodelist=list([allStates.index(s) for s in allStates if s.isAccepting()]), 37 nodelist=list([allStates.index(s) for s in allStates if s.isAccepting()]),
38 node_shape='s') 38 node_shape='s')
39 # nx.draw_networkx_nodes(G, pos, nodelist=list([allStates.index(s) for s in allStates if s.isFinal()])), ) 39 # nx.draw_networkx_nodes(G, pos, nodelist=list([allStates.index(s) for s in allStates if s.isFinal()])), )
40 - nx.draw_networkx_edges(G, pos, edgelist=edgeLabelsMap.keys()) 40 + nx.draw_networkx_edges(G, pos, edgelist=list(edgeLabelsMap.keys()))
41 nx.draw_networkx_edge_labels(G, pos, edge_labels = edgeLabelsMap) 41 nx.draw_networkx_edge_labels(G, pos, edge_labels = edgeLabelsMap)
42 nx.draw_networkx_labels(G, pos, labels=nodeLabelsMap) 42 nx.draw_networkx_labels(G, pos, labels=nodeLabelsMap)
43 plt.axis('off') 43 plt.axis('off')
44 plt.draw() 44 plt.draw()
45 plt.show() 45 plt.show()
46 # plt.savefig('filename.png') 46 # plt.savefig('filename.png')
47 - print 'done' 47 + print('done')
fsabuilder/morfeuszbuilder/segrules/preprocessor.py
@@ -7,10 +7,10 @@ Created on 23 sty 2014 @@ -7,10 +7,10 @@ Created on 23 sty 2014
7 import re 7 import re
8 from pyparsing import * 8 from pyparsing import *
9 from morfeuszbuilder.utils import exceptions 9 from morfeuszbuilder.utils import exceptions
10 -from pyparseString import pyparseString 10 +from .pyparseString import pyparseString
11 11
12 -identifier = Word(alphas, bodyChars=alphanums+u'_>*+{},')  
13 -define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + Word(alphas, bodyChars=alphanums+u'_') + Suppress(')')) + restOfLine + LineEnd() + StringEnd() 12 +identifier = Word(alphas, bodyChars=alphanums+'_>*+{},')
  13 +define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + Word(alphas, bodyChars=alphanums+'_') + Suppress(')')) + restOfLine + LineEnd() + StringEnd()
14 ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd() 14 ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd()
15 endif = Keyword('#endif').suppress() + LineEnd() + StringEnd() 15 endif = Keyword('#endif').suppress() + LineEnd() + StringEnd()
16 16
@@ -107,5 +107,5 @@ def preprocess(inputLines, defs, filename): @@ -107,5 +107,5 @@ def preprocess(inputLines, defs, filename):
107 ifdefsStack.pop() 107 ifdefsStack.pop()
108 elif line.startswith('#'): 108 elif line.startswith('#'):
109 yield lineNum, line 109 yield lineNum, line
110 - elif len(ifdefsStack) == 0 or all(map(lambda (name, isActive): (name in defs and isActive) or (name not in defs and not isActive), ifdefsStack)): 110 + elif len(ifdefsStack) == 0 or all([(name_isActive[0] in defs and name_isActive[1]) or (name_isActive[0] not in defs and not name_isActive[1]) for name_isActive in ifdefsStack]):
111 yield lineNum, _processLine(lineNum, line, defines, filename) 111 yield lineNum, _processLine(lineNum, line, defines, filename)
fsabuilder/morfeuszbuilder/segrules/pyparseString.py
@@ -11,7 +11,7 @@ def pyparseString(rule, lineNum, line, filename): @@ -11,7 +11,7 @@ def pyparseString(rule, lineNum, line, filename):
11 try: 11 try:
12 return rule.parseString(line, parseAll=True) 12 return rule.parseString(line, parseAll=True)
13 except ParseException as ex: 13 except ParseException as ex:
14 - msg = u'%s:%d - Preprocessing of segmentation rules failed.\n' % (filename, lineNum) 14 + msg = '%s:%d - Preprocessing of segmentation rules failed.\n' % (filename, lineNum)
15 msg += line + '\n' 15 msg += line + '\n'
16 msg += (ex.col - 1) * ' ' + '^\n' 16 msg += (ex.col - 1) * ' ' + '^\n'
17 msg += ex.msg 17 msg += ex.msg
fsabuilder/morfeuszbuilder/segrules/rules.py
@@ -126,7 +126,7 @@ class ComplexRule(SegmentRule): @@ -126,7 +126,7 @@ class ComplexRule(SegmentRule):
126 def __init__(self, children, linenum): 126 def __init__(self, children, linenum):
127 super(ComplexRule, self).__init__(linenum) 127 super(ComplexRule, self).__init__(linenum)
128 self.children = children 128 self.children = children
129 - assert not any(map(lambda c: c.isSinkRule(), children)) 129 + assert not any([c.isSinkRule() for c in children])
130 130
131 def addToNFA(self, fsa): 131 def addToNFA(self, fsa):
132 endState = RulesNFAState(self, final=True, weak=self.weak, autogenerated=self.autogenerated) 132 endState = RulesNFAState(self, final=True, weak=self.weak, autogenerated=self.autogenerated)
@@ -159,13 +159,13 @@ class ConcatRule(ComplexRule): @@ -159,13 +159,13 @@ class ConcatRule(ComplexRule):
159 lastChild._doAddToNFA(currStartState, endState) 159 lastChild._doAddToNFA(currStartState, endState)
160 160
161 def allowsEmptySequence(self): 161 def allowsEmptySequence(self):
162 - return all(map(lambda rule: rule.allowsEmptySequence(), self.children)) 162 + return all([rule.allowsEmptySequence() for rule in self.children])
163 163
164 def __str__(self): 164 def __str__(self):
165 - return u' '.join(map(lambda c: str(c), self.children)) 165 + return ' '.join([str(c) for c in self.children])
166 166
167 def isShiftOrthRule(self): 167 def isShiftOrthRule(self):
168 - return all(map(lambda c: c.isShiftOrthRule(), self.children)) 168 + return all([c.isShiftOrthRule() for c in self.children])
169 169
170 def transformToGeneratorVersion(self): 170 def transformToGeneratorVersion(self):
171 newChildren = [child.transformToGeneratorVersion() for child in self.children if not child.allowsEmptySequence() or child.isShiftOrthRule()] 171 newChildren = [child.transformToGeneratorVersion() for child in self.children if not child.allowsEmptySequence() or child.isShiftOrthRule()]
@@ -207,11 +207,11 @@ class ConcatRule(ComplexRule): @@ -207,11 +207,11 @@ class ConcatRule(ComplexRule):
207 for rule in self.children: 207 for rule in self.children:
208 rule.validate(filename) 208 rule.validate(filename)
209 if self.children[-1].isShiftOrthRule() \ 209 if self.children[-1].isShiftOrthRule() \
210 - and not all(map(lambda c: c.isShiftOrthRule(), self.children)): 210 + and not all([c.isShiftOrthRule() for c in self.children]):
211 raise ConfigFileException( 211 raise ConfigFileException(
212 filename, 212 filename,
213 self.linenum, 213 self.linenum,
214 - u'If the rightmost subrule of concatenation "%s" is with ">", than all subrules must be with ">"' % str(self)) 214 + 'If the rightmost subrule of concatenation "%s" is with ">", than all subrules must be with ">"' % str(self))
215 215
216 class OrRule(ComplexRule): 216 class OrRule(ComplexRule):
217 217
@@ -227,17 +227,17 @@ class OrRule(ComplexRule): @@ -227,17 +227,17 @@ class OrRule(ComplexRule):
227 intermEndState.addTransition(None, endState) 227 intermEndState.addTransition(None, endState)
228 228
229 def allowsEmptySequence(self): 229 def allowsEmptySequence(self):
230 - return any(map(lambda rule: rule.allowsEmptySequence(), self.children)) 230 + return any([rule.allowsEmptySequence() for rule in self.children])
231 231
232 def __str__(self): 232 def __str__(self):
233 - return u' | '.join(map(lambda c: str(c), self.children)) 233 + return ' | '.join([str(c) for c in self.children])
234 234
235 def isShiftOrthRule(self): 235 def isShiftOrthRule(self):
236 - return all(map(lambda c: c.isShiftOrthRule(), self.children)) 236 + return all([c.isShiftOrthRule() for c in self.children])
237 237
238 def transformToGeneratorVersion(self): 238 def transformToGeneratorVersion(self):
239 newChildren = [child.transformToGeneratorVersion() for child in self.children if not child.allowsEmptySequence() or child.isShiftOrthRule()] 239 newChildren = [child.transformToGeneratorVersion() for child in self.children if not child.allowsEmptySequence() or child.isShiftOrthRule()]
240 - newChildren = filter(lambda c: not c.isSinkRule(), newChildren) 240 + newChildren = [c for c in newChildren if not c.isSinkRule()]
241 if newChildren == []: 241 if newChildren == []:
242 return SinkRule() 242 return SinkRule()
243 else: 243 else:
@@ -255,12 +255,12 @@ class OrRule(ComplexRule): @@ -255,12 +255,12 @@ class OrRule(ComplexRule):
255 for rule in self.children: 255 for rule in self.children:
256 rule.validate(filename) 256 rule.validate(filename)
257 if not ( 257 if not (
258 - all(map(lambda c: c.isShiftOrthRule(), self.children))  
259 - or not any(map(lambda c: c.isShiftOrthRule(), self.children))): 258 + all([c.isShiftOrthRule() for c in self.children])
  259 + or not any([c.isShiftOrthRule() for c in self.children])):
260 raise ConfigFileException( 260 raise ConfigFileException(
261 filename, 261 filename,
262 self.linenum, 262 self.linenum,
263 - u'All subrules of alternative "%s" must be either with or without ">"' % str(self)) 263 + 'All subrules of alternative "%s" must be either with or without ">"' % str(self))
264 264
265 class ZeroOrMoreRule(UnaryRule): 265 class ZeroOrMoreRule(UnaryRule):
266 266
@@ -291,7 +291,7 @@ class ZeroOrMoreRule(UnaryRule): @@ -291,7 +291,7 @@ class ZeroOrMoreRule(UnaryRule):
291 return SinkRule() 291 return SinkRule()
292 292
293 def __str__(self): 293 def __str__(self):
294 - return u'(' + str(self.child) + ')*' 294 + return '(' + str(self.child) + ')*'
295 295
296 class OptionalRule(UnaryRule): 296 class OptionalRule(UnaryRule):
297 297
@@ -321,7 +321,7 @@ class OptionalRule(UnaryRule): @@ -321,7 +321,7 @@ class OptionalRule(UnaryRule):
321 return self.child.transformToGeneratorVersion() 321 return self.child.transformToGeneratorVersion()
322 322
323 def __str__(self): 323 def __str__(self):
324 - return u'(' + str(self.child) + ')?' 324 + return '(' + str(self.child) + ')?'
325 325
326 class SinkRule(SegmentRule): 326 class SinkRule(SegmentRule):
327 327
fsabuilder/morfeuszbuilder/segrules/rulesFSA.py
@@ -49,7 +49,7 @@ class RulesFSA(object): @@ -49,7 +49,7 @@ class RulesFSA(object):
49 def transitionsData2bytearray(self, state): 49 def transitionsData2bytearray(self, state):
50 res = bytearray() 50 res = bytearray()
51 # logging.debug('next') 51 # logging.debug('next')
52 - for (segnum, shiftOrth), nextState in sorted(state.transitionsMap.iteritems()): 52 + for (segnum, shiftOrth), nextState in sorted(state.transitionsMap.items()):
53 res.append(segnum) 53 res.append(segnum)
54 if shiftOrth: 54 if shiftOrth:
55 res.append(1) 55 res.append(1)
@@ -57,8 +57,8 @@ class RulesFSA(object): @@ -57,8 +57,8 @@ class RulesFSA(object):
57 res.append(0) 57 res.append(0)
58 offset = nextState.offset 58 offset = nextState.offset
59 exceptions.validate(offset <= MAX_FSA_SIZE, 59 exceptions.validate(offset <= MAX_FSA_SIZE,
60 - u'Segmentation rules are too big and complicated' \  
61 - + u'- the resulting automaton would exceed its max size which is %d' \ 60 + 'Segmentation rules are too big and complicated' \
  61 + + '- the resulting automaton would exceed its max size which is %d' \
62 % MAX_FSA_SIZE) 62 % MAX_FSA_SIZE)
63 res.extend(htons(offset)) 63 res.extend(htons(offset))
64 return res 64 return res
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
@@ -7,7 +7,7 @@ import logging @@ -7,7 +7,7 @@ import logging
7 from morfeuszbuilder.utils.serializationUtils import htons, htonl 7 from morfeuszbuilder.utils.serializationUtils import htons, htonl
8 from morfeuszbuilder.utils import serializationUtils 8 from morfeuszbuilder.utils import serializationUtils
9 from morfeuszbuilder.utils import exceptions 9 from morfeuszbuilder.utils import exceptions
10 -import shiftOrthMagic 10 +from . import shiftOrthMagic
11 11
12 class RulesManager(object): 12 class RulesManager(object):
13 13
@@ -19,7 +19,7 @@ class RulesManager(object): @@ -19,7 +19,7 @@ class RulesManager(object):
19 self.shiftOrthMagic = shiftOrthMagic.ShiftOrthMagic() 19 self.shiftOrthMagic = shiftOrthMagic.ShiftOrthMagic()
20 20
21 def _options2Key(self, optionsMap): 21 def _options2Key(self, optionsMap):
22 - return frozenset(optionsMap.items()) 22 + return frozenset(list(optionsMap.items()))
23 23
24 def _key2Options(self, optionsKey): 24 def _key2Options(self, optionsKey):
25 return dict(optionsKey) 25 return dict(optionsKey)
@@ -46,9 +46,9 @@ class RulesManager(object): @@ -46,9 +46,9 @@ class RulesManager(object):
46 dfasNum = len(self.options2DFA) 46 dfasNum = len(self.options2DFA)
47 exceptions.validate( 47 exceptions.validate(
48 dfasNum > 0 and dfasNum < 256, 48 dfasNum > 0 and dfasNum < 256,
49 - u'Too many segmentation rules variants') 49 + 'Too many segmentation rules variants')
50 res.append(dfasNum) 50 res.append(dfasNum)
51 - for key, dfa in self.options2DFA.iteritems(): 51 + for key, dfa in list(self.options2DFA.items()):
52 optionsMap = self._key2Options(key) 52 optionsMap = self._key2Options(key)
53 res.extend(self._serializeOptionsMap(optionsMap)) 53 res.extend(self._serializeOptionsMap(optionsMap))
54 res.extend(self._serializeDFA(dfa)) 54 res.extend(self._serializeDFA(dfa))
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
@@ -41,16 +41,16 @@ class RulesNFAState(object): @@ -41,16 +41,16 @@ class RulesNFAState(object):
41 if not self in visitedStates: 41 if not self in visitedStates:
42 visitedStates.add(self) 42 visitedStates.add(self)
43 yield self 43 yield self
44 - for _, nextStates in self.transitionsMap.iteritems(): 44 + for _, nextStates in list(self.transitionsMap.items()):
45 for state in nextStates: 45 for state in nextStates:
46 for state1 in state.dfs(visitedStates): 46 for state1 in state.dfs(visitedStates):
47 yield state1 47 yield state1
48 48
49 def debug(self): 49 def debug(self):
50 - print '----------------'  
51 - print 'STATE:', self.idx  
52 - for label, nextStates in self.transitionsMap.iteritems():  
53 - print label, '-->', [s.idx for s in sorted(nextStates, key=lambda s: s.idx)] 50 + print('----------------')
  51 + print(('STATE:', self.idx))
  52 + for label, nextStates in list(self.transitionsMap.items()):
  53 + print((label, '-->', [s.idx for s in sorted(nextStates, key=lambda s: s.idx)]))
54 54
55 class RulesNFA(object): 55 class RulesNFA(object):
56 56
@@ -60,7 +60,7 @@ class RulesNFA(object): @@ -60,7 +60,7 @@ class RulesNFA(object):
60 def _groupOutputByLabels(self, nfaStates): 60 def _groupOutputByLabels(self, nfaStates):
61 res = {} 61 res = {}
62 for nfaState in nfaStates: 62 for nfaState in nfaStates:
63 - for label, nextStates in nfaState.transitionsMap.iteritems(): 63 + for label, nextStates in list(nfaState.transitionsMap.items()):
64 if label is not None: 64 if label is not None:
65 # transitionData = nfaState.transitionsDataMap[label] 65 # transitionData = nfaState.transitionsDataMap[label]
66 segnum, shiftOrth = label 66 segnum, shiftOrth = label
@@ -70,27 +70,21 @@ class RulesNFA(object): @@ -70,27 +70,21 @@ class RulesNFA(object):
70 return res 70 return res
71 71
72 def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState): 72 def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState):
73 - weakHits = map(  
74 - lambda state: state.weak,  
75 - filter(  
76 - lambda state: state.final and not state.autogenerated,  
77 - nfaStates)) 73 + weakHits = [state.weak for state in [state for state in nfaStates if state.final and not state.autogenerated]]
78 if not all(weakHits) \ 74 if not all(weakHits) \
79 and any(weakHits): 75 and any(weakHits):
80 - weakState = list(filter(lambda state: state.final and state.weak, nfaStates))[0]  
81 - nonWeakState = list(filter(lambda state: state.final and not state.weak, nfaStates))[0] 76 + weakState = list([state for state in nfaStates if state.final and state.weak])[0]
  77 + nonWeakState = list([state for state in nfaStates if state.final and not state.weak])[0]
82 raise InconsistentStateWeaknessException(weakState, nonWeakState) 78 raise InconsistentStateWeaknessException(weakState, nonWeakState)
83 - weak = any(map(  
84 - lambda state: state.weak and state.final,  
85 - filter(lambda state: not state.autogenerated, nfaStates)))  
86 - final = any(map(lambda state: state.final, nfaStates)) 79 + weak = any([state.weak and state.final for state in [state for state in nfaStates if not state.autogenerated]])
  80 + final = any([state.final for state in nfaStates])
87 # assert not weak or not final 81 # assert not weak or not final
88 if final: 82 if final:
89 # dfaState should be final 83 # dfaState should be final
90 # and contain info about weakness 84 # and contain info about weakness
91 dfaState.setAsAccepting(weak=weak) 85 dfaState.setAsAccepting(weak=weak)
92 # dfaState.encodedData = bytearray([1 if weak else 0]) 86 # dfaState.encodedData = bytearray([1 if weak else 0])
93 - for (segnum, shiftOrth), nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems(): 87 + for (segnum, shiftOrth), nextNFAStates in list(self._groupOutputByLabels(nfaStates).items()):
94 key = frozenset(nextNFAStates) 88 key = frozenset(nextNFAStates)
95 if key in nfaSubset2DFAState: 89 if key in nfaSubset2DFAState:
96 nextDFAState = nfaSubset2DFAState[key] 90 nextDFAState = nfaSubset2DFAState[key]
@@ -104,7 +98,7 @@ class RulesNFA(object): @@ -104,7 +98,7 @@ class RulesNFA(object):
104 def convertToDFA(self): 98 def convertToDFA(self):
105 dfa = RulesFSA() 99 dfa = RulesFSA()
106 startStates = self.initialState.getClosure(set()) 100 startStates = self.initialState.getClosure(set())
107 - assert not any(filter(lambda s: s.final, startStates)) 101 + assert not any([s for s in startStates if s.final])
108 dfa.initialState = RulesState() 102 dfa.initialState = RulesState()
109 self._doConvertState(dfa.initialState, startStates, {frozenset(startStates): dfa.initialState}) 103 self._doConvertState(dfa.initialState, startStates, {frozenset(startStates): dfa.initialState})
110 return dfa 104 return dfa
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
@@ -28,11 +28,11 @@ class RulesParser(object): @@ -28,11 +28,11 @@ class RulesParser(object):
28 key, defs = lineToParse.parseString(line) 28 key, defs = lineToParse.parseString(line)
29 res[key] = tuple(defs) 29 res[key] = tuple(defs)
30 except Exception as ex: 30 except Exception as ex:
31 - raise exceptions.ConfigFileException(segtypesConfigFile.filename, lineNum, u'Error in [options] section: %s' % str(ex)) 31 + raise exceptions.ConfigFileException(segtypesConfigFile.filename, lineNum, 'Error in [options] section: %s' % str(ex))
32 return res 32 return res
33 33
34 def _key2DefAsKey(self, key2Def): 34 def _key2DefAsKey(self, key2Def):
35 - return frozenset(key2Def.items()) 35 + return frozenset(list(key2Def.items()))
36 36
37 def parse(self, filename): 37 def parse(self, filename):
38 38
@@ -53,12 +53,12 @@ class RulesParser(object): @@ -53,12 +53,12 @@ class RulesParser(object):
53 res = rulesManager.RulesManager(segtypesHelper, separatorsList) 53 res = rulesManager.RulesManager(segtypesHelper, separatorsList)
54 54
55 def2Key = {} 55 def2Key = {}
56 - for key, defs in key2Defs.iteritems(): 56 + for key, defs in list(key2Defs.items()):
57 for define in defs: 57 for define in defs:
58 def2Key[define] = key 58 def2Key[define] = key
59 59
60 resultsMap = {} 60 resultsMap = {}
61 - for idx, defs in enumerate(itertools.product(*key2Defs.values())): 61 + for idx, defs in enumerate(itertools.product(*list(key2Defs.values()))):
62 key2Def = dict([(def2Key[define], define) for define in defs]) 62 key2Def = dict([(def2Key[define], define) for define in defs])
63 currRes = [] 63 currRes = []
64 resultsMap[self._key2DefAsKey(key2Def)] = currRes 64 resultsMap[self._key2DefAsKey(key2Def)] = currRes
@@ -86,7 +86,7 @@ class RulesParser(object): @@ -86,7 +86,7 @@ class RulesParser(object):
86 86
87 self.doShiftOrthMagic(resultsMap, res) 87 self.doShiftOrthMagic(resultsMap, res)
88 88
89 - for idx, defs in enumerate(itertools.product(*key2Defs.values())): 89 + for idx, defs in enumerate(itertools.product(*list(key2Defs.values()))):
90 key2Def = dict([(def2Key[define], define) for define in defs]) 90 key2Def = dict([(def2Key[define], define) for define in defs])
91 91
92 nfa = rulesNFA.RulesNFA() 92 nfa = rulesNFA.RulesNFA()
@@ -115,20 +115,20 @@ class RulesParser(object): @@ -115,20 +115,20 @@ class RulesParser(object):
115 115
116 def _createNewTagRule(self, segtype, shiftOrth, lineNum, line, segtypesHelper): 116 def _createNewTagRule(self, segtype, shiftOrth, lineNum, line, segtypesHelper):
117 if not segtypesHelper.hasSegtype(segtype): 117 if not segtypesHelper.hasSegtype(segtype):
118 - raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype)) 118 + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid segment type: %s' % (line, segtype))
119 else: 119 else:
120 # return rules.TagRule(segtype) 120 # return rules.TagRule(segtype)
121 return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), shiftOrth, segtype, lineNum) 121 return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), shiftOrth, segtype, lineNum)
122 122
123 def _createQuantRule1(self, child, quantity, lineNum, line, segtypesHelper): 123 def _createQuantRule1(self, child, quantity, lineNum, line, segtypesHelper):
124 if quantity <= 0: 124 if quantity <= 0:
125 - raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid quantity: %d' % (line, quantity)) 125 + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantity: %d' % (line, quantity))
126 else: 126 else:
127 return rules.ConcatRule(quantity * [child], lineNum) 127 return rules.ConcatRule(quantity * [child], lineNum)
128 128
129 def _createQuantRule2(self, child, leftN, rightN, lineNum, line, segtypesHelper): 129 def _createQuantRule2(self, child, leftN, rightN, lineNum, line, segtypesHelper):
130 if leftN > rightN or (leftN, rightN) == (0, 0): 130 if leftN > rightN or (leftN, rightN) == (0, 0):
131 - raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid quantities: %d %d' % (line, leftN, rightN)) 131 + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantities: %d %d' % (line, leftN, rightN))
132 elif leftN == 0: 132 elif leftN == 0:
133 children = [rules.OptionalRule(child, lineNum)] 133 children = [rules.OptionalRule(child, lineNum)]
134 for n in range(2, rightN + 1): 134 for n in range(2, rightN + 1):
@@ -140,7 +140,7 @@ class RulesParser(object): @@ -140,7 +140,7 @@ class RulesParser(object):
140 140
141 def _createQuantRule3(self, child, quantity, lineNum, line, segtypesHelper): 141 def _createQuantRule3(self, child, quantity, lineNum, line, segtypesHelper):
142 if quantity <= 0: 142 if quantity <= 0:
143 - raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid quantity: %d' % (line, quantity)) 143 + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantity: %d' % (line, quantity))
144 else: 144 else:
145 return rules.ConcatRule( 145 return rules.ConcatRule(
146 [ 146 [
@@ -200,7 +200,7 @@ class RulesParser(object): @@ -200,7 +200,7 @@ class RulesParser(object):
200 shiftOrthSegtypes = set() 200 shiftOrthSegtypes = set()
201 nonShiftOrthSegtypes = set() 201 nonShiftOrthSegtypes = set()
202 202
203 - for _, rules in resultsMap.iteritems(): 203 + for _, rules in list(resultsMap.items()):
204 for rule in rules: 204 for rule in rules:
205 for atomicRule in rule.getAtomicRules(): 205 for atomicRule in rule.getAtomicRules():
206 if atomicRule.shiftOrth: 206 if atomicRule.shiftOrth:
fsabuilder/morfeuszbuilder/segrules/shiftOrthMagic.py
@@ -36,7 +36,7 @@ class ShiftOrthMagic(object): @@ -36,7 +36,7 @@ class ShiftOrthMagic(object):
36 for segtype in shiftOrthSegtypes - nonShiftOrthSegtypes: 36 for segtype in shiftOrthSegtypes - nonShiftOrthSegtypes:
37 self._onlyShiftSegnums.add(segtypesHelper.getSegnum4Segtype(segtype)) 37 self._onlyShiftSegnums.add(segtypesHelper.getSegnum4Segtype(segtype))
38 38
39 - for _, rules in resultsMap.iteritems(): 39 + for _, rules in list(resultsMap.items()):
40 for rule in rules: 40 for rule in rules:
41 for atomicRule in rule.getAtomicRules(): 41 for atomicRule in rule.getAtomicRules():
42 if atomicRule.segnum in self._bothShiftAndNonShiftSegnums and atomicRule.shiftOrth: 42 if atomicRule.segnum in self._bothShiftAndNonShiftSegnums and atomicRule.shiftOrth:
fsabuilder/morfeuszbuilder/segrules/test/parserTest.py
@@ -12,18 +12,18 @@ from morfeuszbuilder.fsa import visualizer, serializer @@ -12,18 +12,18 @@ from morfeuszbuilder.fsa import visualizer, serializer
12 class Test(unittest.TestCase): 12 class Test(unittest.TestCase):
13 13
14 def testParser(self): 14 def testParser(self):
15 - print 'do test' 15 + print('do test')
16 t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset')) 16 t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset'))
17 parser = rulesParser.RulesParser(t) 17 parser = rulesParser.RulesParser(t)
18 rulesManager = parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) 18 rulesManager = parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat'))
19 fsa = rulesManager.getDFA({'aggl': 'permissive', 'praet': 'split'}) 19 fsa = rulesManager.getDFA({'aggl': 'permissive', 'praet': 'split'})
20 for s in fsa.dfs(): 20 for s in fsa.dfs():
21 s.debug() 21 s.debug()
22 - print 'states:', len(list(fsa.dfs()))  
23 - print 'transitions:', fsa.getTransitionsNum() 22 + print(('states:', len(list(fsa.dfs()))))
  23 + print(('transitions:', fsa.getTransitionsNum()))
24 visualizer.Visualizer().visualize(fsa, charLabels=False) 24 visualizer.Visualizer().visualize(fsa, charLabels=False)
25 - print 'size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray()))  
26 - print 'done' 25 + print(('size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray()))))
  26 + print('done')
27 27
28 if __name__ == "__main__": 28 if __name__ == "__main__":
29 unittest.main() 29 unittest.main()
fsabuilder/morfeuszbuilder/segrules/test/preprocessorTest.py
@@ -19,7 +19,7 @@ class Test(unittest.TestCase): @@ -19,7 +19,7 @@ class Test(unittest.TestCase):
19 parsedFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes', 'segment types']) 19 parsedFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes', 'segment types'])
20 linesEnum = parsedFile.enumerateLinesInSection('combinations') 20 linesEnum = parsedFile.enumerateLinesInSection('combinations')
21 for lineNum, line in preprocessor.preprocess(linesEnum, ['extra', 'superextra']): 21 for lineNum, line in preprocessor.preprocess(linesEnum, ['extra', 'superextra']):
22 - print (lineNum, line) 22 + print((lineNum, line))
23 23
24 24
25 if __name__ == "__main__": 25 if __name__ == "__main__":
fsabuilder/morfeuszbuilder/tagset/segtypes.py
@@ -11,11 +11,11 @@ from morfeuszbuilder.utils import exceptions @@ -11,11 +11,11 @@ from morfeuszbuilder.utils import exceptions
11 def _getLemmaHomonymPair(lemma): 11 def _getLemmaHomonymPair(lemma):
12 if lemma is None: 12 if lemma is None:
13 return (None, None) 13 return (None, None)
14 - elif u':' in lemma:  
15 - if lemma.replace(u':', '') == '': 14 + elif ':' in lemma:
  15 + if lemma.replace(':', '') == '':
16 return (lemma, None) 16 return (lemma, None)
17 else: 17 else:
18 - return lemma.split(u':', 1) 18 + return lemma.split(':', 1)
19 else: 19 else:
20 return (lemma, None) 20 return (lemma, None)
21 21
@@ -26,7 +26,7 @@ class Segtypes(object): @@ -26,7 +26,7 @@ class Segtypes(object):
26 self.tagset = tagset 26 self.tagset = tagset
27 self.namesMap = namesMap 27 self.namesMap = namesMap
28 self.labelsMap = labelsMap 28 self.labelsMap = labelsMap
29 - self._reverseLabelsMap = dict([(v, k) for (k, v) in labelsMap.iteritems()]) 29 + self._reverseLabelsMap = dict([(v, k) for (k, v) in list(labelsMap.items())])
30 30
31 self.filename = segrulesConfigFile.filename 31 self.filename = segrulesConfigFile.filename
32 32
@@ -59,13 +59,13 @@ class Segtypes(object): @@ -59,13 +59,13 @@ class Segtypes(object):
59 59
60 def _readSegtypes(self, segrulesConfigFile): 60 def _readSegtypes(self, segrulesConfigFile):
61 for lineNum, line in segrulesConfigFile.enumerateLinesInSection('segment types'): 61 for lineNum, line in segrulesConfigFile.enumerateLinesInSection('segment types'):
62 - assert type(line) == unicode 62 + assert type(line) == str
63 self._validate( 63 self._validate(
64 - u'Segment type must be a single word', 64 + 'Segment type must be a single word',
65 lineNum, 65 lineNum,
66 re.match(r'^\w+$', line)) 66 re.match(r'^\w+$', line))
67 self._validate( 67 self._validate(
68 - u'Segment type already defined: "%s"' % line, 68 + 'Segment type already defined: "%s"' % line,
69 lineNum, 69 lineNum,
70 line not in self.segtypes) 70 line not in self.segtypes)
71 self.segtypes.append(line) 71 self.segtypes.append(line)
@@ -75,13 +75,13 @@ class Segtypes(object): @@ -75,13 +75,13 @@ class Segtypes(object):
75 for lineNum, line in segrulesConfigFile.enumerateLinesInSection('tags'): 75 for lineNum, line in segrulesConfigFile.enumerateLinesInSection('tags'):
76 self._parsePattern(lineNum, line, withLemma=False) 76 self._parsePattern(lineNum, line, withLemma=False)
77 self._validate( 77 self._validate(
78 - u'Pattern that matches everything must be the last one', 78 + 'Pattern that matches everything must be the last one',
79 lineNum - 1, 79 lineNum - 1,
80 not gotWildcardPattern) 80 not gotWildcardPattern)
81 gotWildcardPattern = gotWildcardPattern or self.patternsList[-1].isWildcardPattern() 81 gotWildcardPattern = gotWildcardPattern or self.patternsList[-1].isWildcardPattern()
82 82
83 self._validate( 83 self._validate(
84 - u'There must be a pattern that matches everything at the end of [tags] section', 84 + 'There must be a pattern that matches everything at the end of [tags] section',
85 lineNum, 85 lineNum,
86 self.patternsList[-1].isWildcardPattern()) 86 self.patternsList[-1].isWildcardPattern())
87 87
@@ -94,18 +94,18 @@ class Segtypes(object): @@ -94,18 +94,18 @@ class Segtypes(object):
94 for f in fields: 94 for f in fields:
95 match = re.match(r'(name|labels)=([\S]+)', f, re.U) 95 match = re.match(r'(name|labels)=([\S]+)', f, re.U)
96 self._validate( 96 self._validate(
97 - u'invalid name or labels constraint: "%s"' % f, 97 + 'invalid name or labels constraint: "%s"' % f,
98 lineNum, 98 lineNum,
99 match) 99 match)
100 key = match.group(1) 100 key = match.group(1)
101 value = match.group(2) 101 value = match.group(2)
102 self._validate( 102 self._validate(
103 - u'%s already specified' % key, 103 + '%s already specified' % key,
104 lineNum, 104 lineNum,
105 key not in res) 105 key not in res)
106 if key == 'labels': 106 if key == 'labels':
107 if value: 107 if value:
108 - value = frozenset(value.split(u'|')) 108 + value = frozenset(value.split('|'))
109 else: 109 else:
110 value = frozenset() 110 value = frozenset()
111 res[key] = value 111 res[key] = value
@@ -115,7 +115,7 @@ class Segtypes(object): @@ -115,7 +115,7 @@ class Segtypes(object):
115 split = re.split(r'\s+', line.strip()) 115 split = re.split(r'\s+', line.strip())
116 if withLemma: 116 if withLemma:
117 self._validate( 117 self._validate(
118 - u'Line in [lexemes] section must contain 3 to 5 fields - segment type, lemma, tag pattern and optional constraints on name and labels', 118 + 'Line in [lexemes] section must contain 3 to 5 fields - segment type, lemma, tag pattern and optional constraints on name and labels',
119 lineNum, 119 lineNum,
120 len(split) in [3, 4, 5]) 120 len(split) in [3, 4, 5])
121 segtype = split[0] 121 segtype = split[0]
@@ -124,7 +124,7 @@ class Segtypes(object): @@ -124,7 +124,7 @@ class Segtypes(object):
124 additionalConstraints = self._parseAdditionalConstraints(lineNum, split[3:]) 124 additionalConstraints = self._parseAdditionalConstraints(lineNum, split[3:])
125 else: 125 else:
126 self._validate( 126 self._validate(
127 - u'Line in [tags] section must contain 2 to 4 fields - segment type, tag pattern and optional constraints on name and labels', 127 + 'Line in [tags] section must contain 2 to 4 fields - segment type, tag pattern and optional constraints on name and labels',
128 lineNum, 128 lineNum,
129 len(split) in [2, 3, 4]) 129 len(split) in [2, 3, 4])
130 segtype = split[0] 130 segtype = split[0]
@@ -132,32 +132,32 @@ class Segtypes(object): @@ -132,32 +132,32 @@ class Segtypes(object):
132 pattern = split[1] 132 pattern = split[1]
133 additionalConstraints = self._parseAdditionalConstraints(lineNum, split[2:]) 133 additionalConstraints = self._parseAdditionalConstraints(lineNum, split[2:])
134 self._validate( 134 self._validate(
135 - u'Undeclared segment type: "%s"' % segtype, 135 + 'Undeclared segment type: "%s"' % segtype,
136 lineNum, 136 lineNum,
137 segtype in self.segtypes) 137 segtype in self.segtypes)
138 segnum = self.segtypes.index(segtype) 138 segnum = self.segtypes.index(segtype)
139 139
140 self._validate( 140 self._validate(
141 - u'Pattern must contain only ":", "%", "." and lowercase alphanumeric letters', 141 + 'Pattern must contain only ":", "%", "." and lowercase alphanumeric letters',
142 lineNum, 142 lineNum,
143 re.match(r'[a-z_\.\:\%]+', pattern)) 143 re.match(r'[a-z_\.\:\%]+', pattern))
144 144
145 segtypePattern = SegtypePattern( 145 segtypePattern = SegtypePattern(
146 lemma, 146 lemma,
147 pattern, 147 pattern,
148 - additionalConstraints.get('name', u''), 148 + additionalConstraints.get('name', ''),
149 additionalConstraints.get('labels', frozenset()), 149 additionalConstraints.get('labels', frozenset()),
150 segnum) 150 segnum)
151 # print 'segtypePattern', repr(str(segtypePattern)) 151 # print 'segtypePattern', repr(str(segtypePattern))
152 self._validate( 152 self._validate(
153 - u'There is no tag that matches pattern "%s".' % (pattern), 153 + 'There is no tag that matches pattern "%s".' % (pattern),
154 lineNum, 154 lineNum,
155 any([segtypePattern.tryToMatchTag(tag) != -1 for tag in self.tagset.getAllTags()])) 155 any([segtypePattern.tryToMatchTag(tag) != -1 for tag in self.tagset.getAllTags()]))
156 self.patternsList.append(segtypePattern) 156 self.patternsList.append(segtypePattern)
157 157
158 def _getAllExistingLabelsnumCombinations(self, labels): 158 def _getAllExistingLabelsnumCombinations(self, labels):
159 if labels: 159 if labels:
160 - for labelsCombination, labelsnum in self.labelsMap.iteritems(): 160 + for labelsCombination, labelsnum in list(self.labelsMap.items()):
161 if labels <= labelsCombination: 161 if labels <= labelsCombination:
162 yield labelsnum 162 yield labelsnum
163 else: 163 else:
@@ -232,7 +232,7 @@ class SegtypePattern(object): @@ -232,7 +232,7 @@ class SegtypePattern(object):
232 return -1 232 return -1
233 233
234 def isWildcardPattern(self): 234 def isWildcardPattern(self):
235 - return (self.lemma, self.pattern, self.name, self.labels) == (None, '%', u'', frozenset()) 235 + return (self.lemma, self.pattern, self.name, self.labels) == (None, '%', '', frozenset())
236 236
237 def __str__(self): 237 def __str__(self):
238 - return u'%s %s %s %s -> %d' % (self.lemma, self.pattern, self.name, self.labels, self.segnum) 238 + return '%s %s %s %s -> %d' % (self.lemma, self.pattern, self.name, self.labels, self.segnum)
fsabuilder/morfeuszbuilder/tagset/tagset.py
@@ -20,7 +20,7 @@ class Tagset(object): @@ -20,7 +20,7 @@ class Tagset(object):
20 #~ self._name2namenum = {} 20 #~ self._name2namenum = {}
21 if filename: 21 if filename:
22 self._doInit(filename, encoding) 22 self._doInit(filename, encoding)
23 - self._tagnum2tag = dict(map(lambda (k, v): (v, k), self.tag2tagnum.iteritems())) 23 + self._tagnum2tag = dict([(k_v[1], k_v[0]) for k_v in iter(list(self.tag2tagnum.items()))])
24 24
25 def _doInit(self, filename, encoding): 25 def _doInit(self, filename, encoding):
26 insideTags = False 26 insideTags = False
@@ -33,11 +33,11 @@ class Tagset(object): @@ -33,11 +33,11 @@ class Tagset(object):
33 self.tagsetId = match.group(1) 33 self.tagsetId = match.group(1)
34 else: 34 else:
35 raise FSABuilderException('missing TAGSET-ID in first line of tagset file') 35 raise FSABuilderException('missing TAGSET-ID in first line of tagset file')
36 - elif line == u'[TAGS]': 36 + elif line == '[TAGS]':
37 insideTags = True 37 insideTags = True
38 #~ elif line == u'[NAMES]': 38 #~ elif line == u'[NAMES]':
39 #~ addingTo = Tagset.NAMES 39 #~ addingTo = Tagset.NAMES
40 - elif line and not line.startswith(u'#'): 40 + elif line and not line.startswith('#'):
41 if not insideTags: 41 if not insideTags:
42 raise FSABuilderException('"%s" - text outside [TAGS] section in tagset file line %d' % (line, linenum)) 42 raise FSABuilderException('"%s" - text outside [TAGS] section in tagset file line %d' % (line, linenum))
43 res = self.tag2tagnum 43 res = self.tag2tagnum
@@ -47,12 +47,12 @@ class Tagset(object): @@ -47,12 +47,12 @@ class Tagset(object):
47 tag = line.split(Tagset.SEP)[1] 47 tag = line.split(Tagset.SEP)[1]
48 if tag in res: 48 if tag in res:
49 raise FSABuilderException('duplicate tag: "%s"' % tag) 49 raise FSABuilderException('duplicate tag: "%s"' % tag)
50 - if int(tagNum) in res.values(): 50 + if int(tagNum) in list(res.values()):
51 raise FSABuilderException('line %d: tagId %d assigned for tag "%s" already appeared somewhere else.' % (linenum, int(tagNum), tag)) 51 raise FSABuilderException('line %d: tagId %d assigned for tag "%s" already appeared somewhere else.' % (linenum, int(tagNum), tag))
52 res[tag] = int(tagNum) 52 res[tag] = int(tagNum)
53 53
54 def getAllTags(self): 54 def getAllTags(self):
55 - return self.tag2tagnum.keys() 55 + return list(self.tag2tagnum.keys())
56 56
57 def getTagnum4Tag(self, tag): 57 def getTagnum4Tag(self, tag):
58 if tag in self.tag2tagnum: 58 if tag in self.tag2tagnum:
fsabuilder/morfeuszbuilder/utils/caseconv/generate.py
@@ -90,7 +90,7 @@ def _serializeTable(table): @@ -90,7 +90,7 @@ def _serializeTable(table):
90 def _serializeExtendedTable(table): 90 def _serializeExtendedTable(table):
91 res = [] 91 res = []
92 res.append('{') 92 res.append('{')
93 - for code, targetCode in table.iteritems(): 93 + for code, targetCode in list(table.items()):
94 res.append('{') 94 res.append('{')
95 res.append(str(code)) 95 res.append(str(code))
96 res.append(',') 96 res.append(',')
fsabuilder/morfeuszbuilder/utils/configFile.py
@@ -6,10 +6,10 @@ Created on 18 lut 2014 @@ -6,10 +6,10 @@ Created on 18 lut 2014
6 6
7 import re 7 import re
8 import codecs 8 import codecs
9 -import exceptions 9 +from . import exceptions
10 10
11 def getHeaderValue(line, lineNum): 11 def getHeaderValue(line, lineNum):
12 - m = re.match(ur'\s*\[(.*?)\]\s*(\#.*)?', line) 12 + m = re.match(r'\s*\[(.*?)\]\s*(\#.*)?', line)
13 if m: 13 if m:
14 return m.group(1) 14 return m.group(1)
15 else: 15 else:
@@ -40,7 +40,7 @@ class ConfigFile(object): @@ -40,7 +40,7 @@ class ConfigFile(object):
40 self.section2Lines[self.currSection].append((lineNum, line)) 40 self.section2Lines[self.currSection].append((lineNum, line))
41 41
42 def _getHeaderValue(self, line, lineNum): 42 def _getHeaderValue(self, line, lineNum):
43 - m = re.match(ur'\s*\[(.*?)\]\s*(\#.*)?', line) 43 + m = re.match(r'\s*\[(.*?)\]\s*(\#.*)?', line)
44 if m: 44 if m:
45 return m.group(1) 45 return m.group(1)
46 else: 46 else:
@@ -48,7 +48,7 @@ class ConfigFile(object): @@ -48,7 +48,7 @@ class ConfigFile(object):
48 48
49 def enumerateLinesInSection(self, sectionName, ignoreComments=True): 49 def enumerateLinesInSection(self, sectionName, ignoreComments=True):
50 if sectionName not in self.section2Lines: 50 if sectionName not in self.section2Lines:
51 - raise exceptions.ConfigFileException(self.filename, None, u'Missing section: "%s"' % sectionName) 51 + raise exceptions.ConfigFileException(self.filename, None, 'Missing section: "%s"' % sectionName)
52 if not ignoreComments: 52 if not ignoreComments:
53 return self.section2Lines[sectionName] 53 return self.section2Lines[sectionName]
54 else: 54 else:
fsabuilder/morfeuszbuilder/utils/exceptions.py
@@ -25,7 +25,7 @@ class SegtypesException(FSABuilderException): @@ -25,7 +25,7 @@ class SegtypesException(FSABuilderException):
25 self.msg = msg 25 self.msg = msg
26 26
27 def __str__(self): 27 def __str__(self):
28 - return u'Error in segment rules: %s' % self.msg 28 + return 'Error in segment rules: %s' % self.msg
29 29
30 class ConfigFileException(FSABuilderException): 30 class ConfigFileException(FSABuilderException):
31 31
@@ -36,7 +36,7 @@ class ConfigFileException(FSABuilderException): @@ -36,7 +36,7 @@ class ConfigFileException(FSABuilderException):
36 36
37 def __str__(self): 37 def __str__(self):
38 if self.lineNum: 38 if self.lineNum:
39 - return u'%s:%d - %s' % (self.filename, self.lineNum, self.msg) 39 + return '%s:%d - %s' % (self.filename, self.lineNum, self.msg)
40 else: 40 else:
41 - return u'%s - %s' % (self.filename, self.msg) 41 + return '%s - %s' % (self.filename, self.msg)
42 42
fsabuilder/morfeuszbuilder/utils/extractTagset.py
@@ -8,10 +8,10 @@ import sys @@ -8,10 +8,10 @@ import sys
8 if __name__ == '__main__': 8 if __name__ == '__main__':
9 version = sys.argv[1] 9 version = sys.argv[1]
10 res = set() 10 res = set()
11 - print '#morfeusz-tagset', version 11 + print(('#morfeusz-tagset', version))
12 for line in sys.stdin: 12 for line in sys.stdin:
13 if line.strip(): 13 if line.strip():
14 tag = line.split('\t')[2] 14 tag = line.split('\t')[2]
15 res.add(tag) 15 res.add(tag)
16 for idx, tag in enumerate(sorted(res)): 16 for idx, tag in enumerate(sorted(res)):
17 - print str(idx) + '\t' + tag 17 + print((str(idx) + '\t' + tag))