Commit 95cbe5ea03398610d704f1ad1d51cc981f0aafea

Authored by Marcin Woliński
1 parent a5484089

morfeusz_builder → Python 3

CMakeLists.txt
... ... @@ -4,7 +4,7 @@ project (Morfeusz)
4 4  
5 5 set (Morfeusz_VERSION_MAJOR 1)
6 6 set (Morfeusz_VERSION_MINOR 9)
7   -set (Morfeusz_VERSION_PATCH 15)
  7 +set (Morfeusz_VERSION_PATCH 16)
8 8 set (Morfeusz_VERSION "${Morfeusz_VERSION_MAJOR}.${Morfeusz_VERSION_MINOR}.${Morfeusz_VERSION_PATCH}")
9 9 set (Morfeusz_LIB_VERSION "${Morfeusz_VERSION}")
10 10 if (BUILT_ON)
... ...
fsabuilder/buildanalyzer.sh
1   -#!/bin/bash
  1 +#! /bin/bash
2 2  
3   -python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V1 --trim-supneg -o $1
  3 +python3 morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V1 --trim-supneg -o $1
... ...
fsabuilder/buildgenerator.sh
1   -#!/bin/bash
  1 +#! /bin/bash
2 2  
3   -python morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \
  3 +python3 morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \
4 4 --tagset-file=../input/sgjp-morfeusz.tagset \
5 5 --segments-file=../input/segmenty.dat \
6 6 --generator \
... ...
fsabuilder/morfeusz_builder 100644 → 100755
1   -#!/usr/bin/python
  1 +#! /usr/bin/python3
2 2 # -*- coding:utf-8 -*-
3 3 '''
4 4 Created on 21 paź 2013
... ... @@ -20,13 +20,13 @@ from optparse import OptionParser
20 20  
21 21 def _checkOption(opt, parser, msg):
22 22 if opt is None:
23   - print >> sys.stderr, msg
  23 + print(msg, file=sys.stderr)
24 24 parser.print_help()
25 25 exit(1)
26 26  
27 27 def _checkCondition(cond, parser, msg):
28 28 if not cond:
29   - print >> sys.stderr, msg
  29 + print(msg, file=sys.stderr)
30 30 parser.print_help()
31 31 exit(1)
32 32  
... ... @@ -40,7 +40,7 @@ def _checkOpen(filename, mode):
40 40 if 'w' in mode:
41 41 os.remove(filename)
42 42 except IOError as ex:
43   - print >> sys.stderr, str(ex)
  43 + print(str(ex), file=sys.stderr)
44 44 exit(1)
45 45  
46 46 def _getDictFilename(opts, isGenerator):
... ... @@ -162,7 +162,7 @@ def _parseOptions():
162 162 _checkOpen(_getDictFilename(opts, isGenerator=True), 'w')
163 163  
164 164 if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1]:
165   - print >> sys.stderr, '--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1])+')'
  165 + print('--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1])+')', file=sys.stderr)
166 166 parser.print_help()
167 167 exit(1)
168 168  
... ... @@ -183,34 +183,34 @@ def _readDictIdAndCopyright(inputFiles):
183 183 with codecs.open(inputFile, 'r', 'utf8') as f:
184 184 inCopyright = False
185 185 for linenum, line in enumerate(f, start=1):
186   - if dictId is None and line.startswith(u'#!DICT-ID'):
187   - dictIdTag, _, dictId = line.strip().partition(u' ')
  186 + if dictId is None and line.startswith('#!DICT-ID'):
  187 + dictIdTag, _, dictId = line.strip().partition(' ')
188 188 exceptions.validate(
189   - dictIdTag == u'#!DICT-ID',
190   - u'Dictionary ID tag must be followed by a space character and dictionary ID string')
  189 + dictIdTag == '#!DICT-ID',
  190 + 'Dictionary ID tag must be followed by a space character and dictionary ID string')
191 191 exceptions.validate(
192   - len(line.split(u' ')) > 1,
193   - u'%s:%d: Must provide DICT-ID' % (inputFile, linenum))
  192 + len(line.split(' ')) > 1,
  193 + '%s:%d: Must provide DICT-ID' % (inputFile, linenum))
194 194 exceptions.validate(
195   - len(line.split(u' ')) == 2,
196   - u'%s:%d: DICT-ID must not contain spaces' % (inputFile, linenum))
197   - elif copyright is None and line.startswith(u'#<COPYRIGHT>'):
  195 + len(line.split(' ')) == 2,
  196 + '%s:%d: DICT-ID must not contain spaces' % (inputFile, linenum))
  197 + elif copyright is None and line.startswith('#<COPYRIGHT>'):
198 198 exceptions.validate(
199   - line.strip() == u'#<COPYRIGHT>',
200   - u'%s:%d: Copyright start tag must be the only one in the line' % (inputFile, linenum))
  199 + line.strip() == '#<COPYRIGHT>',
  200 + '%s:%d: Copyright start tag must be the only one in the line' % (inputFile, linenum))
201 201  
202 202 inCopyright = True
203   - copyright = u''
  203 + copyright = ''
204 204  
205   - elif line.startswith(u'#</COPYRIGHT>'):
  205 + elif line.startswith('#</COPYRIGHT>'):
206 206  
207 207 exceptions.validate(
208 208 inCopyright,
209   - u'%s:%d: Copyright end tag must be preceded by copyright start tag' % (inputFile, linenum))
  209 + '%s:%d: Copyright end tag must be preceded by copyright start tag' % (inputFile, linenum))
210 210  
211 211 exceptions.validate(
212   - line.strip() == u'#</COPYRIGHT>',
213   - u'%s:%d: Copyright end tag must be the only one in the line' % (inputFile, linenum))
  212 + line.strip() == '#</COPYRIGHT>',
  213 + '%s:%d: Copyright end tag must be the only one in the line' % (inputFile, linenum))
214 214  
215 215 inCopyright = False
216 216  
... ... @@ -219,21 +219,21 @@ def _readDictIdAndCopyright(inputFiles):
219 219 copyright += line
220 220  
221 221 if dictId is None:
222   - logging.warn(u'No dictionary ID tag found')
223   - dictId = u''
  222 + logging.warn('No dictionary ID tag found')
  223 + dictId = ''
224 224  
225 225 if copyright is None:
226   - logging.warn(u'No copyright info found')
227   - copyright = u''
  226 + logging.warn('No copyright info found')
  227 + copyright = ''
228 228  
229 229 return (dictId, copyright)
230 230  
231 231 def _readNamesAndQualifiers(inputFiles):
232   - names = set([u''])
  232 + names = set([''])
233 233 qualifiers = set([frozenset()])
234 234 lineParser = convertinput.LineParser()
235 235 for line in _concatFiles(inputFiles):
236   - line = line.strip().decode('utf8')
  236 + line = line.strip()
237 237 if not lineParser.ignoreLine(line):
238 238 _, _, _, name, qualifier = lineParser.parseLine(line)
239 239 names.add(name)
... ... @@ -242,7 +242,7 @@ def _readNamesAndQualifiers(inputFiles):
242 242 qualifiersMap = dict([(quals, idx) for idx, quals in enumerate(sorted(qualifiers, key=lambda q: tuple(sorted(q))))])
243 243 exceptions.validate(
244 244 len(qualifiersMap) <= limits.MAX_QUALIFIERS_COMBINATIONS,
245   - u'Too many qualifiers combinations. The limit is %d' % limits.MAX_QUALIFIERS_COMBINATIONS)
  245 + 'Too many qualifiers combinations. The limit is %d' % limits.MAX_QUALIFIERS_COMBINATIONS)
246 246  
247 247 return namesMap, qualifiersMap
248 248  
... ...
fsabuilder/morfeuszbuilder/fsa/common.py
... ... @@ -10,9 +10,9 @@ import logging
10 10 class EncodedFormWithoutPrefix(object):
11 11  
12 12 def __init__(self, fromWord, targetWord, lowercase):
13   - assert type(fromWord) == unicode
14   - assert type(targetWord) == unicode
15   - root = u''
  13 + assert type(fromWord) == str
  14 + assert type(targetWord) == str
  15 + root = ''
16 16 for o, b in zip(fromWord, targetWord):
17 17 if ((o.lower() == b.lower()) if lowercase else o == b):
18 18 root += b
... ... @@ -26,8 +26,8 @@ class EncodedFormWithoutPrefix(object):
26 26 class EncodedForm4Generator(object):
27 27  
28 28 def __init__(self, fromWord, targetWord):
29   - assert type(fromWord) == unicode
30   - assert type(targetWord) == unicode
  29 + assert type(fromWord) == str
  30 + assert type(targetWord) == str
31 31 bestEncodedForm = None
32 32 bestPrefixLength = -1
33 33 for prefixLength in range(min(len(targetWord), 5)):
... ... @@ -45,8 +45,8 @@ class EncodedForm4Generator(object):
45 45 class EncodedForm4Analyzer(object):
46 46  
47 47 def __init__(self, fromWord, targetWord):
48   - assert type(fromWord) == unicode
49   - assert type(targetWord) == unicode
  48 + assert type(fromWord) == str
  49 + assert type(targetWord) == str
50 50 bestEncodedForm = None
51 51 bestPrefixCutLength = -1
52 52 for prefixCutLength in range(min(len(fromWord), 5)):
... ... @@ -123,7 +123,7 @@ class Interpretation4Generator(object):
123 123 return hash(self.getSortKey())
124 124  
125 125 def __unicode__(self):
126   - return u'<%s,(%d %s),%d,%d>' % (self.lemma.decode('utf8'), self.encodedForm.cutLength, self.encodedForm.suffixToAdd.decode('utf8'), self.tagnum, self.namenum)
  126 + return '<%s,(%d %s),%d,%d>' % (self.lemma, self.encodedForm.cutLength, self.encodedForm.suffixToAdd.decode('utf8'), self.tagnum, self.namenum)
127 127  
128 128 def __repr__(self):
129   - return unicode(self)
  129 + return str(self)
... ...
fsabuilder/morfeuszbuilder/fsa/convertinput.py
... ... @@ -4,7 +4,7 @@ Created on Oct 23, 2013
4 4 @author: mlenart
5 5 '''
6 6 import logging
7   -from common import Interpretation4Analyzer
  7 +from .common import Interpretation4Analyzer
8 8 from morfeuszbuilder.fsa.common import Interpretation4Generator
9 9 #from morfeuszbuilder.fsa import externalsort
10 10  
... ... @@ -36,24 +36,24 @@ class LineParser(object):
36 36 def ignoreLine(self, line):
37 37 if not line:
38 38 return True
39   - elif line.strip() == u'#<COPYRIGHT>':
  39 + elif line.strip() == '#<COPYRIGHT>':
40 40 self.inCopyright = True
41 41 return True
42   - elif line.strip() == u'#</COPYRIGHT>':
  42 + elif line.strip() == '#</COPYRIGHT>':
43 43 self.inCopyright = False
44 44 return True
45 45 elif self.inCopyright:
46 46 return True
47 47 elif line and not ' ' in ''.join(line.split('\t')[:2]):
48 48 return False
49   - elif line.startswith(u'#!DICT-ID'):
  49 + elif line.startswith('#!DICT-ID'):
50 50 return True
51 51 else:
52   - logging.warn(u'Ignoring line: "%s" - contains space in text form or lemma' % (line.strip()))
  52 + logging.warn('Ignoring line: "%s" - contains space in text form or lemma' % (line.strip()))
53 53 return True
54 54  
55 55 def parseLine(self, line):
56   - splitLine = line.strip().split(u'\t')
  56 + splitLine = line.strip().split('\t')
57 57 if len(splitLine) == 5:
58 58 orth, base, tag, name, qualifier = splitLine
59 59 elif len(splitLine) == 4:
... ... @@ -69,7 +69,7 @@ class LineParser(object):
69 69  
70 70 def parseQualifiers(string):
71 71 if string:
72   - return frozenset(string.split(u'|'))
  72 + return frozenset(string.split('|'))
73 73 else:
74 74 return frozenset()
75 75  
... ... @@ -87,7 +87,7 @@ class PolimorfConverter4Analyzer(object):
87 87 def _partiallyParseLines(self, inputLines):
88 88 lineParser = LineParser()
89 89 for line in inputLines:
90   - line = line.decode(self.inputEncoding).strip('\n')
  90 + line = line.strip('\n')
91 91 if not lineParser.ignoreLine(line):
92 92 orth, base, tag, name, qualifier = lineParser.parseLine(line)
93 93  
... ... @@ -106,8 +106,8 @@ class PolimorfConverter4Analyzer(object):
106 106 base = orth
107 107  
108 108 yield '\t'.join((
109   - orth.encode(self.inputEncoding),
110   - base.encode(self.inputEncoding),
  109 + orth,
  110 + base,
111 111 str(tagnum),
112 112 str(namenum),
113 113 str(typenum),
... ... @@ -118,8 +118,8 @@ class PolimorfConverter4Analyzer(object):
118 118 base = orth
119 119 typenum = self.segmentRulesManager.shiftOrthMagic.getNewSegnum4ShiftOrth(typenum)
120 120 yield '\t'.join((
121   - orth.encode(self.inputEncoding),
122   - base.encode(self.inputEncoding),
  121 + orth,
  122 + base,
123 123 str(tagnum),
124 124 str(namenum),
125 125 str(typenum),
... ... @@ -127,14 +127,14 @@ class PolimorfConverter4Analyzer(object):
127 127  
128 128 # input lines are encoded and partially parsed
129 129 def _sortLines(self, inputLines):
130   - return sorted(inputLines, key=lambda line: self.encoder.word2SortKey(line.split('\t')[0].decode('utf8')))
  130 + return sorted(inputLines, key=lambda line: self.encoder.word2SortKey(line.split('\t')[0]))
131 131 # return sorted(inputLines, key=lambda line: self.encoder.word2SortKey(line.split('\t')[0].decode('utf8')))
132 132  
133 133 def _reallyParseLines(self, inputLines):
134 134 for line in inputLines:
135   - line = line.decode(self.inputEncoding).strip(u'\n')
  135 + line = line.strip('\n')
136 136 if line:
137   - orth, base, tagnum, namenum, typenum, qualsnum = line.split(u'\t')
  137 + orth, base, tagnum, namenum, typenum, qualsnum = line.split('\t')
138 138 tagnum = int(tagnum)
139 139 namenum = int(namenum)
140 140 typenum = int(typenum)
... ... @@ -159,14 +159,14 @@ class PolimorfConverter4Generator(object):
159 159 def _partiallyParseLines(self, inputLines):
160 160 lineParser = LineParser()
161 161 for line in inputLines:
162   - line = line.decode(self.inputEncoding).strip('\n')
  162 + line = line.strip('\n')
163 163 if not lineParser.ignoreLine(line):
164 164 orth, base, tag, name, qualifier = lineParser.parseLine(line)
165 165 if base:
166   - homonymId = u''
167   - if u':' in base:
168   - assumedBase, assumedHomonymId = base.split(u':', 1)
169   - if assumedBase != u'' and assumedHomonymId != u'' and assumedHomonymId.isalnum():
  166 + homonymId = ''
  167 + if ':' in base:
  168 + assumedBase, assumedHomonymId = base.split(':', 1)
  169 + if assumedBase != '' and assumedHomonymId != '' and assumedHomonymId.isalnum():
170 170 base, homonymId = assumedBase, assumedHomonymId
171 171 tagnum = self.tagset.getTagnum4Tag(tag)
172 172 namenum = self.namesMap[name]
... ... @@ -179,39 +179,39 @@ class PolimorfConverter4Generator(object):
179 179 base = orth
180 180  
181 181 yield '\t'.join((
182   - orth.encode(self.inputEncoding),
183   - base.encode(self.inputEncoding),
  182 + orth,
  183 + base,
184 184 str(tagnum),
185 185 str(namenum),
186 186 str(typenum),
187   - homonymId.encode(self.inputEncoding),
  187 + homonymId,
188 188 str(qualsnum)))
189 189  
190 190 if self.segmentRulesManager.shiftOrthMagic.getNewSegnum4ShiftOrth(typenum) != None:
191 191 base = orth
192 192 typenum = self.segmentRulesManager.shiftOrthMagic.getNewSegnum4ShiftOrth(typenum)
193 193 yield '\t'.join((
194   - orth.encode(self.inputEncoding),
195   - base.encode(self.inputEncoding),
  194 + orth,
  195 + base,
196 196 str(tagnum),
197 197 str(namenum),
198 198 str(typenum),
199   - homonymId.encode(self.inputEncoding),
  199 + homonymId,
200 200 str(qualsnum)))
201 201 else:
202 202 logging.warn('Ignoring line: "%s" - contains empty lemma', line.strip())
203 203  
204 204 # input lines are encoded and partially parsed
205 205 def _sortLines(self, inputLines):
206   - return sorted(inputLines, key=lambda line: (self.encoder.word2SortKey(line.split('\t')[1].decode('utf8')), line))
  206 + return sorted(inputLines, key=lambda line: (self.encoder.word2SortKey(line.split('\t')[1]), line))
207 207  
208 208 def _reallyParseLines(self, inputLines):
209 209 prevLine = None
210 210 for line in inputLines:
211   - line = line.decode(self.inputEncoding).strip(u'\n')
  211 + line = line.strip('\n')
212 212 if line and line != prevLine:
213   - orth, base, tagnum, namenum, typenum, homonymId, qualsnum = line.split(u'\t')
214   -# print orth.encode('utf8'), base.encode('utf8'), homonymId
  213 + orth, base, tagnum, namenum, typenum, homonymId, qualsnum = line.split('\t')
  214 +# print orth, base, homonymId
215 215 tagnum = int(tagnum)
216 216 namenum = int(namenum)
217 217 typenum = int(typenum)
... ...
fsabuilder/morfeuszbuilder/fsa/encode.py
... ... @@ -24,7 +24,7 @@ class Encoder(object):
24 24 #~ self.qualifiersMap = { frozenset(): 0}
25 25  
26 26 def encodeWord(self, word, lowercase=True):
27   - assert type(word) == unicode
  27 + assert type(word) == str
28 28 res = bytearray(word.lower() if self.lowercase and lowercase else word, self.encoding)
29 29 return res
30 30  
... ... @@ -35,16 +35,16 @@ class Encoder(object):
35 35 return NotImplementedError()
36 36  
37 37 def decodeWord(self, rawWord):
38   - return unicode(str(rawWord).strip('\x00'), self.encoding)
  38 + return str(str(rawWord).strip('\x00'), self.encoding)
39 39  
40 40 def word2SortKey(self, word):
41 41 normalizedWord = word.lower() if self.lowercase else word
42   - return normalizedWord.encode(self.encoding)
  42 + return normalizedWord
43 43  
44 44 def _encodeTypeNum(self, typenum):
45 45 exceptions.validate(
46 46 typenum <= limits.MAX_SEGMENT_TYPES,
47   - u'Too many segment types. The limit is %d' % limits.MAX_SEGMENT_TYPES)
  47 + 'Too many segment types. The limit is %d' % limits.MAX_SEGMENT_TYPES)
48 48 return bytearray([typenum])
49 49  
50 50 def _hasUpperPrefix(self, casePattern):
... ... @@ -62,13 +62,13 @@ class Encoder(object):
62 62  
63 63 def _encodeTagNum(self, tagnum):
64 64 res = bytearray()
65   - exceptions.validate(tagnum <= limits.MAX_TAGS, u'Too many tags. The limit is %d' % limits.MAX_TAGS)
  65 + exceptions.validate(tagnum <= limits.MAX_TAGS, 'Too many tags. The limit is %d' % limits.MAX_TAGS)
66 66 res.append((tagnum & 0xFF00) >> 8)
67 67 res.append(tagnum & 0x00FF)
68 68 return res
69 69  
70 70 def _encodeNameNum(self, namenum):
71   - exceptions.validate(namenum <= limits.MAX_NAMES, u'Too many named entity types. The limit is %d' % limits.MAX_NAMES)
  71 + exceptions.validate(namenum <= limits.MAX_NAMES, 'Too many named entity types. The limit is %d' % limits.MAX_NAMES)
72 72 return bytearray([namenum])
73 73  
74 74 def _groupInterpsByType(self, interpsList):
... ... @@ -86,7 +86,7 @@ class Encoder(object):
86 86  
87 87 res = bytearray()
88 88  
89   - for typenum, interpsList in segnum2Interps.iteritems():
  89 + for typenum, interpsList in list(segnum2Interps.items()):
90 90 res.extend(self._encodeInterps4Type(typenum, interpsList))
91 91 del interpsList
92 92  
... ... @@ -135,10 +135,10 @@ class MorphEncoder(Encoder):
135 135 return res
136 136  
137 137 def _casePatternsHaveOnlyLowercase(self, casePatterns):
138   - return not any(map(lambda cp: cp and True in cp, casePatterns))
  138 + return not any([cp and True in cp for cp in casePatterns])
139 139  
140 140 def _casePatternsAreOnlyTitles(self, casePatterns):
141   - return all(map(lambda cp: cp and cp[0] == True and not True in cp[1:], casePatterns))
  141 + return all([cp and cp[0] == True and not True in cp[1:] for cp in casePatterns])
142 142  
143 143 def _casePatternsAreEncodedInCompressByte(self, casePatterns):
144 144 return self._casePatternsHaveOnlyLowercase(casePatterns) or self._casePatternsAreOnlyTitles(casePatterns)
... ...
fsabuilder/morfeuszbuilder/fsa/fsa.py
... ... @@ -4,8 +4,8 @@ Created on Oct 8, 2013
4 4 @author: mlenart
5 5 '''
6 6  
7   -import state
8   -import register
  7 +from . import state
  8 +from . import register
9 9 import logging
10 10 from morfeuszbuilder.utils import exceptions
11 11  
... ... @@ -35,7 +35,7 @@ class FSA(object):
35 35 assert not self.closed
36 36 assert data is not None
37 37 encodedWord = self.encodeWord(word)
38   - assert encodedWord > self.encodedPrevWord
  38 + assert self.encodedPrevWord is None or encodedWord > self.encodedPrevWord
39 39 self._addSorted(encodedWord, self.encodeData(data))
40 40 self.encodedPrevWord = encodedWord
41 41  
... ... @@ -43,7 +43,7 @@ class FSA(object):
43 43  
44 44 # debug
45 45 if self.n < 10 or (self.n < 10000 and self.n % 1000 == 0) or self.n % 10000 == 0:
46   - logging.info(u'%d %s' % (self.n, word))
  46 + logging.info('%d %s' % (self.n, word))
47 47 for label in encodedWord:
48 48 self.label2Freq[label] = self.label2Freq.get(label, 0) + 1
49 49  
... ... @@ -78,7 +78,7 @@ class FSA(object):
78 78 return res
79 79  
80 80 def _addSorted(self, encodedWord, data):
81   - assert self.encodedPrevWord < encodedWord
  81 + assert self.encodedPrevWord is None or self.encodedPrevWord < encodedWord
82 82 assert type(data) == bytearray
83 83 q = self.initialState
84 84 i = 0
... ...
fsabuilder/morfeuszbuilder/fsa/serializer.py
... ... @@ -5,7 +5,7 @@ Created on Oct 20, 2013
5 5 '''
6 6  
7 7 import logging
8   -from state import State
  8 +from .state import State
9 9 from morfeuszbuilder.utils import limits, exceptions
10 10 from morfeuszbuilder.utils.serializationUtils import *
11 11  
... ... @@ -106,7 +106,7 @@ class Serializer(object):
106 106 res = bytearray()
107 107 numOfTags = len(tagsMap)
108 108 res.extend(htons(numOfTags))
109   - for tag, tagnum in sorted(tagsMap.iteritems(), key=lambda (tag, tagnum): tagnum):
  109 + for tag, tagnum in sorted(iter(list(tagsMap.items())), key=lambda tag_tagnum: tag_tagnum[1]):
110 110 res.extend(htons(tagnum))
111 111 res.extend(self.fsa.encodeWord(tag))
112 112 res.append(0)
... ... @@ -121,7 +121,7 @@ class Serializer(object):
121 121 #~ return res
122 122  
123 123 def serializeQualifiersMap(self):
124   - label2labelId = dict([ (u'|'.join(qualifiers), n) for qualifiers, n in sorted(self.qualifiersMap.iteritems(), key=lambda (qs, n): n) ])
  124 + label2labelId = dict([ ('|'.join(sorted(qualifiers)), n) for qualifiers, n in sorted(iter(list(self.qualifiersMap.items())), key=lambda qs_n: qs_n[1]) ])
125 125 return self._serializeTags(label2labelId)
126 126 #~ res = bytearray()
127 127 #~ res.extend(htons(len(self.qualifiersMap)))
... ... @@ -186,9 +186,9 @@ class Serializer(object):
186 186 return res
187 187  
188 188 def getSortedTransitions(self, state):
189   - defaultKey = lambda (label, nextState): (-state.label2Freq.get(label, 0), -self.fsa.label2Freq.get(label, 0))
  189 + defaultKey = lambda label_nextState: (-state.label2Freq.get(label_nextState[0], 0), -self.fsa.label2Freq.get(label_nextState[0], 0))
190 190 return list(sorted(
191   - state.transitionsMap.iteritems(),
  191 + iter(list(state.transitionsMap.items())),
192 192 key=defaultKey))
193 193  
194 194 def stateData2bytearray(self, state):
... ... @@ -215,9 +215,9 @@ class SimpleSerializer(Serializer):
215 215  
216 216 def getStateSize(self, state):
217 217 if self.serializeTransitionsData:
218   - return 1 + 5 * len(state.transitionsMap.keys()) + self.getDataSize(state)
  218 + return 1 + 5 * len(list(state.transitionsMap.keys())) + self.getDataSize(state)
219 219 else:
220   - return 1 + 4 * len(state.transitionsMap.keys()) + self.getDataSize(state)
  220 + return 1 + 4 * len(list(state.transitionsMap.keys())) + self.getDataSize(state)
221 221  
222 222 def getDataSize(self, state):
223 223 return len(state.encodedData) if state.isAccepting() else 0
... ... @@ -270,12 +270,12 @@ class VLengthSerializer1(Serializer):
270 270 res = bytearray()
271 271  
272 272 # labels sorted by popularity
273   - sortedLabels = [label for (label, freq) in sorted(self.fsa.label2Freq.iteritems(), key=lambda (label, freq): (-freq, label))]
  273 + sortedLabels = [label for (label, freq) in sorted(iter(list(self.fsa.label2Freq.items())), key=lambda label_freq: (-label_freq[1], label_freq[0]))]
274 274  
275 275 # popular labels table
276 276 self.label2ShortLabel = dict([(label, sortedLabels.index(label) + 1) for label in sortedLabels[:63]])
277 277  
278   - logging.debug(dict([(chr(label), shortLabel) for label, shortLabel in self.label2ShortLabel.items()]))
  278 + logging.debug(dict([(chr(label), shortLabel) for label, shortLabel in list(self.label2ShortLabel.items())]))
279 279  
280 280 # write remaining short labels (zeros)
281 281 for label in range(256):
... ... @@ -354,7 +354,7 @@ class VLengthSerializer1(Serializer):
354 354 offsetSize += 1
355 355 exceptions.validate(
356 356 offset < 256 * 256 * 256,
357   - u'Cannot build the automaton - it would exceed its max size which is %d' % (256 * 256 * 256))
  357 + 'Cannot build the automaton - it would exceed its max size which is %d' % (256 * 256 * 256))
358 358 # assert offset < 256 * 256 * 256 # TODO - przerobic na jakis porzadny wyjatek
359 359 assert offsetSize <= 3
360 360 firstByte |= offsetSize
... ... @@ -380,7 +380,7 @@ class VLengthSerializer1(Serializer):
380 380 newState.encodedData = state.encodedData
381 381 newState.reverseOffset = state.reverseOffset
382 382 newState.offset = state.offset
383   - newState.transitionsMap = dict([(label, nextState) for (label, nextState) in state.transitionsMap.iteritems()])
  383 + newState.transitionsMap = dict([(label, nextState) for (label, nextState) in list(state.transitionsMap.items())])
384 384 # newState.transitionsMap = dict([(label, nextState) for (label, nextState) in state.transitionsMap.iteritems() if not label in self.label2ShortLabel or not self.label2ShortLabel[label] in range(1,64)])
385 385 newState.serializeAsArray = False
386 386 return newState
... ... @@ -388,12 +388,12 @@ class VLengthSerializer1(Serializer):
388 388 def _transitions2ArrayBytes(self, state):
389 389 res = bytearray()
390 390 array = [0] * 64
391   - for label, nextState in state.transitionsMap.iteritems():
  391 + for label, nextState in list(state.transitionsMap.items()):
392 392 if label in self.label2ShortLabel:
393 393 shortLabel = self.label2ShortLabel[label]
394 394 array[shortLabel] = nextState.offset
395 395 logging.debug(array)
396   - for offset in map(lambda x: x if x else 0, array):
  396 + for offset in [x if x else 0 for x in array]:
397 397 res.append(0)
398 398 res.append((offset & 0xFF0000) >> 16)
399 399 res.append((offset & 0x00FF00) >> 8)
... ... @@ -409,8 +409,8 @@ class VLengthSerializer1(Serializer):
409 409 return self._transitions2ListBytes(state)
410 410  
411 411 def _chooseArrayStates(self):
412   - for state1 in self.fsa.initialState.transitionsMap.values():
413   - for state2 in state1.transitionsMap.values():
  412 + for state1 in list(self.fsa.initialState.transitionsMap.values()):
  413 + for state2 in list(state1.transitionsMap.values()):
414 414 # for state3 in state2.transitionsMap.values():
415 415 # state3.serializeAsArray = True
416 416 state2.serializeAsArray = True
... ...
fsabuilder/morfeuszbuilder/fsa/state.py
... ... @@ -45,7 +45,7 @@ class State(object):
45 45 return self.transitionsMap.get(byte, None)
46 46  
47 47 def getRegisterKey(self):
48   - return ( frozenset(self.transitionsMap.iteritems()), tuple(self.encodedData) if self.encodedData else None )
  48 + return ( frozenset(iter(list(self.transitionsMap.items()))), tuple(self.encodedData) if self.encodedData else None )
49 49  
50 50 def isAccepting(self):
51 51 return self.encodedData is not None
... ... @@ -60,10 +60,10 @@ class State(object):
60 60 else:
61 61 return self.encodedData
62 62  
63   - def dfs(self, alreadyVisited, sortKey=lambda (_, state): -state.freq):
  63 + def dfs(self, alreadyVisited, sortKey=lambda __state: -__state[1].freq):
64 64 if not self in alreadyVisited:
65 65 alreadyVisited.add(self)
66   - for _, state in sorted(self.transitionsMap.iteritems(), key=sortKey):
  66 + for _, state in sorted(iter(list(self.transitionsMap.items())), key=sortKey):
67 67 for state1 in state.dfs(alreadyVisited):
68 68 yield state1
69 69 yield self
... ... @@ -77,7 +77,7 @@ class State(object):
77 77 state.offset = currReverseOffset - state.reverseOffset
78 78  
79 79 def debug(self):
80   - print '----------------'
81   - print 'STATE:', self.idx, 'accepting', self.isAccepting()
82   - for label, s in self.transitionsMap.iteritems():
83   - print label, '-->', s.idx
  80 + print('----------------')
  81 + print(('STATE:', self.idx, 'accepting', self.isAccepting()))
  82 + for label, s in list(self.transitionsMap.items()):
  83 + print((label, '-->', s.idx))
... ...
fsabuilder/morfeuszbuilder/fsa/visualizer.py
... ... @@ -19,7 +19,7 @@ class Visualizer(object):
19 19 nodeLabelsMap = {}
20 20 for idx, state in enumerate(allStates):
21 21 G.add_node(idx, offset=state.offset)
22   - for c, targetState in state.transitionsMap.iteritems():
  22 + for c, targetState in list(state.transitionsMap.items()):
23 23 G.add_edge(idx, allStates.index(targetState))
24 24 label = (chr(c) if c <= 127 else '%') if charLabels \
25 25 else c
... ... @@ -37,11 +37,11 @@ class Visualizer(object):
37 37 nodelist=list([allStates.index(s) for s in allStates if s.isAccepting()]),
38 38 node_shape='s')
39 39 # nx.draw_networkx_nodes(G, pos, nodelist=list([allStates.index(s) for s in allStates if s.isFinal()])), )
40   - nx.draw_networkx_edges(G, pos, edgelist=edgeLabelsMap.keys())
  40 + nx.draw_networkx_edges(G, pos, edgelist=list(edgeLabelsMap.keys()))
41 41 nx.draw_networkx_edge_labels(G, pos, edge_labels = edgeLabelsMap)
42 42 nx.draw_networkx_labels(G, pos, labels=nodeLabelsMap)
43 43 plt.axis('off')
44 44 plt.draw()
45 45 plt.show()
46 46 # plt.savefig('filename.png')
47   - print 'done'
  47 + print('done')
... ...
fsabuilder/morfeuszbuilder/segrules/preprocessor.py
... ... @@ -7,10 +7,10 @@ Created on 23 sty 2014
7 7 import re
8 8 from pyparsing import *
9 9 from morfeuszbuilder.utils import exceptions
10   -from pyparseString import pyparseString
  10 +from .pyparseString import pyparseString
11 11  
12   -identifier = Word(alphas, bodyChars=alphanums+u'_>*+{},')
13   -define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + Word(alphas, bodyChars=alphanums+u'_') + Suppress(')')) + restOfLine + LineEnd() + StringEnd()
  12 +identifier = Word(alphas, bodyChars=alphanums+'_>*+{},')
  13 +define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + Word(alphas, bodyChars=alphanums+'_') + Suppress(')')) + restOfLine + LineEnd() + StringEnd()
14 14 ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd()
15 15 endif = Keyword('#endif').suppress() + LineEnd() + StringEnd()
16 16  
... ... @@ -107,5 +107,5 @@ def preprocess(inputLines, defs, filename):
107 107 ifdefsStack.pop()
108 108 elif line.startswith('#'):
109 109 yield lineNum, line
110   - elif len(ifdefsStack) == 0 or all(map(lambda (name, isActive): (name in defs and isActive) or (name not in defs and not isActive), ifdefsStack)):
  110 + elif len(ifdefsStack) == 0 or all([(name_isActive[0] in defs and name_isActive[1]) or (name_isActive[0] not in defs and not name_isActive[1]) for name_isActive in ifdefsStack]):
111 111 yield lineNum, _processLine(lineNum, line, defines, filename)
... ...
fsabuilder/morfeuszbuilder/segrules/pyparseString.py
... ... @@ -11,7 +11,7 @@ def pyparseString(rule, lineNum, line, filename):
11 11 try:
12 12 return rule.parseString(line, parseAll=True)
13 13 except ParseException as ex:
14   - msg = u'%s:%d - Preprocessing of segmentation rules failed.\n' % (filename, lineNum)
  14 + msg = '%s:%d - Preprocessing of segmentation rules failed.\n' % (filename, lineNum)
15 15 msg += line + '\n'
16 16 msg += (ex.col - 1) * ' ' + '^\n'
17 17 msg += ex.msg
... ...
fsabuilder/morfeuszbuilder/segrules/rules.py
... ... @@ -126,7 +126,7 @@ class ComplexRule(SegmentRule):
126 126 def __init__(self, children, linenum):
127 127 super(ComplexRule, self).__init__(linenum)
128 128 self.children = children
129   - assert not any(map(lambda c: c.isSinkRule(), children))
  129 + assert not any([c.isSinkRule() for c in children])
130 130  
131 131 def addToNFA(self, fsa):
132 132 endState = RulesNFAState(self, final=True, weak=self.weak, autogenerated=self.autogenerated)
... ... @@ -159,13 +159,13 @@ class ConcatRule(ComplexRule):
159 159 lastChild._doAddToNFA(currStartState, endState)
160 160  
161 161 def allowsEmptySequence(self):
162   - return all(map(lambda rule: rule.allowsEmptySequence(), self.children))
  162 + return all([rule.allowsEmptySequence() for rule in self.children])
163 163  
164 164 def __str__(self):
165   - return u' '.join(map(lambda c: str(c), self.children))
  165 + return ' '.join([str(c) for c in self.children])
166 166  
167 167 def isShiftOrthRule(self):
168   - return all(map(lambda c: c.isShiftOrthRule(), self.children))
  168 + return all([c.isShiftOrthRule() for c in self.children])
169 169  
170 170 def transformToGeneratorVersion(self):
171 171 newChildren = [child.transformToGeneratorVersion() for child in self.children if not child.allowsEmptySequence() or child.isShiftOrthRule()]
... ... @@ -207,11 +207,11 @@ class ConcatRule(ComplexRule):
207 207 for rule in self.children:
208 208 rule.validate(filename)
209 209 if self.children[-1].isShiftOrthRule() \
210   - and not all(map(lambda c: c.isShiftOrthRule(), self.children)):
  210 + and not all([c.isShiftOrthRule() for c in self.children]):
211 211 raise ConfigFileException(
212 212 filename,
213 213 self.linenum,
214   - u'If the rightmost subrule of concatenation "%s" is with ">", than all subrules must be with ">"' % str(self))
  214 + 'If the rightmost subrule of concatenation "%s" is with ">", than all subrules must be with ">"' % str(self))
215 215  
216 216 class OrRule(ComplexRule):
217 217  
... ... @@ -227,17 +227,17 @@ class OrRule(ComplexRule):
227 227 intermEndState.addTransition(None, endState)
228 228  
229 229 def allowsEmptySequence(self):
230   - return any(map(lambda rule: rule.allowsEmptySequence(), self.children))
  230 + return any([rule.allowsEmptySequence() for rule in self.children])
231 231  
232 232 def __str__(self):
233   - return u' | '.join(map(lambda c: str(c), self.children))
  233 + return ' | '.join([str(c) for c in self.children])
234 234  
235 235 def isShiftOrthRule(self):
236   - return all(map(lambda c: c.isShiftOrthRule(), self.children))
  236 + return all([c.isShiftOrthRule() for c in self.children])
237 237  
238 238 def transformToGeneratorVersion(self):
239 239 newChildren = [child.transformToGeneratorVersion() for child in self.children if not child.allowsEmptySequence() or child.isShiftOrthRule()]
240   - newChildren = filter(lambda c: not c.isSinkRule(), newChildren)
  240 + newChildren = [c for c in newChildren if not c.isSinkRule()]
241 241 if newChildren == []:
242 242 return SinkRule()
243 243 else:
... ... @@ -255,12 +255,12 @@ class OrRule(ComplexRule):
255 255 for rule in self.children:
256 256 rule.validate(filename)
257 257 if not (
258   - all(map(lambda c: c.isShiftOrthRule(), self.children))
259   - or not any(map(lambda c: c.isShiftOrthRule(), self.children))):
  258 + all([c.isShiftOrthRule() for c in self.children])
  259 + or not any([c.isShiftOrthRule() for c in self.children])):
260 260 raise ConfigFileException(
261 261 filename,
262 262 self.linenum,
263   - u'All subrules of alternative "%s" must be either with or without ">"' % str(self))
  263 + 'All subrules of alternative "%s" must be either with or without ">"' % str(self))
264 264  
265 265 class ZeroOrMoreRule(UnaryRule):
266 266  
... ... @@ -291,7 +291,7 @@ class ZeroOrMoreRule(UnaryRule):
291 291 return SinkRule()
292 292  
293 293 def __str__(self):
294   - return u'(' + str(self.child) + ')*'
  294 + return '(' + str(self.child) + ')*'
295 295  
296 296 class OptionalRule(UnaryRule):
297 297  
... ... @@ -321,7 +321,7 @@ class OptionalRule(UnaryRule):
321 321 return self.child.transformToGeneratorVersion()
322 322  
323 323 def __str__(self):
324   - return u'(' + str(self.child) + ')?'
  324 + return '(' + str(self.child) + ')?'
325 325  
326 326 class SinkRule(SegmentRule):
327 327  
... ...
fsabuilder/morfeuszbuilder/segrules/rulesFSA.py
... ... @@ -49,7 +49,7 @@ class RulesFSA(object):
49 49 def transitionsData2bytearray(self, state):
50 50 res = bytearray()
51 51 # logging.debug('next')
52   - for (segnum, shiftOrth), nextState in sorted(state.transitionsMap.iteritems()):
  52 + for (segnum, shiftOrth), nextState in sorted(state.transitionsMap.items()):
53 53 res.append(segnum)
54 54 if shiftOrth:
55 55 res.append(1)
... ... @@ -57,8 +57,8 @@ class RulesFSA(object):
57 57 res.append(0)
58 58 offset = nextState.offset
59 59 exceptions.validate(offset <= MAX_FSA_SIZE,
60   - u'Segmentation rules are too big and complicated' \
61   - + u'- the resulting automaton would exceed its max size which is %d' \
  60 + 'Segmentation rules are too big and complicated' \
  61 + + '- the resulting automaton would exceed its max size which is %d' \
62 62 % MAX_FSA_SIZE)
63 63 res.extend(htons(offset))
64 64 return res
... ...
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
... ... @@ -7,7 +7,7 @@ import logging
7 7 from morfeuszbuilder.utils.serializationUtils import htons, htonl
8 8 from morfeuszbuilder.utils import serializationUtils
9 9 from morfeuszbuilder.utils import exceptions
10   -import shiftOrthMagic
  10 +from . import shiftOrthMagic
11 11  
12 12 class RulesManager(object):
13 13  
... ... @@ -19,7 +19,7 @@ class RulesManager(object):
19 19 self.shiftOrthMagic = shiftOrthMagic.ShiftOrthMagic()
20 20  
21 21 def _options2Key(self, optionsMap):
22   - return frozenset(optionsMap.items())
  22 + return frozenset(list(optionsMap.items()))
23 23  
24 24 def _key2Options(self, optionsKey):
25 25 return dict(optionsKey)
... ... @@ -46,9 +46,9 @@ class RulesManager(object):
46 46 dfasNum = len(self.options2DFA)
47 47 exceptions.validate(
48 48 dfasNum > 0 and dfasNum < 256,
49   - u'Too many segmentation rules variants')
  49 + 'Too many segmentation rules variants')
50 50 res.append(dfasNum)
51   - for key, dfa in self.options2DFA.iteritems():
  51 + for key, dfa in list(self.options2DFA.items()):
52 52 optionsMap = self._key2Options(key)
53 53 res.extend(self._serializeOptionsMap(optionsMap))
54 54 res.extend(self._serializeDFA(dfa))
... ...
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
... ... @@ -41,16 +41,16 @@ class RulesNFAState(object):
41 41 if not self in visitedStates:
42 42 visitedStates.add(self)
43 43 yield self
44   - for _, nextStates in self.transitionsMap.iteritems():
  44 + for _, nextStates in list(self.transitionsMap.items()):
45 45 for state in nextStates:
46 46 for state1 in state.dfs(visitedStates):
47 47 yield state1
48 48  
49 49 def debug(self):
50   - print '----------------'
51   - print 'STATE:', self.idx
52   - for label, nextStates in self.transitionsMap.iteritems():
53   - print label, '-->', [s.idx for s in sorted(nextStates, key=lambda s: s.idx)]
  50 + print('----------------')
  51 + print(('STATE:', self.idx))
  52 + for label, nextStates in list(self.transitionsMap.items()):
  53 + print((label, '-->', [s.idx for s in sorted(nextStates, key=lambda s: s.idx)]))
54 54  
55 55 class RulesNFA(object):
56 56  
... ... @@ -60,7 +60,7 @@ class RulesNFA(object):
60 60 def _groupOutputByLabels(self, nfaStates):
61 61 res = {}
62 62 for nfaState in nfaStates:
63   - for label, nextStates in nfaState.transitionsMap.iteritems():
  63 + for label, nextStates in list(nfaState.transitionsMap.items()):
64 64 if label is not None:
65 65 # transitionData = nfaState.transitionsDataMap[label]
66 66 segnum, shiftOrth = label
... ... @@ -70,27 +70,21 @@ class RulesNFA(object):
70 70 return res
71 71  
72 72 def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState):
73   - weakHits = map(
74   - lambda state: state.weak,
75   - filter(
76   - lambda state: state.final and not state.autogenerated,
77   - nfaStates))
  73 + weakHits = [state.weak for state in [state for state in nfaStates if state.final and not state.autogenerated]]
78 74 if not all(weakHits) \
79 75 and any(weakHits):
80   - weakState = list(filter(lambda state: state.final and state.weak, nfaStates))[0]
81   - nonWeakState = list(filter(lambda state: state.final and not state.weak, nfaStates))[0]
  76 + weakState = list([state for state in nfaStates if state.final and state.weak])[0]
  77 + nonWeakState = list([state for state in nfaStates if state.final and not state.weak])[0]
82 78 raise InconsistentStateWeaknessException(weakState, nonWeakState)
83   - weak = any(map(
84   - lambda state: state.weak and state.final,
85   - filter(lambda state: not state.autogenerated, nfaStates)))
86   - final = any(map(lambda state: state.final, nfaStates))
  79 + weak = any([state.weak and state.final for state in [state for state in nfaStates if not state.autogenerated]])
  80 + final = any([state.final for state in nfaStates])
87 81 # assert not weak or not final
88 82 if final:
89 83 # dfaState should be final
90 84 # and contain info about weakness
91 85 dfaState.setAsAccepting(weak=weak)
92 86 # dfaState.encodedData = bytearray([1 if weak else 0])
93   - for (segnum, shiftOrth), nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems():
  87 + for (segnum, shiftOrth), nextNFAStates in list(self._groupOutputByLabels(nfaStates).items()):
94 88 key = frozenset(nextNFAStates)
95 89 if key in nfaSubset2DFAState:
96 90 nextDFAState = nfaSubset2DFAState[key]
... ... @@ -104,7 +98,7 @@ class RulesNFA(object):
104 98 def convertToDFA(self):
105 99 dfa = RulesFSA()
106 100 startStates = self.initialState.getClosure(set())
107   - assert not any(filter(lambda s: s.final, startStates))
  101 + assert not any([s for s in startStates if s.final])
108 102 dfa.initialState = RulesState()
109 103 self._doConvertState(dfa.initialState, startStates, {frozenset(startStates): dfa.initialState})
110 104 return dfa
... ...
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
... ... @@ -28,11 +28,11 @@ class RulesParser(object):
28 28 key, defs = lineToParse.parseString(line)
29 29 res[key] = tuple(defs)
30 30 except Exception as ex:
31   - raise exceptions.ConfigFileException(segtypesConfigFile.filename, lineNum, u'Error in [options] section: %s' % str(ex))
  31 + raise exceptions.ConfigFileException(segtypesConfigFile.filename, lineNum, 'Error in [options] section: %s' % str(ex))
32 32 return res
33 33  
34 34 def _key2DefAsKey(self, key2Def):
35   - return frozenset(key2Def.items())
  35 + return frozenset(list(key2Def.items()))
36 36  
37 37 def parse(self, filename):
38 38  
... ... @@ -53,12 +53,12 @@ class RulesParser(object):
53 53 res = rulesManager.RulesManager(segtypesHelper, separatorsList)
54 54  
55 55 def2Key = {}
56   - for key, defs in key2Defs.iteritems():
  56 + for key, defs in list(key2Defs.items()):
57 57 for define in defs:
58 58 def2Key[define] = key
59 59  
60 60 resultsMap = {}
61   - for idx, defs in enumerate(itertools.product(*key2Defs.values())):
  61 + for idx, defs in enumerate(itertools.product(*list(key2Defs.values()))):
62 62 key2Def = dict([(def2Key[define], define) for define in defs])
63 63 currRes = []
64 64 resultsMap[self._key2DefAsKey(key2Def)] = currRes
... ... @@ -86,7 +86,7 @@ class RulesParser(object):
86 86  
87 87 self.doShiftOrthMagic(resultsMap, res)
88 88  
89   - for idx, defs in enumerate(itertools.product(*key2Defs.values())):
  89 + for idx, defs in enumerate(itertools.product(*list(key2Defs.values()))):
90 90 key2Def = dict([(def2Key[define], define) for define in defs])
91 91  
92 92 nfa = rulesNFA.RulesNFA()
... ... @@ -115,20 +115,20 @@ class RulesParser(object):
115 115  
116 116 def _createNewTagRule(self, segtype, shiftOrth, lineNum, line, segtypesHelper):
117 117 if not segtypesHelper.hasSegtype(segtype):
118   - raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype))
  118 + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid segment type: %s' % (line, segtype))
119 119 else:
120 120 # return rules.TagRule(segtype)
121 121 return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), shiftOrth, segtype, lineNum)
122 122  
123 123 def _createQuantRule1(self, child, quantity, lineNum, line, segtypesHelper):
124 124 if quantity <= 0:
125   - raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid quantity: %d' % (line, quantity))
  125 + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantity: %d' % (line, quantity))
126 126 else:
127 127 return rules.ConcatRule(quantity * [child], lineNum)
128 128  
129 129 def _createQuantRule2(self, child, leftN, rightN, lineNum, line, segtypesHelper):
130 130 if leftN > rightN or (leftN, rightN) == (0, 0):
131   - raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid quantities: %d %d' % (line, leftN, rightN))
  131 + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantities: %d %d' % (line, leftN, rightN))
132 132 elif leftN == 0:
133 133 children = [rules.OptionalRule(child, lineNum)]
134 134 for n in range(2, rightN + 1):
... ... @@ -140,7 +140,7 @@ class RulesParser(object):
140 140  
141 141 def _createQuantRule3(self, child, quantity, lineNum, line, segtypesHelper):
142 142 if quantity <= 0:
143   - raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid quantity: %d' % (line, quantity))
  143 + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantity: %d' % (line, quantity))
144 144 else:
145 145 return rules.ConcatRule(
146 146 [
... ... @@ -200,7 +200,7 @@ class RulesParser(object):
200 200 shiftOrthSegtypes = set()
201 201 nonShiftOrthSegtypes = set()
202 202  
203   - for _, rules in resultsMap.iteritems():
  203 + for _, rules in list(resultsMap.items()):
204 204 for rule in rules:
205 205 for atomicRule in rule.getAtomicRules():
206 206 if atomicRule.shiftOrth:
... ...
fsabuilder/morfeuszbuilder/segrules/shiftOrthMagic.py
... ... @@ -36,7 +36,7 @@ class ShiftOrthMagic(object):
36 36 for segtype in shiftOrthSegtypes - nonShiftOrthSegtypes:
37 37 self._onlyShiftSegnums.add(segtypesHelper.getSegnum4Segtype(segtype))
38 38  
39   - for _, rules in resultsMap.iteritems():
  39 + for _, rules in list(resultsMap.items()):
40 40 for rule in rules:
41 41 for atomicRule in rule.getAtomicRules():
42 42 if atomicRule.segnum in self._bothShiftAndNonShiftSegnums and atomicRule.shiftOrth:
... ...
fsabuilder/morfeuszbuilder/segrules/test/parserTest.py
... ... @@ -12,18 +12,18 @@ from morfeuszbuilder.fsa import visualizer, serializer
12 12 class Test(unittest.TestCase):
13 13  
14 14 def testParser(self):
15   - print 'do test'
  15 + print('do test')
16 16 t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset'))
17 17 parser = rulesParser.RulesParser(t)
18 18 rulesManager = parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat'))
19 19 fsa = rulesManager.getDFA({'aggl': 'permissive', 'praet': 'split'})
20 20 for s in fsa.dfs():
21 21 s.debug()
22   - print 'states:', len(list(fsa.dfs()))
23   - print 'transitions:', fsa.getTransitionsNum()
  22 + print(('states:', len(list(fsa.dfs()))))
  23 + print(('transitions:', fsa.getTransitionsNum()))
24 24 visualizer.Visualizer().visualize(fsa, charLabels=False)
25   - print 'size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray()))
26   - print 'done'
  25 + print(('size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray()))))
  26 + print('done')
27 27  
28 28 if __name__ == "__main__":
29 29 unittest.main()
... ...
fsabuilder/morfeuszbuilder/segrules/test/preprocessorTest.py
... ... @@ -19,7 +19,7 @@ class Test(unittest.TestCase):
19 19 parsedFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes', 'segment types'])
20 20 linesEnum = parsedFile.enumerateLinesInSection('combinations')
21 21 for lineNum, line in preprocessor.preprocess(linesEnum, ['extra', 'superextra']):
22   - print (lineNum, line)
  22 + print((lineNum, line))
23 23  
24 24  
25 25 if __name__ == "__main__":
... ...
fsabuilder/morfeuszbuilder/tagset/segtypes.py
... ... @@ -11,11 +11,11 @@ from morfeuszbuilder.utils import exceptions
11 11 def _getLemmaHomonymPair(lemma):
12 12 if lemma is None:
13 13 return (None, None)
14   - elif u':' in lemma:
15   - if lemma.replace(u':', '') == '':
  14 + elif ':' in lemma:
  15 + if lemma.replace(':', '') == '':
16 16 return (lemma, None)
17 17 else:
18   - return lemma.split(u':', 1)
  18 + return lemma.split(':', 1)
19 19 else:
20 20 return (lemma, None)
21 21  
... ... @@ -26,7 +26,7 @@ class Segtypes(object):
26 26 self.tagset = tagset
27 27 self.namesMap = namesMap
28 28 self.labelsMap = labelsMap
29   - self._reverseLabelsMap = dict([(v, k) for (k, v) in labelsMap.iteritems()])
  29 + self._reverseLabelsMap = dict([(v, k) for (k, v) in list(labelsMap.items())])
30 30  
31 31 self.filename = segrulesConfigFile.filename
32 32  
... ... @@ -59,13 +59,13 @@ class Segtypes(object):
59 59  
60 60 def _readSegtypes(self, segrulesConfigFile):
61 61 for lineNum, line in segrulesConfigFile.enumerateLinesInSection('segment types'):
62   - assert type(line) == unicode
  62 + assert type(line) == str
63 63 self._validate(
64   - u'Segment type must be a single word',
  64 + 'Segment type must be a single word',
65 65 lineNum,
66 66 re.match(r'^\w+$', line))
67 67 self._validate(
68   - u'Segment type already defined: "%s"' % line,
  68 + 'Segment type already defined: "%s"' % line,
69 69 lineNum,
70 70 line not in self.segtypes)
71 71 self.segtypes.append(line)
... ... @@ -75,13 +75,13 @@ class Segtypes(object):
75 75 for lineNum, line in segrulesConfigFile.enumerateLinesInSection('tags'):
76 76 self._parsePattern(lineNum, line, withLemma=False)
77 77 self._validate(
78   - u'Pattern that matches everything must be the last one',
  78 + 'Pattern that matches everything must be the last one',
79 79 lineNum - 1,
80 80 not gotWildcardPattern)
81 81 gotWildcardPattern = gotWildcardPattern or self.patternsList[-1].isWildcardPattern()
82 82  
83 83 self._validate(
84   - u'There must be a pattern that matches everything at the end of [tags] section',
  84 + 'There must be a pattern that matches everything at the end of [tags] section',
85 85 lineNum,
86 86 self.patternsList[-1].isWildcardPattern())
87 87  
... ... @@ -94,18 +94,18 @@ class Segtypes(object):
94 94 for f in fields:
95 95 match = re.match(r'(name|labels)=([\S]+)', f, re.U)
96 96 self._validate(
97   - u'invalid name or labels constraint: "%s"' % f,
  97 + 'invalid name or labels constraint: "%s"' % f,
98 98 lineNum,
99 99 match)
100 100 key = match.group(1)
101 101 value = match.group(2)
102 102 self._validate(
103   - u'%s already specified' % key,
  103 + '%s already specified' % key,
104 104 lineNum,
105 105 key not in res)
106 106 if key == 'labels':
107 107 if value:
108   - value = frozenset(value.split(u'|'))
  108 + value = frozenset(value.split('|'))
109 109 else:
110 110 value = frozenset()
111 111 res[key] = value
... ... @@ -115,7 +115,7 @@ class Segtypes(object):
115 115 split = re.split(r'\s+', line.strip())
116 116 if withLemma:
117 117 self._validate(
118   - u'Line in [lexemes] section must contain 3 to 5 fields - segment type, lemma, tag pattern and optional constraints on name and labels',
  118 + 'Line in [lexemes] section must contain 3 to 5 fields - segment type, lemma, tag pattern and optional constraints on name and labels',
119 119 lineNum,
120 120 len(split) in [3, 4, 5])
121 121 segtype = split[0]
... ... @@ -124,7 +124,7 @@ class Segtypes(object):
124 124 additionalConstraints = self._parseAdditionalConstraints(lineNum, split[3:])
125 125 else:
126 126 self._validate(
127   - u'Line in [tags] section must contain 2 to 4 fields - segment type, tag pattern and optional constraints on name and labels',
  127 + 'Line in [tags] section must contain 2 to 4 fields - segment type, tag pattern and optional constraints on name and labels',
128 128 lineNum,
129 129 len(split) in [2, 3, 4])
130 130 segtype = split[0]
... ... @@ -132,32 +132,32 @@ class Segtypes(object):
132 132 pattern = split[1]
133 133 additionalConstraints = self._parseAdditionalConstraints(lineNum, split[2:])
134 134 self._validate(
135   - u'Undeclared segment type: "%s"' % segtype,
  135 + 'Undeclared segment type: "%s"' % segtype,
136 136 lineNum,
137 137 segtype in self.segtypes)
138 138 segnum = self.segtypes.index(segtype)
139 139  
140 140 self._validate(
141   - u'Pattern must contain only ":", "%", "." and lowercase alphanumeric letters',
  141 + 'Pattern must contain only ":", "%", "." and lowercase alphanumeric letters',
142 142 lineNum,
143 143 re.match(r'[a-z_\.\:\%]+', pattern))
144 144  
145 145 segtypePattern = SegtypePattern(
146 146 lemma,
147 147 pattern,
148   - additionalConstraints.get('name', u''),
  148 + additionalConstraints.get('name', ''),
149 149 additionalConstraints.get('labels', frozenset()),
150 150 segnum)
151 151 # print 'segtypePattern', repr(str(segtypePattern))
152 152 self._validate(
153   - u'There is no tag that matches pattern "%s".' % (pattern),
  153 + 'There is no tag that matches pattern "%s".' % (pattern),
154 154 lineNum,
155 155 any([segtypePattern.tryToMatchTag(tag) != -1 for tag in self.tagset.getAllTags()]))
156 156 self.patternsList.append(segtypePattern)
157 157  
158 158 def _getAllExistingLabelsnumCombinations(self, labels):
159 159 if labels:
160   - for labelsCombination, labelsnum in self.labelsMap.iteritems():
  160 + for labelsCombination, labelsnum in list(self.labelsMap.items()):
161 161 if labels <= labelsCombination:
162 162 yield labelsnum
163 163 else:
... ... @@ -232,7 +232,7 @@ class SegtypePattern(object):
232 232 return -1
233 233  
234 234 def isWildcardPattern(self):
235   - return (self.lemma, self.pattern, self.name, self.labels) == (None, '%', u'', frozenset())
  235 + return (self.lemma, self.pattern, self.name, self.labels) == (None, '%', '', frozenset())
236 236  
237 237 def __str__(self):
238   - return u'%s %s %s %s -> %d' % (self.lemma, self.pattern, self.name, self.labels, self.segnum)
  238 + return '%s %s %s %s -> %d' % (self.lemma, self.pattern, self.name, self.labels, self.segnum)
... ...
fsabuilder/morfeuszbuilder/tagset/tagset.py
... ... @@ -20,7 +20,7 @@ class Tagset(object):
20 20 #~ self._name2namenum = {}
21 21 if filename:
22 22 self._doInit(filename, encoding)
23   - self._tagnum2tag = dict(map(lambda (k, v): (v, k), self.tag2tagnum.iteritems()))
  23 + self._tagnum2tag = dict([(k_v[1], k_v[0]) for k_v in iter(list(self.tag2tagnum.items()))])
24 24  
25 25 def _doInit(self, filename, encoding):
26 26 insideTags = False
... ... @@ -33,11 +33,11 @@ class Tagset(object):
33 33 self.tagsetId = match.group(1)
34 34 else:
35 35 raise FSABuilderException('missing TAGSET-ID in first line of tagset file')
36   - elif line == u'[TAGS]':
  36 + elif line == '[TAGS]':
37 37 insideTags = True
38 38 #~ elif line == u'[NAMES]':
39 39 #~ addingTo = Tagset.NAMES
40   - elif line and not line.startswith(u'#'):
  40 + elif line and not line.startswith('#'):
41 41 if not insideTags:
42 42 raise FSABuilderException('"%s" - text outside [TAGS] section in tagset file line %d' % (line, linenum))
43 43 res = self.tag2tagnum
... ... @@ -47,12 +47,12 @@ class Tagset(object):
47 47 tag = line.split(Tagset.SEP)[1]
48 48 if tag in res:
49 49 raise FSABuilderException('duplicate tag: "%s"' % tag)
50   - if int(tagNum) in res.values():
  50 + if int(tagNum) in list(res.values()):
51 51 raise FSABuilderException('line %d: tagId %d assigned for tag "%s" already appeared somewhere else.' % (linenum, int(tagNum), tag))
52 52 res[tag] = int(tagNum)
53 53  
54 54 def getAllTags(self):
55   - return self.tag2tagnum.keys()
  55 + return list(self.tag2tagnum.keys())
56 56  
57 57 def getTagnum4Tag(self, tag):
58 58 if tag in self.tag2tagnum:
... ...
fsabuilder/morfeuszbuilder/utils/caseconv/generate.py
... ... @@ -90,7 +90,7 @@ def _serializeTable(table):
90 90 def _serializeExtendedTable(table):
91 91 res = []
92 92 res.append('{')
93   - for code, targetCode in table.iteritems():
  93 + for code, targetCode in list(table.items()):
94 94 res.append('{')
95 95 res.append(str(code))
96 96 res.append(',')
... ...
fsabuilder/morfeuszbuilder/utils/configFile.py
... ... @@ -6,10 +6,10 @@ Created on 18 lut 2014
6 6  
7 7 import re
8 8 import codecs
9   -import exceptions
  9 +from . import exceptions
10 10  
11 11 def getHeaderValue(line, lineNum):
12   - m = re.match(ur'\s*\[(.*?)\]\s*(\#.*)?', line)
  12 + m = re.match(r'\s*\[(.*?)\]\s*(\#.*)?', line)
13 13 if m:
14 14 return m.group(1)
15 15 else:
... ... @@ -40,7 +40,7 @@ class ConfigFile(object):
40 40 self.section2Lines[self.currSection].append((lineNum, line))
41 41  
42 42 def _getHeaderValue(self, line, lineNum):
43   - m = re.match(ur'\s*\[(.*?)\]\s*(\#.*)?', line)
  43 + m = re.match(r'\s*\[(.*?)\]\s*(\#.*)?', line)
44 44 if m:
45 45 return m.group(1)
46 46 else:
... ... @@ -48,7 +48,7 @@ class ConfigFile(object):
48 48  
49 49 def enumerateLinesInSection(self, sectionName, ignoreComments=True):
50 50 if sectionName not in self.section2Lines:
51   - raise exceptions.ConfigFileException(self.filename, None, u'Missing section: "%s"' % sectionName)
  51 + raise exceptions.ConfigFileException(self.filename, None, 'Missing section: "%s"' % sectionName)
52 52 if not ignoreComments:
53 53 return self.section2Lines[sectionName]
54 54 else:
... ...
fsabuilder/morfeuszbuilder/utils/exceptions.py
... ... @@ -25,7 +25,7 @@ class SegtypesException(FSABuilderException):
25 25 self.msg = msg
26 26  
27 27 def __str__(self):
28   - return u'Error in segment rules: %s' % self.msg
  28 + return 'Error in segment rules: %s' % self.msg
29 29  
30 30 class ConfigFileException(FSABuilderException):
31 31  
... ... @@ -36,7 +36,7 @@ class ConfigFileException(FSABuilderException):
36 36  
37 37 def __str__(self):
38 38 if self.lineNum:
39   - return u'%s:%d - %s' % (self.filename, self.lineNum, self.msg)
  39 + return '%s:%d - %s' % (self.filename, self.lineNum, self.msg)
40 40 else:
41   - return u'%s - %s' % (self.filename, self.msg)
  41 + return '%s - %s' % (self.filename, self.msg)
42 42  
... ...
fsabuilder/morfeuszbuilder/utils/extractTagset.py
... ... @@ -8,10 +8,10 @@ import sys
8 8 if __name__ == '__main__':
9 9 version = sys.argv[1]
10 10 res = set()
11   - print '#morfeusz-tagset', version
  11 + print(('#morfeusz-tagset', version))
12 12 for line in sys.stdin:
13 13 if line.strip():
14 14 tag = line.split('\t')[2]
15 15 res.add(tag)
16 16 for idx, tag in enumerate(sorted(res)):
17   - print str(idx) + '\t' + tag
  17 + print((str(idx) + '\t' + tag))
... ...