Commit 95cbe5ea03398610d704f1ad1d51cc981f0aafea
1 parent
a5484089
morfeusz_builder → Python 3
Showing
27 changed files
with
204 additions
and
210 deletions
CMakeLists.txt
@@ -4,7 +4,7 @@ project (Morfeusz) | @@ -4,7 +4,7 @@ project (Morfeusz) | ||
4 | 4 | ||
5 | set (Morfeusz_VERSION_MAJOR 1) | 5 | set (Morfeusz_VERSION_MAJOR 1) |
6 | set (Morfeusz_VERSION_MINOR 9) | 6 | set (Morfeusz_VERSION_MINOR 9) |
7 | -set (Morfeusz_VERSION_PATCH 15) | 7 | +set (Morfeusz_VERSION_PATCH 16) |
8 | set (Morfeusz_VERSION "${Morfeusz_VERSION_MAJOR}.${Morfeusz_VERSION_MINOR}.${Morfeusz_VERSION_PATCH}") | 8 | set (Morfeusz_VERSION "${Morfeusz_VERSION_MAJOR}.${Morfeusz_VERSION_MINOR}.${Morfeusz_VERSION_PATCH}") |
9 | set (Morfeusz_LIB_VERSION "${Morfeusz_VERSION}") | 9 | set (Morfeusz_LIB_VERSION "${Morfeusz_VERSION}") |
10 | if (BUILT_ON) | 10 | if (BUILT_ON) |
fsabuilder/buildanalyzer.sh
1 | -#!/bin/bash | 1 | +#! /bin/bash |
2 | 2 | ||
3 | -python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V1 --trim-supneg -o $1 | 3 | +python3 morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V1 --trim-supneg -o $1 |
fsabuilder/buildgenerator.sh
1 | -#!/bin/bash | 1 | +#! /bin/bash |
2 | 2 | ||
3 | -python morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \ | 3 | +python3 morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \ |
4 | --tagset-file=../input/sgjp-morfeusz.tagset \ | 4 | --tagset-file=../input/sgjp-morfeusz.tagset \ |
5 | --segments-file=../input/segmenty.dat \ | 5 | --segments-file=../input/segmenty.dat \ |
6 | --generator \ | 6 | --generator \ |
fsabuilder/morfeusz_builder
100644 → 100755
1 | -#!/usr/bin/python | 1 | +#! /usr/bin/python3 |
2 | # -*- coding:utf-8 -*- | 2 | # -*- coding:utf-8 -*- |
3 | ''' | 3 | ''' |
4 | Created on 21 paź 2013 | 4 | Created on 21 paź 2013 |
@@ -20,13 +20,13 @@ from optparse import OptionParser | @@ -20,13 +20,13 @@ from optparse import OptionParser | ||
20 | 20 | ||
21 | def _checkOption(opt, parser, msg): | 21 | def _checkOption(opt, parser, msg): |
22 | if opt is None: | 22 | if opt is None: |
23 | - print >> sys.stderr, msg | 23 | + print(msg, file=sys.stderr) |
24 | parser.print_help() | 24 | parser.print_help() |
25 | exit(1) | 25 | exit(1) |
26 | 26 | ||
27 | def _checkCondition(cond, parser, msg): | 27 | def _checkCondition(cond, parser, msg): |
28 | if not cond: | 28 | if not cond: |
29 | - print >> sys.stderr, msg | 29 | + print(msg, file=sys.stderr) |
30 | parser.print_help() | 30 | parser.print_help() |
31 | exit(1) | 31 | exit(1) |
32 | 32 | ||
@@ -40,7 +40,7 @@ def _checkOpen(filename, mode): | @@ -40,7 +40,7 @@ def _checkOpen(filename, mode): | ||
40 | if 'w' in mode: | 40 | if 'w' in mode: |
41 | os.remove(filename) | 41 | os.remove(filename) |
42 | except IOError as ex: | 42 | except IOError as ex: |
43 | - print >> sys.stderr, str(ex) | 43 | + print(str(ex), file=sys.stderr) |
44 | exit(1) | 44 | exit(1) |
45 | 45 | ||
46 | def _getDictFilename(opts, isGenerator): | 46 | def _getDictFilename(opts, isGenerator): |
@@ -162,7 +162,7 @@ def _parseOptions(): | @@ -162,7 +162,7 @@ def _parseOptions(): | ||
162 | _checkOpen(_getDictFilename(opts, isGenerator=True), 'w') | 162 | _checkOpen(_getDictFilename(opts, isGenerator=True), 'w') |
163 | 163 | ||
164 | if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1]: | 164 | if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1]: |
165 | - print >> sys.stderr, '--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1])+')' | 165 | + print('--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1])+')', file=sys.stderr) |
166 | parser.print_help() | 166 | parser.print_help() |
167 | exit(1) | 167 | exit(1) |
168 | 168 | ||
@@ -183,34 +183,34 @@ def _readDictIdAndCopyright(inputFiles): | @@ -183,34 +183,34 @@ def _readDictIdAndCopyright(inputFiles): | ||
183 | with codecs.open(inputFile, 'r', 'utf8') as f: | 183 | with codecs.open(inputFile, 'r', 'utf8') as f: |
184 | inCopyright = False | 184 | inCopyright = False |
185 | for linenum, line in enumerate(f, start=1): | 185 | for linenum, line in enumerate(f, start=1): |
186 | - if dictId is None and line.startswith(u'#!DICT-ID'): | ||
187 | - dictIdTag, _, dictId = line.strip().partition(u' ') | 186 | + if dictId is None and line.startswith('#!DICT-ID'): |
187 | + dictIdTag, _, dictId = line.strip().partition(' ') | ||
188 | exceptions.validate( | 188 | exceptions.validate( |
189 | - dictIdTag == u'#!DICT-ID', | ||
190 | - u'Dictionary ID tag must be followed by a space character and dictionary ID string') | 189 | + dictIdTag == '#!DICT-ID', |
190 | + 'Dictionary ID tag must be followed by a space character and dictionary ID string') | ||
191 | exceptions.validate( | 191 | exceptions.validate( |
192 | - len(line.split(u' ')) > 1, | ||
193 | - u'%s:%d: Must provide DICT-ID' % (inputFile, linenum)) | 192 | + len(line.split(' ')) > 1, |
193 | + '%s:%d: Must provide DICT-ID' % (inputFile, linenum)) | ||
194 | exceptions.validate( | 194 | exceptions.validate( |
195 | - len(line.split(u' ')) == 2, | ||
196 | - u'%s:%d: DICT-ID must not contain spaces' % (inputFile, linenum)) | ||
197 | - elif copyright is None and line.startswith(u'#<COPYRIGHT>'): | 195 | + len(line.split(' ')) == 2, |
196 | + '%s:%d: DICT-ID must not contain spaces' % (inputFile, linenum)) | ||
197 | + elif copyright is None and line.startswith('#<COPYRIGHT>'): | ||
198 | exceptions.validate( | 198 | exceptions.validate( |
199 | - line.strip() == u'#<COPYRIGHT>', | ||
200 | - u'%s:%d: Copyright start tag must be the only one in the line' % (inputFile, linenum)) | 199 | + line.strip() == '#<COPYRIGHT>', |
200 | + '%s:%d: Copyright start tag must be the only one in the line' % (inputFile, linenum)) | ||
201 | 201 | ||
202 | inCopyright = True | 202 | inCopyright = True |
203 | - copyright = u'' | 203 | + copyright = '' |
204 | 204 | ||
205 | - elif line.startswith(u'#</COPYRIGHT>'): | 205 | + elif line.startswith('#</COPYRIGHT>'): |
206 | 206 | ||
207 | exceptions.validate( | 207 | exceptions.validate( |
208 | inCopyright, | 208 | inCopyright, |
209 | - u'%s:%d: Copyright end tag must be preceded by copyright start tag' % (inputFile, linenum)) | 209 | + '%s:%d: Copyright end tag must be preceded by copyright start tag' % (inputFile, linenum)) |
210 | 210 | ||
211 | exceptions.validate( | 211 | exceptions.validate( |
212 | - line.strip() == u'#</COPYRIGHT>', | ||
213 | - u'%s:%d: Copyright end tag must be the only one in the line' % (inputFile, linenum)) | 212 | + line.strip() == '#</COPYRIGHT>', |
213 | + '%s:%d: Copyright end tag must be the only one in the line' % (inputFile, linenum)) | ||
214 | 214 | ||
215 | inCopyright = False | 215 | inCopyright = False |
216 | 216 | ||
@@ -219,21 +219,21 @@ def _readDictIdAndCopyright(inputFiles): | @@ -219,21 +219,21 @@ def _readDictIdAndCopyright(inputFiles): | ||
219 | copyright += line | 219 | copyright += line |
220 | 220 | ||
221 | if dictId is None: | 221 | if dictId is None: |
222 | - logging.warn(u'No dictionary ID tag found') | ||
223 | - dictId = u'' | 222 | + logging.warn('No dictionary ID tag found') |
223 | + dictId = '' | ||
224 | 224 | ||
225 | if copyright is None: | 225 | if copyright is None: |
226 | - logging.warn(u'No copyright info found') | ||
227 | - copyright = u'' | 226 | + logging.warn('No copyright info found') |
227 | + copyright = '' | ||
228 | 228 | ||
229 | return (dictId, copyright) | 229 | return (dictId, copyright) |
230 | 230 | ||
231 | def _readNamesAndQualifiers(inputFiles): | 231 | def _readNamesAndQualifiers(inputFiles): |
232 | - names = set([u'']) | 232 | + names = set(['']) |
233 | qualifiers = set([frozenset()]) | 233 | qualifiers = set([frozenset()]) |
234 | lineParser = convertinput.LineParser() | 234 | lineParser = convertinput.LineParser() |
235 | for line in _concatFiles(inputFiles): | 235 | for line in _concatFiles(inputFiles): |
236 | - line = line.strip().decode('utf8') | 236 | + line = line.strip() |
237 | if not lineParser.ignoreLine(line): | 237 | if not lineParser.ignoreLine(line): |
238 | _, _, _, name, qualifier = lineParser.parseLine(line) | 238 | _, _, _, name, qualifier = lineParser.parseLine(line) |
239 | names.add(name) | 239 | names.add(name) |
@@ -242,7 +242,7 @@ def _readNamesAndQualifiers(inputFiles): | @@ -242,7 +242,7 @@ def _readNamesAndQualifiers(inputFiles): | ||
242 | qualifiersMap = dict([(quals, idx) for idx, quals in enumerate(sorted(qualifiers, key=lambda q: tuple(sorted(q))))]) | 242 | qualifiersMap = dict([(quals, idx) for idx, quals in enumerate(sorted(qualifiers, key=lambda q: tuple(sorted(q))))]) |
243 | exceptions.validate( | 243 | exceptions.validate( |
244 | len(qualifiersMap) <= limits.MAX_QUALIFIERS_COMBINATIONS, | 244 | len(qualifiersMap) <= limits.MAX_QUALIFIERS_COMBINATIONS, |
245 | - u'Too many qualifiers combinations. The limit is %d' % limits.MAX_QUALIFIERS_COMBINATIONS) | 245 | + 'Too many qualifiers combinations. The limit is %d' % limits.MAX_QUALIFIERS_COMBINATIONS) |
246 | 246 | ||
247 | return namesMap, qualifiersMap | 247 | return namesMap, qualifiersMap |
248 | 248 |
fsabuilder/morfeuszbuilder/fsa/common.py
@@ -10,9 +10,9 @@ import logging | @@ -10,9 +10,9 @@ import logging | ||
10 | class EncodedFormWithoutPrefix(object): | 10 | class EncodedFormWithoutPrefix(object): |
11 | 11 | ||
12 | def __init__(self, fromWord, targetWord, lowercase): | 12 | def __init__(self, fromWord, targetWord, lowercase): |
13 | - assert type(fromWord) == unicode | ||
14 | - assert type(targetWord) == unicode | ||
15 | - root = u'' | 13 | + assert type(fromWord) == str |
14 | + assert type(targetWord) == str | ||
15 | + root = '' | ||
16 | for o, b in zip(fromWord, targetWord): | 16 | for o, b in zip(fromWord, targetWord): |
17 | if ((o.lower() == b.lower()) if lowercase else o == b): | 17 | if ((o.lower() == b.lower()) if lowercase else o == b): |
18 | root += b | 18 | root += b |
@@ -26,8 +26,8 @@ class EncodedFormWithoutPrefix(object): | @@ -26,8 +26,8 @@ class EncodedFormWithoutPrefix(object): | ||
26 | class EncodedForm4Generator(object): | 26 | class EncodedForm4Generator(object): |
27 | 27 | ||
28 | def __init__(self, fromWord, targetWord): | 28 | def __init__(self, fromWord, targetWord): |
29 | - assert type(fromWord) == unicode | ||
30 | - assert type(targetWord) == unicode | 29 | + assert type(fromWord) == str |
30 | + assert type(targetWord) == str | ||
31 | bestEncodedForm = None | 31 | bestEncodedForm = None |
32 | bestPrefixLength = -1 | 32 | bestPrefixLength = -1 |
33 | for prefixLength in range(min(len(targetWord), 5)): | 33 | for prefixLength in range(min(len(targetWord), 5)): |
@@ -45,8 +45,8 @@ class EncodedForm4Generator(object): | @@ -45,8 +45,8 @@ class EncodedForm4Generator(object): | ||
45 | class EncodedForm4Analyzer(object): | 45 | class EncodedForm4Analyzer(object): |
46 | 46 | ||
47 | def __init__(self, fromWord, targetWord): | 47 | def __init__(self, fromWord, targetWord): |
48 | - assert type(fromWord) == unicode | ||
49 | - assert type(targetWord) == unicode | 48 | + assert type(fromWord) == str |
49 | + assert type(targetWord) == str | ||
50 | bestEncodedForm = None | 50 | bestEncodedForm = None |
51 | bestPrefixCutLength = -1 | 51 | bestPrefixCutLength = -1 |
52 | for prefixCutLength in range(min(len(fromWord), 5)): | 52 | for prefixCutLength in range(min(len(fromWord), 5)): |
@@ -123,7 +123,7 @@ class Interpretation4Generator(object): | @@ -123,7 +123,7 @@ class Interpretation4Generator(object): | ||
123 | return hash(self.getSortKey()) | 123 | return hash(self.getSortKey()) |
124 | 124 | ||
125 | def __unicode__(self): | 125 | def __unicode__(self): |
126 | - return u'<%s,(%d %s),%d,%d>' % (self.lemma.decode('utf8'), self.encodedForm.cutLength, self.encodedForm.suffixToAdd.decode('utf8'), self.tagnum, self.namenum) | 126 | + return '<%s,(%d %s),%d,%d>' % (self.lemma, self.encodedForm.cutLength, self.encodedForm.suffixToAdd.decode('utf8'), self.tagnum, self.namenum) |
127 | 127 | ||
128 | def __repr__(self): | 128 | def __repr__(self): |
129 | - return unicode(self) | 129 | + return str(self) |
fsabuilder/morfeuszbuilder/fsa/convertinput.py
@@ -4,7 +4,7 @@ Created on Oct 23, 2013 | @@ -4,7 +4,7 @@ Created on Oct 23, 2013 | ||
4 | @author: mlenart | 4 | @author: mlenart |
5 | ''' | 5 | ''' |
6 | import logging | 6 | import logging |
7 | -from common import Interpretation4Analyzer | 7 | +from .common import Interpretation4Analyzer |
8 | from morfeuszbuilder.fsa.common import Interpretation4Generator | 8 | from morfeuszbuilder.fsa.common import Interpretation4Generator |
9 | #from morfeuszbuilder.fsa import externalsort | 9 | #from morfeuszbuilder.fsa import externalsort |
10 | 10 | ||
@@ -36,24 +36,24 @@ class LineParser(object): | @@ -36,24 +36,24 @@ class LineParser(object): | ||
36 | def ignoreLine(self, line): | 36 | def ignoreLine(self, line): |
37 | if not line: | 37 | if not line: |
38 | return True | 38 | return True |
39 | - elif line.strip() == u'#<COPYRIGHT>': | 39 | + elif line.strip() == '#<COPYRIGHT>': |
40 | self.inCopyright = True | 40 | self.inCopyright = True |
41 | return True | 41 | return True |
42 | - elif line.strip() == u'#</COPYRIGHT>': | 42 | + elif line.strip() == '#</COPYRIGHT>': |
43 | self.inCopyright = False | 43 | self.inCopyright = False |
44 | return True | 44 | return True |
45 | elif self.inCopyright: | 45 | elif self.inCopyright: |
46 | return True | 46 | return True |
47 | elif line and not ' ' in ''.join(line.split('\t')[:2]): | 47 | elif line and not ' ' in ''.join(line.split('\t')[:2]): |
48 | return False | 48 | return False |
49 | - elif line.startswith(u'#!DICT-ID'): | 49 | + elif line.startswith('#!DICT-ID'): |
50 | return True | 50 | return True |
51 | else: | 51 | else: |
52 | - logging.warn(u'Ignoring line: "%s" - contains space in text form or lemma' % (line.strip())) | 52 | + logging.warn('Ignoring line: "%s" - contains space in text form or lemma' % (line.strip())) |
53 | return True | 53 | return True |
54 | 54 | ||
55 | def parseLine(self, line): | 55 | def parseLine(self, line): |
56 | - splitLine = line.strip().split(u'\t') | 56 | + splitLine = line.strip().split('\t') |
57 | if len(splitLine) == 5: | 57 | if len(splitLine) == 5: |
58 | orth, base, tag, name, qualifier = splitLine | 58 | orth, base, tag, name, qualifier = splitLine |
59 | elif len(splitLine) == 4: | 59 | elif len(splitLine) == 4: |
@@ -69,7 +69,7 @@ class LineParser(object): | @@ -69,7 +69,7 @@ class LineParser(object): | ||
69 | 69 | ||
70 | def parseQualifiers(string): | 70 | def parseQualifiers(string): |
71 | if string: | 71 | if string: |
72 | - return frozenset(string.split(u'|')) | 72 | + return frozenset(string.split('|')) |
73 | else: | 73 | else: |
74 | return frozenset() | 74 | return frozenset() |
75 | 75 | ||
@@ -87,7 +87,7 @@ class PolimorfConverter4Analyzer(object): | @@ -87,7 +87,7 @@ class PolimorfConverter4Analyzer(object): | ||
87 | def _partiallyParseLines(self, inputLines): | 87 | def _partiallyParseLines(self, inputLines): |
88 | lineParser = LineParser() | 88 | lineParser = LineParser() |
89 | for line in inputLines: | 89 | for line in inputLines: |
90 | - line = line.decode(self.inputEncoding).strip('\n') | 90 | + line = line.strip('\n') |
91 | if not lineParser.ignoreLine(line): | 91 | if not lineParser.ignoreLine(line): |
92 | orth, base, tag, name, qualifier = lineParser.parseLine(line) | 92 | orth, base, tag, name, qualifier = lineParser.parseLine(line) |
93 | 93 | ||
@@ -106,8 +106,8 @@ class PolimorfConverter4Analyzer(object): | @@ -106,8 +106,8 @@ class PolimorfConverter4Analyzer(object): | ||
106 | base = orth | 106 | base = orth |
107 | 107 | ||
108 | yield '\t'.join(( | 108 | yield '\t'.join(( |
109 | - orth.encode(self.inputEncoding), | ||
110 | - base.encode(self.inputEncoding), | 109 | + orth, |
110 | + base, | ||
111 | str(tagnum), | 111 | str(tagnum), |
112 | str(namenum), | 112 | str(namenum), |
113 | str(typenum), | 113 | str(typenum), |
@@ -118,8 +118,8 @@ class PolimorfConverter4Analyzer(object): | @@ -118,8 +118,8 @@ class PolimorfConverter4Analyzer(object): | ||
118 | base = orth | 118 | base = orth |
119 | typenum = self.segmentRulesManager.shiftOrthMagic.getNewSegnum4ShiftOrth(typenum) | 119 | typenum = self.segmentRulesManager.shiftOrthMagic.getNewSegnum4ShiftOrth(typenum) |
120 | yield '\t'.join(( | 120 | yield '\t'.join(( |
121 | - orth.encode(self.inputEncoding), | ||
122 | - base.encode(self.inputEncoding), | 121 | + orth, |
122 | + base, | ||
123 | str(tagnum), | 123 | str(tagnum), |
124 | str(namenum), | 124 | str(namenum), |
125 | str(typenum), | 125 | str(typenum), |
@@ -127,14 +127,14 @@ class PolimorfConverter4Analyzer(object): | @@ -127,14 +127,14 @@ class PolimorfConverter4Analyzer(object): | ||
127 | 127 | ||
128 | # input lines are encoded and partially parsed | 128 | # input lines are encoded and partially parsed |
129 | def _sortLines(self, inputLines): | 129 | def _sortLines(self, inputLines): |
130 | - return sorted(inputLines, key=lambda line: self.encoder.word2SortKey(line.split('\t')[0].decode('utf8'))) | 130 | + return sorted(inputLines, key=lambda line: self.encoder.word2SortKey(line.split('\t')[0])) |
131 | # return sorted(inputLines, key=lambda line: self.encoder.word2SortKey(line.split('\t')[0].decode('utf8'))) | 131 | # return sorted(inputLines, key=lambda line: self.encoder.word2SortKey(line.split('\t')[0].decode('utf8'))) |
132 | 132 | ||
133 | def _reallyParseLines(self, inputLines): | 133 | def _reallyParseLines(self, inputLines): |
134 | for line in inputLines: | 134 | for line in inputLines: |
135 | - line = line.decode(self.inputEncoding).strip(u'\n') | 135 | + line = line.strip('\n') |
136 | if line: | 136 | if line: |
137 | - orth, base, tagnum, namenum, typenum, qualsnum = line.split(u'\t') | 137 | + orth, base, tagnum, namenum, typenum, qualsnum = line.split('\t') |
138 | tagnum = int(tagnum) | 138 | tagnum = int(tagnum) |
139 | namenum = int(namenum) | 139 | namenum = int(namenum) |
140 | typenum = int(typenum) | 140 | typenum = int(typenum) |
@@ -159,14 +159,14 @@ class PolimorfConverter4Generator(object): | @@ -159,14 +159,14 @@ class PolimorfConverter4Generator(object): | ||
159 | def _partiallyParseLines(self, inputLines): | 159 | def _partiallyParseLines(self, inputLines): |
160 | lineParser = LineParser() | 160 | lineParser = LineParser() |
161 | for line in inputLines: | 161 | for line in inputLines: |
162 | - line = line.decode(self.inputEncoding).strip('\n') | 162 | + line = line.strip('\n') |
163 | if not lineParser.ignoreLine(line): | 163 | if not lineParser.ignoreLine(line): |
164 | orth, base, tag, name, qualifier = lineParser.parseLine(line) | 164 | orth, base, tag, name, qualifier = lineParser.parseLine(line) |
165 | if base: | 165 | if base: |
166 | - homonymId = u'' | ||
167 | - if u':' in base: | ||
168 | - assumedBase, assumedHomonymId = base.split(u':', 1) | ||
169 | - if assumedBase != u'' and assumedHomonymId != u'' and assumedHomonymId.isalnum(): | 166 | + homonymId = '' |
167 | + if ':' in base: | ||
168 | + assumedBase, assumedHomonymId = base.split(':', 1) | ||
169 | + if assumedBase != '' and assumedHomonymId != '' and assumedHomonymId.isalnum(): | ||
170 | base, homonymId = assumedBase, assumedHomonymId | 170 | base, homonymId = assumedBase, assumedHomonymId |
171 | tagnum = self.tagset.getTagnum4Tag(tag) | 171 | tagnum = self.tagset.getTagnum4Tag(tag) |
172 | namenum = self.namesMap[name] | 172 | namenum = self.namesMap[name] |
@@ -179,39 +179,39 @@ class PolimorfConverter4Generator(object): | @@ -179,39 +179,39 @@ class PolimorfConverter4Generator(object): | ||
179 | base = orth | 179 | base = orth |
180 | 180 | ||
181 | yield '\t'.join(( | 181 | yield '\t'.join(( |
182 | - orth.encode(self.inputEncoding), | ||
183 | - base.encode(self.inputEncoding), | 182 | + orth, |
183 | + base, | ||
184 | str(tagnum), | 184 | str(tagnum), |
185 | str(namenum), | 185 | str(namenum), |
186 | str(typenum), | 186 | str(typenum), |
187 | - homonymId.encode(self.inputEncoding), | 187 | + homonymId, |
188 | str(qualsnum))) | 188 | str(qualsnum))) |
189 | 189 | ||
190 | if self.segmentRulesManager.shiftOrthMagic.getNewSegnum4ShiftOrth(typenum) != None: | 190 | if self.segmentRulesManager.shiftOrthMagic.getNewSegnum4ShiftOrth(typenum) != None: |
191 | base = orth | 191 | base = orth |
192 | typenum = self.segmentRulesManager.shiftOrthMagic.getNewSegnum4ShiftOrth(typenum) | 192 | typenum = self.segmentRulesManager.shiftOrthMagic.getNewSegnum4ShiftOrth(typenum) |
193 | yield '\t'.join(( | 193 | yield '\t'.join(( |
194 | - orth.encode(self.inputEncoding), | ||
195 | - base.encode(self.inputEncoding), | 194 | + orth, |
195 | + base, | ||
196 | str(tagnum), | 196 | str(tagnum), |
197 | str(namenum), | 197 | str(namenum), |
198 | str(typenum), | 198 | str(typenum), |
199 | - homonymId.encode(self.inputEncoding), | 199 | + homonymId, |
200 | str(qualsnum))) | 200 | str(qualsnum))) |
201 | else: | 201 | else: |
202 | logging.warn('Ignoring line: "%s" - contains empty lemma', line.strip()) | 202 | logging.warn('Ignoring line: "%s" - contains empty lemma', line.strip()) |
203 | 203 | ||
204 | # input lines are encoded and partially parsed | 204 | # input lines are encoded and partially parsed |
205 | def _sortLines(self, inputLines): | 205 | def _sortLines(self, inputLines): |
206 | - return sorted(inputLines, key=lambda line: (self.encoder.word2SortKey(line.split('\t')[1].decode('utf8')), line)) | 206 | + return sorted(inputLines, key=lambda line: (self.encoder.word2SortKey(line.split('\t')[1]), line)) |
207 | 207 | ||
208 | def _reallyParseLines(self, inputLines): | 208 | def _reallyParseLines(self, inputLines): |
209 | prevLine = None | 209 | prevLine = None |
210 | for line in inputLines: | 210 | for line in inputLines: |
211 | - line = line.decode(self.inputEncoding).strip(u'\n') | 211 | + line = line.strip('\n') |
212 | if line and line != prevLine: | 212 | if line and line != prevLine: |
213 | - orth, base, tagnum, namenum, typenum, homonymId, qualsnum = line.split(u'\t') | ||
214 | -# print orth.encode('utf8'), base.encode('utf8'), homonymId | 213 | + orth, base, tagnum, namenum, typenum, homonymId, qualsnum = line.split('\t') |
214 | +# print orth, base, homonymId | ||
215 | tagnum = int(tagnum) | 215 | tagnum = int(tagnum) |
216 | namenum = int(namenum) | 216 | namenum = int(namenum) |
217 | typenum = int(typenum) | 217 | typenum = int(typenum) |
fsabuilder/morfeuszbuilder/fsa/encode.py
@@ -24,7 +24,7 @@ class Encoder(object): | @@ -24,7 +24,7 @@ class Encoder(object): | ||
24 | #~ self.qualifiersMap = { frozenset(): 0} | 24 | #~ self.qualifiersMap = { frozenset(): 0} |
25 | 25 | ||
26 | def encodeWord(self, word, lowercase=True): | 26 | def encodeWord(self, word, lowercase=True): |
27 | - assert type(word) == unicode | 27 | + assert type(word) == str |
28 | res = bytearray(word.lower() if self.lowercase and lowercase else word, self.encoding) | 28 | res = bytearray(word.lower() if self.lowercase and lowercase else word, self.encoding) |
29 | return res | 29 | return res |
30 | 30 | ||
@@ -35,16 +35,16 @@ class Encoder(object): | @@ -35,16 +35,16 @@ class Encoder(object): | ||
35 | return NotImplementedError() | 35 | return NotImplementedError() |
36 | 36 | ||
37 | def decodeWord(self, rawWord): | 37 | def decodeWord(self, rawWord): |
38 | - return unicode(str(rawWord).strip('\x00'), self.encoding) | 38 | + return str(str(rawWord).strip('\x00'), self.encoding) |
39 | 39 | ||
40 | def word2SortKey(self, word): | 40 | def word2SortKey(self, word): |
41 | normalizedWord = word.lower() if self.lowercase else word | 41 | normalizedWord = word.lower() if self.lowercase else word |
42 | - return normalizedWord.encode(self.encoding) | 42 | + return normalizedWord |
43 | 43 | ||
44 | def _encodeTypeNum(self, typenum): | 44 | def _encodeTypeNum(self, typenum): |
45 | exceptions.validate( | 45 | exceptions.validate( |
46 | typenum <= limits.MAX_SEGMENT_TYPES, | 46 | typenum <= limits.MAX_SEGMENT_TYPES, |
47 | - u'Too many segment types. The limit is %d' % limits.MAX_SEGMENT_TYPES) | 47 | + 'Too many segment types. The limit is %d' % limits.MAX_SEGMENT_TYPES) |
48 | return bytearray([typenum]) | 48 | return bytearray([typenum]) |
49 | 49 | ||
50 | def _hasUpperPrefix(self, casePattern): | 50 | def _hasUpperPrefix(self, casePattern): |
@@ -62,13 +62,13 @@ class Encoder(object): | @@ -62,13 +62,13 @@ class Encoder(object): | ||
62 | 62 | ||
63 | def _encodeTagNum(self, tagnum): | 63 | def _encodeTagNum(self, tagnum): |
64 | res = bytearray() | 64 | res = bytearray() |
65 | - exceptions.validate(tagnum <= limits.MAX_TAGS, u'Too many tags. The limit is %d' % limits.MAX_TAGS) | 65 | + exceptions.validate(tagnum <= limits.MAX_TAGS, 'Too many tags. The limit is %d' % limits.MAX_TAGS) |
66 | res.append((tagnum & 0xFF00) >> 8) | 66 | res.append((tagnum & 0xFF00) >> 8) |
67 | res.append(tagnum & 0x00FF) | 67 | res.append(tagnum & 0x00FF) |
68 | return res | 68 | return res |
69 | 69 | ||
70 | def _encodeNameNum(self, namenum): | 70 | def _encodeNameNum(self, namenum): |
71 | - exceptions.validate(namenum <= limits.MAX_NAMES, u'Too many named entity types. The limit is %d' % limits.MAX_NAMES) | 71 | + exceptions.validate(namenum <= limits.MAX_NAMES, 'Too many named entity types. The limit is %d' % limits.MAX_NAMES) |
72 | return bytearray([namenum]) | 72 | return bytearray([namenum]) |
73 | 73 | ||
74 | def _groupInterpsByType(self, interpsList): | 74 | def _groupInterpsByType(self, interpsList): |
@@ -86,7 +86,7 @@ class Encoder(object): | @@ -86,7 +86,7 @@ class Encoder(object): | ||
86 | 86 | ||
87 | res = bytearray() | 87 | res = bytearray() |
88 | 88 | ||
89 | - for typenum, interpsList in segnum2Interps.iteritems(): | 89 | + for typenum, interpsList in list(segnum2Interps.items()): |
90 | res.extend(self._encodeInterps4Type(typenum, interpsList)) | 90 | res.extend(self._encodeInterps4Type(typenum, interpsList)) |
91 | del interpsList | 91 | del interpsList |
92 | 92 | ||
@@ -135,10 +135,10 @@ class MorphEncoder(Encoder): | @@ -135,10 +135,10 @@ class MorphEncoder(Encoder): | ||
135 | return res | 135 | return res |
136 | 136 | ||
137 | def _casePatternsHaveOnlyLowercase(self, casePatterns): | 137 | def _casePatternsHaveOnlyLowercase(self, casePatterns): |
138 | - return not any(map(lambda cp: cp and True in cp, casePatterns)) | 138 | + return not any([cp and True in cp for cp in casePatterns]) |
139 | 139 | ||
140 | def _casePatternsAreOnlyTitles(self, casePatterns): | 140 | def _casePatternsAreOnlyTitles(self, casePatterns): |
141 | - return all(map(lambda cp: cp and cp[0] == True and not True in cp[1:], casePatterns)) | 141 | + return all([cp and cp[0] == True and not True in cp[1:] for cp in casePatterns]) |
142 | 142 | ||
143 | def _casePatternsAreEncodedInCompressByte(self, casePatterns): | 143 | def _casePatternsAreEncodedInCompressByte(self, casePatterns): |
144 | return self._casePatternsHaveOnlyLowercase(casePatterns) or self._casePatternsAreOnlyTitles(casePatterns) | 144 | return self._casePatternsHaveOnlyLowercase(casePatterns) or self._casePatternsAreOnlyTitles(casePatterns) |
fsabuilder/morfeuszbuilder/fsa/fsa.py
@@ -4,8 +4,8 @@ Created on Oct 8, 2013 | @@ -4,8 +4,8 @@ Created on Oct 8, 2013 | ||
4 | @author: mlenart | 4 | @author: mlenart |
5 | ''' | 5 | ''' |
6 | 6 | ||
7 | -import state | ||
8 | -import register | 7 | +from . import state |
8 | +from . import register | ||
9 | import logging | 9 | import logging |
10 | from morfeuszbuilder.utils import exceptions | 10 | from morfeuszbuilder.utils import exceptions |
11 | 11 | ||
@@ -35,7 +35,7 @@ class FSA(object): | @@ -35,7 +35,7 @@ class FSA(object): | ||
35 | assert not self.closed | 35 | assert not self.closed |
36 | assert data is not None | 36 | assert data is not None |
37 | encodedWord = self.encodeWord(word) | 37 | encodedWord = self.encodeWord(word) |
38 | - assert encodedWord > self.encodedPrevWord | 38 | + assert self.encodedPrevWord is None or encodedWord > self.encodedPrevWord |
39 | self._addSorted(encodedWord, self.encodeData(data)) | 39 | self._addSorted(encodedWord, self.encodeData(data)) |
40 | self.encodedPrevWord = encodedWord | 40 | self.encodedPrevWord = encodedWord |
41 | 41 | ||
@@ -43,7 +43,7 @@ class FSA(object): | @@ -43,7 +43,7 @@ class FSA(object): | ||
43 | 43 | ||
44 | # debug | 44 | # debug |
45 | if self.n < 10 or (self.n < 10000 and self.n % 1000 == 0) or self.n % 10000 == 0: | 45 | if self.n < 10 or (self.n < 10000 and self.n % 1000 == 0) or self.n % 10000 == 0: |
46 | - logging.info(u'%d %s' % (self.n, word)) | 46 | + logging.info('%d %s' % (self.n, word)) |
47 | for label in encodedWord: | 47 | for label in encodedWord: |
48 | self.label2Freq[label] = self.label2Freq.get(label, 0) + 1 | 48 | self.label2Freq[label] = self.label2Freq.get(label, 0) + 1 |
49 | 49 | ||
@@ -78,7 +78,7 @@ class FSA(object): | @@ -78,7 +78,7 @@ class FSA(object): | ||
78 | return res | 78 | return res |
79 | 79 | ||
80 | def _addSorted(self, encodedWord, data): | 80 | def _addSorted(self, encodedWord, data): |
81 | - assert self.encodedPrevWord < encodedWord | 81 | + assert self.encodedPrevWord is None or self.encodedPrevWord < encodedWord |
82 | assert type(data) == bytearray | 82 | assert type(data) == bytearray |
83 | q = self.initialState | 83 | q = self.initialState |
84 | i = 0 | 84 | i = 0 |
fsabuilder/morfeuszbuilder/fsa/serializer.py
@@ -5,7 +5,7 @@ Created on Oct 20, 2013 | @@ -5,7 +5,7 @@ Created on Oct 20, 2013 | ||
5 | ''' | 5 | ''' |
6 | 6 | ||
7 | import logging | 7 | import logging |
8 | -from state import State | 8 | +from .state import State |
9 | from morfeuszbuilder.utils import limits, exceptions | 9 | from morfeuszbuilder.utils import limits, exceptions |
10 | from morfeuszbuilder.utils.serializationUtils import * | 10 | from morfeuszbuilder.utils.serializationUtils import * |
11 | 11 | ||
@@ -106,7 +106,7 @@ class Serializer(object): | @@ -106,7 +106,7 @@ class Serializer(object): | ||
106 | res = bytearray() | 106 | res = bytearray() |
107 | numOfTags = len(tagsMap) | 107 | numOfTags = len(tagsMap) |
108 | res.extend(htons(numOfTags)) | 108 | res.extend(htons(numOfTags)) |
109 | - for tag, tagnum in sorted(tagsMap.iteritems(), key=lambda (tag, tagnum): tagnum): | 109 | + for tag, tagnum in sorted(iter(list(tagsMap.items())), key=lambda tag_tagnum: tag_tagnum[1]): |
110 | res.extend(htons(tagnum)) | 110 | res.extend(htons(tagnum)) |
111 | res.extend(self.fsa.encodeWord(tag)) | 111 | res.extend(self.fsa.encodeWord(tag)) |
112 | res.append(0) | 112 | res.append(0) |
@@ -121,7 +121,7 @@ class Serializer(object): | @@ -121,7 +121,7 @@ class Serializer(object): | ||
121 | #~ return res | 121 | #~ return res |
122 | 122 | ||
123 | def serializeQualifiersMap(self): | 123 | def serializeQualifiersMap(self): |
124 | - label2labelId = dict([ (u'|'.join(qualifiers), n) for qualifiers, n in sorted(self.qualifiersMap.iteritems(), key=lambda (qs, n): n) ]) | 124 | + label2labelId = dict([ ('|'.join(sorted(qualifiers)), n) for qualifiers, n in sorted(iter(list(self.qualifiersMap.items())), key=lambda qs_n: qs_n[1]) ]) |
125 | return self._serializeTags(label2labelId) | 125 | return self._serializeTags(label2labelId) |
126 | #~ res = bytearray() | 126 | #~ res = bytearray() |
127 | #~ res.extend(htons(len(self.qualifiersMap))) | 127 | #~ res.extend(htons(len(self.qualifiersMap))) |
@@ -186,9 +186,9 @@ class Serializer(object): | @@ -186,9 +186,9 @@ class Serializer(object): | ||
186 | return res | 186 | return res |
187 | 187 | ||
188 | def getSortedTransitions(self, state): | 188 | def getSortedTransitions(self, state): |
189 | - defaultKey = lambda (label, nextState): (-state.label2Freq.get(label, 0), -self.fsa.label2Freq.get(label, 0)) | 189 | + defaultKey = lambda label_nextState: (-state.label2Freq.get(label_nextState[0], 0), -self.fsa.label2Freq.get(label_nextState[0], 0)) |
190 | return list(sorted( | 190 | return list(sorted( |
191 | - state.transitionsMap.iteritems(), | 191 | + iter(list(state.transitionsMap.items())), |
192 | key=defaultKey)) | 192 | key=defaultKey)) |
193 | 193 | ||
194 | def stateData2bytearray(self, state): | 194 | def stateData2bytearray(self, state): |
@@ -215,9 +215,9 @@ class SimpleSerializer(Serializer): | @@ -215,9 +215,9 @@ class SimpleSerializer(Serializer): | ||
215 | 215 | ||
216 | def getStateSize(self, state): | 216 | def getStateSize(self, state): |
217 | if self.serializeTransitionsData: | 217 | if self.serializeTransitionsData: |
218 | - return 1 + 5 * len(state.transitionsMap.keys()) + self.getDataSize(state) | 218 | + return 1 + 5 * len(list(state.transitionsMap.keys())) + self.getDataSize(state) |
219 | else: | 219 | else: |
220 | - return 1 + 4 * len(state.transitionsMap.keys()) + self.getDataSize(state) | 220 | + return 1 + 4 * len(list(state.transitionsMap.keys())) + self.getDataSize(state) |
221 | 221 | ||
222 | def getDataSize(self, state): | 222 | def getDataSize(self, state): |
223 | return len(state.encodedData) if state.isAccepting() else 0 | 223 | return len(state.encodedData) if state.isAccepting() else 0 |
@@ -270,12 +270,12 @@ class VLengthSerializer1(Serializer): | @@ -270,12 +270,12 @@ class VLengthSerializer1(Serializer): | ||
270 | res = bytearray() | 270 | res = bytearray() |
271 | 271 | ||
272 | # labels sorted by popularity | 272 | # labels sorted by popularity |
273 | - sortedLabels = [label for (label, freq) in sorted(self.fsa.label2Freq.iteritems(), key=lambda (label, freq): (-freq, label))] | 273 | + sortedLabels = [label for (label, freq) in sorted(iter(list(self.fsa.label2Freq.items())), key=lambda label_freq: (-label_freq[1], label_freq[0]))] |
274 | 274 | ||
275 | # popular labels table | 275 | # popular labels table |
276 | self.label2ShortLabel = dict([(label, sortedLabels.index(label) + 1) for label in sortedLabels[:63]]) | 276 | self.label2ShortLabel = dict([(label, sortedLabels.index(label) + 1) for label in sortedLabels[:63]]) |
277 | 277 | ||
278 | - logging.debug(dict([(chr(label), shortLabel) for label, shortLabel in self.label2ShortLabel.items()])) | 278 | + logging.debug(dict([(chr(label), shortLabel) for label, shortLabel in list(self.label2ShortLabel.items())])) |
279 | 279 | ||
280 | # write remaining short labels (zeros) | 280 | # write remaining short labels (zeros) |
281 | for label in range(256): | 281 | for label in range(256): |
@@ -354,7 +354,7 @@ class VLengthSerializer1(Serializer): | @@ -354,7 +354,7 @@ class VLengthSerializer1(Serializer): | ||
354 | offsetSize += 1 | 354 | offsetSize += 1 |
355 | exceptions.validate( | 355 | exceptions.validate( |
356 | offset < 256 * 256 * 256, | 356 | offset < 256 * 256 * 256, |
357 | - u'Cannot build the automaton - it would exceed its max size which is %d' % (256 * 256 * 256)) | 357 | + 'Cannot build the automaton - it would exceed its max size which is %d' % (256 * 256 * 256)) |
358 | # assert offset < 256 * 256 * 256 # TODO - przerobic na jakis porzadny wyjatek | 358 | # assert offset < 256 * 256 * 256 # TODO - przerobic na jakis porzadny wyjatek |
359 | assert offsetSize <= 3 | 359 | assert offsetSize <= 3 |
360 | firstByte |= offsetSize | 360 | firstByte |= offsetSize |
@@ -380,7 +380,7 @@ class VLengthSerializer1(Serializer): | @@ -380,7 +380,7 @@ class VLengthSerializer1(Serializer): | ||
380 | newState.encodedData = state.encodedData | 380 | newState.encodedData = state.encodedData |
381 | newState.reverseOffset = state.reverseOffset | 381 | newState.reverseOffset = state.reverseOffset |
382 | newState.offset = state.offset | 382 | newState.offset = state.offset |
383 | - newState.transitionsMap = dict([(label, nextState) for (label, nextState) in state.transitionsMap.iteritems()]) | 383 | + newState.transitionsMap = dict([(label, nextState) for (label, nextState) in list(state.transitionsMap.items())]) |
384 | # newState.transitionsMap = dict([(label, nextState) for (label, nextState) in state.transitionsMap.iteritems() if not label in self.label2ShortLabel or not self.label2ShortLabel[label] in range(1,64)]) | 384 | # newState.transitionsMap = dict([(label, nextState) for (label, nextState) in state.transitionsMap.iteritems() if not label in self.label2ShortLabel or not self.label2ShortLabel[label] in range(1,64)]) |
385 | newState.serializeAsArray = False | 385 | newState.serializeAsArray = False |
386 | return newState | 386 | return newState |
@@ -388,12 +388,12 @@ class VLengthSerializer1(Serializer): | @@ -388,12 +388,12 @@ class VLengthSerializer1(Serializer): | ||
388 | def _transitions2ArrayBytes(self, state): | 388 | def _transitions2ArrayBytes(self, state): |
389 | res = bytearray() | 389 | res = bytearray() |
390 | array = [0] * 64 | 390 | array = [0] * 64 |
391 | - for label, nextState in state.transitionsMap.iteritems(): | 391 | + for label, nextState in list(state.transitionsMap.items()): |
392 | if label in self.label2ShortLabel: | 392 | if label in self.label2ShortLabel: |
393 | shortLabel = self.label2ShortLabel[label] | 393 | shortLabel = self.label2ShortLabel[label] |
394 | array[shortLabel] = nextState.offset | 394 | array[shortLabel] = nextState.offset |
395 | logging.debug(array) | 395 | logging.debug(array) |
396 | - for offset in map(lambda x: x if x else 0, array): | 396 | + for offset in [x if x else 0 for x in array]: |
397 | res.append(0) | 397 | res.append(0) |
398 | res.append((offset & 0xFF0000) >> 16) | 398 | res.append((offset & 0xFF0000) >> 16) |
399 | res.append((offset & 0x00FF00) >> 8) | 399 | res.append((offset & 0x00FF00) >> 8) |
@@ -409,8 +409,8 @@ class VLengthSerializer1(Serializer): | @@ -409,8 +409,8 @@ class VLengthSerializer1(Serializer): | ||
409 | return self._transitions2ListBytes(state) | 409 | return self._transitions2ListBytes(state) |
410 | 410 | ||
411 | def _chooseArrayStates(self): | 411 | def _chooseArrayStates(self): |
412 | - for state1 in self.fsa.initialState.transitionsMap.values(): | ||
413 | - for state2 in state1.transitionsMap.values(): | 412 | + for state1 in list(self.fsa.initialState.transitionsMap.values()): |
413 | + for state2 in list(state1.transitionsMap.values()): | ||
414 | # for state3 in state2.transitionsMap.values(): | 414 | # for state3 in state2.transitionsMap.values(): |
415 | # state3.serializeAsArray = True | 415 | # state3.serializeAsArray = True |
416 | state2.serializeAsArray = True | 416 | state2.serializeAsArray = True |
fsabuilder/morfeuszbuilder/fsa/state.py
@@ -45,7 +45,7 @@ class State(object): | @@ -45,7 +45,7 @@ class State(object): | ||
45 | return self.transitionsMap.get(byte, None) | 45 | return self.transitionsMap.get(byte, None) |
46 | 46 | ||
47 | def getRegisterKey(self): | 47 | def getRegisterKey(self): |
48 | - return ( frozenset(self.transitionsMap.iteritems()), tuple(self.encodedData) if self.encodedData else None ) | 48 | + return ( frozenset(iter(list(self.transitionsMap.items()))), tuple(self.encodedData) if self.encodedData else None ) |
49 | 49 | ||
50 | def isAccepting(self): | 50 | def isAccepting(self): |
51 | return self.encodedData is not None | 51 | return self.encodedData is not None |
@@ -60,10 +60,10 @@ class State(object): | @@ -60,10 +60,10 @@ class State(object): | ||
60 | else: | 60 | else: |
61 | return self.encodedData | 61 | return self.encodedData |
62 | 62 | ||
63 | - def dfs(self, alreadyVisited, sortKey=lambda (_, state): -state.freq): | 63 | + def dfs(self, alreadyVisited, sortKey=lambda __state: -__state[1].freq): |
64 | if not self in alreadyVisited: | 64 | if not self in alreadyVisited: |
65 | alreadyVisited.add(self) | 65 | alreadyVisited.add(self) |
66 | - for _, state in sorted(self.transitionsMap.iteritems(), key=sortKey): | 66 | + for _, state in sorted(iter(list(self.transitionsMap.items())), key=sortKey): |
67 | for state1 in state.dfs(alreadyVisited): | 67 | for state1 in state.dfs(alreadyVisited): |
68 | yield state1 | 68 | yield state1 |
69 | yield self | 69 | yield self |
@@ -77,7 +77,7 @@ class State(object): | @@ -77,7 +77,7 @@ class State(object): | ||
77 | state.offset = currReverseOffset - state.reverseOffset | 77 | state.offset = currReverseOffset - state.reverseOffset |
78 | 78 | ||
79 | def debug(self): | 79 | def debug(self): |
80 | - print '----------------' | ||
81 | - print 'STATE:', self.idx, 'accepting', self.isAccepting() | ||
82 | - for label, s in self.transitionsMap.iteritems(): | ||
83 | - print label, '-->', s.idx | 80 | + print('----------------') |
81 | + print(('STATE:', self.idx, 'accepting', self.isAccepting())) | ||
82 | + for label, s in list(self.transitionsMap.items()): | ||
83 | + print((label, '-->', s.idx)) |
fsabuilder/morfeuszbuilder/fsa/visualizer.py
@@ -19,7 +19,7 @@ class Visualizer(object): | @@ -19,7 +19,7 @@ class Visualizer(object): | ||
19 | nodeLabelsMap = {} | 19 | nodeLabelsMap = {} |
20 | for idx, state in enumerate(allStates): | 20 | for idx, state in enumerate(allStates): |
21 | G.add_node(idx, offset=state.offset) | 21 | G.add_node(idx, offset=state.offset) |
22 | - for c, targetState in state.transitionsMap.iteritems(): | 22 | + for c, targetState in list(state.transitionsMap.items()): |
23 | G.add_edge(idx, allStates.index(targetState)) | 23 | G.add_edge(idx, allStates.index(targetState)) |
24 | label = (chr(c) if c <= 127 else '%') if charLabels \ | 24 | label = (chr(c) if c <= 127 else '%') if charLabels \ |
25 | else c | 25 | else c |
@@ -37,11 +37,11 @@ class Visualizer(object): | @@ -37,11 +37,11 @@ class Visualizer(object): | ||
37 | nodelist=list([allStates.index(s) for s in allStates if s.isAccepting()]), | 37 | nodelist=list([allStates.index(s) for s in allStates if s.isAccepting()]), |
38 | node_shape='s') | 38 | node_shape='s') |
39 | # nx.draw_networkx_nodes(G, pos, nodelist=list([allStates.index(s) for s in allStates if s.isFinal()])), ) | 39 | # nx.draw_networkx_nodes(G, pos, nodelist=list([allStates.index(s) for s in allStates if s.isFinal()])), ) |
40 | - nx.draw_networkx_edges(G, pos, edgelist=edgeLabelsMap.keys()) | 40 | + nx.draw_networkx_edges(G, pos, edgelist=list(edgeLabelsMap.keys())) |
41 | nx.draw_networkx_edge_labels(G, pos, edge_labels = edgeLabelsMap) | 41 | nx.draw_networkx_edge_labels(G, pos, edge_labels = edgeLabelsMap) |
42 | nx.draw_networkx_labels(G, pos, labels=nodeLabelsMap) | 42 | nx.draw_networkx_labels(G, pos, labels=nodeLabelsMap) |
43 | plt.axis('off') | 43 | plt.axis('off') |
44 | plt.draw() | 44 | plt.draw() |
45 | plt.show() | 45 | plt.show() |
46 | # plt.savefig('filename.png') | 46 | # plt.savefig('filename.png') |
47 | - print 'done' | 47 | + print('done') |
fsabuilder/morfeuszbuilder/segrules/preprocessor.py
@@ -7,10 +7,10 @@ Created on 23 sty 2014 | @@ -7,10 +7,10 @@ Created on 23 sty 2014 | ||
7 | import re | 7 | import re |
8 | from pyparsing import * | 8 | from pyparsing import * |
9 | from morfeuszbuilder.utils import exceptions | 9 | from morfeuszbuilder.utils import exceptions |
10 | -from pyparseString import pyparseString | 10 | +from .pyparseString import pyparseString |
11 | 11 | ||
12 | -identifier = Word(alphas, bodyChars=alphanums+u'_>*+{},') | ||
13 | -define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + Word(alphas, bodyChars=alphanums+u'_') + Suppress(')')) + restOfLine + LineEnd() + StringEnd() | 12 | +identifier = Word(alphas, bodyChars=alphanums+'_>*+{},') |
13 | +define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + Word(alphas, bodyChars=alphanums+'_') + Suppress(')')) + restOfLine + LineEnd() + StringEnd() | ||
14 | ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd() | 14 | ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd() |
15 | endif = Keyword('#endif').suppress() + LineEnd() + StringEnd() | 15 | endif = Keyword('#endif').suppress() + LineEnd() + StringEnd() |
16 | 16 | ||
@@ -107,5 +107,5 @@ def preprocess(inputLines, defs, filename): | @@ -107,5 +107,5 @@ def preprocess(inputLines, defs, filename): | ||
107 | ifdefsStack.pop() | 107 | ifdefsStack.pop() |
108 | elif line.startswith('#'): | 108 | elif line.startswith('#'): |
109 | yield lineNum, line | 109 | yield lineNum, line |
110 | - elif len(ifdefsStack) == 0 or all(map(lambda (name, isActive): (name in defs and isActive) or (name not in defs and not isActive), ifdefsStack)): | 110 | + elif len(ifdefsStack) == 0 or all([(name_isActive[0] in defs and name_isActive[1]) or (name_isActive[0] not in defs and not name_isActive[1]) for name_isActive in ifdefsStack]): |
111 | yield lineNum, _processLine(lineNum, line, defines, filename) | 111 | yield lineNum, _processLine(lineNum, line, defines, filename) |
fsabuilder/morfeuszbuilder/segrules/pyparseString.py
@@ -11,7 +11,7 @@ def pyparseString(rule, lineNum, line, filename): | @@ -11,7 +11,7 @@ def pyparseString(rule, lineNum, line, filename): | ||
11 | try: | 11 | try: |
12 | return rule.parseString(line, parseAll=True) | 12 | return rule.parseString(line, parseAll=True) |
13 | except ParseException as ex: | 13 | except ParseException as ex: |
14 | - msg = u'%s:%d - Preprocessing of segmentation rules failed.\n' % (filename, lineNum) | 14 | + msg = '%s:%d - Preprocessing of segmentation rules failed.\n' % (filename, lineNum) |
15 | msg += line + '\n' | 15 | msg += line + '\n' |
16 | msg += (ex.col - 1) * ' ' + '^\n' | 16 | msg += (ex.col - 1) * ' ' + '^\n' |
17 | msg += ex.msg | 17 | msg += ex.msg |
fsabuilder/morfeuszbuilder/segrules/rules.py
@@ -126,7 +126,7 @@ class ComplexRule(SegmentRule): | @@ -126,7 +126,7 @@ class ComplexRule(SegmentRule): | ||
126 | def __init__(self, children, linenum): | 126 | def __init__(self, children, linenum): |
127 | super(ComplexRule, self).__init__(linenum) | 127 | super(ComplexRule, self).__init__(linenum) |
128 | self.children = children | 128 | self.children = children |
129 | - assert not any(map(lambda c: c.isSinkRule(), children)) | 129 | + assert not any([c.isSinkRule() for c in children]) |
130 | 130 | ||
131 | def addToNFA(self, fsa): | 131 | def addToNFA(self, fsa): |
132 | endState = RulesNFAState(self, final=True, weak=self.weak, autogenerated=self.autogenerated) | 132 | endState = RulesNFAState(self, final=True, weak=self.weak, autogenerated=self.autogenerated) |
@@ -159,13 +159,13 @@ class ConcatRule(ComplexRule): | @@ -159,13 +159,13 @@ class ConcatRule(ComplexRule): | ||
159 | lastChild._doAddToNFA(currStartState, endState) | 159 | lastChild._doAddToNFA(currStartState, endState) |
160 | 160 | ||
161 | def allowsEmptySequence(self): | 161 | def allowsEmptySequence(self): |
162 | - return all(map(lambda rule: rule.allowsEmptySequence(), self.children)) | 162 | + return all([rule.allowsEmptySequence() for rule in self.children]) |
163 | 163 | ||
164 | def __str__(self): | 164 | def __str__(self): |
165 | - return u' '.join(map(lambda c: str(c), self.children)) | 165 | + return ' '.join([str(c) for c in self.children]) |
166 | 166 | ||
167 | def isShiftOrthRule(self): | 167 | def isShiftOrthRule(self): |
168 | - return all(map(lambda c: c.isShiftOrthRule(), self.children)) | 168 | + return all([c.isShiftOrthRule() for c in self.children]) |
169 | 169 | ||
170 | def transformToGeneratorVersion(self): | 170 | def transformToGeneratorVersion(self): |
171 | newChildren = [child.transformToGeneratorVersion() for child in self.children if not child.allowsEmptySequence() or child.isShiftOrthRule()] | 171 | newChildren = [child.transformToGeneratorVersion() for child in self.children if not child.allowsEmptySequence() or child.isShiftOrthRule()] |
@@ -207,11 +207,11 @@ class ConcatRule(ComplexRule): | @@ -207,11 +207,11 @@ class ConcatRule(ComplexRule): | ||
207 | for rule in self.children: | 207 | for rule in self.children: |
208 | rule.validate(filename) | 208 | rule.validate(filename) |
209 | if self.children[-1].isShiftOrthRule() \ | 209 | if self.children[-1].isShiftOrthRule() \ |
210 | - and not all(map(lambda c: c.isShiftOrthRule(), self.children)): | 210 | + and not all([c.isShiftOrthRule() for c in self.children]): |
211 | raise ConfigFileException( | 211 | raise ConfigFileException( |
212 | filename, | 212 | filename, |
213 | self.linenum, | 213 | self.linenum, |
214 | - u'If the rightmost subrule of concatenation "%s" is with ">", than all subrules must be with ">"' % str(self)) | 214 | + 'If the rightmost subrule of concatenation "%s" is with ">", than all subrules must be with ">"' % str(self)) |
215 | 215 | ||
216 | class OrRule(ComplexRule): | 216 | class OrRule(ComplexRule): |
217 | 217 | ||
@@ -227,17 +227,17 @@ class OrRule(ComplexRule): | @@ -227,17 +227,17 @@ class OrRule(ComplexRule): | ||
227 | intermEndState.addTransition(None, endState) | 227 | intermEndState.addTransition(None, endState) |
228 | 228 | ||
229 | def allowsEmptySequence(self): | 229 | def allowsEmptySequence(self): |
230 | - return any(map(lambda rule: rule.allowsEmptySequence(), self.children)) | 230 | + return any([rule.allowsEmptySequence() for rule in self.children]) |
231 | 231 | ||
232 | def __str__(self): | 232 | def __str__(self): |
233 | - return u' | '.join(map(lambda c: str(c), self.children)) | 233 | + return ' | '.join([str(c) for c in self.children]) |
234 | 234 | ||
235 | def isShiftOrthRule(self): | 235 | def isShiftOrthRule(self): |
236 | - return all(map(lambda c: c.isShiftOrthRule(), self.children)) | 236 | + return all([c.isShiftOrthRule() for c in self.children]) |
237 | 237 | ||
238 | def transformToGeneratorVersion(self): | 238 | def transformToGeneratorVersion(self): |
239 | newChildren = [child.transformToGeneratorVersion() for child in self.children if not child.allowsEmptySequence() or child.isShiftOrthRule()] | 239 | newChildren = [child.transformToGeneratorVersion() for child in self.children if not child.allowsEmptySequence() or child.isShiftOrthRule()] |
240 | - newChildren = filter(lambda c: not c.isSinkRule(), newChildren) | 240 | + newChildren = [c for c in newChildren if not c.isSinkRule()] |
241 | if newChildren == []: | 241 | if newChildren == []: |
242 | return SinkRule() | 242 | return SinkRule() |
243 | else: | 243 | else: |
@@ -255,12 +255,12 @@ class OrRule(ComplexRule): | @@ -255,12 +255,12 @@ class OrRule(ComplexRule): | ||
255 | for rule in self.children: | 255 | for rule in self.children: |
256 | rule.validate(filename) | 256 | rule.validate(filename) |
257 | if not ( | 257 | if not ( |
258 | - all(map(lambda c: c.isShiftOrthRule(), self.children)) | ||
259 | - or not any(map(lambda c: c.isShiftOrthRule(), self.children))): | 258 | + all([c.isShiftOrthRule() for c in self.children]) |
259 | + or not any([c.isShiftOrthRule() for c in self.children])): | ||
260 | raise ConfigFileException( | 260 | raise ConfigFileException( |
261 | filename, | 261 | filename, |
262 | self.linenum, | 262 | self.linenum, |
263 | - u'All subrules of alternative "%s" must be either with or without ">"' % str(self)) | 263 | + 'All subrules of alternative "%s" must be either with or without ">"' % str(self)) |
264 | 264 | ||
265 | class ZeroOrMoreRule(UnaryRule): | 265 | class ZeroOrMoreRule(UnaryRule): |
266 | 266 | ||
@@ -291,7 +291,7 @@ class ZeroOrMoreRule(UnaryRule): | @@ -291,7 +291,7 @@ class ZeroOrMoreRule(UnaryRule): | ||
291 | return SinkRule() | 291 | return SinkRule() |
292 | 292 | ||
293 | def __str__(self): | 293 | def __str__(self): |
294 | - return u'(' + str(self.child) + ')*' | 294 | + return '(' + str(self.child) + ')*' |
295 | 295 | ||
296 | class OptionalRule(UnaryRule): | 296 | class OptionalRule(UnaryRule): |
297 | 297 | ||
@@ -321,7 +321,7 @@ class OptionalRule(UnaryRule): | @@ -321,7 +321,7 @@ class OptionalRule(UnaryRule): | ||
321 | return self.child.transformToGeneratorVersion() | 321 | return self.child.transformToGeneratorVersion() |
322 | 322 | ||
323 | def __str__(self): | 323 | def __str__(self): |
324 | - return u'(' + str(self.child) + ')?' | 324 | + return '(' + str(self.child) + ')?' |
325 | 325 | ||
326 | class SinkRule(SegmentRule): | 326 | class SinkRule(SegmentRule): |
327 | 327 |
fsabuilder/morfeuszbuilder/segrules/rulesFSA.py
@@ -49,7 +49,7 @@ class RulesFSA(object): | @@ -49,7 +49,7 @@ class RulesFSA(object): | ||
49 | def transitionsData2bytearray(self, state): | 49 | def transitionsData2bytearray(self, state): |
50 | res = bytearray() | 50 | res = bytearray() |
51 | # logging.debug('next') | 51 | # logging.debug('next') |
52 | - for (segnum, shiftOrth), nextState in sorted(state.transitionsMap.iteritems()): | 52 | + for (segnum, shiftOrth), nextState in sorted(state.transitionsMap.items()): |
53 | res.append(segnum) | 53 | res.append(segnum) |
54 | if shiftOrth: | 54 | if shiftOrth: |
55 | res.append(1) | 55 | res.append(1) |
@@ -57,8 +57,8 @@ class RulesFSA(object): | @@ -57,8 +57,8 @@ class RulesFSA(object): | ||
57 | res.append(0) | 57 | res.append(0) |
58 | offset = nextState.offset | 58 | offset = nextState.offset |
59 | exceptions.validate(offset <= MAX_FSA_SIZE, | 59 | exceptions.validate(offset <= MAX_FSA_SIZE, |
60 | - u'Segmentation rules are too big and complicated' \ | ||
61 | - + u'- the resulting automaton would exceed its max size which is %d' \ | 60 | + 'Segmentation rules are too big and complicated' \ |
61 | + + '- the resulting automaton would exceed its max size which is %d' \ | ||
62 | % MAX_FSA_SIZE) | 62 | % MAX_FSA_SIZE) |
63 | res.extend(htons(offset)) | 63 | res.extend(htons(offset)) |
64 | return res | 64 | return res |
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
@@ -7,7 +7,7 @@ import logging | @@ -7,7 +7,7 @@ import logging | ||
7 | from morfeuszbuilder.utils.serializationUtils import htons, htonl | 7 | from morfeuszbuilder.utils.serializationUtils import htons, htonl |
8 | from morfeuszbuilder.utils import serializationUtils | 8 | from morfeuszbuilder.utils import serializationUtils |
9 | from morfeuszbuilder.utils import exceptions | 9 | from morfeuszbuilder.utils import exceptions |
10 | -import shiftOrthMagic | 10 | +from . import shiftOrthMagic |
11 | 11 | ||
12 | class RulesManager(object): | 12 | class RulesManager(object): |
13 | 13 | ||
@@ -19,7 +19,7 @@ class RulesManager(object): | @@ -19,7 +19,7 @@ class RulesManager(object): | ||
19 | self.shiftOrthMagic = shiftOrthMagic.ShiftOrthMagic() | 19 | self.shiftOrthMagic = shiftOrthMagic.ShiftOrthMagic() |
20 | 20 | ||
21 | def _options2Key(self, optionsMap): | 21 | def _options2Key(self, optionsMap): |
22 | - return frozenset(optionsMap.items()) | 22 | + return frozenset(list(optionsMap.items())) |
23 | 23 | ||
24 | def _key2Options(self, optionsKey): | 24 | def _key2Options(self, optionsKey): |
25 | return dict(optionsKey) | 25 | return dict(optionsKey) |
@@ -46,9 +46,9 @@ class RulesManager(object): | @@ -46,9 +46,9 @@ class RulesManager(object): | ||
46 | dfasNum = len(self.options2DFA) | 46 | dfasNum = len(self.options2DFA) |
47 | exceptions.validate( | 47 | exceptions.validate( |
48 | dfasNum > 0 and dfasNum < 256, | 48 | dfasNum > 0 and dfasNum < 256, |
49 | - u'Too many segmentation rules variants') | 49 | + 'Too many segmentation rules variants') |
50 | res.append(dfasNum) | 50 | res.append(dfasNum) |
51 | - for key, dfa in self.options2DFA.iteritems(): | 51 | + for key, dfa in list(self.options2DFA.items()): |
52 | optionsMap = self._key2Options(key) | 52 | optionsMap = self._key2Options(key) |
53 | res.extend(self._serializeOptionsMap(optionsMap)) | 53 | res.extend(self._serializeOptionsMap(optionsMap)) |
54 | res.extend(self._serializeDFA(dfa)) | 54 | res.extend(self._serializeDFA(dfa)) |
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
@@ -41,16 +41,16 @@ class RulesNFAState(object): | @@ -41,16 +41,16 @@ class RulesNFAState(object): | ||
41 | if not self in visitedStates: | 41 | if not self in visitedStates: |
42 | visitedStates.add(self) | 42 | visitedStates.add(self) |
43 | yield self | 43 | yield self |
44 | - for _, nextStates in self.transitionsMap.iteritems(): | 44 | + for _, nextStates in list(self.transitionsMap.items()): |
45 | for state in nextStates: | 45 | for state in nextStates: |
46 | for state1 in state.dfs(visitedStates): | 46 | for state1 in state.dfs(visitedStates): |
47 | yield state1 | 47 | yield state1 |
48 | 48 | ||
49 | def debug(self): | 49 | def debug(self): |
50 | - print '----------------' | ||
51 | - print 'STATE:', self.idx | ||
52 | - for label, nextStates in self.transitionsMap.iteritems(): | ||
53 | - print label, '-->', [s.idx for s in sorted(nextStates, key=lambda s: s.idx)] | 50 | + print('----------------') |
51 | + print(('STATE:', self.idx)) | ||
52 | + for label, nextStates in list(self.transitionsMap.items()): | ||
53 | + print((label, '-->', [s.idx for s in sorted(nextStates, key=lambda s: s.idx)])) | ||
54 | 54 | ||
55 | class RulesNFA(object): | 55 | class RulesNFA(object): |
56 | 56 | ||
@@ -60,7 +60,7 @@ class RulesNFA(object): | @@ -60,7 +60,7 @@ class RulesNFA(object): | ||
60 | def _groupOutputByLabels(self, nfaStates): | 60 | def _groupOutputByLabels(self, nfaStates): |
61 | res = {} | 61 | res = {} |
62 | for nfaState in nfaStates: | 62 | for nfaState in nfaStates: |
63 | - for label, nextStates in nfaState.transitionsMap.iteritems(): | 63 | + for label, nextStates in list(nfaState.transitionsMap.items()): |
64 | if label is not None: | 64 | if label is not None: |
65 | # transitionData = nfaState.transitionsDataMap[label] | 65 | # transitionData = nfaState.transitionsDataMap[label] |
66 | segnum, shiftOrth = label | 66 | segnum, shiftOrth = label |
@@ -70,27 +70,21 @@ class RulesNFA(object): | @@ -70,27 +70,21 @@ class RulesNFA(object): | ||
70 | return res | 70 | return res |
71 | 71 | ||
72 | def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState): | 72 | def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState): |
73 | - weakHits = map( | ||
74 | - lambda state: state.weak, | ||
75 | - filter( | ||
76 | - lambda state: state.final and not state.autogenerated, | ||
77 | - nfaStates)) | 73 | + weakHits = [state.weak for state in [state for state in nfaStates if state.final and not state.autogenerated]] |
78 | if not all(weakHits) \ | 74 | if not all(weakHits) \ |
79 | and any(weakHits): | 75 | and any(weakHits): |
80 | - weakState = list(filter(lambda state: state.final and state.weak, nfaStates))[0] | ||
81 | - nonWeakState = list(filter(lambda state: state.final and not state.weak, nfaStates))[0] | 76 | + weakState = list([state for state in nfaStates if state.final and state.weak])[0] |
77 | + nonWeakState = list([state for state in nfaStates if state.final and not state.weak])[0] | ||
82 | raise InconsistentStateWeaknessException(weakState, nonWeakState) | 78 | raise InconsistentStateWeaknessException(weakState, nonWeakState) |
83 | - weak = any(map( | ||
84 | - lambda state: state.weak and state.final, | ||
85 | - filter(lambda state: not state.autogenerated, nfaStates))) | ||
86 | - final = any(map(lambda state: state.final, nfaStates)) | 79 | + weak = any([state.weak and state.final for state in [state for state in nfaStates if not state.autogenerated]]) |
80 | + final = any([state.final for state in nfaStates]) | ||
87 | # assert not weak or not final | 81 | # assert not weak or not final |
88 | if final: | 82 | if final: |
89 | # dfaState should be final | 83 | # dfaState should be final |
90 | # and contain info about weakness | 84 | # and contain info about weakness |
91 | dfaState.setAsAccepting(weak=weak) | 85 | dfaState.setAsAccepting(weak=weak) |
92 | # dfaState.encodedData = bytearray([1 if weak else 0]) | 86 | # dfaState.encodedData = bytearray([1 if weak else 0]) |
93 | - for (segnum, shiftOrth), nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems(): | 87 | + for (segnum, shiftOrth), nextNFAStates in list(self._groupOutputByLabels(nfaStates).items()): |
94 | key = frozenset(nextNFAStates) | 88 | key = frozenset(nextNFAStates) |
95 | if key in nfaSubset2DFAState: | 89 | if key in nfaSubset2DFAState: |
96 | nextDFAState = nfaSubset2DFAState[key] | 90 | nextDFAState = nfaSubset2DFAState[key] |
@@ -104,7 +98,7 @@ class RulesNFA(object): | @@ -104,7 +98,7 @@ class RulesNFA(object): | ||
104 | def convertToDFA(self): | 98 | def convertToDFA(self): |
105 | dfa = RulesFSA() | 99 | dfa = RulesFSA() |
106 | startStates = self.initialState.getClosure(set()) | 100 | startStates = self.initialState.getClosure(set()) |
107 | - assert not any(filter(lambda s: s.final, startStates)) | 101 | + assert not any([s for s in startStates if s.final]) |
108 | dfa.initialState = RulesState() | 102 | dfa.initialState = RulesState() |
109 | self._doConvertState(dfa.initialState, startStates, {frozenset(startStates): dfa.initialState}) | 103 | self._doConvertState(dfa.initialState, startStates, {frozenset(startStates): dfa.initialState}) |
110 | return dfa | 104 | return dfa |
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
@@ -28,11 +28,11 @@ class RulesParser(object): | @@ -28,11 +28,11 @@ class RulesParser(object): | ||
28 | key, defs = lineToParse.parseString(line) | 28 | key, defs = lineToParse.parseString(line) |
29 | res[key] = tuple(defs) | 29 | res[key] = tuple(defs) |
30 | except Exception as ex: | 30 | except Exception as ex: |
31 | - raise exceptions.ConfigFileException(segtypesConfigFile.filename, lineNum, u'Error in [options] section: %s' % str(ex)) | 31 | + raise exceptions.ConfigFileException(segtypesConfigFile.filename, lineNum, 'Error in [options] section: %s' % str(ex)) |
32 | return res | 32 | return res |
33 | 33 | ||
34 | def _key2DefAsKey(self, key2Def): | 34 | def _key2DefAsKey(self, key2Def): |
35 | - return frozenset(key2Def.items()) | 35 | + return frozenset(list(key2Def.items())) |
36 | 36 | ||
37 | def parse(self, filename): | 37 | def parse(self, filename): |
38 | 38 | ||
@@ -53,12 +53,12 @@ class RulesParser(object): | @@ -53,12 +53,12 @@ class RulesParser(object): | ||
53 | res = rulesManager.RulesManager(segtypesHelper, separatorsList) | 53 | res = rulesManager.RulesManager(segtypesHelper, separatorsList) |
54 | 54 | ||
55 | def2Key = {} | 55 | def2Key = {} |
56 | - for key, defs in key2Defs.iteritems(): | 56 | + for key, defs in list(key2Defs.items()): |
57 | for define in defs: | 57 | for define in defs: |
58 | def2Key[define] = key | 58 | def2Key[define] = key |
59 | 59 | ||
60 | resultsMap = {} | 60 | resultsMap = {} |
61 | - for idx, defs in enumerate(itertools.product(*key2Defs.values())): | 61 | + for idx, defs in enumerate(itertools.product(*list(key2Defs.values()))): |
62 | key2Def = dict([(def2Key[define], define) for define in defs]) | 62 | key2Def = dict([(def2Key[define], define) for define in defs]) |
63 | currRes = [] | 63 | currRes = [] |
64 | resultsMap[self._key2DefAsKey(key2Def)] = currRes | 64 | resultsMap[self._key2DefAsKey(key2Def)] = currRes |
@@ -86,7 +86,7 @@ class RulesParser(object): | @@ -86,7 +86,7 @@ class RulesParser(object): | ||
86 | 86 | ||
87 | self.doShiftOrthMagic(resultsMap, res) | 87 | self.doShiftOrthMagic(resultsMap, res) |
88 | 88 | ||
89 | - for idx, defs in enumerate(itertools.product(*key2Defs.values())): | 89 | + for idx, defs in enumerate(itertools.product(*list(key2Defs.values()))): |
90 | key2Def = dict([(def2Key[define], define) for define in defs]) | 90 | key2Def = dict([(def2Key[define], define) for define in defs]) |
91 | 91 | ||
92 | nfa = rulesNFA.RulesNFA() | 92 | nfa = rulesNFA.RulesNFA() |
@@ -115,20 +115,20 @@ class RulesParser(object): | @@ -115,20 +115,20 @@ class RulesParser(object): | ||
115 | 115 | ||
116 | def _createNewTagRule(self, segtype, shiftOrth, lineNum, line, segtypesHelper): | 116 | def _createNewTagRule(self, segtype, shiftOrth, lineNum, line, segtypesHelper): |
117 | if not segtypesHelper.hasSegtype(segtype): | 117 | if not segtypesHelper.hasSegtype(segtype): |
118 | - raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype)) | 118 | + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid segment type: %s' % (line, segtype)) |
119 | else: | 119 | else: |
120 | # return rules.TagRule(segtype) | 120 | # return rules.TagRule(segtype) |
121 | return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), shiftOrth, segtype, lineNum) | 121 | return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), shiftOrth, segtype, lineNum) |
122 | 122 | ||
123 | def _createQuantRule1(self, child, quantity, lineNum, line, segtypesHelper): | 123 | def _createQuantRule1(self, child, quantity, lineNum, line, segtypesHelper): |
124 | if quantity <= 0: | 124 | if quantity <= 0: |
125 | - raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid quantity: %d' % (line, quantity)) | 125 | + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantity: %d' % (line, quantity)) |
126 | else: | 126 | else: |
127 | return rules.ConcatRule(quantity * [child], lineNum) | 127 | return rules.ConcatRule(quantity * [child], lineNum) |
128 | 128 | ||
129 | def _createQuantRule2(self, child, leftN, rightN, lineNum, line, segtypesHelper): | 129 | def _createQuantRule2(self, child, leftN, rightN, lineNum, line, segtypesHelper): |
130 | if leftN > rightN or (leftN, rightN) == (0, 0): | 130 | if leftN > rightN or (leftN, rightN) == (0, 0): |
131 | - raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid quantities: %d %d' % (line, leftN, rightN)) | 131 | + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantities: %d %d' % (line, leftN, rightN)) |
132 | elif leftN == 0: | 132 | elif leftN == 0: |
133 | children = [rules.OptionalRule(child, lineNum)] | 133 | children = [rules.OptionalRule(child, lineNum)] |
134 | for n in range(2, rightN + 1): | 134 | for n in range(2, rightN + 1): |
@@ -140,7 +140,7 @@ class RulesParser(object): | @@ -140,7 +140,7 @@ class RulesParser(object): | ||
140 | 140 | ||
141 | def _createQuantRule3(self, child, quantity, lineNum, line, segtypesHelper): | 141 | def _createQuantRule3(self, child, quantity, lineNum, line, segtypesHelper): |
142 | if quantity <= 0: | 142 | if quantity <= 0: |
143 | - raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid quantity: %d' % (line, quantity)) | 143 | + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantity: %d' % (line, quantity)) |
144 | else: | 144 | else: |
145 | return rules.ConcatRule( | 145 | return rules.ConcatRule( |
146 | [ | 146 | [ |
@@ -200,7 +200,7 @@ class RulesParser(object): | @@ -200,7 +200,7 @@ class RulesParser(object): | ||
200 | shiftOrthSegtypes = set() | 200 | shiftOrthSegtypes = set() |
201 | nonShiftOrthSegtypes = set() | 201 | nonShiftOrthSegtypes = set() |
202 | 202 | ||
203 | - for _, rules in resultsMap.iteritems(): | 203 | + for _, rules in list(resultsMap.items()): |
204 | for rule in rules: | 204 | for rule in rules: |
205 | for atomicRule in rule.getAtomicRules(): | 205 | for atomicRule in rule.getAtomicRules(): |
206 | if atomicRule.shiftOrth: | 206 | if atomicRule.shiftOrth: |
fsabuilder/morfeuszbuilder/segrules/shiftOrthMagic.py
@@ -36,7 +36,7 @@ class ShiftOrthMagic(object): | @@ -36,7 +36,7 @@ class ShiftOrthMagic(object): | ||
36 | for segtype in shiftOrthSegtypes - nonShiftOrthSegtypes: | 36 | for segtype in shiftOrthSegtypes - nonShiftOrthSegtypes: |
37 | self._onlyShiftSegnums.add(segtypesHelper.getSegnum4Segtype(segtype)) | 37 | self._onlyShiftSegnums.add(segtypesHelper.getSegnum4Segtype(segtype)) |
38 | 38 | ||
39 | - for _, rules in resultsMap.iteritems(): | 39 | + for _, rules in list(resultsMap.items()): |
40 | for rule in rules: | 40 | for rule in rules: |
41 | for atomicRule in rule.getAtomicRules(): | 41 | for atomicRule in rule.getAtomicRules(): |
42 | if atomicRule.segnum in self._bothShiftAndNonShiftSegnums and atomicRule.shiftOrth: | 42 | if atomicRule.segnum in self._bothShiftAndNonShiftSegnums and atomicRule.shiftOrth: |
fsabuilder/morfeuszbuilder/segrules/test/parserTest.py
@@ -12,18 +12,18 @@ from morfeuszbuilder.fsa import visualizer, serializer | @@ -12,18 +12,18 @@ from morfeuszbuilder.fsa import visualizer, serializer | ||
12 | class Test(unittest.TestCase): | 12 | class Test(unittest.TestCase): |
13 | 13 | ||
14 | def testParser(self): | 14 | def testParser(self): |
15 | - print 'do test' | 15 | + print('do test') |
16 | t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset')) | 16 | t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset')) |
17 | parser = rulesParser.RulesParser(t) | 17 | parser = rulesParser.RulesParser(t) |
18 | rulesManager = parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) | 18 | rulesManager = parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) |
19 | fsa = rulesManager.getDFA({'aggl': 'permissive', 'praet': 'split'}) | 19 | fsa = rulesManager.getDFA({'aggl': 'permissive', 'praet': 'split'}) |
20 | for s in fsa.dfs(): | 20 | for s in fsa.dfs(): |
21 | s.debug() | 21 | s.debug() |
22 | - print 'states:', len(list(fsa.dfs())) | ||
23 | - print 'transitions:', fsa.getTransitionsNum() | 22 | + print(('states:', len(list(fsa.dfs())))) |
23 | + print(('transitions:', fsa.getTransitionsNum())) | ||
24 | visualizer.Visualizer().visualize(fsa, charLabels=False) | 24 | visualizer.Visualizer().visualize(fsa, charLabels=False) |
25 | - print 'size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray())) | ||
26 | - print 'done' | 25 | + print(('size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray())))) |
26 | + print('done') | ||
27 | 27 | ||
28 | if __name__ == "__main__": | 28 | if __name__ == "__main__": |
29 | unittest.main() | 29 | unittest.main() |
fsabuilder/morfeuszbuilder/segrules/test/preprocessorTest.py
@@ -19,7 +19,7 @@ class Test(unittest.TestCase): | @@ -19,7 +19,7 @@ class Test(unittest.TestCase): | ||
19 | parsedFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes', 'segment types']) | 19 | parsedFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes', 'segment types']) |
20 | linesEnum = parsedFile.enumerateLinesInSection('combinations') | 20 | linesEnum = parsedFile.enumerateLinesInSection('combinations') |
21 | for lineNum, line in preprocessor.preprocess(linesEnum, ['extra', 'superextra']): | 21 | for lineNum, line in preprocessor.preprocess(linesEnum, ['extra', 'superextra']): |
22 | - print (lineNum, line) | 22 | + print((lineNum, line)) |
23 | 23 | ||
24 | 24 | ||
25 | if __name__ == "__main__": | 25 | if __name__ == "__main__": |
fsabuilder/morfeuszbuilder/tagset/segtypes.py
@@ -11,11 +11,11 @@ from morfeuszbuilder.utils import exceptions | @@ -11,11 +11,11 @@ from morfeuszbuilder.utils import exceptions | ||
11 | def _getLemmaHomonymPair(lemma): | 11 | def _getLemmaHomonymPair(lemma): |
12 | if lemma is None: | 12 | if lemma is None: |
13 | return (None, None) | 13 | return (None, None) |
14 | - elif u':' in lemma: | ||
15 | - if lemma.replace(u':', '') == '': | 14 | + elif ':' in lemma: |
15 | + if lemma.replace(':', '') == '': | ||
16 | return (lemma, None) | 16 | return (lemma, None) |
17 | else: | 17 | else: |
18 | - return lemma.split(u':', 1) | 18 | + return lemma.split(':', 1) |
19 | else: | 19 | else: |
20 | return (lemma, None) | 20 | return (lemma, None) |
21 | 21 | ||
@@ -26,7 +26,7 @@ class Segtypes(object): | @@ -26,7 +26,7 @@ class Segtypes(object): | ||
26 | self.tagset = tagset | 26 | self.tagset = tagset |
27 | self.namesMap = namesMap | 27 | self.namesMap = namesMap |
28 | self.labelsMap = labelsMap | 28 | self.labelsMap = labelsMap |
29 | - self._reverseLabelsMap = dict([(v, k) for (k, v) in labelsMap.iteritems()]) | 29 | + self._reverseLabelsMap = dict([(v, k) for (k, v) in list(labelsMap.items())]) |
30 | 30 | ||
31 | self.filename = segrulesConfigFile.filename | 31 | self.filename = segrulesConfigFile.filename |
32 | 32 | ||
@@ -59,13 +59,13 @@ class Segtypes(object): | @@ -59,13 +59,13 @@ class Segtypes(object): | ||
59 | 59 | ||
60 | def _readSegtypes(self, segrulesConfigFile): | 60 | def _readSegtypes(self, segrulesConfigFile): |
61 | for lineNum, line in segrulesConfigFile.enumerateLinesInSection('segment types'): | 61 | for lineNum, line in segrulesConfigFile.enumerateLinesInSection('segment types'): |
62 | - assert type(line) == unicode | 62 | + assert type(line) == str |
63 | self._validate( | 63 | self._validate( |
64 | - u'Segment type must be a single word', | 64 | + 'Segment type must be a single word', |
65 | lineNum, | 65 | lineNum, |
66 | re.match(r'^\w+$', line)) | 66 | re.match(r'^\w+$', line)) |
67 | self._validate( | 67 | self._validate( |
68 | - u'Segment type already defined: "%s"' % line, | 68 | + 'Segment type already defined: "%s"' % line, |
69 | lineNum, | 69 | lineNum, |
70 | line not in self.segtypes) | 70 | line not in self.segtypes) |
71 | self.segtypes.append(line) | 71 | self.segtypes.append(line) |
@@ -75,13 +75,13 @@ class Segtypes(object): | @@ -75,13 +75,13 @@ class Segtypes(object): | ||
75 | for lineNum, line in segrulesConfigFile.enumerateLinesInSection('tags'): | 75 | for lineNum, line in segrulesConfigFile.enumerateLinesInSection('tags'): |
76 | self._parsePattern(lineNum, line, withLemma=False) | 76 | self._parsePattern(lineNum, line, withLemma=False) |
77 | self._validate( | 77 | self._validate( |
78 | - u'Pattern that matches everything must be the last one', | 78 | + 'Pattern that matches everything must be the last one', |
79 | lineNum - 1, | 79 | lineNum - 1, |
80 | not gotWildcardPattern) | 80 | not gotWildcardPattern) |
81 | gotWildcardPattern = gotWildcardPattern or self.patternsList[-1].isWildcardPattern() | 81 | gotWildcardPattern = gotWildcardPattern or self.patternsList[-1].isWildcardPattern() |
82 | 82 | ||
83 | self._validate( | 83 | self._validate( |
84 | - u'There must be a pattern that matches everything at the end of [tags] section', | 84 | + 'There must be a pattern that matches everything at the end of [tags] section', |
85 | lineNum, | 85 | lineNum, |
86 | self.patternsList[-1].isWildcardPattern()) | 86 | self.patternsList[-1].isWildcardPattern()) |
87 | 87 | ||
@@ -94,18 +94,18 @@ class Segtypes(object): | @@ -94,18 +94,18 @@ class Segtypes(object): | ||
94 | for f in fields: | 94 | for f in fields: |
95 | match = re.match(r'(name|labels)=([\S]+)', f, re.U) | 95 | match = re.match(r'(name|labels)=([\S]+)', f, re.U) |
96 | self._validate( | 96 | self._validate( |
97 | - u'invalid name or labels constraint: "%s"' % f, | 97 | + 'invalid name or labels constraint: "%s"' % f, |
98 | lineNum, | 98 | lineNum, |
99 | match) | 99 | match) |
100 | key = match.group(1) | 100 | key = match.group(1) |
101 | value = match.group(2) | 101 | value = match.group(2) |
102 | self._validate( | 102 | self._validate( |
103 | - u'%s already specified' % key, | 103 | + '%s already specified' % key, |
104 | lineNum, | 104 | lineNum, |
105 | key not in res) | 105 | key not in res) |
106 | if key == 'labels': | 106 | if key == 'labels': |
107 | if value: | 107 | if value: |
108 | - value = frozenset(value.split(u'|')) | 108 | + value = frozenset(value.split('|')) |
109 | else: | 109 | else: |
110 | value = frozenset() | 110 | value = frozenset() |
111 | res[key] = value | 111 | res[key] = value |
@@ -115,7 +115,7 @@ class Segtypes(object): | @@ -115,7 +115,7 @@ class Segtypes(object): | ||
115 | split = re.split(r'\s+', line.strip()) | 115 | split = re.split(r'\s+', line.strip()) |
116 | if withLemma: | 116 | if withLemma: |
117 | self._validate( | 117 | self._validate( |
118 | - u'Line in [lexemes] section must contain 3 to 5 fields - segment type, lemma, tag pattern and optional constraints on name and labels', | 118 | + 'Line in [lexemes] section must contain 3 to 5 fields - segment type, lemma, tag pattern and optional constraints on name and labels', |
119 | lineNum, | 119 | lineNum, |
120 | len(split) in [3, 4, 5]) | 120 | len(split) in [3, 4, 5]) |
121 | segtype = split[0] | 121 | segtype = split[0] |
@@ -124,7 +124,7 @@ class Segtypes(object): | @@ -124,7 +124,7 @@ class Segtypes(object): | ||
124 | additionalConstraints = self._parseAdditionalConstraints(lineNum, split[3:]) | 124 | additionalConstraints = self._parseAdditionalConstraints(lineNum, split[3:]) |
125 | else: | 125 | else: |
126 | self._validate( | 126 | self._validate( |
127 | - u'Line in [tags] section must contain 2 to 4 fields - segment type, tag pattern and optional constraints on name and labels', | 127 | + 'Line in [tags] section must contain 2 to 4 fields - segment type, tag pattern and optional constraints on name and labels', |
128 | lineNum, | 128 | lineNum, |
129 | len(split) in [2, 3, 4]) | 129 | len(split) in [2, 3, 4]) |
130 | segtype = split[0] | 130 | segtype = split[0] |
@@ -132,32 +132,32 @@ class Segtypes(object): | @@ -132,32 +132,32 @@ class Segtypes(object): | ||
132 | pattern = split[1] | 132 | pattern = split[1] |
133 | additionalConstraints = self._parseAdditionalConstraints(lineNum, split[2:]) | 133 | additionalConstraints = self._parseAdditionalConstraints(lineNum, split[2:]) |
134 | self._validate( | 134 | self._validate( |
135 | - u'Undeclared segment type: "%s"' % segtype, | 135 | + 'Undeclared segment type: "%s"' % segtype, |
136 | lineNum, | 136 | lineNum, |
137 | segtype in self.segtypes) | 137 | segtype in self.segtypes) |
138 | segnum = self.segtypes.index(segtype) | 138 | segnum = self.segtypes.index(segtype) |
139 | 139 | ||
140 | self._validate( | 140 | self._validate( |
141 | - u'Pattern must contain only ":", "%", "." and lowercase alphanumeric letters', | 141 | + 'Pattern must contain only ":", "%", "." and lowercase alphanumeric letters', |
142 | lineNum, | 142 | lineNum, |
143 | re.match(r'[a-z_\.\:\%]+', pattern)) | 143 | re.match(r'[a-z_\.\:\%]+', pattern)) |
144 | 144 | ||
145 | segtypePattern = SegtypePattern( | 145 | segtypePattern = SegtypePattern( |
146 | lemma, | 146 | lemma, |
147 | pattern, | 147 | pattern, |
148 | - additionalConstraints.get('name', u''), | 148 | + additionalConstraints.get('name', ''), |
149 | additionalConstraints.get('labels', frozenset()), | 149 | additionalConstraints.get('labels', frozenset()), |
150 | segnum) | 150 | segnum) |
151 | # print 'segtypePattern', repr(str(segtypePattern)) | 151 | # print 'segtypePattern', repr(str(segtypePattern)) |
152 | self._validate( | 152 | self._validate( |
153 | - u'There is no tag that matches pattern "%s".' % (pattern), | 153 | + 'There is no tag that matches pattern "%s".' % (pattern), |
154 | lineNum, | 154 | lineNum, |
155 | any([segtypePattern.tryToMatchTag(tag) != -1 for tag in self.tagset.getAllTags()])) | 155 | any([segtypePattern.tryToMatchTag(tag) != -1 for tag in self.tagset.getAllTags()])) |
156 | self.patternsList.append(segtypePattern) | 156 | self.patternsList.append(segtypePattern) |
157 | 157 | ||
158 | def _getAllExistingLabelsnumCombinations(self, labels): | 158 | def _getAllExistingLabelsnumCombinations(self, labels): |
159 | if labels: | 159 | if labels: |
160 | - for labelsCombination, labelsnum in self.labelsMap.iteritems(): | 160 | + for labelsCombination, labelsnum in list(self.labelsMap.items()): |
161 | if labels <= labelsCombination: | 161 | if labels <= labelsCombination: |
162 | yield labelsnum | 162 | yield labelsnum |
163 | else: | 163 | else: |
@@ -232,7 +232,7 @@ class SegtypePattern(object): | @@ -232,7 +232,7 @@ class SegtypePattern(object): | ||
232 | return -1 | 232 | return -1 |
233 | 233 | ||
234 | def isWildcardPattern(self): | 234 | def isWildcardPattern(self): |
235 | - return (self.lemma, self.pattern, self.name, self.labels) == (None, '%', u'', frozenset()) | 235 | + return (self.lemma, self.pattern, self.name, self.labels) == (None, '%', '', frozenset()) |
236 | 236 | ||
237 | def __str__(self): | 237 | def __str__(self): |
238 | - return u'%s %s %s %s -> %d' % (self.lemma, self.pattern, self.name, self.labels, self.segnum) | 238 | + return '%s %s %s %s -> %d' % (self.lemma, self.pattern, self.name, self.labels, self.segnum) |
fsabuilder/morfeuszbuilder/tagset/tagset.py
@@ -20,7 +20,7 @@ class Tagset(object): | @@ -20,7 +20,7 @@ class Tagset(object): | ||
20 | #~ self._name2namenum = {} | 20 | #~ self._name2namenum = {} |
21 | if filename: | 21 | if filename: |
22 | self._doInit(filename, encoding) | 22 | self._doInit(filename, encoding) |
23 | - self._tagnum2tag = dict(map(lambda (k, v): (v, k), self.tag2tagnum.iteritems())) | 23 | + self._tagnum2tag = dict([(k_v[1], k_v[0]) for k_v in iter(list(self.tag2tagnum.items()))]) |
24 | 24 | ||
25 | def _doInit(self, filename, encoding): | 25 | def _doInit(self, filename, encoding): |
26 | insideTags = False | 26 | insideTags = False |
@@ -33,11 +33,11 @@ class Tagset(object): | @@ -33,11 +33,11 @@ class Tagset(object): | ||
33 | self.tagsetId = match.group(1) | 33 | self.tagsetId = match.group(1) |
34 | else: | 34 | else: |
35 | raise FSABuilderException('missing TAGSET-ID in first line of tagset file') | 35 | raise FSABuilderException('missing TAGSET-ID in first line of tagset file') |
36 | - elif line == u'[TAGS]': | 36 | + elif line == '[TAGS]': |
37 | insideTags = True | 37 | insideTags = True |
38 | #~ elif line == u'[NAMES]': | 38 | #~ elif line == u'[NAMES]': |
39 | #~ addingTo = Tagset.NAMES | 39 | #~ addingTo = Tagset.NAMES |
40 | - elif line and not line.startswith(u'#'): | 40 | + elif line and not line.startswith('#'): |
41 | if not insideTags: | 41 | if not insideTags: |
42 | raise FSABuilderException('"%s" - text outside [TAGS] section in tagset file line %d' % (line, linenum)) | 42 | raise FSABuilderException('"%s" - text outside [TAGS] section in tagset file line %d' % (line, linenum)) |
43 | res = self.tag2tagnum | 43 | res = self.tag2tagnum |
@@ -47,12 +47,12 @@ class Tagset(object): | @@ -47,12 +47,12 @@ class Tagset(object): | ||
47 | tag = line.split(Tagset.SEP)[1] | 47 | tag = line.split(Tagset.SEP)[1] |
48 | if tag in res: | 48 | if tag in res: |
49 | raise FSABuilderException('duplicate tag: "%s"' % tag) | 49 | raise FSABuilderException('duplicate tag: "%s"' % tag) |
50 | - if int(tagNum) in res.values(): | 50 | + if int(tagNum) in list(res.values()): |
51 | raise FSABuilderException('line %d: tagId %d assigned for tag "%s" already appeared somewhere else.' % (linenum, int(tagNum), tag)) | 51 | raise FSABuilderException('line %d: tagId %d assigned for tag "%s" already appeared somewhere else.' % (linenum, int(tagNum), tag)) |
52 | res[tag] = int(tagNum) | 52 | res[tag] = int(tagNum) |
53 | 53 | ||
54 | def getAllTags(self): | 54 | def getAllTags(self): |
55 | - return self.tag2tagnum.keys() | 55 | + return list(self.tag2tagnum.keys()) |
56 | 56 | ||
57 | def getTagnum4Tag(self, tag): | 57 | def getTagnum4Tag(self, tag): |
58 | if tag in self.tag2tagnum: | 58 | if tag in self.tag2tagnum: |
fsabuilder/morfeuszbuilder/utils/caseconv/generate.py
@@ -90,7 +90,7 @@ def _serializeTable(table): | @@ -90,7 +90,7 @@ def _serializeTable(table): | ||
90 | def _serializeExtendedTable(table): | 90 | def _serializeExtendedTable(table): |
91 | res = [] | 91 | res = [] |
92 | res.append('{') | 92 | res.append('{') |
93 | - for code, targetCode in table.iteritems(): | 93 | + for code, targetCode in list(table.items()): |
94 | res.append('{') | 94 | res.append('{') |
95 | res.append(str(code)) | 95 | res.append(str(code)) |
96 | res.append(',') | 96 | res.append(',') |
fsabuilder/morfeuszbuilder/utils/configFile.py
@@ -6,10 +6,10 @@ Created on 18 lut 2014 | @@ -6,10 +6,10 @@ Created on 18 lut 2014 | ||
6 | 6 | ||
7 | import re | 7 | import re |
8 | import codecs | 8 | import codecs |
9 | -import exceptions | 9 | +from . import exceptions |
10 | 10 | ||
11 | def getHeaderValue(line, lineNum): | 11 | def getHeaderValue(line, lineNum): |
12 | - m = re.match(ur'\s*\[(.*?)\]\s*(\#.*)?', line) | 12 | + m = re.match(r'\s*\[(.*?)\]\s*(\#.*)?', line) |
13 | if m: | 13 | if m: |
14 | return m.group(1) | 14 | return m.group(1) |
15 | else: | 15 | else: |
@@ -40,7 +40,7 @@ class ConfigFile(object): | @@ -40,7 +40,7 @@ class ConfigFile(object): | ||
40 | self.section2Lines[self.currSection].append((lineNum, line)) | 40 | self.section2Lines[self.currSection].append((lineNum, line)) |
41 | 41 | ||
42 | def _getHeaderValue(self, line, lineNum): | 42 | def _getHeaderValue(self, line, lineNum): |
43 | - m = re.match(ur'\s*\[(.*?)\]\s*(\#.*)?', line) | 43 | + m = re.match(r'\s*\[(.*?)\]\s*(\#.*)?', line) |
44 | if m: | 44 | if m: |
45 | return m.group(1) | 45 | return m.group(1) |
46 | else: | 46 | else: |
@@ -48,7 +48,7 @@ class ConfigFile(object): | @@ -48,7 +48,7 @@ class ConfigFile(object): | ||
48 | 48 | ||
49 | def enumerateLinesInSection(self, sectionName, ignoreComments=True): | 49 | def enumerateLinesInSection(self, sectionName, ignoreComments=True): |
50 | if sectionName not in self.section2Lines: | 50 | if sectionName not in self.section2Lines: |
51 | - raise exceptions.ConfigFileException(self.filename, None, u'Missing section: "%s"' % sectionName) | 51 | + raise exceptions.ConfigFileException(self.filename, None, 'Missing section: "%s"' % sectionName) |
52 | if not ignoreComments: | 52 | if not ignoreComments: |
53 | return self.section2Lines[sectionName] | 53 | return self.section2Lines[sectionName] |
54 | else: | 54 | else: |
fsabuilder/morfeuszbuilder/utils/exceptions.py
@@ -25,7 +25,7 @@ class SegtypesException(FSABuilderException): | @@ -25,7 +25,7 @@ class SegtypesException(FSABuilderException): | ||
25 | self.msg = msg | 25 | self.msg = msg |
26 | 26 | ||
27 | def __str__(self): | 27 | def __str__(self): |
28 | - return u'Error in segment rules: %s' % self.msg | 28 | + return 'Error in segment rules: %s' % self.msg |
29 | 29 | ||
30 | class ConfigFileException(FSABuilderException): | 30 | class ConfigFileException(FSABuilderException): |
31 | 31 | ||
@@ -36,7 +36,7 @@ class ConfigFileException(FSABuilderException): | @@ -36,7 +36,7 @@ class ConfigFileException(FSABuilderException): | ||
36 | 36 | ||
37 | def __str__(self): | 37 | def __str__(self): |
38 | if self.lineNum: | 38 | if self.lineNum: |
39 | - return u'%s:%d - %s' % (self.filename, self.lineNum, self.msg) | 39 | + return '%s:%d - %s' % (self.filename, self.lineNum, self.msg) |
40 | else: | 40 | else: |
41 | - return u'%s - %s' % (self.filename, self.msg) | 41 | + return '%s - %s' % (self.filename, self.msg) |
42 | 42 |
fsabuilder/morfeuszbuilder/utils/extractTagset.py
@@ -8,10 +8,10 @@ import sys | @@ -8,10 +8,10 @@ import sys | ||
8 | if __name__ == '__main__': | 8 | if __name__ == '__main__': |
9 | version = sys.argv[1] | 9 | version = sys.argv[1] |
10 | res = set() | 10 | res = set() |
11 | - print '#morfeusz-tagset', version | 11 | + print(('#morfeusz-tagset', version)) |
12 | for line in sys.stdin: | 12 | for line in sys.stdin: |
13 | if line.strip(): | 13 | if line.strip(): |
14 | tag = line.split('\t')[2] | 14 | tag = line.split('\t')[2] |
15 | res.add(tag) | 15 | res.add(tag) |
16 | for idx, tag in enumerate(sorted(res)): | 16 | for idx, tag in enumerate(sorted(res)): |
17 | - print str(idx) + '\t' + tag | 17 | + print((str(idx) + '\t' + tag)) |