Commit 95cbe5ea03398610d704f1ad1d51cc981f0aafea
1 parent
a5484089
morfeusz_builder → Python 3
Showing
27 changed files
with
204 additions
and
210 deletions
CMakeLists.txt
... | ... | @@ -4,7 +4,7 @@ project (Morfeusz) |
4 | 4 | |
5 | 5 | set (Morfeusz_VERSION_MAJOR 1) |
6 | 6 | set (Morfeusz_VERSION_MINOR 9) |
7 | -set (Morfeusz_VERSION_PATCH 15) | |
7 | +set (Morfeusz_VERSION_PATCH 16) | |
8 | 8 | set (Morfeusz_VERSION "${Morfeusz_VERSION_MAJOR}.${Morfeusz_VERSION_MINOR}.${Morfeusz_VERSION_PATCH}") |
9 | 9 | set (Morfeusz_LIB_VERSION "${Morfeusz_VERSION}") |
10 | 10 | if (BUILT_ON) |
... | ... |
fsabuilder/buildanalyzer.sh
1 | -#!/bin/bash | |
1 | +#! /bin/bash | |
2 | 2 | |
3 | -python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V1 --trim-supneg -o $1 | |
3 | +python3 morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V1 --trim-supneg -o $1 | |
... | ... |
fsabuilder/buildgenerator.sh
1 | -#!/bin/bash | |
1 | +#! /bin/bash | |
2 | 2 | |
3 | -python morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \ | |
3 | +python3 morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \ | |
4 | 4 | --tagset-file=../input/sgjp-morfeusz.tagset \ |
5 | 5 | --segments-file=../input/segmenty.dat \ |
6 | 6 | --generator \ |
... | ... |
fsabuilder/morfeusz_builder
100644 → 100755
1 | -#!/usr/bin/python | |
1 | +#! /usr/bin/python3 | |
2 | 2 | # -*- coding:utf-8 -*- |
3 | 3 | ''' |
4 | 4 | Created on 21 paź 2013 |
... | ... | @@ -20,13 +20,13 @@ from optparse import OptionParser |
20 | 20 | |
21 | 21 | def _checkOption(opt, parser, msg): |
22 | 22 | if opt is None: |
23 | - print >> sys.stderr, msg | |
23 | + print(msg, file=sys.stderr) | |
24 | 24 | parser.print_help() |
25 | 25 | exit(1) |
26 | 26 | |
27 | 27 | def _checkCondition(cond, parser, msg): |
28 | 28 | if not cond: |
29 | - print >> sys.stderr, msg | |
29 | + print(msg, file=sys.stderr) | |
30 | 30 | parser.print_help() |
31 | 31 | exit(1) |
32 | 32 | |
... | ... | @@ -40,7 +40,7 @@ def _checkOpen(filename, mode): |
40 | 40 | if 'w' in mode: |
41 | 41 | os.remove(filename) |
42 | 42 | except IOError as ex: |
43 | - print >> sys.stderr, str(ex) | |
43 | + print(str(ex), file=sys.stderr) | |
44 | 44 | exit(1) |
45 | 45 | |
46 | 46 | def _getDictFilename(opts, isGenerator): |
... | ... | @@ -162,7 +162,7 @@ def _parseOptions(): |
162 | 162 | _checkOpen(_getDictFilename(opts, isGenerator=True), 'w') |
163 | 163 | |
164 | 164 | if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1]: |
165 | - print >> sys.stderr, '--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1])+')' | |
165 | + print('--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1])+')', file=sys.stderr) | |
166 | 166 | parser.print_help() |
167 | 167 | exit(1) |
168 | 168 | |
... | ... | @@ -183,34 +183,34 @@ def _readDictIdAndCopyright(inputFiles): |
183 | 183 | with codecs.open(inputFile, 'r', 'utf8') as f: |
184 | 184 | inCopyright = False |
185 | 185 | for linenum, line in enumerate(f, start=1): |
186 | - if dictId is None and line.startswith(u'#!DICT-ID'): | |
187 | - dictIdTag, _, dictId = line.strip().partition(u' ') | |
186 | + if dictId is None and line.startswith('#!DICT-ID'): | |
187 | + dictIdTag, _, dictId = line.strip().partition(' ') | |
188 | 188 | exceptions.validate( |
189 | - dictIdTag == u'#!DICT-ID', | |
190 | - u'Dictionary ID tag must be followed by a space character and dictionary ID string') | |
189 | + dictIdTag == '#!DICT-ID', | |
190 | + 'Dictionary ID tag must be followed by a space character and dictionary ID string') | |
191 | 191 | exceptions.validate( |
192 | - len(line.split(u' ')) > 1, | |
193 | - u'%s:%d: Must provide DICT-ID' % (inputFile, linenum)) | |
192 | + len(line.split(' ')) > 1, | |
193 | + '%s:%d: Must provide DICT-ID' % (inputFile, linenum)) | |
194 | 194 | exceptions.validate( |
195 | - len(line.split(u' ')) == 2, | |
196 | - u'%s:%d: DICT-ID must not contain spaces' % (inputFile, linenum)) | |
197 | - elif copyright is None and line.startswith(u'#<COPYRIGHT>'): | |
195 | + len(line.split(' ')) == 2, | |
196 | + '%s:%d: DICT-ID must not contain spaces' % (inputFile, linenum)) | |
197 | + elif copyright is None and line.startswith('#<COPYRIGHT>'): | |
198 | 198 | exceptions.validate( |
199 | - line.strip() == u'#<COPYRIGHT>', | |
200 | - u'%s:%d: Copyright start tag must be the only one in the line' % (inputFile, linenum)) | |
199 | + line.strip() == '#<COPYRIGHT>', | |
200 | + '%s:%d: Copyright start tag must be the only one in the line' % (inputFile, linenum)) | |
201 | 201 | |
202 | 202 | inCopyright = True |
203 | - copyright = u'' | |
203 | + copyright = '' | |
204 | 204 | |
205 | - elif line.startswith(u'#</COPYRIGHT>'): | |
205 | + elif line.startswith('#</COPYRIGHT>'): | |
206 | 206 | |
207 | 207 | exceptions.validate( |
208 | 208 | inCopyright, |
209 | - u'%s:%d: Copyright end tag must be preceded by copyright start tag' % (inputFile, linenum)) | |
209 | + '%s:%d: Copyright end tag must be preceded by copyright start tag' % (inputFile, linenum)) | |
210 | 210 | |
211 | 211 | exceptions.validate( |
212 | - line.strip() == u'#</COPYRIGHT>', | |
213 | - u'%s:%d: Copyright end tag must be the only one in the line' % (inputFile, linenum)) | |
212 | + line.strip() == '#</COPYRIGHT>', | |
213 | + '%s:%d: Copyright end tag must be the only one in the line' % (inputFile, linenum)) | |
214 | 214 | |
215 | 215 | inCopyright = False |
216 | 216 | |
... | ... | @@ -219,21 +219,21 @@ def _readDictIdAndCopyright(inputFiles): |
219 | 219 | copyright += line |
220 | 220 | |
221 | 221 | if dictId is None: |
222 | - logging.warn(u'No dictionary ID tag found') | |
223 | - dictId = u'' | |
222 | + logging.warn('No dictionary ID tag found') | |
223 | + dictId = '' | |
224 | 224 | |
225 | 225 | if copyright is None: |
226 | - logging.warn(u'No copyright info found') | |
227 | - copyright = u'' | |
226 | + logging.warn('No copyright info found') | |
227 | + copyright = '' | |
228 | 228 | |
229 | 229 | return (dictId, copyright) |
230 | 230 | |
231 | 231 | def _readNamesAndQualifiers(inputFiles): |
232 | - names = set([u'']) | |
232 | + names = set(['']) | |
233 | 233 | qualifiers = set([frozenset()]) |
234 | 234 | lineParser = convertinput.LineParser() |
235 | 235 | for line in _concatFiles(inputFiles): |
236 | - line = line.strip().decode('utf8') | |
236 | + line = line.strip() | |
237 | 237 | if not lineParser.ignoreLine(line): |
238 | 238 | _, _, _, name, qualifier = lineParser.parseLine(line) |
239 | 239 | names.add(name) |
... | ... | @@ -242,7 +242,7 @@ def _readNamesAndQualifiers(inputFiles): |
242 | 242 | qualifiersMap = dict([(quals, idx) for idx, quals in enumerate(sorted(qualifiers, key=lambda q: tuple(sorted(q))))]) |
243 | 243 | exceptions.validate( |
244 | 244 | len(qualifiersMap) <= limits.MAX_QUALIFIERS_COMBINATIONS, |
245 | - u'Too many qualifiers combinations. The limit is %d' % limits.MAX_QUALIFIERS_COMBINATIONS) | |
245 | + 'Too many qualifiers combinations. The limit is %d' % limits.MAX_QUALIFIERS_COMBINATIONS) | |
246 | 246 | |
247 | 247 | return namesMap, qualifiersMap |
248 | 248 | |
... | ... |
fsabuilder/morfeuszbuilder/fsa/common.py
... | ... | @@ -10,9 +10,9 @@ import logging |
10 | 10 | class EncodedFormWithoutPrefix(object): |
11 | 11 | |
12 | 12 | def __init__(self, fromWord, targetWord, lowercase): |
13 | - assert type(fromWord) == unicode | |
14 | - assert type(targetWord) == unicode | |
15 | - root = u'' | |
13 | + assert type(fromWord) == str | |
14 | + assert type(targetWord) == str | |
15 | + root = '' | |
16 | 16 | for o, b in zip(fromWord, targetWord): |
17 | 17 | if ((o.lower() == b.lower()) if lowercase else o == b): |
18 | 18 | root += b |
... | ... | @@ -26,8 +26,8 @@ class EncodedFormWithoutPrefix(object): |
26 | 26 | class EncodedForm4Generator(object): |
27 | 27 | |
28 | 28 | def __init__(self, fromWord, targetWord): |
29 | - assert type(fromWord) == unicode | |
30 | - assert type(targetWord) == unicode | |
29 | + assert type(fromWord) == str | |
30 | + assert type(targetWord) == str | |
31 | 31 | bestEncodedForm = None |
32 | 32 | bestPrefixLength = -1 |
33 | 33 | for prefixLength in range(min(len(targetWord), 5)): |
... | ... | @@ -45,8 +45,8 @@ class EncodedForm4Generator(object): |
45 | 45 | class EncodedForm4Analyzer(object): |
46 | 46 | |
47 | 47 | def __init__(self, fromWord, targetWord): |
48 | - assert type(fromWord) == unicode | |
49 | - assert type(targetWord) == unicode | |
48 | + assert type(fromWord) == str | |
49 | + assert type(targetWord) == str | |
50 | 50 | bestEncodedForm = None |
51 | 51 | bestPrefixCutLength = -1 |
52 | 52 | for prefixCutLength in range(min(len(fromWord), 5)): |
... | ... | @@ -123,7 +123,7 @@ class Interpretation4Generator(object): |
123 | 123 | return hash(self.getSortKey()) |
124 | 124 | |
125 | 125 | def __unicode__(self): |
126 | - return u'<%s,(%d %s),%d,%d>' % (self.lemma.decode('utf8'), self.encodedForm.cutLength, self.encodedForm.suffixToAdd.decode('utf8'), self.tagnum, self.namenum) | |
126 | + return '<%s,(%d %s),%d,%d>' % (self.lemma, self.encodedForm.cutLength, self.encodedForm.suffixToAdd.decode('utf8'), self.tagnum, self.namenum) | |
127 | 127 | |
128 | 128 | def __repr__(self): |
129 | - return unicode(self) | |
129 | + return str(self) | |
... | ... |
fsabuilder/morfeuszbuilder/fsa/convertinput.py
... | ... | @@ -4,7 +4,7 @@ Created on Oct 23, 2013 |
4 | 4 | @author: mlenart |
5 | 5 | ''' |
6 | 6 | import logging |
7 | -from common import Interpretation4Analyzer | |
7 | +from .common import Interpretation4Analyzer | |
8 | 8 | from morfeuszbuilder.fsa.common import Interpretation4Generator |
9 | 9 | #from morfeuszbuilder.fsa import externalsort |
10 | 10 | |
... | ... | @@ -36,24 +36,24 @@ class LineParser(object): |
36 | 36 | def ignoreLine(self, line): |
37 | 37 | if not line: |
38 | 38 | return True |
39 | - elif line.strip() == u'#<COPYRIGHT>': | |
39 | + elif line.strip() == '#<COPYRIGHT>': | |
40 | 40 | self.inCopyright = True |
41 | 41 | return True |
42 | - elif line.strip() == u'#</COPYRIGHT>': | |
42 | + elif line.strip() == '#</COPYRIGHT>': | |
43 | 43 | self.inCopyright = False |
44 | 44 | return True |
45 | 45 | elif self.inCopyright: |
46 | 46 | return True |
47 | 47 | elif line and not ' ' in ''.join(line.split('\t')[:2]): |
48 | 48 | return False |
49 | - elif line.startswith(u'#!DICT-ID'): | |
49 | + elif line.startswith('#!DICT-ID'): | |
50 | 50 | return True |
51 | 51 | else: |
52 | - logging.warn(u'Ignoring line: "%s" - contains space in text form or lemma' % (line.strip())) | |
52 | + logging.warn('Ignoring line: "%s" - contains space in text form or lemma' % (line.strip())) | |
53 | 53 | return True |
54 | 54 | |
55 | 55 | def parseLine(self, line): |
56 | - splitLine = line.strip().split(u'\t') | |
56 | + splitLine = line.strip().split('\t') | |
57 | 57 | if len(splitLine) == 5: |
58 | 58 | orth, base, tag, name, qualifier = splitLine |
59 | 59 | elif len(splitLine) == 4: |
... | ... | @@ -69,7 +69,7 @@ class LineParser(object): |
69 | 69 | |
70 | 70 | def parseQualifiers(string): |
71 | 71 | if string: |
72 | - return frozenset(string.split(u'|')) | |
72 | + return frozenset(string.split('|')) | |
73 | 73 | else: |
74 | 74 | return frozenset() |
75 | 75 | |
... | ... | @@ -87,7 +87,7 @@ class PolimorfConverter4Analyzer(object): |
87 | 87 | def _partiallyParseLines(self, inputLines): |
88 | 88 | lineParser = LineParser() |
89 | 89 | for line in inputLines: |
90 | - line = line.decode(self.inputEncoding).strip('\n') | |
90 | + line = line.strip('\n') | |
91 | 91 | if not lineParser.ignoreLine(line): |
92 | 92 | orth, base, tag, name, qualifier = lineParser.parseLine(line) |
93 | 93 | |
... | ... | @@ -106,8 +106,8 @@ class PolimorfConverter4Analyzer(object): |
106 | 106 | base = orth |
107 | 107 | |
108 | 108 | yield '\t'.join(( |
109 | - orth.encode(self.inputEncoding), | |
110 | - base.encode(self.inputEncoding), | |
109 | + orth, | |
110 | + base, | |
111 | 111 | str(tagnum), |
112 | 112 | str(namenum), |
113 | 113 | str(typenum), |
... | ... | @@ -118,8 +118,8 @@ class PolimorfConverter4Analyzer(object): |
118 | 118 | base = orth |
119 | 119 | typenum = self.segmentRulesManager.shiftOrthMagic.getNewSegnum4ShiftOrth(typenum) |
120 | 120 | yield '\t'.join(( |
121 | - orth.encode(self.inputEncoding), | |
122 | - base.encode(self.inputEncoding), | |
121 | + orth, | |
122 | + base, | |
123 | 123 | str(tagnum), |
124 | 124 | str(namenum), |
125 | 125 | str(typenum), |
... | ... | @@ -127,14 +127,14 @@ class PolimorfConverter4Analyzer(object): |
127 | 127 | |
128 | 128 | # input lines are encoded and partially parsed |
129 | 129 | def _sortLines(self, inputLines): |
130 | - return sorted(inputLines, key=lambda line: self.encoder.word2SortKey(line.split('\t')[0].decode('utf8'))) | |
130 | + return sorted(inputLines, key=lambda line: self.encoder.word2SortKey(line.split('\t')[0])) | |
131 | 131 | # return sorted(inputLines, key=lambda line: self.encoder.word2SortKey(line.split('\t')[0].decode('utf8'))) |
132 | 132 | |
133 | 133 | def _reallyParseLines(self, inputLines): |
134 | 134 | for line in inputLines: |
135 | - line = line.decode(self.inputEncoding).strip(u'\n') | |
135 | + line = line.strip('\n') | |
136 | 136 | if line: |
137 | - orth, base, tagnum, namenum, typenum, qualsnum = line.split(u'\t') | |
137 | + orth, base, tagnum, namenum, typenum, qualsnum = line.split('\t') | |
138 | 138 | tagnum = int(tagnum) |
139 | 139 | namenum = int(namenum) |
140 | 140 | typenum = int(typenum) |
... | ... | @@ -159,14 +159,14 @@ class PolimorfConverter4Generator(object): |
159 | 159 | def _partiallyParseLines(self, inputLines): |
160 | 160 | lineParser = LineParser() |
161 | 161 | for line in inputLines: |
162 | - line = line.decode(self.inputEncoding).strip('\n') | |
162 | + line = line.strip('\n') | |
163 | 163 | if not lineParser.ignoreLine(line): |
164 | 164 | orth, base, tag, name, qualifier = lineParser.parseLine(line) |
165 | 165 | if base: |
166 | - homonymId = u'' | |
167 | - if u':' in base: | |
168 | - assumedBase, assumedHomonymId = base.split(u':', 1) | |
169 | - if assumedBase != u'' and assumedHomonymId != u'' and assumedHomonymId.isalnum(): | |
166 | + homonymId = '' | |
167 | + if ':' in base: | |
168 | + assumedBase, assumedHomonymId = base.split(':', 1) | |
169 | + if assumedBase != '' and assumedHomonymId != '' and assumedHomonymId.isalnum(): | |
170 | 170 | base, homonymId = assumedBase, assumedHomonymId |
171 | 171 | tagnum = self.tagset.getTagnum4Tag(tag) |
172 | 172 | namenum = self.namesMap[name] |
... | ... | @@ -179,39 +179,39 @@ class PolimorfConverter4Generator(object): |
179 | 179 | base = orth |
180 | 180 | |
181 | 181 | yield '\t'.join(( |
182 | - orth.encode(self.inputEncoding), | |
183 | - base.encode(self.inputEncoding), | |
182 | + orth, | |
183 | + base, | |
184 | 184 | str(tagnum), |
185 | 185 | str(namenum), |
186 | 186 | str(typenum), |
187 | - homonymId.encode(self.inputEncoding), | |
187 | + homonymId, | |
188 | 188 | str(qualsnum))) |
189 | 189 | |
190 | 190 | if self.segmentRulesManager.shiftOrthMagic.getNewSegnum4ShiftOrth(typenum) != None: |
191 | 191 | base = orth |
192 | 192 | typenum = self.segmentRulesManager.shiftOrthMagic.getNewSegnum4ShiftOrth(typenum) |
193 | 193 | yield '\t'.join(( |
194 | - orth.encode(self.inputEncoding), | |
195 | - base.encode(self.inputEncoding), | |
194 | + orth, | |
195 | + base, | |
196 | 196 | str(tagnum), |
197 | 197 | str(namenum), |
198 | 198 | str(typenum), |
199 | - homonymId.encode(self.inputEncoding), | |
199 | + homonymId, | |
200 | 200 | str(qualsnum))) |
201 | 201 | else: |
202 | 202 | logging.warn('Ignoring line: "%s" - contains empty lemma', line.strip()) |
203 | 203 | |
204 | 204 | # input lines are encoded and partially parsed |
205 | 205 | def _sortLines(self, inputLines): |
206 | - return sorted(inputLines, key=lambda line: (self.encoder.word2SortKey(line.split('\t')[1].decode('utf8')), line)) | |
206 | + return sorted(inputLines, key=lambda line: (self.encoder.word2SortKey(line.split('\t')[1]), line)) | |
207 | 207 | |
208 | 208 | def _reallyParseLines(self, inputLines): |
209 | 209 | prevLine = None |
210 | 210 | for line in inputLines: |
211 | - line = line.decode(self.inputEncoding).strip(u'\n') | |
211 | + line = line.strip('\n') | |
212 | 212 | if line and line != prevLine: |
213 | - orth, base, tagnum, namenum, typenum, homonymId, qualsnum = line.split(u'\t') | |
214 | -# print orth.encode('utf8'), base.encode('utf8'), homonymId | |
213 | + orth, base, tagnum, namenum, typenum, homonymId, qualsnum = line.split('\t') | |
214 | +# print orth, base, homonymId | |
215 | 215 | tagnum = int(tagnum) |
216 | 216 | namenum = int(namenum) |
217 | 217 | typenum = int(typenum) |
... | ... |
fsabuilder/morfeuszbuilder/fsa/encode.py
... | ... | @@ -24,7 +24,7 @@ class Encoder(object): |
24 | 24 | #~ self.qualifiersMap = { frozenset(): 0} |
25 | 25 | |
26 | 26 | def encodeWord(self, word, lowercase=True): |
27 | - assert type(word) == unicode | |
27 | + assert type(word) == str | |
28 | 28 | res = bytearray(word.lower() if self.lowercase and lowercase else word, self.encoding) |
29 | 29 | return res |
30 | 30 | |
... | ... | @@ -35,16 +35,16 @@ class Encoder(object): |
35 | 35 | return NotImplementedError() |
36 | 36 | |
37 | 37 | def decodeWord(self, rawWord): |
38 | - return unicode(str(rawWord).strip('\x00'), self.encoding) | |
38 | + return str(str(rawWord).strip('\x00'), self.encoding) | |
39 | 39 | |
40 | 40 | def word2SortKey(self, word): |
41 | 41 | normalizedWord = word.lower() if self.lowercase else word |
42 | - return normalizedWord.encode(self.encoding) | |
42 | + return normalizedWord | |
43 | 43 | |
44 | 44 | def _encodeTypeNum(self, typenum): |
45 | 45 | exceptions.validate( |
46 | 46 | typenum <= limits.MAX_SEGMENT_TYPES, |
47 | - u'Too many segment types. The limit is %d' % limits.MAX_SEGMENT_TYPES) | |
47 | + 'Too many segment types. The limit is %d' % limits.MAX_SEGMENT_TYPES) | |
48 | 48 | return bytearray([typenum]) |
49 | 49 | |
50 | 50 | def _hasUpperPrefix(self, casePattern): |
... | ... | @@ -62,13 +62,13 @@ class Encoder(object): |
62 | 62 | |
63 | 63 | def _encodeTagNum(self, tagnum): |
64 | 64 | res = bytearray() |
65 | - exceptions.validate(tagnum <= limits.MAX_TAGS, u'Too many tags. The limit is %d' % limits.MAX_TAGS) | |
65 | + exceptions.validate(tagnum <= limits.MAX_TAGS, 'Too many tags. The limit is %d' % limits.MAX_TAGS) | |
66 | 66 | res.append((tagnum & 0xFF00) >> 8) |
67 | 67 | res.append(tagnum & 0x00FF) |
68 | 68 | return res |
69 | 69 | |
70 | 70 | def _encodeNameNum(self, namenum): |
71 | - exceptions.validate(namenum <= limits.MAX_NAMES, u'Too many named entity types. The limit is %d' % limits.MAX_NAMES) | |
71 | + exceptions.validate(namenum <= limits.MAX_NAMES, 'Too many named entity types. The limit is %d' % limits.MAX_NAMES) | |
72 | 72 | return bytearray([namenum]) |
73 | 73 | |
74 | 74 | def _groupInterpsByType(self, interpsList): |
... | ... | @@ -86,7 +86,7 @@ class Encoder(object): |
86 | 86 | |
87 | 87 | res = bytearray() |
88 | 88 | |
89 | - for typenum, interpsList in segnum2Interps.iteritems(): | |
89 | + for typenum, interpsList in list(segnum2Interps.items()): | |
90 | 90 | res.extend(self._encodeInterps4Type(typenum, interpsList)) |
91 | 91 | del interpsList |
92 | 92 | |
... | ... | @@ -135,10 +135,10 @@ class MorphEncoder(Encoder): |
135 | 135 | return res |
136 | 136 | |
137 | 137 | def _casePatternsHaveOnlyLowercase(self, casePatterns): |
138 | - return not any(map(lambda cp: cp and True in cp, casePatterns)) | |
138 | + return not any([cp and True in cp for cp in casePatterns]) | |
139 | 139 | |
140 | 140 | def _casePatternsAreOnlyTitles(self, casePatterns): |
141 | - return all(map(lambda cp: cp and cp[0] == True and not True in cp[1:], casePatterns)) | |
141 | + return all([cp and cp[0] == True and not True in cp[1:] for cp in casePatterns]) | |
142 | 142 | |
143 | 143 | def _casePatternsAreEncodedInCompressByte(self, casePatterns): |
144 | 144 | return self._casePatternsHaveOnlyLowercase(casePatterns) or self._casePatternsAreOnlyTitles(casePatterns) |
... | ... |
fsabuilder/morfeuszbuilder/fsa/fsa.py
... | ... | @@ -4,8 +4,8 @@ Created on Oct 8, 2013 |
4 | 4 | @author: mlenart |
5 | 5 | ''' |
6 | 6 | |
7 | -import state | |
8 | -import register | |
7 | +from . import state | |
8 | +from . import register | |
9 | 9 | import logging |
10 | 10 | from morfeuszbuilder.utils import exceptions |
11 | 11 | |
... | ... | @@ -35,7 +35,7 @@ class FSA(object): |
35 | 35 | assert not self.closed |
36 | 36 | assert data is not None |
37 | 37 | encodedWord = self.encodeWord(word) |
38 | - assert encodedWord > self.encodedPrevWord | |
38 | + assert self.encodedPrevWord is None or encodedWord > self.encodedPrevWord | |
39 | 39 | self._addSorted(encodedWord, self.encodeData(data)) |
40 | 40 | self.encodedPrevWord = encodedWord |
41 | 41 | |
... | ... | @@ -43,7 +43,7 @@ class FSA(object): |
43 | 43 | |
44 | 44 | # debug |
45 | 45 | if self.n < 10 or (self.n < 10000 and self.n % 1000 == 0) or self.n % 10000 == 0: |
46 | - logging.info(u'%d %s' % (self.n, word)) | |
46 | + logging.info('%d %s' % (self.n, word)) | |
47 | 47 | for label in encodedWord: |
48 | 48 | self.label2Freq[label] = self.label2Freq.get(label, 0) + 1 |
49 | 49 | |
... | ... | @@ -78,7 +78,7 @@ class FSA(object): |
78 | 78 | return res |
79 | 79 | |
80 | 80 | def _addSorted(self, encodedWord, data): |
81 | - assert self.encodedPrevWord < encodedWord | |
81 | + assert self.encodedPrevWord is None or self.encodedPrevWord < encodedWord | |
82 | 82 | assert type(data) == bytearray |
83 | 83 | q = self.initialState |
84 | 84 | i = 0 |
... | ... |
fsabuilder/morfeuszbuilder/fsa/serializer.py
... | ... | @@ -5,7 +5,7 @@ Created on Oct 20, 2013 |
5 | 5 | ''' |
6 | 6 | |
7 | 7 | import logging |
8 | -from state import State | |
8 | +from .state import State | |
9 | 9 | from morfeuszbuilder.utils import limits, exceptions |
10 | 10 | from morfeuszbuilder.utils.serializationUtils import * |
11 | 11 | |
... | ... | @@ -106,7 +106,7 @@ class Serializer(object): |
106 | 106 | res = bytearray() |
107 | 107 | numOfTags = len(tagsMap) |
108 | 108 | res.extend(htons(numOfTags)) |
109 | - for tag, tagnum in sorted(tagsMap.iteritems(), key=lambda (tag, tagnum): tagnum): | |
109 | + for tag, tagnum in sorted(iter(list(tagsMap.items())), key=lambda tag_tagnum: tag_tagnum[1]): | |
110 | 110 | res.extend(htons(tagnum)) |
111 | 111 | res.extend(self.fsa.encodeWord(tag)) |
112 | 112 | res.append(0) |
... | ... | @@ -121,7 +121,7 @@ class Serializer(object): |
121 | 121 | #~ return res |
122 | 122 | |
123 | 123 | def serializeQualifiersMap(self): |
124 | - label2labelId = dict([ (u'|'.join(qualifiers), n) for qualifiers, n in sorted(self.qualifiersMap.iteritems(), key=lambda (qs, n): n) ]) | |
124 | + label2labelId = dict([ ('|'.join(sorted(qualifiers)), n) for qualifiers, n in sorted(iter(list(self.qualifiersMap.items())), key=lambda qs_n: qs_n[1]) ]) | |
125 | 125 | return self._serializeTags(label2labelId) |
126 | 126 | #~ res = bytearray() |
127 | 127 | #~ res.extend(htons(len(self.qualifiersMap))) |
... | ... | @@ -186,9 +186,9 @@ class Serializer(object): |
186 | 186 | return res |
187 | 187 | |
188 | 188 | def getSortedTransitions(self, state): |
189 | - defaultKey = lambda (label, nextState): (-state.label2Freq.get(label, 0), -self.fsa.label2Freq.get(label, 0)) | |
189 | + defaultKey = lambda label_nextState: (-state.label2Freq.get(label_nextState[0], 0), -self.fsa.label2Freq.get(label_nextState[0], 0)) | |
190 | 190 | return list(sorted( |
191 | - state.transitionsMap.iteritems(), | |
191 | + iter(list(state.transitionsMap.items())), | |
192 | 192 | key=defaultKey)) |
193 | 193 | |
194 | 194 | def stateData2bytearray(self, state): |
... | ... | @@ -215,9 +215,9 @@ class SimpleSerializer(Serializer): |
215 | 215 | |
216 | 216 | def getStateSize(self, state): |
217 | 217 | if self.serializeTransitionsData: |
218 | - return 1 + 5 * len(state.transitionsMap.keys()) + self.getDataSize(state) | |
218 | + return 1 + 5 * len(list(state.transitionsMap.keys())) + self.getDataSize(state) | |
219 | 219 | else: |
220 | - return 1 + 4 * len(state.transitionsMap.keys()) + self.getDataSize(state) | |
220 | + return 1 + 4 * len(list(state.transitionsMap.keys())) + self.getDataSize(state) | |
221 | 221 | |
222 | 222 | def getDataSize(self, state): |
223 | 223 | return len(state.encodedData) if state.isAccepting() else 0 |
... | ... | @@ -270,12 +270,12 @@ class VLengthSerializer1(Serializer): |
270 | 270 | res = bytearray() |
271 | 271 | |
272 | 272 | # labels sorted by popularity |
273 | - sortedLabels = [label for (label, freq) in sorted(self.fsa.label2Freq.iteritems(), key=lambda (label, freq): (-freq, label))] | |
273 | + sortedLabels = [label for (label, freq) in sorted(iter(list(self.fsa.label2Freq.items())), key=lambda label_freq: (-label_freq[1], label_freq[0]))] | |
274 | 274 | |
275 | 275 | # popular labels table |
276 | 276 | self.label2ShortLabel = dict([(label, sortedLabels.index(label) + 1) for label in sortedLabels[:63]]) |
277 | 277 | |
278 | - logging.debug(dict([(chr(label), shortLabel) for label, shortLabel in self.label2ShortLabel.items()])) | |
278 | + logging.debug(dict([(chr(label), shortLabel) for label, shortLabel in list(self.label2ShortLabel.items())])) | |
279 | 279 | |
280 | 280 | # write remaining short labels (zeros) |
281 | 281 | for label in range(256): |
... | ... | @@ -354,7 +354,7 @@ class VLengthSerializer1(Serializer): |
354 | 354 | offsetSize += 1 |
355 | 355 | exceptions.validate( |
356 | 356 | offset < 256 * 256 * 256, |
357 | - u'Cannot build the automaton - it would exceed its max size which is %d' % (256 * 256 * 256)) | |
357 | + 'Cannot build the automaton - it would exceed its max size which is %d' % (256 * 256 * 256)) | |
358 | 358 | # assert offset < 256 * 256 * 256 # TODO - przerobic na jakis porzadny wyjatek |
359 | 359 | assert offsetSize <= 3 |
360 | 360 | firstByte |= offsetSize |
... | ... | @@ -380,7 +380,7 @@ class VLengthSerializer1(Serializer): |
380 | 380 | newState.encodedData = state.encodedData |
381 | 381 | newState.reverseOffset = state.reverseOffset |
382 | 382 | newState.offset = state.offset |
383 | - newState.transitionsMap = dict([(label, nextState) for (label, nextState) in state.transitionsMap.iteritems()]) | |
383 | + newState.transitionsMap = dict([(label, nextState) for (label, nextState) in list(state.transitionsMap.items())]) | |
384 | 384 | # newState.transitionsMap = dict([(label, nextState) for (label, nextState) in state.transitionsMap.iteritems() if not label in self.label2ShortLabel or not self.label2ShortLabel[label] in range(1,64)]) |
385 | 385 | newState.serializeAsArray = False |
386 | 386 | return newState |
... | ... | @@ -388,12 +388,12 @@ class VLengthSerializer1(Serializer): |
388 | 388 | def _transitions2ArrayBytes(self, state): |
389 | 389 | res = bytearray() |
390 | 390 | array = [0] * 64 |
391 | - for label, nextState in state.transitionsMap.iteritems(): | |
391 | + for label, nextState in list(state.transitionsMap.items()): | |
392 | 392 | if label in self.label2ShortLabel: |
393 | 393 | shortLabel = self.label2ShortLabel[label] |
394 | 394 | array[shortLabel] = nextState.offset |
395 | 395 | logging.debug(array) |
396 | - for offset in map(lambda x: x if x else 0, array): | |
396 | + for offset in [x if x else 0 for x in array]: | |
397 | 397 | res.append(0) |
398 | 398 | res.append((offset & 0xFF0000) >> 16) |
399 | 399 | res.append((offset & 0x00FF00) >> 8) |
... | ... | @@ -409,8 +409,8 @@ class VLengthSerializer1(Serializer): |
409 | 409 | return self._transitions2ListBytes(state) |
410 | 410 | |
411 | 411 | def _chooseArrayStates(self): |
412 | - for state1 in self.fsa.initialState.transitionsMap.values(): | |
413 | - for state2 in state1.transitionsMap.values(): | |
412 | + for state1 in list(self.fsa.initialState.transitionsMap.values()): | |
413 | + for state2 in list(state1.transitionsMap.values()): | |
414 | 414 | # for state3 in state2.transitionsMap.values(): |
415 | 415 | # state3.serializeAsArray = True |
416 | 416 | state2.serializeAsArray = True |
... | ... |
fsabuilder/morfeuszbuilder/fsa/state.py
... | ... | @@ -45,7 +45,7 @@ class State(object): |
45 | 45 | return self.transitionsMap.get(byte, None) |
46 | 46 | |
47 | 47 | def getRegisterKey(self): |
48 | - return ( frozenset(self.transitionsMap.iteritems()), tuple(self.encodedData) if self.encodedData else None ) | |
48 | + return ( frozenset(iter(list(self.transitionsMap.items()))), tuple(self.encodedData) if self.encodedData else None ) | |
49 | 49 | |
50 | 50 | def isAccepting(self): |
51 | 51 | return self.encodedData is not None |
... | ... | @@ -60,10 +60,10 @@ class State(object): |
60 | 60 | else: |
61 | 61 | return self.encodedData |
62 | 62 | |
63 | - def dfs(self, alreadyVisited, sortKey=lambda (_, state): -state.freq): | |
63 | + def dfs(self, alreadyVisited, sortKey=lambda __state: -__state[1].freq): | |
64 | 64 | if not self in alreadyVisited: |
65 | 65 | alreadyVisited.add(self) |
66 | - for _, state in sorted(self.transitionsMap.iteritems(), key=sortKey): | |
66 | + for _, state in sorted(iter(list(self.transitionsMap.items())), key=sortKey): | |
67 | 67 | for state1 in state.dfs(alreadyVisited): |
68 | 68 | yield state1 |
69 | 69 | yield self |
... | ... | @@ -77,7 +77,7 @@ class State(object): |
77 | 77 | state.offset = currReverseOffset - state.reverseOffset |
78 | 78 | |
79 | 79 | def debug(self): |
80 | - print '----------------' | |
81 | - print 'STATE:', self.idx, 'accepting', self.isAccepting() | |
82 | - for label, s in self.transitionsMap.iteritems(): | |
83 | - print label, '-->', s.idx | |
80 | + print('----------------') | |
81 | + print(('STATE:', self.idx, 'accepting', self.isAccepting())) | |
82 | + for label, s in list(self.transitionsMap.items()): | |
83 | + print((label, '-->', s.idx)) | |
... | ... |
fsabuilder/morfeuszbuilder/fsa/visualizer.py
... | ... | @@ -19,7 +19,7 @@ class Visualizer(object): |
19 | 19 | nodeLabelsMap = {} |
20 | 20 | for idx, state in enumerate(allStates): |
21 | 21 | G.add_node(idx, offset=state.offset) |
22 | - for c, targetState in state.transitionsMap.iteritems(): | |
22 | + for c, targetState in list(state.transitionsMap.items()): | |
23 | 23 | G.add_edge(idx, allStates.index(targetState)) |
24 | 24 | label = (chr(c) if c <= 127 else '%') if charLabels \ |
25 | 25 | else c |
... | ... | @@ -37,11 +37,11 @@ class Visualizer(object): |
37 | 37 | nodelist=list([allStates.index(s) for s in allStates if s.isAccepting()]), |
38 | 38 | node_shape='s') |
39 | 39 | # nx.draw_networkx_nodes(G, pos, nodelist=list([allStates.index(s) for s in allStates if s.isFinal()])), ) |
40 | - nx.draw_networkx_edges(G, pos, edgelist=edgeLabelsMap.keys()) | |
40 | + nx.draw_networkx_edges(G, pos, edgelist=list(edgeLabelsMap.keys())) | |
41 | 41 | nx.draw_networkx_edge_labels(G, pos, edge_labels = edgeLabelsMap) |
42 | 42 | nx.draw_networkx_labels(G, pos, labels=nodeLabelsMap) |
43 | 43 | plt.axis('off') |
44 | 44 | plt.draw() |
45 | 45 | plt.show() |
46 | 46 | # plt.savefig('filename.png') |
47 | - print 'done' | |
47 | + print('done') | |
... | ... |
fsabuilder/morfeuszbuilder/segrules/preprocessor.py
... | ... | @@ -7,10 +7,10 @@ Created on 23 sty 2014 |
7 | 7 | import re |
8 | 8 | from pyparsing import * |
9 | 9 | from morfeuszbuilder.utils import exceptions |
10 | -from pyparseString import pyparseString | |
10 | +from .pyparseString import pyparseString | |
11 | 11 | |
12 | -identifier = Word(alphas, bodyChars=alphanums+u'_>*+{},') | |
13 | -define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + Word(alphas, bodyChars=alphanums+u'_') + Suppress(')')) + restOfLine + LineEnd() + StringEnd() | |
12 | +identifier = Word(alphas, bodyChars=alphanums+'_>*+{},') | |
13 | +define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + Word(alphas, bodyChars=alphanums+'_') + Suppress(')')) + restOfLine + LineEnd() + StringEnd() | |
14 | 14 | ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd() |
15 | 15 | endif = Keyword('#endif').suppress() + LineEnd() + StringEnd() |
16 | 16 | |
... | ... | @@ -107,5 +107,5 @@ def preprocess(inputLines, defs, filename): |
107 | 107 | ifdefsStack.pop() |
108 | 108 | elif line.startswith('#'): |
109 | 109 | yield lineNum, line |
110 | - elif len(ifdefsStack) == 0 or all(map(lambda (name, isActive): (name in defs and isActive) or (name not in defs and not isActive), ifdefsStack)): | |
110 | + elif len(ifdefsStack) == 0 or all([(name_isActive[0] in defs and name_isActive[1]) or (name_isActive[0] not in defs and not name_isActive[1]) for name_isActive in ifdefsStack]): | |
111 | 111 | yield lineNum, _processLine(lineNum, line, defines, filename) |
... | ... |
fsabuilder/morfeuszbuilder/segrules/pyparseString.py
... | ... | @@ -11,7 +11,7 @@ def pyparseString(rule, lineNum, line, filename): |
11 | 11 | try: |
12 | 12 | return rule.parseString(line, parseAll=True) |
13 | 13 | except ParseException as ex: |
14 | - msg = u'%s:%d - Preprocessing of segmentation rules failed.\n' % (filename, lineNum) | |
14 | + msg = '%s:%d - Preprocessing of segmentation rules failed.\n' % (filename, lineNum) | |
15 | 15 | msg += line + '\n' |
16 | 16 | msg += (ex.col - 1) * ' ' + '^\n' |
17 | 17 | msg += ex.msg |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rules.py
... | ... | @@ -126,7 +126,7 @@ class ComplexRule(SegmentRule): |
126 | 126 | def __init__(self, children, linenum): |
127 | 127 | super(ComplexRule, self).__init__(linenum) |
128 | 128 | self.children = children |
129 | - assert not any(map(lambda c: c.isSinkRule(), children)) | |
129 | + assert not any([c.isSinkRule() for c in children]) | |
130 | 130 | |
131 | 131 | def addToNFA(self, fsa): |
132 | 132 | endState = RulesNFAState(self, final=True, weak=self.weak, autogenerated=self.autogenerated) |
... | ... | @@ -159,13 +159,13 @@ class ConcatRule(ComplexRule): |
159 | 159 | lastChild._doAddToNFA(currStartState, endState) |
160 | 160 | |
161 | 161 | def allowsEmptySequence(self): |
162 | - return all(map(lambda rule: rule.allowsEmptySequence(), self.children)) | |
162 | + return all([rule.allowsEmptySequence() for rule in self.children]) | |
163 | 163 | |
164 | 164 | def __str__(self): |
165 | - return u' '.join(map(lambda c: str(c), self.children)) | |
165 | + return ' '.join([str(c) for c in self.children]) | |
166 | 166 | |
167 | 167 | def isShiftOrthRule(self): |
168 | - return all(map(lambda c: c.isShiftOrthRule(), self.children)) | |
168 | + return all([c.isShiftOrthRule() for c in self.children]) | |
169 | 169 | |
170 | 170 | def transformToGeneratorVersion(self): |
171 | 171 | newChildren = [child.transformToGeneratorVersion() for child in self.children if not child.allowsEmptySequence() or child.isShiftOrthRule()] |
... | ... | @@ -207,11 +207,11 @@ class ConcatRule(ComplexRule): |
207 | 207 | for rule in self.children: |
208 | 208 | rule.validate(filename) |
209 | 209 | if self.children[-1].isShiftOrthRule() \ |
210 | - and not all(map(lambda c: c.isShiftOrthRule(), self.children)): | |
210 | + and not all([c.isShiftOrthRule() for c in self.children]): | |
211 | 211 | raise ConfigFileException( |
212 | 212 | filename, |
213 | 213 | self.linenum, |
214 | - u'If the rightmost subrule of concatenation "%s" is with ">", than all subrules must be with ">"' % str(self)) | |
214 | + 'If the rightmost subrule of concatenation "%s" is with ">", than all subrules must be with ">"' % str(self)) | |
215 | 215 | |
216 | 216 | class OrRule(ComplexRule): |
217 | 217 | |
... | ... | @@ -227,17 +227,17 @@ class OrRule(ComplexRule): |
227 | 227 | intermEndState.addTransition(None, endState) |
228 | 228 | |
229 | 229 | def allowsEmptySequence(self): |
230 | - return any(map(lambda rule: rule.allowsEmptySequence(), self.children)) | |
230 | + return any([rule.allowsEmptySequence() for rule in self.children]) | |
231 | 231 | |
232 | 232 | def __str__(self): |
233 | - return u' | '.join(map(lambda c: str(c), self.children)) | |
233 | + return ' | '.join([str(c) for c in self.children]) | |
234 | 234 | |
235 | 235 | def isShiftOrthRule(self): |
236 | - return all(map(lambda c: c.isShiftOrthRule(), self.children)) | |
236 | + return all([c.isShiftOrthRule() for c in self.children]) | |
237 | 237 | |
238 | 238 | def transformToGeneratorVersion(self): |
239 | 239 | newChildren = [child.transformToGeneratorVersion() for child in self.children if not child.allowsEmptySequence() or child.isShiftOrthRule()] |
240 | - newChildren = filter(lambda c: not c.isSinkRule(), newChildren) | |
240 | + newChildren = [c for c in newChildren if not c.isSinkRule()] | |
241 | 241 | if newChildren == []: |
242 | 242 | return SinkRule() |
243 | 243 | else: |
... | ... | @@ -255,12 +255,12 @@ class OrRule(ComplexRule): |
255 | 255 | for rule in self.children: |
256 | 256 | rule.validate(filename) |
257 | 257 | if not ( |
258 | - all(map(lambda c: c.isShiftOrthRule(), self.children)) | |
259 | - or not any(map(lambda c: c.isShiftOrthRule(), self.children))): | |
258 | + all([c.isShiftOrthRule() for c in self.children]) | |
259 | + or not any([c.isShiftOrthRule() for c in self.children])): | |
260 | 260 | raise ConfigFileException( |
261 | 261 | filename, |
262 | 262 | self.linenum, |
263 | - u'All subrules of alternative "%s" must be either with or without ">"' % str(self)) | |
263 | + 'All subrules of alternative "%s" must be either with or without ">"' % str(self)) | |
264 | 264 | |
265 | 265 | class ZeroOrMoreRule(UnaryRule): |
266 | 266 | |
... | ... | @@ -291,7 +291,7 @@ class ZeroOrMoreRule(UnaryRule): |
291 | 291 | return SinkRule() |
292 | 292 | |
293 | 293 | def __str__(self): |
294 | - return u'(' + str(self.child) + ')*' | |
294 | + return '(' + str(self.child) + ')*' | |
295 | 295 | |
296 | 296 | class OptionalRule(UnaryRule): |
297 | 297 | |
... | ... | @@ -321,7 +321,7 @@ class OptionalRule(UnaryRule): |
321 | 321 | return self.child.transformToGeneratorVersion() |
322 | 322 | |
323 | 323 | def __str__(self): |
324 | - return u'(' + str(self.child) + ')?' | |
324 | + return '(' + str(self.child) + ')?' | |
325 | 325 | |
326 | 326 | class SinkRule(SegmentRule): |
327 | 327 | |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesFSA.py
... | ... | @@ -49,7 +49,7 @@ class RulesFSA(object): |
49 | 49 | def transitionsData2bytearray(self, state): |
50 | 50 | res = bytearray() |
51 | 51 | # logging.debug('next') |
52 | - for (segnum, shiftOrth), nextState in sorted(state.transitionsMap.iteritems()): | |
52 | + for (segnum, shiftOrth), nextState in sorted(state.transitionsMap.items()): | |
53 | 53 | res.append(segnum) |
54 | 54 | if shiftOrth: |
55 | 55 | res.append(1) |
... | ... | @@ -57,8 +57,8 @@ class RulesFSA(object): |
57 | 57 | res.append(0) |
58 | 58 | offset = nextState.offset |
59 | 59 | exceptions.validate(offset <= MAX_FSA_SIZE, |
60 | - u'Segmentation rules are too big and complicated' \ | |
61 | - + u'- the resulting automaton would exceed its max size which is %d' \ | |
60 | + 'Segmentation rules are too big and complicated' \ | |
61 | + + '- the resulting automaton would exceed its max size which is %d' \ | |
62 | 62 | % MAX_FSA_SIZE) |
63 | 63 | res.extend(htons(offset)) |
64 | 64 | return res |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
... | ... | @@ -7,7 +7,7 @@ import logging |
7 | 7 | from morfeuszbuilder.utils.serializationUtils import htons, htonl |
8 | 8 | from morfeuszbuilder.utils import serializationUtils |
9 | 9 | from morfeuszbuilder.utils import exceptions |
10 | -import shiftOrthMagic | |
10 | +from . import shiftOrthMagic | |
11 | 11 | |
12 | 12 | class RulesManager(object): |
13 | 13 | |
... | ... | @@ -19,7 +19,7 @@ class RulesManager(object): |
19 | 19 | self.shiftOrthMagic = shiftOrthMagic.ShiftOrthMagic() |
20 | 20 | |
21 | 21 | def _options2Key(self, optionsMap): |
22 | - return frozenset(optionsMap.items()) | |
22 | + return frozenset(list(optionsMap.items())) | |
23 | 23 | |
24 | 24 | def _key2Options(self, optionsKey): |
25 | 25 | return dict(optionsKey) |
... | ... | @@ -46,9 +46,9 @@ class RulesManager(object): |
46 | 46 | dfasNum = len(self.options2DFA) |
47 | 47 | exceptions.validate( |
48 | 48 | dfasNum > 0 and dfasNum < 256, |
49 | - u'Too many segmentation rules variants') | |
49 | + 'Too many segmentation rules variants') | |
50 | 50 | res.append(dfasNum) |
51 | - for key, dfa in self.options2DFA.iteritems(): | |
51 | + for key, dfa in list(self.options2DFA.items()): | |
52 | 52 | optionsMap = self._key2Options(key) |
53 | 53 | res.extend(self._serializeOptionsMap(optionsMap)) |
54 | 54 | res.extend(self._serializeDFA(dfa)) |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
... | ... | @@ -41,16 +41,16 @@ class RulesNFAState(object): |
41 | 41 | if not self in visitedStates: |
42 | 42 | visitedStates.add(self) |
43 | 43 | yield self |
44 | - for _, nextStates in self.transitionsMap.iteritems(): | |
44 | + for _, nextStates in list(self.transitionsMap.items()): | |
45 | 45 | for state in nextStates: |
46 | 46 | for state1 in state.dfs(visitedStates): |
47 | 47 | yield state1 |
48 | 48 | |
49 | 49 | def debug(self): |
50 | - print '----------------' | |
51 | - print 'STATE:', self.idx | |
52 | - for label, nextStates in self.transitionsMap.iteritems(): | |
53 | - print label, '-->', [s.idx for s in sorted(nextStates, key=lambda s: s.idx)] | |
50 | + print('----------------') | |
51 | + print(('STATE:', self.idx)) | |
52 | + for label, nextStates in list(self.transitionsMap.items()): | |
53 | + print((label, '-->', [s.idx for s in sorted(nextStates, key=lambda s: s.idx)])) | |
54 | 54 | |
55 | 55 | class RulesNFA(object): |
56 | 56 | |
... | ... | @@ -60,7 +60,7 @@ class RulesNFA(object): |
60 | 60 | def _groupOutputByLabels(self, nfaStates): |
61 | 61 | res = {} |
62 | 62 | for nfaState in nfaStates: |
63 | - for label, nextStates in nfaState.transitionsMap.iteritems(): | |
63 | + for label, nextStates in list(nfaState.transitionsMap.items()): | |
64 | 64 | if label is not None: |
65 | 65 | # transitionData = nfaState.transitionsDataMap[label] |
66 | 66 | segnum, shiftOrth = label |
... | ... | @@ -70,27 +70,21 @@ class RulesNFA(object): |
70 | 70 | return res |
71 | 71 | |
72 | 72 | def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState): |
73 | - weakHits = map( | |
74 | - lambda state: state.weak, | |
75 | - filter( | |
76 | - lambda state: state.final and not state.autogenerated, | |
77 | - nfaStates)) | |
73 | + weakHits = [state.weak for state in [state for state in nfaStates if state.final and not state.autogenerated]] | |
78 | 74 | if not all(weakHits) \ |
79 | 75 | and any(weakHits): |
80 | - weakState = list(filter(lambda state: state.final and state.weak, nfaStates))[0] | |
81 | - nonWeakState = list(filter(lambda state: state.final and not state.weak, nfaStates))[0] | |
76 | + weakState = list([state for state in nfaStates if state.final and state.weak])[0] | |
77 | + nonWeakState = list([state for state in nfaStates if state.final and not state.weak])[0] | |
82 | 78 | raise InconsistentStateWeaknessException(weakState, nonWeakState) |
83 | - weak = any(map( | |
84 | - lambda state: state.weak and state.final, | |
85 | - filter(lambda state: not state.autogenerated, nfaStates))) | |
86 | - final = any(map(lambda state: state.final, nfaStates)) | |
79 | + weak = any([state.weak and state.final for state in [state for state in nfaStates if not state.autogenerated]]) | |
80 | + final = any([state.final for state in nfaStates]) | |
87 | 81 | # assert not weak or not final |
88 | 82 | if final: |
89 | 83 | # dfaState should be final |
90 | 84 | # and contain info about weakness |
91 | 85 | dfaState.setAsAccepting(weak=weak) |
92 | 86 | # dfaState.encodedData = bytearray([1 if weak else 0]) |
93 | - for (segnum, shiftOrth), nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems(): | |
87 | + for (segnum, shiftOrth), nextNFAStates in list(self._groupOutputByLabels(nfaStates).items()): | |
94 | 88 | key = frozenset(nextNFAStates) |
95 | 89 | if key in nfaSubset2DFAState: |
96 | 90 | nextDFAState = nfaSubset2DFAState[key] |
... | ... | @@ -104,7 +98,7 @@ class RulesNFA(object): |
104 | 98 | def convertToDFA(self): |
105 | 99 | dfa = RulesFSA() |
106 | 100 | startStates = self.initialState.getClosure(set()) |
107 | - assert not any(filter(lambda s: s.final, startStates)) | |
101 | + assert not any([s for s in startStates if s.final]) | |
108 | 102 | dfa.initialState = RulesState() |
109 | 103 | self._doConvertState(dfa.initialState, startStates, {frozenset(startStates): dfa.initialState}) |
110 | 104 | return dfa |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
... | ... | @@ -28,11 +28,11 @@ class RulesParser(object): |
28 | 28 | key, defs = lineToParse.parseString(line) |
29 | 29 | res[key] = tuple(defs) |
30 | 30 | except Exception as ex: |
31 | - raise exceptions.ConfigFileException(segtypesConfigFile.filename, lineNum, u'Error in [options] section: %s' % str(ex)) | |
31 | + raise exceptions.ConfigFileException(segtypesConfigFile.filename, lineNum, 'Error in [options] section: %s' % str(ex)) | |
32 | 32 | return res |
33 | 33 | |
34 | 34 | def _key2DefAsKey(self, key2Def): |
35 | - return frozenset(key2Def.items()) | |
35 | + return frozenset(list(key2Def.items())) | |
36 | 36 | |
37 | 37 | def parse(self, filename): |
38 | 38 | |
... | ... | @@ -53,12 +53,12 @@ class RulesParser(object): |
53 | 53 | res = rulesManager.RulesManager(segtypesHelper, separatorsList) |
54 | 54 | |
55 | 55 | def2Key = {} |
56 | - for key, defs in key2Defs.iteritems(): | |
56 | + for key, defs in list(key2Defs.items()): | |
57 | 57 | for define in defs: |
58 | 58 | def2Key[define] = key |
59 | 59 | |
60 | 60 | resultsMap = {} |
61 | - for idx, defs in enumerate(itertools.product(*key2Defs.values())): | |
61 | + for idx, defs in enumerate(itertools.product(*list(key2Defs.values()))): | |
62 | 62 | key2Def = dict([(def2Key[define], define) for define in defs]) |
63 | 63 | currRes = [] |
64 | 64 | resultsMap[self._key2DefAsKey(key2Def)] = currRes |
... | ... | @@ -86,7 +86,7 @@ class RulesParser(object): |
86 | 86 | |
87 | 87 | self.doShiftOrthMagic(resultsMap, res) |
88 | 88 | |
89 | - for idx, defs in enumerate(itertools.product(*key2Defs.values())): | |
89 | + for idx, defs in enumerate(itertools.product(*list(key2Defs.values()))): | |
90 | 90 | key2Def = dict([(def2Key[define], define) for define in defs]) |
91 | 91 | |
92 | 92 | nfa = rulesNFA.RulesNFA() |
... | ... | @@ -115,20 +115,20 @@ class RulesParser(object): |
115 | 115 | |
116 | 116 | def _createNewTagRule(self, segtype, shiftOrth, lineNum, line, segtypesHelper): |
117 | 117 | if not segtypesHelper.hasSegtype(segtype): |
118 | - raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype)) | |
118 | + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid segment type: %s' % (line, segtype)) | |
119 | 119 | else: |
120 | 120 | # return rules.TagRule(segtype) |
121 | 121 | return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), shiftOrth, segtype, lineNum) |
122 | 122 | |
123 | 123 | def _createQuantRule1(self, child, quantity, lineNum, line, segtypesHelper): |
124 | 124 | if quantity <= 0: |
125 | - raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid quantity: %d' % (line, quantity)) | |
125 | + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantity: %d' % (line, quantity)) | |
126 | 126 | else: |
127 | 127 | return rules.ConcatRule(quantity * [child], lineNum) |
128 | 128 | |
129 | 129 | def _createQuantRule2(self, child, leftN, rightN, lineNum, line, segtypesHelper): |
130 | 130 | if leftN > rightN or (leftN, rightN) == (0, 0): |
131 | - raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid quantities: %d %d' % (line, leftN, rightN)) | |
131 | + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantities: %d %d' % (line, leftN, rightN)) | |
132 | 132 | elif leftN == 0: |
133 | 133 | children = [rules.OptionalRule(child, lineNum)] |
134 | 134 | for n in range(2, rightN + 1): |
... | ... | @@ -140,7 +140,7 @@ class RulesParser(object): |
140 | 140 | |
141 | 141 | def _createQuantRule3(self, child, quantity, lineNum, line, segtypesHelper): |
142 | 142 | if quantity <= 0: |
143 | - raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid quantity: %d' % (line, quantity)) | |
143 | + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantity: %d' % (line, quantity)) | |
144 | 144 | else: |
145 | 145 | return rules.ConcatRule( |
146 | 146 | [ |
... | ... | @@ -200,7 +200,7 @@ class RulesParser(object): |
200 | 200 | shiftOrthSegtypes = set() |
201 | 201 | nonShiftOrthSegtypes = set() |
202 | 202 | |
203 | - for _, rules in resultsMap.iteritems(): | |
203 | + for _, rules in list(resultsMap.items()): | |
204 | 204 | for rule in rules: |
205 | 205 | for atomicRule in rule.getAtomicRules(): |
206 | 206 | if atomicRule.shiftOrth: |
... | ... |
fsabuilder/morfeuszbuilder/segrules/shiftOrthMagic.py
... | ... | @@ -36,7 +36,7 @@ class ShiftOrthMagic(object): |
36 | 36 | for segtype in shiftOrthSegtypes - nonShiftOrthSegtypes: |
37 | 37 | self._onlyShiftSegnums.add(segtypesHelper.getSegnum4Segtype(segtype)) |
38 | 38 | |
39 | - for _, rules in resultsMap.iteritems(): | |
39 | + for _, rules in list(resultsMap.items()): | |
40 | 40 | for rule in rules: |
41 | 41 | for atomicRule in rule.getAtomicRules(): |
42 | 42 | if atomicRule.segnum in self._bothShiftAndNonShiftSegnums and atomicRule.shiftOrth: |
... | ... |
fsabuilder/morfeuszbuilder/segrules/test/parserTest.py
... | ... | @@ -12,18 +12,18 @@ from morfeuszbuilder.fsa import visualizer, serializer |
12 | 12 | class Test(unittest.TestCase): |
13 | 13 | |
14 | 14 | def testParser(self): |
15 | - print 'do test' | |
15 | + print('do test') | |
16 | 16 | t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset')) |
17 | 17 | parser = rulesParser.RulesParser(t) |
18 | 18 | rulesManager = parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) |
19 | 19 | fsa = rulesManager.getDFA({'aggl': 'permissive', 'praet': 'split'}) |
20 | 20 | for s in fsa.dfs(): |
21 | 21 | s.debug() |
22 | - print 'states:', len(list(fsa.dfs())) | |
23 | - print 'transitions:', fsa.getTransitionsNum() | |
22 | + print(('states:', len(list(fsa.dfs())))) | |
23 | + print(('transitions:', fsa.getTransitionsNum())) | |
24 | 24 | visualizer.Visualizer().visualize(fsa, charLabels=False) |
25 | - print 'size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray())) | |
26 | - print 'done' | |
25 | + print(('size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray())))) | |
26 | + print('done') | |
27 | 27 | |
28 | 28 | if __name__ == "__main__": |
29 | 29 | unittest.main() |
... | ... |
fsabuilder/morfeuszbuilder/segrules/test/preprocessorTest.py
... | ... | @@ -19,7 +19,7 @@ class Test(unittest.TestCase): |
19 | 19 | parsedFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes', 'segment types']) |
20 | 20 | linesEnum = parsedFile.enumerateLinesInSection('combinations') |
21 | 21 | for lineNum, line in preprocessor.preprocess(linesEnum, ['extra', 'superextra']): |
22 | - print (lineNum, line) | |
22 | + print((lineNum, line)) | |
23 | 23 | |
24 | 24 | |
25 | 25 | if __name__ == "__main__": |
... | ... |
fsabuilder/morfeuszbuilder/tagset/segtypes.py
... | ... | @@ -11,11 +11,11 @@ from morfeuszbuilder.utils import exceptions |
11 | 11 | def _getLemmaHomonymPair(lemma): |
12 | 12 | if lemma is None: |
13 | 13 | return (None, None) |
14 | - elif u':' in lemma: | |
15 | - if lemma.replace(u':', '') == '': | |
14 | + elif ':' in lemma: | |
15 | + if lemma.replace(':', '') == '': | |
16 | 16 | return (lemma, None) |
17 | 17 | else: |
18 | - return lemma.split(u':', 1) | |
18 | + return lemma.split(':', 1) | |
19 | 19 | else: |
20 | 20 | return (lemma, None) |
21 | 21 | |
... | ... | @@ -26,7 +26,7 @@ class Segtypes(object): |
26 | 26 | self.tagset = tagset |
27 | 27 | self.namesMap = namesMap |
28 | 28 | self.labelsMap = labelsMap |
29 | - self._reverseLabelsMap = dict([(v, k) for (k, v) in labelsMap.iteritems()]) | |
29 | + self._reverseLabelsMap = dict([(v, k) for (k, v) in list(labelsMap.items())]) | |
30 | 30 | |
31 | 31 | self.filename = segrulesConfigFile.filename |
32 | 32 | |
... | ... | @@ -59,13 +59,13 @@ class Segtypes(object): |
59 | 59 | |
60 | 60 | def _readSegtypes(self, segrulesConfigFile): |
61 | 61 | for lineNum, line in segrulesConfigFile.enumerateLinesInSection('segment types'): |
62 | - assert type(line) == unicode | |
62 | + assert type(line) == str | |
63 | 63 | self._validate( |
64 | - u'Segment type must be a single word', | |
64 | + 'Segment type must be a single word', | |
65 | 65 | lineNum, |
66 | 66 | re.match(r'^\w+$', line)) |
67 | 67 | self._validate( |
68 | - u'Segment type already defined: "%s"' % line, | |
68 | + 'Segment type already defined: "%s"' % line, | |
69 | 69 | lineNum, |
70 | 70 | line not in self.segtypes) |
71 | 71 | self.segtypes.append(line) |
... | ... | @@ -75,13 +75,13 @@ class Segtypes(object): |
75 | 75 | for lineNum, line in segrulesConfigFile.enumerateLinesInSection('tags'): |
76 | 76 | self._parsePattern(lineNum, line, withLemma=False) |
77 | 77 | self._validate( |
78 | - u'Pattern that matches everything must be the last one', | |
78 | + 'Pattern that matches everything must be the last one', | |
79 | 79 | lineNum - 1, |
80 | 80 | not gotWildcardPattern) |
81 | 81 | gotWildcardPattern = gotWildcardPattern or self.patternsList[-1].isWildcardPattern() |
82 | 82 | |
83 | 83 | self._validate( |
84 | - u'There must be a pattern that matches everything at the end of [tags] section', | |
84 | + 'There must be a pattern that matches everything at the end of [tags] section', | |
85 | 85 | lineNum, |
86 | 86 | self.patternsList[-1].isWildcardPattern()) |
87 | 87 | |
... | ... | @@ -94,18 +94,18 @@ class Segtypes(object): |
94 | 94 | for f in fields: |
95 | 95 | match = re.match(r'(name|labels)=([\S]+)', f, re.U) |
96 | 96 | self._validate( |
97 | - u'invalid name or labels constraint: "%s"' % f, | |
97 | + 'invalid name or labels constraint: "%s"' % f, | |
98 | 98 | lineNum, |
99 | 99 | match) |
100 | 100 | key = match.group(1) |
101 | 101 | value = match.group(2) |
102 | 102 | self._validate( |
103 | - u'%s already specified' % key, | |
103 | + '%s already specified' % key, | |
104 | 104 | lineNum, |
105 | 105 | key not in res) |
106 | 106 | if key == 'labels': |
107 | 107 | if value: |
108 | - value = frozenset(value.split(u'|')) | |
108 | + value = frozenset(value.split('|')) | |
109 | 109 | else: |
110 | 110 | value = frozenset() |
111 | 111 | res[key] = value |
... | ... | @@ -115,7 +115,7 @@ class Segtypes(object): |
115 | 115 | split = re.split(r'\s+', line.strip()) |
116 | 116 | if withLemma: |
117 | 117 | self._validate( |
118 | - u'Line in [lexemes] section must contain 3 to 5 fields - segment type, lemma, tag pattern and optional constraints on name and labels', | |
118 | + 'Line in [lexemes] section must contain 3 to 5 fields - segment type, lemma, tag pattern and optional constraints on name and labels', | |
119 | 119 | lineNum, |
120 | 120 | len(split) in [3, 4, 5]) |
121 | 121 | segtype = split[0] |
... | ... | @@ -124,7 +124,7 @@ class Segtypes(object): |
124 | 124 | additionalConstraints = self._parseAdditionalConstraints(lineNum, split[3:]) |
125 | 125 | else: |
126 | 126 | self._validate( |
127 | - u'Line in [tags] section must contain 2 to 4 fields - segment type, tag pattern and optional constraints on name and labels', | |
127 | + 'Line in [tags] section must contain 2 to 4 fields - segment type, tag pattern and optional constraints on name and labels', | |
128 | 128 | lineNum, |
129 | 129 | len(split) in [2, 3, 4]) |
130 | 130 | segtype = split[0] |
... | ... | @@ -132,32 +132,32 @@ class Segtypes(object): |
132 | 132 | pattern = split[1] |
133 | 133 | additionalConstraints = self._parseAdditionalConstraints(lineNum, split[2:]) |
134 | 134 | self._validate( |
135 | - u'Undeclared segment type: "%s"' % segtype, | |
135 | + 'Undeclared segment type: "%s"' % segtype, | |
136 | 136 | lineNum, |
137 | 137 | segtype in self.segtypes) |
138 | 138 | segnum = self.segtypes.index(segtype) |
139 | 139 | |
140 | 140 | self._validate( |
141 | - u'Pattern must contain only ":", "%", "." and lowercase alphanumeric letters', | |
141 | + 'Pattern must contain only ":", "%", "." and lowercase alphanumeric letters', | |
142 | 142 | lineNum, |
143 | 143 | re.match(r'[a-z_\.\:\%]+', pattern)) |
144 | 144 | |
145 | 145 | segtypePattern = SegtypePattern( |
146 | 146 | lemma, |
147 | 147 | pattern, |
148 | - additionalConstraints.get('name', u''), | |
148 | + additionalConstraints.get('name', ''), | |
149 | 149 | additionalConstraints.get('labels', frozenset()), |
150 | 150 | segnum) |
151 | 151 | # print 'segtypePattern', repr(str(segtypePattern)) |
152 | 152 | self._validate( |
153 | - u'There is no tag that matches pattern "%s".' % (pattern), | |
153 | + 'There is no tag that matches pattern "%s".' % (pattern), | |
154 | 154 | lineNum, |
155 | 155 | any([segtypePattern.tryToMatchTag(tag) != -1 for tag in self.tagset.getAllTags()])) |
156 | 156 | self.patternsList.append(segtypePattern) |
157 | 157 | |
158 | 158 | def _getAllExistingLabelsnumCombinations(self, labels): |
159 | 159 | if labels: |
160 | - for labelsCombination, labelsnum in self.labelsMap.iteritems(): | |
160 | + for labelsCombination, labelsnum in list(self.labelsMap.items()): | |
161 | 161 | if labels <= labelsCombination: |
162 | 162 | yield labelsnum |
163 | 163 | else: |
... | ... | @@ -232,7 +232,7 @@ class SegtypePattern(object): |
232 | 232 | return -1 |
233 | 233 | |
234 | 234 | def isWildcardPattern(self): |
235 | - return (self.lemma, self.pattern, self.name, self.labels) == (None, '%', u'', frozenset()) | |
235 | + return (self.lemma, self.pattern, self.name, self.labels) == (None, '%', '', frozenset()) | |
236 | 236 | |
237 | 237 | def __str__(self): |
238 | - return u'%s %s %s %s -> %d' % (self.lemma, self.pattern, self.name, self.labels, self.segnum) | |
238 | + return '%s %s %s %s -> %d' % (self.lemma, self.pattern, self.name, self.labels, self.segnum) | |
... | ... |
fsabuilder/morfeuszbuilder/tagset/tagset.py
... | ... | @@ -20,7 +20,7 @@ class Tagset(object): |
20 | 20 | #~ self._name2namenum = {} |
21 | 21 | if filename: |
22 | 22 | self._doInit(filename, encoding) |
23 | - self._tagnum2tag = dict(map(lambda (k, v): (v, k), self.tag2tagnum.iteritems())) | |
23 | + self._tagnum2tag = dict([(k_v[1], k_v[0]) for k_v in iter(list(self.tag2tagnum.items()))]) | |
24 | 24 | |
25 | 25 | def _doInit(self, filename, encoding): |
26 | 26 | insideTags = False |
... | ... | @@ -33,11 +33,11 @@ class Tagset(object): |
33 | 33 | self.tagsetId = match.group(1) |
34 | 34 | else: |
35 | 35 | raise FSABuilderException('missing TAGSET-ID in first line of tagset file') |
36 | - elif line == u'[TAGS]': | |
36 | + elif line == '[TAGS]': | |
37 | 37 | insideTags = True |
38 | 38 | #~ elif line == u'[NAMES]': |
39 | 39 | #~ addingTo = Tagset.NAMES |
40 | - elif line and not line.startswith(u'#'): | |
40 | + elif line and not line.startswith('#'): | |
41 | 41 | if not insideTags: |
42 | 42 | raise FSABuilderException('"%s" - text outside [TAGS] section in tagset file line %d' % (line, linenum)) |
43 | 43 | res = self.tag2tagnum |
... | ... | @@ -47,12 +47,12 @@ class Tagset(object): |
47 | 47 | tag = line.split(Tagset.SEP)[1] |
48 | 48 | if tag in res: |
49 | 49 | raise FSABuilderException('duplicate tag: "%s"' % tag) |
50 | - if int(tagNum) in res.values(): | |
50 | + if int(tagNum) in list(res.values()): | |
51 | 51 | raise FSABuilderException('line %d: tagId %d assigned for tag "%s" already appeared somewhere else.' % (linenum, int(tagNum), tag)) |
52 | 52 | res[tag] = int(tagNum) |
53 | 53 | |
54 | 54 | def getAllTags(self): |
55 | - return self.tag2tagnum.keys() | |
55 | + return list(self.tag2tagnum.keys()) | |
56 | 56 | |
57 | 57 | def getTagnum4Tag(self, tag): |
58 | 58 | if tag in self.tag2tagnum: |
... | ... |
fsabuilder/morfeuszbuilder/utils/caseconv/generate.py
... | ... | @@ -90,7 +90,7 @@ def _serializeTable(table): |
90 | 90 | def _serializeExtendedTable(table): |
91 | 91 | res = [] |
92 | 92 | res.append('{') |
93 | - for code, targetCode in table.iteritems(): | |
93 | + for code, targetCode in list(table.items()): | |
94 | 94 | res.append('{') |
95 | 95 | res.append(str(code)) |
96 | 96 | res.append(',') |
... | ... |
fsabuilder/morfeuszbuilder/utils/configFile.py
... | ... | @@ -6,10 +6,10 @@ Created on 18 lut 2014 |
6 | 6 | |
7 | 7 | import re |
8 | 8 | import codecs |
9 | -import exceptions | |
9 | +from . import exceptions | |
10 | 10 | |
11 | 11 | def getHeaderValue(line, lineNum): |
12 | - m = re.match(ur'\s*\[(.*?)\]\s*(\#.*)?', line) | |
12 | + m = re.match(r'\s*\[(.*?)\]\s*(\#.*)?', line) | |
13 | 13 | if m: |
14 | 14 | return m.group(1) |
15 | 15 | else: |
... | ... | @@ -40,7 +40,7 @@ class ConfigFile(object): |
40 | 40 | self.section2Lines[self.currSection].append((lineNum, line)) |
41 | 41 | |
42 | 42 | def _getHeaderValue(self, line, lineNum): |
43 | - m = re.match(ur'\s*\[(.*?)\]\s*(\#.*)?', line) | |
43 | + m = re.match(r'\s*\[(.*?)\]\s*(\#.*)?', line) | |
44 | 44 | if m: |
45 | 45 | return m.group(1) |
46 | 46 | else: |
... | ... | @@ -48,7 +48,7 @@ class ConfigFile(object): |
48 | 48 | |
49 | 49 | def enumerateLinesInSection(self, sectionName, ignoreComments=True): |
50 | 50 | if sectionName not in self.section2Lines: |
51 | - raise exceptions.ConfigFileException(self.filename, None, u'Missing section: "%s"' % sectionName) | |
51 | + raise exceptions.ConfigFileException(self.filename, None, 'Missing section: "%s"' % sectionName) | |
52 | 52 | if not ignoreComments: |
53 | 53 | return self.section2Lines[sectionName] |
54 | 54 | else: |
... | ... |
fsabuilder/morfeuszbuilder/utils/exceptions.py
... | ... | @@ -25,7 +25,7 @@ class SegtypesException(FSABuilderException): |
25 | 25 | self.msg = msg |
26 | 26 | |
27 | 27 | def __str__(self): |
28 | - return u'Error in segment rules: %s' % self.msg | |
28 | + return 'Error in segment rules: %s' % self.msg | |
29 | 29 | |
30 | 30 | class ConfigFileException(FSABuilderException): |
31 | 31 | |
... | ... | @@ -36,7 +36,7 @@ class ConfigFileException(FSABuilderException): |
36 | 36 | |
37 | 37 | def __str__(self): |
38 | 38 | if self.lineNum: |
39 | - return u'%s:%d - %s' % (self.filename, self.lineNum, self.msg) | |
39 | + return '%s:%d - %s' % (self.filename, self.lineNum, self.msg) | |
40 | 40 | else: |
41 | - return u'%s - %s' % (self.filename, self.msg) | |
41 | + return '%s - %s' % (self.filename, self.msg) | |
42 | 42 | |
... | ... |
fsabuilder/morfeuszbuilder/utils/extractTagset.py
... | ... | @@ -8,10 +8,10 @@ import sys |
8 | 8 | if __name__ == '__main__': |
9 | 9 | version = sys.argv[1] |
10 | 10 | res = set() |
11 | - print '#morfeusz-tagset', version | |
11 | + print(('#morfeusz-tagset', version)) | |
12 | 12 | for line in sys.stdin: |
13 | 13 | if line.strip(): |
14 | 14 | tag = line.split('\t')[2] |
15 | 15 | res.add(tag) |
16 | 16 | for idx, tag in enumerate(sorted(res)): |
17 | - print str(idx) + '\t' + tag | |
17 | + print((str(idx) + '\t' + tag)) | |
... | ... |