From 95cbe5ea03398610d704f1ad1d51cc981f0aafea Mon Sep 17 00:00:00 2001 From: Marcin Woliński <wolinski@ipipan.waw.pl> Date: Thu, 2 Jul 2020 20:51:35 +0200 Subject: [PATCH] morfeusz_builder → Python 3 --- CMakeLists.txt | 2 +- fsabuilder/buildanalyzer.sh | 4 ++-- fsabuilder/buildgenerator.sh | 4 ++-- fsabuilder/morfeusz_builder | 56 ++++++++++++++++++++++++++++---------------------------- fsabuilder/morfeuszbuilder/fsa/common.py | 18 +++++++++--------- fsabuilder/morfeuszbuilder/fsa/convertinput.py | 60 ++++++++++++++++++++++++++++++------------------------------ fsabuilder/morfeuszbuilder/fsa/encode.py | 18 +++++++++--------- fsabuilder/morfeuszbuilder/fsa/fsa.py | 10 +++++----- fsabuilder/morfeuszbuilder/fsa/serializer.py | 30 +++++++++++++++--------------- fsabuilder/morfeuszbuilder/fsa/state.py | 14 +++++++------- fsabuilder/morfeuszbuilder/fsa/visualizer.py | 6 +++--- fsabuilder/morfeuszbuilder/segrules/preprocessor.py | 8 ++++---- fsabuilder/morfeuszbuilder/segrules/pyparseString.py | 2 +- fsabuilder/morfeuszbuilder/segrules/rules.py | 30 +++++++++++++++--------------- fsabuilder/morfeuszbuilder/segrules/rulesFSA.py | 6 +++--- fsabuilder/morfeuszbuilder/segrules/rulesManager.py | 8 ++++---- fsabuilder/morfeuszbuilder/segrules/rulesNFA.py | 32 +++++++++++++------------------- fsabuilder/morfeuszbuilder/segrules/rulesParser.py | 20 ++++++++++---------- fsabuilder/morfeuszbuilder/segrules/shiftOrthMagic.py | 2 +- fsabuilder/morfeuszbuilder/segrules/test/parserTest.py | 10 +++++----- fsabuilder/morfeuszbuilder/segrules/test/preprocessorTest.py | 2 +- fsabuilder/morfeuszbuilder/tagset/segtypes.py | 42 +++++++++++++++++++++--------------------- fsabuilder/morfeuszbuilder/tagset/tagset.py | 10 +++++----- fsabuilder/morfeuszbuilder/utils/caseconv/generate.py | 2 +- fsabuilder/morfeuszbuilder/utils/configFile.py | 8 ++++---- fsabuilder/morfeuszbuilder/utils/exceptions.py | 6 +++--- fsabuilder/morfeuszbuilder/utils/extractTagset.py | 4 ++-- 27 files changed, 204 insertions(+), 210 deletions(-) mode change 100644 => 100755 fsabuilder/morfeusz_builder diff --git a/CMakeLists.txt b/CMakeLists.txt index 821f79d..5473cd2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ project (Morfeusz) set (Morfeusz_VERSION_MAJOR 1) set (Morfeusz_VERSION_MINOR 9) -set (Morfeusz_VERSION_PATCH 15) +set (Morfeusz_VERSION_PATCH 16) set (Morfeusz_VERSION "${Morfeusz_VERSION_MAJOR}.${Morfeusz_VERSION_MINOR}.${Morfeusz_VERSION_PATCH}") set (Morfeusz_LIB_VERSION "${Morfeusz_VERSION}") if (BUILT_ON) diff --git a/fsabuilder/buildanalyzer.sh b/fsabuilder/buildanalyzer.sh index 9d36fdf..fa470b1 100755 --- a/fsabuilder/buildanalyzer.sh +++ b/fsabuilder/buildanalyzer.sh @@ -1,3 +1,3 @@ -#!/bin/bash +#! /bin/bash -python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V1 --trim-supneg -o $1 +python3 morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V1 --trim-supneg -o $1 diff --git a/fsabuilder/buildgenerator.sh b/fsabuilder/buildgenerator.sh index 2f7f562..0e6f70c 100755 --- a/fsabuilder/buildgenerator.sh +++ b/fsabuilder/buildgenerator.sh @@ -1,6 +1,6 @@ -#!/bin/bash +#! /bin/bash -python morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \ +python3 morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \ --tagset-file=../input/sgjp-morfeusz.tagset \ --segments-file=../input/segmenty.dat \ --generator \ diff --git a/fsabuilder/morfeusz_builder b/fsabuilder/morfeusz_builder old mode 100644 new mode 100755 index 1b0b352..453ec28 --- a/fsabuilder/morfeusz_builder +++ b/fsabuilder/morfeusz_builder @@ -1,4 +1,4 @@ -#!/usr/bin/python +#! /usr/bin/python3 # -*- coding:utf-8 -*- ''' Created on 21 paź 2013 @@ -20,13 +20,13 @@ from optparse import OptionParser def _checkOption(opt, parser, msg): if opt is None: - print >> sys.stderr, msg + print(msg, file=sys.stderr) parser.print_help() exit(1) def _checkCondition(cond, parser, msg): if not cond: - print >> sys.stderr, msg + print(msg, file=sys.stderr) parser.print_help() exit(1) @@ -40,7 +40,7 @@ def _checkOpen(filename, mode): if 'w' in mode: os.remove(filename) except IOError as ex: - print >> sys.stderr, str(ex) + print(str(ex), file=sys.stderr) exit(1) def _getDictFilename(opts, isGenerator): @@ -162,7 +162,7 @@ def _parseOptions(): _checkOpen(_getDictFilename(opts, isGenerator=True), 'w') if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1]: - print >> sys.stderr, '--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1])+')' + print('--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1])+')', file=sys.stderr) parser.print_help() exit(1) @@ -183,34 +183,34 @@ def _readDictIdAndCopyright(inputFiles): with codecs.open(inputFile, 'r', 'utf8') as f: inCopyright = False for linenum, line in enumerate(f, start=1): - if dictId is None and line.startswith(u'#!DICT-ID'): - dictIdTag, _, dictId = line.strip().partition(u' ') + if dictId is None and line.startswith('#!DICT-ID'): + dictIdTag, _, dictId = line.strip().partition(' ') exceptions.validate( - dictIdTag == u'#!DICT-ID', - u'Dictionary ID tag must be followed by a space character and dictionary ID string') + dictIdTag == '#!DICT-ID', + 'Dictionary ID tag must be followed by a space character and dictionary ID string') exceptions.validate( - len(line.split(u' ')) > 1, - u'%s:%d: Must provide DICT-ID' % (inputFile, linenum)) + len(line.split(' ')) > 1, + '%s:%d: Must provide DICT-ID' % (inputFile, linenum)) exceptions.validate( - len(line.split(u' ')) == 2, - u'%s:%d: DICT-ID must not contain spaces' % (inputFile, linenum)) - elif copyright is None and line.startswith(u'#<COPYRIGHT>'): + len(line.split(' ')) == 2, + '%s:%d: DICT-ID must not contain spaces' % (inputFile, linenum)) + elif copyright is None and line.startswith('#<COPYRIGHT>'): exceptions.validate( - line.strip() == u'#<COPYRIGHT>', - u'%s:%d: Copyright start tag must be the only one in the line' % (inputFile, linenum)) + line.strip() == '#<COPYRIGHT>', + '%s:%d: Copyright start tag must be the only one in the line' % (inputFile, linenum)) inCopyright = True - copyright = u'' + copyright = '' - elif line.startswith(u'#</COPYRIGHT>'): + elif line.startswith('#</COPYRIGHT>'): exceptions.validate( inCopyright, - u'%s:%d: Copyright end tag must be preceded by copyright start tag' % (inputFile, linenum)) + '%s:%d: Copyright end tag must be preceded by copyright start tag' % (inputFile, linenum)) exceptions.validate( - line.strip() == u'#</COPYRIGHT>', - u'%s:%d: Copyright end tag must be the only one in the line' % (inputFile, linenum)) + line.strip() == '#</COPYRIGHT>', + '%s:%d: Copyright end tag must be the only one in the line' % (inputFile, linenum)) inCopyright = False @@ -219,21 +219,21 @@ def _readDictIdAndCopyright(inputFiles): copyright += line if dictId is None: - logging.warn(u'No dictionary ID tag found') - dictId = u'' + logging.warn('No dictionary ID tag found') + dictId = '' if copyright is None: - logging.warn(u'No copyright info found') - copyright = u'' + logging.warn('No copyright info found') + copyright = '' return (dictId, copyright) def _readNamesAndQualifiers(inputFiles): - names = set([u'']) + names = set(['']) qualifiers = set([frozenset()]) lineParser = convertinput.LineParser() for line in _concatFiles(inputFiles): - line = line.strip().decode('utf8') + line = line.strip() if not lineParser.ignoreLine(line): _, _, _, name, qualifier = lineParser.parseLine(line) names.add(name) @@ -242,7 +242,7 @@ def _readNamesAndQualifiers(inputFiles): qualifiersMap = dict([(quals, idx) for idx, quals in enumerate(sorted(qualifiers, key=lambda q: tuple(sorted(q))))]) exceptions.validate( len(qualifiersMap) <= limits.MAX_QUALIFIERS_COMBINATIONS, - u'Too many qualifiers combinations. The limit is %d' % limits.MAX_QUALIFIERS_COMBINATIONS) + 'Too many qualifiers combinations. The limit is %d' % limits.MAX_QUALIFIERS_COMBINATIONS) return namesMap, qualifiersMap diff --git a/fsabuilder/morfeuszbuilder/fsa/common.py b/fsabuilder/morfeuszbuilder/fsa/common.py index ba6a1d5..233fcb2 100644 --- a/fsabuilder/morfeuszbuilder/fsa/common.py +++ b/fsabuilder/morfeuszbuilder/fsa/common.py @@ -10,9 +10,9 @@ import logging class EncodedFormWithoutPrefix(object): def __init__(self, fromWord, targetWord, lowercase): - assert type(fromWord) == unicode - assert type(targetWord) == unicode - root = u'' + assert type(fromWord) == str + assert type(targetWord) == str + root = '' for o, b in zip(fromWord, targetWord): if ((o.lower() == b.lower()) if lowercase else o == b): root += b @@ -26,8 +26,8 @@ class EncodedFormWithoutPrefix(object): class EncodedForm4Generator(object): def __init__(self, fromWord, targetWord): - assert type(fromWord) == unicode - assert type(targetWord) == unicode + assert type(fromWord) == str + assert type(targetWord) == str bestEncodedForm = None bestPrefixLength = -1 for prefixLength in range(min(len(targetWord), 5)): @@ -45,8 +45,8 @@ class EncodedForm4Generator(object): class EncodedForm4Analyzer(object): def __init__(self, fromWord, targetWord): - assert type(fromWord) == unicode - assert type(targetWord) == unicode + assert type(fromWord) == str + assert type(targetWord) == str bestEncodedForm = None bestPrefixCutLength = -1 for prefixCutLength in range(min(len(fromWord), 5)): @@ -123,7 +123,7 @@ class Interpretation4Generator(object): return hash(self.getSortKey()) def __unicode__(self): - return u'<%s,(%d %s),%d,%d>' % (self.lemma.decode('utf8'), self.encodedForm.cutLength, self.encodedForm.suffixToAdd.decode('utf8'), self.tagnum, self.namenum) + return '<%s,(%d %s),%d,%d>' % (self.lemma, self.encodedForm.cutLength, self.encodedForm.suffixToAdd.decode('utf8'), self.tagnum, self.namenum) def __repr__(self): - return unicode(self) + return str(self) diff --git a/fsabuilder/morfeuszbuilder/fsa/convertinput.py b/fsabuilder/morfeuszbuilder/fsa/convertinput.py index c3c4269..f2feae3 100644 --- a/fsabuilder/morfeuszbuilder/fsa/convertinput.py +++ b/fsabuilder/morfeuszbuilder/fsa/convertinput.py @@ -4,7 +4,7 @@ Created on Oct 23, 2013 @author: mlenart ''' import logging -from common import Interpretation4Analyzer +from .common import Interpretation4Analyzer from morfeuszbuilder.fsa.common import Interpretation4Generator #from morfeuszbuilder.fsa import externalsort @@ -36,24 +36,24 @@ class LineParser(object): def ignoreLine(self, line): if not line: return True - elif line.strip() == u'#<COPYRIGHT>': + elif line.strip() == '#<COPYRIGHT>': self.inCopyright = True return True - elif line.strip() == u'#</COPYRIGHT>': + elif line.strip() == '#</COPYRIGHT>': self.inCopyright = False return True elif self.inCopyright: return True elif line and not ' ' in ''.join(line.split('\t')[:2]): return False - elif line.startswith(u'#!DICT-ID'): + elif line.startswith('#!DICT-ID'): return True else: - logging.warn(u'Ignoring line: "%s" - contains space in text form or lemma' % (line.strip())) + logging.warn('Ignoring line: "%s" - contains space in text form or lemma' % (line.strip())) return True def parseLine(self, line): - splitLine = line.strip().split(u'\t') + splitLine = line.strip().split('\t') if len(splitLine) == 5: orth, base, tag, name, qualifier = splitLine elif len(splitLine) == 4: @@ -69,7 +69,7 @@ class LineParser(object): def parseQualifiers(string): if string: - return frozenset(string.split(u'|')) + return frozenset(string.split('|')) else: return frozenset() @@ -87,7 +87,7 @@ class PolimorfConverter4Analyzer(object): def _partiallyParseLines(self, inputLines): lineParser = LineParser() for line in inputLines: - line = line.decode(self.inputEncoding).strip('\n') + line = line.strip('\n') if not lineParser.ignoreLine(line): orth, base, tag, name, qualifier = lineParser.parseLine(line) @@ -106,8 +106,8 @@ class PolimorfConverter4Analyzer(object): base = orth yield '\t'.join(( - orth.encode(self.inputEncoding), - base.encode(self.inputEncoding), + orth, + base, str(tagnum), str(namenum), str(typenum), @@ -118,8 +118,8 @@ class PolimorfConverter4Analyzer(object): base = orth typenum = self.segmentRulesManager.shiftOrthMagic.getNewSegnum4ShiftOrth(typenum) yield '\t'.join(( - orth.encode(self.inputEncoding), - base.encode(self.inputEncoding), + orth, + base, str(tagnum), str(namenum), str(typenum), @@ -127,14 +127,14 @@ class PolimorfConverter4Analyzer(object): # input lines are encoded and partially parsed def _sortLines(self, inputLines): - return sorted(inputLines, key=lambda line: self.encoder.word2SortKey(line.split('\t')[0].decode('utf8'))) + return sorted(inputLines, key=lambda line: self.encoder.word2SortKey(line.split('\t')[0])) # return sorted(inputLines, key=lambda line: self.encoder.word2SortKey(line.split('\t')[0].decode('utf8'))) def _reallyParseLines(self, inputLines): for line in inputLines: - line = line.decode(self.inputEncoding).strip(u'\n') + line = line.strip('\n') if line: - orth, base, tagnum, namenum, typenum, qualsnum = line.split(u'\t') + orth, base, tagnum, namenum, typenum, qualsnum = line.split('\t') tagnum = int(tagnum) namenum = int(namenum) typenum = int(typenum) @@ -159,14 +159,14 @@ class PolimorfConverter4Generator(object): def _partiallyParseLines(self, inputLines): lineParser = LineParser() for line in inputLines: - line = line.decode(self.inputEncoding).strip('\n') + line = line.strip('\n') if not lineParser.ignoreLine(line): orth, base, tag, name, qualifier = lineParser.parseLine(line) if base: - homonymId = u'' - if u':' in base: - assumedBase, assumedHomonymId = base.split(u':', 1) - if assumedBase != u'' and assumedHomonymId != u'' and assumedHomonymId.isalnum(): + homonymId = '' + if ':' in base: + assumedBase, assumedHomonymId = base.split(':', 1) + if assumedBase != '' and assumedHomonymId != '' and assumedHomonymId.isalnum(): base, homonymId = assumedBase, assumedHomonymId tagnum = self.tagset.getTagnum4Tag(tag) namenum = self.namesMap[name] @@ -179,39 +179,39 @@ class PolimorfConverter4Generator(object): base = orth yield '\t'.join(( - orth.encode(self.inputEncoding), - base.encode(self.inputEncoding), + orth, + base, str(tagnum), str(namenum), str(typenum), - homonymId.encode(self.inputEncoding), + homonymId, str(qualsnum))) if self.segmentRulesManager.shiftOrthMagic.getNewSegnum4ShiftOrth(typenum) != None: base = orth typenum = self.segmentRulesManager.shiftOrthMagic.getNewSegnum4ShiftOrth(typenum) yield '\t'.join(( - orth.encode(self.inputEncoding), - base.encode(self.inputEncoding), + orth, + base, str(tagnum), str(namenum), str(typenum), - homonymId.encode(self.inputEncoding), + homonymId, str(qualsnum))) else: logging.warn('Ignoring line: "%s" - contains empty lemma', line.strip()) # input lines are encoded and partially parsed def _sortLines(self, inputLines): - return sorted(inputLines, key=lambda line: (self.encoder.word2SortKey(line.split('\t')[1].decode('utf8')), line)) + return sorted(inputLines, key=lambda line: (self.encoder.word2SortKey(line.split('\t')[1]), line)) def _reallyParseLines(self, inputLines): prevLine = None for line in inputLines: - line = line.decode(self.inputEncoding).strip(u'\n') + line = line.strip('\n') if line and line != prevLine: - orth, base, tagnum, namenum, typenum, homonymId, qualsnum = line.split(u'\t') -# print orth.encode('utf8'), base.encode('utf8'), homonymId + orth, base, tagnum, namenum, typenum, homonymId, qualsnum = line.split('\t') +# print orth, base, homonymId tagnum = int(tagnum) namenum = int(namenum) typenum = int(typenum) diff --git a/fsabuilder/morfeuszbuilder/fsa/encode.py b/fsabuilder/morfeuszbuilder/fsa/encode.py index 0313559..e47ca92 100644 --- a/fsabuilder/morfeuszbuilder/fsa/encode.py +++ b/fsabuilder/morfeuszbuilder/fsa/encode.py @@ -24,7 +24,7 @@ class Encoder(object): #~ self.qualifiersMap = { frozenset(): 0} def encodeWord(self, word, lowercase=True): - assert type(word) == unicode + assert type(word) == str res = bytearray(word.lower() if self.lowercase and lowercase else word, self.encoding) return res @@ -35,16 +35,16 @@ class Encoder(object): return NotImplementedError() def decodeWord(self, rawWord): - return unicode(str(rawWord).strip('\x00'), self.encoding) + return str(str(rawWord).strip('\x00'), self.encoding) def word2SortKey(self, word): normalizedWord = word.lower() if self.lowercase else word - return normalizedWord.encode(self.encoding) + return normalizedWord def _encodeTypeNum(self, typenum): exceptions.validate( typenum <= limits.MAX_SEGMENT_TYPES, - u'Too many segment types. The limit is %d' % limits.MAX_SEGMENT_TYPES) + 'Too many segment types. The limit is %d' % limits.MAX_SEGMENT_TYPES) return bytearray([typenum]) def _hasUpperPrefix(self, casePattern): @@ -62,13 +62,13 @@ class Encoder(object): def _encodeTagNum(self, tagnum): res = bytearray() - exceptions.validate(tagnum <= limits.MAX_TAGS, u'Too many tags. The limit is %d' % limits.MAX_TAGS) + exceptions.validate(tagnum <= limits.MAX_TAGS, 'Too many tags. The limit is %d' % limits.MAX_TAGS) res.append((tagnum & 0xFF00) >> 8) res.append(tagnum & 0x00FF) return res def _encodeNameNum(self, namenum): - exceptions.validate(namenum <= limits.MAX_NAMES, u'Too many named entity types. The limit is %d' % limits.MAX_NAMES) + exceptions.validate(namenum <= limits.MAX_NAMES, 'Too many named entity types. The limit is %d' % limits.MAX_NAMES) return bytearray([namenum]) def _groupInterpsByType(self, interpsList): @@ -86,7 +86,7 @@ class Encoder(object): res = bytearray() - for typenum, interpsList in segnum2Interps.iteritems(): + for typenum, interpsList in list(segnum2Interps.items()): res.extend(self._encodeInterps4Type(typenum, interpsList)) del interpsList @@ -135,10 +135,10 @@ class MorphEncoder(Encoder): return res def _casePatternsHaveOnlyLowercase(self, casePatterns): - return not any(map(lambda cp: cp and True in cp, casePatterns)) + return not any([cp and True in cp for cp in casePatterns]) def _casePatternsAreOnlyTitles(self, casePatterns): - return all(map(lambda cp: cp and cp[0] == True and not True in cp[1:], casePatterns)) + return all([cp and cp[0] == True and not True in cp[1:] for cp in casePatterns]) def _casePatternsAreEncodedInCompressByte(self, casePatterns): return self._casePatternsHaveOnlyLowercase(casePatterns) or self._casePatternsAreOnlyTitles(casePatterns) diff --git a/fsabuilder/morfeuszbuilder/fsa/fsa.py b/fsabuilder/morfeuszbuilder/fsa/fsa.py index 4e973fb..9b66a20 100644 --- a/fsabuilder/morfeuszbuilder/fsa/fsa.py +++ b/fsabuilder/morfeuszbuilder/fsa/fsa.py @@ -4,8 +4,8 @@ Created on Oct 8, 2013 @author: mlenart ''' -import state -import register +from . import state +from . import register import logging from morfeuszbuilder.utils import exceptions @@ -35,7 +35,7 @@ class FSA(object): assert not self.closed assert data is not None encodedWord = self.encodeWord(word) - assert encodedWord > self.encodedPrevWord + assert self.encodedPrevWord is None or encodedWord > self.encodedPrevWord self._addSorted(encodedWord, self.encodeData(data)) self.encodedPrevWord = encodedWord @@ -43,7 +43,7 @@ class FSA(object): # debug if self.n < 10 or (self.n < 10000 and self.n % 1000 == 0) or self.n % 10000 == 0: - logging.info(u'%d %s' % (self.n, word)) + logging.info('%d %s' % (self.n, word)) for label in encodedWord: self.label2Freq[label] = self.label2Freq.get(label, 0) + 1 @@ -78,7 +78,7 @@ class FSA(object): return res def _addSorted(self, encodedWord, data): - assert self.encodedPrevWord < encodedWord + assert self.encodedPrevWord is None or self.encodedPrevWord < encodedWord assert type(data) == bytearray q = self.initialState i = 0 diff --git a/fsabuilder/morfeuszbuilder/fsa/serializer.py b/fsabuilder/morfeuszbuilder/fsa/serializer.py index 8510c64..c34238d 100644 --- a/fsabuilder/morfeuszbuilder/fsa/serializer.py +++ b/fsabuilder/morfeuszbuilder/fsa/serializer.py @@ -5,7 +5,7 @@ Created on Oct 20, 2013 ''' import logging -from state import State +from .state import State from morfeuszbuilder.utils import limits, exceptions from morfeuszbuilder.utils.serializationUtils import * @@ -106,7 +106,7 @@ class Serializer(object): res = bytearray() numOfTags = len(tagsMap) res.extend(htons(numOfTags)) - for tag, tagnum in sorted(tagsMap.iteritems(), key=lambda (tag, tagnum): tagnum): + for tag, tagnum in sorted(iter(list(tagsMap.items())), key=lambda tag_tagnum: tag_tagnum[1]): res.extend(htons(tagnum)) res.extend(self.fsa.encodeWord(tag)) res.append(0) @@ -121,7 +121,7 @@ class Serializer(object): #~ return res def serializeQualifiersMap(self): - label2labelId = dict([ (u'|'.join(qualifiers), n) for qualifiers, n in sorted(self.qualifiersMap.iteritems(), key=lambda (qs, n): n) ]) + label2labelId = dict([ ('|'.join(sorted(qualifiers)), n) for qualifiers, n in sorted(iter(list(self.qualifiersMap.items())), key=lambda qs_n: qs_n[1]) ]) return self._serializeTags(label2labelId) #~ res = bytearray() #~ res.extend(htons(len(self.qualifiersMap))) @@ -186,9 +186,9 @@ class Serializer(object): return res def getSortedTransitions(self, state): - defaultKey = lambda (label, nextState): (-state.label2Freq.get(label, 0), -self.fsa.label2Freq.get(label, 0)) + defaultKey = lambda label_nextState: (-state.label2Freq.get(label_nextState[0], 0), -self.fsa.label2Freq.get(label_nextState[0], 0)) return list(sorted( - state.transitionsMap.iteritems(), + iter(list(state.transitionsMap.items())), key=defaultKey)) def stateData2bytearray(self, state): @@ -215,9 +215,9 @@ class SimpleSerializer(Serializer): def getStateSize(self, state): if self.serializeTransitionsData: - return 1 + 5 * len(state.transitionsMap.keys()) + self.getDataSize(state) + return 1 + 5 * len(list(state.transitionsMap.keys())) + self.getDataSize(state) else: - return 1 + 4 * len(state.transitionsMap.keys()) + self.getDataSize(state) + return 1 + 4 * len(list(state.transitionsMap.keys())) + self.getDataSize(state) def getDataSize(self, state): return len(state.encodedData) if state.isAccepting() else 0 @@ -270,12 +270,12 @@ class VLengthSerializer1(Serializer): res = bytearray() # labels sorted by popularity - sortedLabels = [label for (label, freq) in sorted(self.fsa.label2Freq.iteritems(), key=lambda (label, freq): (-freq, label))] + sortedLabels = [label for (label, freq) in sorted(iter(list(self.fsa.label2Freq.items())), key=lambda label_freq: (-label_freq[1], label_freq[0]))] # popular labels table self.label2ShortLabel = dict([(label, sortedLabels.index(label) + 1) for label in sortedLabels[:63]]) - logging.debug(dict([(chr(label), shortLabel) for label, shortLabel in self.label2ShortLabel.items()])) + logging.debug(dict([(chr(label), shortLabel) for label, shortLabel in list(self.label2ShortLabel.items())])) # write remaining short labels (zeros) for label in range(256): @@ -354,7 +354,7 @@ class VLengthSerializer1(Serializer): offsetSize += 1 exceptions.validate( offset < 256 * 256 * 256, - u'Cannot build the automaton - it would exceed its max size which is %d' % (256 * 256 * 256)) + 'Cannot build the automaton - it would exceed its max size which is %d' % (256 * 256 * 256)) # assert offset < 256 * 256 * 256 # TODO - przerobic na jakis porzadny wyjatek assert offsetSize <= 3 firstByte |= offsetSize @@ -380,7 +380,7 @@ class VLengthSerializer1(Serializer): newState.encodedData = state.encodedData newState.reverseOffset = state.reverseOffset newState.offset = state.offset - newState.transitionsMap = dict([(label, nextState) for (label, nextState) in state.transitionsMap.iteritems()]) + newState.transitionsMap = dict([(label, nextState) for (label, nextState) in list(state.transitionsMap.items())]) # newState.transitionsMap = dict([(label, nextState) for (label, nextState) in state.transitionsMap.iteritems() if not label in self.label2ShortLabel or not self.label2ShortLabel[label] in range(1,64)]) newState.serializeAsArray = False return newState @@ -388,12 +388,12 @@ class VLengthSerializer1(Serializer): def _transitions2ArrayBytes(self, state): res = bytearray() array = [0] * 64 - for label, nextState in state.transitionsMap.iteritems(): + for label, nextState in list(state.transitionsMap.items()): if label in self.label2ShortLabel: shortLabel = self.label2ShortLabel[label] array[shortLabel] = nextState.offset logging.debug(array) - for offset in map(lambda x: x if x else 0, array): + for offset in [x if x else 0 for x in array]: res.append(0) res.append((offset & 0xFF0000) >> 16) res.append((offset & 0x00FF00) >> 8) @@ -409,8 +409,8 @@ class VLengthSerializer1(Serializer): return self._transitions2ListBytes(state) def _chooseArrayStates(self): - for state1 in self.fsa.initialState.transitionsMap.values(): - for state2 in state1.transitionsMap.values(): + for state1 in list(self.fsa.initialState.transitionsMap.values()): + for state2 in list(state1.transitionsMap.values()): # for state3 in state2.transitionsMap.values(): # state3.serializeAsArray = True state2.serializeAsArray = True diff --git a/fsabuilder/morfeuszbuilder/fsa/state.py b/fsabuilder/morfeuszbuilder/fsa/state.py index 7a306dc..37e7583 100644 --- a/fsabuilder/morfeuszbuilder/fsa/state.py +++ b/fsabuilder/morfeuszbuilder/fsa/state.py @@ -45,7 +45,7 @@ class State(object): return self.transitionsMap.get(byte, None) def getRegisterKey(self): - return ( frozenset(self.transitionsMap.iteritems()), tuple(self.encodedData) if self.encodedData else None ) + return ( frozenset(iter(list(self.transitionsMap.items()))), tuple(self.encodedData) if self.encodedData else None ) def isAccepting(self): return self.encodedData is not None @@ -60,10 +60,10 @@ class State(object): else: return self.encodedData - def dfs(self, alreadyVisited, sortKey=lambda (_, state): -state.freq): + def dfs(self, alreadyVisited, sortKey=lambda __state: -__state[1].freq): if not self in alreadyVisited: alreadyVisited.add(self) - for _, state in sorted(self.transitionsMap.iteritems(), key=sortKey): + for _, state in sorted(iter(list(self.transitionsMap.items())), key=sortKey): for state1 in state.dfs(alreadyVisited): yield state1 yield self @@ -77,7 +77,7 @@ class State(object): state.offset = currReverseOffset - state.reverseOffset def debug(self): - print '----------------' - print 'STATE:', self.idx, 'accepting', self.isAccepting() - for label, s in self.transitionsMap.iteritems(): - print label, '-->', s.idx + print('----------------') + print(('STATE:', self.idx, 'accepting', self.isAccepting())) + for label, s in list(self.transitionsMap.items()): + print((label, '-->', s.idx)) diff --git a/fsabuilder/morfeuszbuilder/fsa/visualizer.py b/fsabuilder/morfeuszbuilder/fsa/visualizer.py index bb40fc4..7a2acef 100644 --- a/fsabuilder/morfeuszbuilder/fsa/visualizer.py +++ b/fsabuilder/morfeuszbuilder/fsa/visualizer.py @@ -19,7 +19,7 @@ class Visualizer(object): nodeLabelsMap = {} for idx, state in enumerate(allStates): G.add_node(idx, offset=state.offset) - for c, targetState in state.transitionsMap.iteritems(): + for c, targetState in list(state.transitionsMap.items()): G.add_edge(idx, allStates.index(targetState)) label = (chr(c) if c <= 127 else '%') if charLabels \ else c @@ -37,11 +37,11 @@ class Visualizer(object): nodelist=list([allStates.index(s) for s in allStates if s.isAccepting()]), node_shape='s') # nx.draw_networkx_nodes(G, pos, nodelist=list([allStates.index(s) for s in allStates if s.isFinal()])), ) - nx.draw_networkx_edges(G, pos, edgelist=edgeLabelsMap.keys()) + nx.draw_networkx_edges(G, pos, edgelist=list(edgeLabelsMap.keys())) nx.draw_networkx_edge_labels(G, pos, edge_labels = edgeLabelsMap) nx.draw_networkx_labels(G, pos, labels=nodeLabelsMap) plt.axis('off') plt.draw() plt.show() # plt.savefig('filename.png') - print 'done' + print('done') diff --git a/fsabuilder/morfeuszbuilder/segrules/preprocessor.py b/fsabuilder/morfeuszbuilder/segrules/preprocessor.py index 1fd336e..339de1b 100644 --- a/fsabuilder/morfeuszbuilder/segrules/preprocessor.py +++ b/fsabuilder/morfeuszbuilder/segrules/preprocessor.py @@ -7,10 +7,10 @@ Created on 23 sty 2014 import re from pyparsing import * from morfeuszbuilder.utils import exceptions -from pyparseString import pyparseString +from .pyparseString import pyparseString -identifier = Word(alphas, bodyChars=alphanums+u'_>*+{},') -define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + Word(alphas, bodyChars=alphanums+u'_') + Suppress(')')) + restOfLine + LineEnd() + StringEnd() +identifier = Word(alphas, bodyChars=alphanums+'_>*+{},') +define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + Word(alphas, bodyChars=alphanums+'_') + Suppress(')')) + restOfLine + LineEnd() + StringEnd() ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd() endif = Keyword('#endif').suppress() + LineEnd() + StringEnd() @@ -107,5 +107,5 @@ def preprocess(inputLines, defs, filename): ifdefsStack.pop() elif line.startswith('#'): yield lineNum, line - elif len(ifdefsStack) == 0 or all(map(lambda (name, isActive): (name in defs and isActive) or (name not in defs and not isActive), ifdefsStack)): + elif len(ifdefsStack) == 0 or all([(name_isActive[0] in defs and name_isActive[1]) or (name_isActive[0] not in defs and not name_isActive[1]) for name_isActive in ifdefsStack]): yield lineNum, _processLine(lineNum, line, defines, filename) diff --git a/fsabuilder/morfeuszbuilder/segrules/pyparseString.py b/fsabuilder/morfeuszbuilder/segrules/pyparseString.py index e999f6c..30e9a08 100644 --- a/fsabuilder/morfeuszbuilder/segrules/pyparseString.py +++ b/fsabuilder/morfeuszbuilder/segrules/pyparseString.py @@ -11,7 +11,7 @@ def pyparseString(rule, lineNum, line, filename): try: return rule.parseString(line, parseAll=True) except ParseException as ex: - msg = u'%s:%d - Preprocessing of segmentation rules failed.\n' % (filename, lineNum) + msg = '%s:%d - Preprocessing of segmentation rules failed.\n' % (filename, lineNum) msg += line + '\n' msg += (ex.col - 1) * ' ' + '^\n' msg += ex.msg diff --git a/fsabuilder/morfeuszbuilder/segrules/rules.py b/fsabuilder/morfeuszbuilder/segrules/rules.py index 5267ecb..37ce0b8 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rules.py +++ b/fsabuilder/morfeuszbuilder/segrules/rules.py @@ -126,7 +126,7 @@ class ComplexRule(SegmentRule): def __init__(self, children, linenum): super(ComplexRule, self).__init__(linenum) self.children = children - assert not any(map(lambda c: c.isSinkRule(), children)) + assert not any([c.isSinkRule() for c in children]) def addToNFA(self, fsa): endState = RulesNFAState(self, final=True, weak=self.weak, autogenerated=self.autogenerated) @@ -159,13 +159,13 @@ class ConcatRule(ComplexRule): lastChild._doAddToNFA(currStartState, endState) def allowsEmptySequence(self): - return all(map(lambda rule: rule.allowsEmptySequence(), self.children)) + return all([rule.allowsEmptySequence() for rule in self.children]) def __str__(self): - return u' '.join(map(lambda c: str(c), self.children)) + return ' '.join([str(c) for c in self.children]) def isShiftOrthRule(self): - return all(map(lambda c: c.isShiftOrthRule(), self.children)) + return all([c.isShiftOrthRule() for c in self.children]) def transformToGeneratorVersion(self): newChildren = [child.transformToGeneratorVersion() for child in self.children if not child.allowsEmptySequence() or child.isShiftOrthRule()] @@ -207,11 +207,11 @@ class ConcatRule(ComplexRule): for rule in self.children: rule.validate(filename) if self.children[-1].isShiftOrthRule() \ - and not all(map(lambda c: c.isShiftOrthRule(), self.children)): + and not all([c.isShiftOrthRule() for c in self.children]): raise ConfigFileException( filename, self.linenum, - u'If the rightmost subrule of concatenation "%s" is with ">", than all subrules must be with ">"' % str(self)) + 'If the rightmost subrule of concatenation "%s" is with ">", than all subrules must be with ">"' % str(self)) class OrRule(ComplexRule): @@ -227,17 +227,17 @@ class OrRule(ComplexRule): intermEndState.addTransition(None, endState) def allowsEmptySequence(self): - return any(map(lambda rule: rule.allowsEmptySequence(), self.children)) + return any([rule.allowsEmptySequence() for rule in self.children]) def __str__(self): - return u' | '.join(map(lambda c: str(c), self.children)) + return ' | '.join([str(c) for c in self.children]) def isShiftOrthRule(self): - return all(map(lambda c: c.isShiftOrthRule(), self.children)) + return all([c.isShiftOrthRule() for c in self.children]) def transformToGeneratorVersion(self): newChildren = [child.transformToGeneratorVersion() for child in self.children if not child.allowsEmptySequence() or child.isShiftOrthRule()] - newChildren = filter(lambda c: not c.isSinkRule(), newChildren) + newChildren = [c for c in newChildren if not c.isSinkRule()] if newChildren == []: return SinkRule() else: @@ -255,12 +255,12 @@ class OrRule(ComplexRule): for rule in self.children: rule.validate(filename) if not ( - all(map(lambda c: c.isShiftOrthRule(), self.children)) - or not any(map(lambda c: c.isShiftOrthRule(), self.children))): + all([c.isShiftOrthRule() for c in self.children]) + or not any([c.isShiftOrthRule() for c in self.children])): raise ConfigFileException( filename, self.linenum, - u'All subrules of alternative "%s" must be either with or without ">"' % str(self)) + 'All subrules of alternative "%s" must be either with or without ">"' % str(self)) class ZeroOrMoreRule(UnaryRule): @@ -291,7 +291,7 @@ class ZeroOrMoreRule(UnaryRule): return SinkRule() def __str__(self): - return u'(' + str(self.child) + ')*' + return '(' + str(self.child) + ')*' class OptionalRule(UnaryRule): @@ -321,7 +321,7 @@ class OptionalRule(UnaryRule): return self.child.transformToGeneratorVersion() def __str__(self): - return u'(' + str(self.child) + ')?' + return '(' + str(self.child) + ')?' class SinkRule(SegmentRule): diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py b/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py index bb0ba9d..40f50a0 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py +++ b/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py @@ -49,7 +49,7 @@ class RulesFSA(object): def transitionsData2bytearray(self, state): res = bytearray() # logging.debug('next') - for (segnum, shiftOrth), nextState in sorted(state.transitionsMap.iteritems()): + for (segnum, shiftOrth), nextState in sorted(state.transitionsMap.items()): res.append(segnum) if shiftOrth: res.append(1) @@ -57,8 +57,8 @@ class RulesFSA(object): res.append(0) offset = nextState.offset exceptions.validate(offset <= MAX_FSA_SIZE, - u'Segmentation rules are too big and complicated' \ - + u'- the resulting automaton would exceed its max size which is %d' \ + 'Segmentation rules are too big and complicated' \ + + '- the resulting automaton would exceed its max size which is %d' \ % MAX_FSA_SIZE) res.extend(htons(offset)) return res diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesManager.py b/fsabuilder/morfeuszbuilder/segrules/rulesManager.py index 182e53a..2f74792 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rulesManager.py +++ b/fsabuilder/morfeuszbuilder/segrules/rulesManager.py @@ -7,7 +7,7 @@ import logging from morfeuszbuilder.utils.serializationUtils import htons, htonl from morfeuszbuilder.utils import serializationUtils from morfeuszbuilder.utils import exceptions -import shiftOrthMagic +from . import shiftOrthMagic class RulesManager(object): @@ -19,7 +19,7 @@ class RulesManager(object): self.shiftOrthMagic = shiftOrthMagic.ShiftOrthMagic() def _options2Key(self, optionsMap): - return frozenset(optionsMap.items()) + return frozenset(list(optionsMap.items())) def _key2Options(self, optionsKey): return dict(optionsKey) @@ -46,9 +46,9 @@ class RulesManager(object): dfasNum = len(self.options2DFA) exceptions.validate( dfasNum > 0 and dfasNum < 256, - u'Too many segmentation rules variants') + 'Too many segmentation rules variants') res.append(dfasNum) - for key, dfa in self.options2DFA.iteritems(): + for key, dfa in list(self.options2DFA.items()): optionsMap = self._key2Options(key) res.extend(self._serializeOptionsMap(optionsMap)) res.extend(self._serializeDFA(dfa)) diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py b/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py index 5862fc7..6385f97 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py +++ b/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py @@ -41,16 +41,16 @@ class RulesNFAState(object): if not self in visitedStates: visitedStates.add(self) yield self - for _, nextStates in self.transitionsMap.iteritems(): + for _, nextStates in list(self.transitionsMap.items()): for state in nextStates: for state1 in state.dfs(visitedStates): yield state1 def debug(self): - print '----------------' - print 'STATE:', self.idx - for label, nextStates in self.transitionsMap.iteritems(): - print label, '-->', [s.idx for s in sorted(nextStates, key=lambda s: s.idx)] + print('----------------') + print(('STATE:', self.idx)) + for label, nextStates in list(self.transitionsMap.items()): + print((label, '-->', [s.idx for s in sorted(nextStates, key=lambda s: s.idx)])) class RulesNFA(object): @@ -60,7 +60,7 @@ class RulesNFA(object): def _groupOutputByLabels(self, nfaStates): res = {} for nfaState in nfaStates: - for label, nextStates in nfaState.transitionsMap.iteritems(): + for label, nextStates in list(nfaState.transitionsMap.items()): if label is not None: # transitionData = nfaState.transitionsDataMap[label] segnum, shiftOrth = label @@ -70,27 +70,21 @@ class RulesNFA(object): return res def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState): - weakHits = map( - lambda state: state.weak, - filter( - lambda state: state.final and not state.autogenerated, - nfaStates)) + weakHits = [state.weak for state in [state for state in nfaStates if state.final and not state.autogenerated]] if not all(weakHits) \ and any(weakHits): - weakState = list(filter(lambda state: state.final and state.weak, nfaStates))[0] - nonWeakState = list(filter(lambda state: state.final and not state.weak, nfaStates))[0] + weakState = list([state for state in nfaStates if state.final and state.weak])[0] + nonWeakState = list([state for state in nfaStates if state.final and not state.weak])[0] raise InconsistentStateWeaknessException(weakState, nonWeakState) - weak = any(map( - lambda state: state.weak and state.final, - filter(lambda state: not state.autogenerated, nfaStates))) - final = any(map(lambda state: state.final, nfaStates)) + weak = any([state.weak and state.final for state in [state for state in nfaStates if not state.autogenerated]]) + final = any([state.final for state in nfaStates]) # assert not weak or not final if final: # dfaState should be final # and contain info about weakness dfaState.setAsAccepting(weak=weak) # dfaState.encodedData = bytearray([1 if weak else 0]) - for (segnum, shiftOrth), nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems(): + for (segnum, shiftOrth), nextNFAStates in list(self._groupOutputByLabels(nfaStates).items()): key = frozenset(nextNFAStates) if key in nfaSubset2DFAState: nextDFAState = nfaSubset2DFAState[key] @@ -104,7 +98,7 @@ class RulesNFA(object): def convertToDFA(self): dfa = RulesFSA() startStates = self.initialState.getClosure(set()) - assert not any(filter(lambda s: s.final, startStates)) + assert not any([s for s in startStates if s.final]) dfa.initialState = RulesState() self._doConvertState(dfa.initialState, startStates, {frozenset(startStates): dfa.initialState}) return dfa diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesParser.py b/fsabuilder/morfeuszbuilder/segrules/rulesParser.py index 6e00b00..360708d 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rulesParser.py +++ b/fsabuilder/morfeuszbuilder/segrules/rulesParser.py @@ -28,11 +28,11 @@ class RulesParser(object): key, defs = lineToParse.parseString(line) res[key] = tuple(defs) except Exception as ex: - raise exceptions.ConfigFileException(segtypesConfigFile.filename, lineNum, u'Error in [options] section: %s' % str(ex)) + raise exceptions.ConfigFileException(segtypesConfigFile.filename, lineNum, 'Error in [options] section: %s' % str(ex)) return res def _key2DefAsKey(self, key2Def): - return frozenset(key2Def.items()) + return frozenset(list(key2Def.items())) def parse(self, filename): @@ -53,12 +53,12 @@ class RulesParser(object): res = rulesManager.RulesManager(segtypesHelper, separatorsList) def2Key = {} - for key, defs in key2Defs.iteritems(): + for key, defs in list(key2Defs.items()): for define in defs: def2Key[define] = key resultsMap = {} - for idx, defs in enumerate(itertools.product(*key2Defs.values())): + for idx, defs in enumerate(itertools.product(*list(key2Defs.values()))): key2Def = dict([(def2Key[define], define) for define in defs]) currRes = [] resultsMap[self._key2DefAsKey(key2Def)] = currRes @@ -86,7 +86,7 @@ class RulesParser(object): self.doShiftOrthMagic(resultsMap, res) - for idx, defs in enumerate(itertools.product(*key2Defs.values())): + for idx, defs in enumerate(itertools.product(*list(key2Defs.values()))): key2Def = dict([(def2Key[define], define) for define in defs]) nfa = rulesNFA.RulesNFA() @@ -115,20 +115,20 @@ class RulesParser(object): def _createNewTagRule(self, segtype, shiftOrth, lineNum, line, segtypesHelper): if not segtypesHelper.hasSegtype(segtype): - raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype)) + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid segment type: %s' % (line, segtype)) else: # return rules.TagRule(segtype) return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype), shiftOrth, segtype, lineNum) def _createQuantRule1(self, child, quantity, lineNum, line, segtypesHelper): if quantity <= 0: - raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid quantity: %d' % (line, quantity)) + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantity: %d' % (line, quantity)) else: return rules.ConcatRule(quantity * [child], lineNum) def _createQuantRule2(self, child, leftN, rightN, lineNum, line, segtypesHelper): if leftN > rightN or (leftN, rightN) == (0, 0): - raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid quantities: %d %d' % (line, leftN, rightN)) + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantities: %d %d' % (line, leftN, rightN)) elif leftN == 0: children = [rules.OptionalRule(child, lineNum)] for n in range(2, rightN + 1): @@ -140,7 +140,7 @@ class RulesParser(object): def _createQuantRule3(self, child, quantity, lineNum, line, segtypesHelper): if quantity <= 0: - raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid quantity: %d' % (line, quantity)) + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, '%s - invalid quantity: %d' % (line, quantity)) else: return rules.ConcatRule( [ @@ -200,7 +200,7 @@ class RulesParser(object): shiftOrthSegtypes = set() nonShiftOrthSegtypes = set() - for _, rules in resultsMap.iteritems(): + for _, rules in list(resultsMap.items()): for rule in rules: for atomicRule in rule.getAtomicRules(): if atomicRule.shiftOrth: diff --git a/fsabuilder/morfeuszbuilder/segrules/shiftOrthMagic.py b/fsabuilder/morfeuszbuilder/segrules/shiftOrthMagic.py index eceb0ae..c9cda3d 100644 --- a/fsabuilder/morfeuszbuilder/segrules/shiftOrthMagic.py +++ b/fsabuilder/morfeuszbuilder/segrules/shiftOrthMagic.py @@ -36,7 +36,7 @@ class ShiftOrthMagic(object): for segtype in shiftOrthSegtypes - nonShiftOrthSegtypes: self._onlyShiftSegnums.add(segtypesHelper.getSegnum4Segtype(segtype)) - for _, rules in resultsMap.iteritems(): + for _, rules in list(resultsMap.items()): for rule in rules: for atomicRule in rule.getAtomicRules(): if atomicRule.segnum in self._bothShiftAndNonShiftSegnums and atomicRule.shiftOrth: diff --git a/fsabuilder/morfeuszbuilder/segrules/test/parserTest.py b/fsabuilder/morfeuszbuilder/segrules/test/parserTest.py index 39cbde5..1f0cc1e 100644 --- a/fsabuilder/morfeuszbuilder/segrules/test/parserTest.py +++ b/fsabuilder/morfeuszbuilder/segrules/test/parserTest.py @@ -12,18 +12,18 @@ from morfeuszbuilder.fsa import visualizer, serializer class Test(unittest.TestCase): def testParser(self): - print 'do test' + print('do test') t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset')) parser = rulesParser.RulesParser(t) rulesManager = parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) fsa = rulesManager.getDFA({'aggl': 'permissive', 'praet': 'split'}) for s in fsa.dfs(): s.debug() - print 'states:', len(list(fsa.dfs())) - print 'transitions:', fsa.getTransitionsNum() + print(('states:', len(list(fsa.dfs())))) + print(('transitions:', fsa.getTransitionsNum())) visualizer.Visualizer().visualize(fsa, charLabels=False) - print 'size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray())) - print 'done' + print(('size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray())))) + print('done') if __name__ == "__main__": unittest.main() diff --git a/fsabuilder/morfeuszbuilder/segrules/test/preprocessorTest.py b/fsabuilder/morfeuszbuilder/segrules/test/preprocessorTest.py index e1ebc63..13b104e 100644 --- a/fsabuilder/morfeuszbuilder/segrules/test/preprocessorTest.py +++ b/fsabuilder/morfeuszbuilder/segrules/test/preprocessorTest.py @@ -19,7 +19,7 @@ class Test(unittest.TestCase): parsedFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes', 'segment types']) linesEnum = parsedFile.enumerateLinesInSection('combinations') for lineNum, line in preprocessor.preprocess(linesEnum, ['extra', 'superextra']): - print (lineNum, line) + print((lineNum, line)) if __name__ == "__main__": diff --git a/fsabuilder/morfeuszbuilder/tagset/segtypes.py b/fsabuilder/morfeuszbuilder/tagset/segtypes.py index a7e2a89..4da4f62 100644 --- a/fsabuilder/morfeuszbuilder/tagset/segtypes.py +++ b/fsabuilder/morfeuszbuilder/tagset/segtypes.py @@ -11,11 +11,11 @@ from morfeuszbuilder.utils import exceptions def _getLemmaHomonymPair(lemma): if lemma is None: return (None, None) - elif u':' in lemma: - if lemma.replace(u':', '') == '': + elif ':' in lemma: + if lemma.replace(':', '') == '': return (lemma, None) else: - return lemma.split(u':', 1) + return lemma.split(':', 1) else: return (lemma, None) @@ -26,7 +26,7 @@ class Segtypes(object): self.tagset = tagset self.namesMap = namesMap self.labelsMap = labelsMap - self._reverseLabelsMap = dict([(v, k) for (k, v) in labelsMap.iteritems()]) + self._reverseLabelsMap = dict([(v, k) for (k, v) in list(labelsMap.items())]) self.filename = segrulesConfigFile.filename @@ -59,13 +59,13 @@ class Segtypes(object): def _readSegtypes(self, segrulesConfigFile): for lineNum, line in segrulesConfigFile.enumerateLinesInSection('segment types'): - assert type(line) == unicode + assert type(line) == str self._validate( - u'Segment type must be a single word', + 'Segment type must be a single word', lineNum, re.match(r'^\w+$', line)) self._validate( - u'Segment type already defined: "%s"' % line, + 'Segment type already defined: "%s"' % line, lineNum, line not in self.segtypes) self.segtypes.append(line) @@ -75,13 +75,13 @@ class Segtypes(object): for lineNum, line in segrulesConfigFile.enumerateLinesInSection('tags'): self._parsePattern(lineNum, line, withLemma=False) self._validate( - u'Pattern that matches everything must be the last one', + 'Pattern that matches everything must be the last one', lineNum - 1, not gotWildcardPattern) gotWildcardPattern = gotWildcardPattern or self.patternsList[-1].isWildcardPattern() self._validate( - u'There must be a pattern that matches everything at the end of [tags] section', + 'There must be a pattern that matches everything at the end of [tags] section', lineNum, self.patternsList[-1].isWildcardPattern()) @@ -94,18 +94,18 @@ class Segtypes(object): for f in fields: match = re.match(r'(name|labels)=([\S]+)', f, re.U) self._validate( - u'invalid name or labels constraint: "%s"' % f, + 'invalid name or labels constraint: "%s"' % f, lineNum, match) key = match.group(1) value = match.group(2) self._validate( - u'%s already specified' % key, + '%s already specified' % key, lineNum, key not in res) if key == 'labels': if value: - value = frozenset(value.split(u'|')) + value = frozenset(value.split('|')) else: value = frozenset() res[key] = value @@ -115,7 +115,7 @@ class Segtypes(object): split = re.split(r'\s+', line.strip()) if withLemma: self._validate( - u'Line in [lexemes] section must contain 3 to 5 fields - segment type, lemma, tag pattern and optional constraints on name and labels', + 'Line in [lexemes] section must contain 3 to 5 fields - segment type, lemma, tag pattern and optional constraints on name and labels', lineNum, len(split) in [3, 4, 5]) segtype = split[0] @@ -124,7 +124,7 @@ class Segtypes(object): additionalConstraints = self._parseAdditionalConstraints(lineNum, split[3:]) else: self._validate( - u'Line in [tags] section must contain 2 to 4 fields - segment type, tag pattern and optional constraints on name and labels', + 'Line in [tags] section must contain 2 to 4 fields - segment type, tag pattern and optional constraints on name and labels', lineNum, len(split) in [2, 3, 4]) segtype = split[0] @@ -132,32 +132,32 @@ class Segtypes(object): pattern = split[1] additionalConstraints = self._parseAdditionalConstraints(lineNum, split[2:]) self._validate( - u'Undeclared segment type: "%s"' % segtype, + 'Undeclared segment type: "%s"' % segtype, lineNum, segtype in self.segtypes) segnum = self.segtypes.index(segtype) self._validate( - u'Pattern must contain only ":", "%", "." and lowercase alphanumeric letters', + 'Pattern must contain only ":", "%", "." and lowercase alphanumeric letters', lineNum, re.match(r'[a-z_\.\:\%]+', pattern)) segtypePattern = SegtypePattern( lemma, pattern, - additionalConstraints.get('name', u''), + additionalConstraints.get('name', ''), additionalConstraints.get('labels', frozenset()), segnum) # print 'segtypePattern', repr(str(segtypePattern)) self._validate( - u'There is no tag that matches pattern "%s".' % (pattern), + 'There is no tag that matches pattern "%s".' % (pattern), lineNum, any([segtypePattern.tryToMatchTag(tag) != -1 for tag in self.tagset.getAllTags()])) self.patternsList.append(segtypePattern) def _getAllExistingLabelsnumCombinations(self, labels): if labels: - for labelsCombination, labelsnum in self.labelsMap.iteritems(): + for labelsCombination, labelsnum in list(self.labelsMap.items()): if labels <= labelsCombination: yield labelsnum else: @@ -232,7 +232,7 @@ class SegtypePattern(object): return -1 def isWildcardPattern(self): - return (self.lemma, self.pattern, self.name, self.labels) == (None, '%', u'', frozenset()) + return (self.lemma, self.pattern, self.name, self.labels) == (None, '%', '', frozenset()) def __str__(self): - return u'%s %s %s %s -> %d' % (self.lemma, self.pattern, self.name, self.labels, self.segnum) + return '%s %s %s %s -> %d' % (self.lemma, self.pattern, self.name, self.labels, self.segnum) diff --git a/fsabuilder/morfeuszbuilder/tagset/tagset.py b/fsabuilder/morfeuszbuilder/tagset/tagset.py index 3eaee31..0a4779f 100644 --- a/fsabuilder/morfeuszbuilder/tagset/tagset.py +++ b/fsabuilder/morfeuszbuilder/tagset/tagset.py @@ -20,7 +20,7 @@ class Tagset(object): #~ self._name2namenum = {} if filename: self._doInit(filename, encoding) - self._tagnum2tag = dict(map(lambda (k, v): (v, k), self.tag2tagnum.iteritems())) + self._tagnum2tag = dict([(k_v[1], k_v[0]) for k_v in iter(list(self.tag2tagnum.items()))]) def _doInit(self, filename, encoding): insideTags = False @@ -33,11 +33,11 @@ class Tagset(object): self.tagsetId = match.group(1) else: raise FSABuilderException('missing TAGSET-ID in first line of tagset file') - elif line == u'[TAGS]': + elif line == '[TAGS]': insideTags = True #~ elif line == u'[NAMES]': #~ addingTo = Tagset.NAMES - elif line and not line.startswith(u'#'): + elif line and not line.startswith('#'): if not insideTags: raise FSABuilderException('"%s" - text outside [TAGS] section in tagset file line %d' % (line, linenum)) res = self.tag2tagnum @@ -47,12 +47,12 @@ class Tagset(object): tag = line.split(Tagset.SEP)[1] if tag in res: raise FSABuilderException('duplicate tag: "%s"' % tag) - if int(tagNum) in res.values(): + if int(tagNum) in list(res.values()): raise FSABuilderException('line %d: tagId %d assigned for tag "%s" already appeared somewhere else.' % (linenum, int(tagNum), tag)) res[tag] = int(tagNum) def getAllTags(self): - return self.tag2tagnum.keys() + return list(self.tag2tagnum.keys()) def getTagnum4Tag(self, tag): if tag in self.tag2tagnum: diff --git a/fsabuilder/morfeuszbuilder/utils/caseconv/generate.py b/fsabuilder/morfeuszbuilder/utils/caseconv/generate.py index c3698f5..7b89611 100644 --- a/fsabuilder/morfeuszbuilder/utils/caseconv/generate.py +++ b/fsabuilder/morfeuszbuilder/utils/caseconv/generate.py @@ -90,7 +90,7 @@ def _serializeTable(table): def _serializeExtendedTable(table): res = [] res.append('{') - for code, targetCode in table.iteritems(): + for code, targetCode in list(table.items()): res.append('{') res.append(str(code)) res.append(',') diff --git a/fsabuilder/morfeuszbuilder/utils/configFile.py b/fsabuilder/morfeuszbuilder/utils/configFile.py index 69d000d..23209c2 100644 --- a/fsabuilder/morfeuszbuilder/utils/configFile.py +++ b/fsabuilder/morfeuszbuilder/utils/configFile.py @@ -6,10 +6,10 @@ Created on 18 lut 2014 import re import codecs -import exceptions +from . import exceptions def getHeaderValue(line, lineNum): - m = re.match(ur'\s*\[(.*?)\]\s*(\#.*)?', line) + m = re.match(r'\s*\[(.*?)\]\s*(\#.*)?', line) if m: return m.group(1) else: @@ -40,7 +40,7 @@ class ConfigFile(object): self.section2Lines[self.currSection].append((lineNum, line)) def _getHeaderValue(self, line, lineNum): - m = re.match(ur'\s*\[(.*?)\]\s*(\#.*)?', line) + m = re.match(r'\s*\[(.*?)\]\s*(\#.*)?', line) if m: return m.group(1) else: @@ -48,7 +48,7 @@ class ConfigFile(object): def enumerateLinesInSection(self, sectionName, ignoreComments=True): if sectionName not in self.section2Lines: - raise exceptions.ConfigFileException(self.filename, None, u'Missing section: "%s"' % sectionName) + raise exceptions.ConfigFileException(self.filename, None, 'Missing section: "%s"' % sectionName) if not ignoreComments: return self.section2Lines[sectionName] else: diff --git a/fsabuilder/morfeuszbuilder/utils/exceptions.py b/fsabuilder/morfeuszbuilder/utils/exceptions.py index 616636f..4a67ee0 100644 --- a/fsabuilder/morfeuszbuilder/utils/exceptions.py +++ b/fsabuilder/morfeuszbuilder/utils/exceptions.py @@ -25,7 +25,7 @@ class SegtypesException(FSABuilderException): self.msg = msg def __str__(self): - return u'Error in segment rules: %s' % self.msg + return 'Error in segment rules: %s' % self.msg class ConfigFileException(FSABuilderException): @@ -36,7 +36,7 @@ class ConfigFileException(FSABuilderException): def __str__(self): if self.lineNum: - return u'%s:%d - %s' % (self.filename, self.lineNum, self.msg) + return '%s:%d - %s' % (self.filename, self.lineNum, self.msg) else: - return u'%s - %s' % (self.filename, self.msg) + return '%s - %s' % (self.filename, self.msg) diff --git a/fsabuilder/morfeuszbuilder/utils/extractTagset.py b/fsabuilder/morfeuszbuilder/utils/extractTagset.py index 7a4aeca..bb46ac6 100644 --- a/fsabuilder/morfeuszbuilder/utils/extractTagset.py +++ b/fsabuilder/morfeuszbuilder/utils/extractTagset.py @@ -8,10 +8,10 @@ import sys if __name__ == '__main__': version = sys.argv[1] res = set() - print '#morfeusz-tagset', version + print(('#morfeusz-tagset', version)) for line in sys.stdin: if line.strip(): tag = line.split('\t')[2] res.add(tag) for idx, tag in enumerate(sorted(res)): - print str(idx) + '\t' + tag + print((str(idx) + '\t' + tag)) -- libgit2 0.22.2