Commit 8d5a878e6650c4130784b81026b903e2ffd965c8

Authored by Michał Lenart
1 parent 1c1bf677

- praca nad budowaniem automatu dla zlepiacza segmentów

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@86 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsabuilder/buildfsa.spec 0 → 100644
  1 +# -*- mode: python -*-
  2 +a = Analysis(['fsa/buildfsa.py'],
  3 + pathex=['/home/lennyn/xxx/morfeusz/fsabuilder'],
  4 + hiddenimports=[],
  5 + hookspath=None,
  6 + runtime_hooks=None)
  7 +pyz = PYZ(a.pure)
  8 +exe = EXE(pyz,
  9 + a.scripts,
  10 + exclude_binaries=True,
  11 + name='buildfsa',
  12 + debug=False,
  13 + strip=None,
  14 + upx=True,
  15 + console=True )
  16 +coll = COLLECT(exe,
  17 + a.binaries,
  18 + a.zipfiles,
  19 + a.datas,
  20 + strip=None,
  21 + upx=True,
  22 + name='buildfsa')
fsabuilder/morfeuszbuilder/fsa/fsa.py
@@ -14,11 +14,12 @@ class FSA(object): @@ -14,11 +14,12 @@ class FSA(object):
14 ''' 14 '''
15 15
16 16
17 - def __init__(self, encoder, tagset=None, encodeData=True):  
18 - self.encodeWord = encoder.encodeWord 17 + def __init__(self, encoder, tagset=None, encodeData=True, encodeWords=True):
  18 + self.encodeWord = encoder.encodeWord if encodeWords else lambda x: x
19 self.encodeData = encoder.encodeData if encodeData else lambda x: x 19 self.encodeData = encoder.encodeData if encodeData else lambda x: x
20 self.decodeData = encoder.decodeData if encodeData else lambda x: x 20 self.decodeData = encoder.decodeData if encodeData else lambda x: x
21 self.encodedPrevWord = None 21 self.encodedPrevWord = None
  22 +
22 self.tagset = tagset 23 self.tagset = tagset
23 self.initialState = state.State() 24 self.initialState = state.State()
24 self.register = register.Register() 25 self.register = register.Register()
fsabuilder/morfeuszbuilder/fsa/state.py
@@ -9,7 +9,7 @@ class State(object): @@ -9,7 +9,7 @@ class State(object):
9 A state in an automaton 9 A state in an automaton
10 ''' 10 '''
11 11
12 - def __init__(self): 12 + def __init__(self, additionalData=None):
13 self.transitionsMap = {} 13 self.transitionsMap = {}
14 self.freq = 0 14 self.freq = 0
15 self.encodedData = None 15 self.encodedData = None
@@ -17,6 +17,7 @@ class State(object): @@ -17,6 +17,7 @@ class State(object):
17 self.offset = None 17 self.offset = None
18 self.label2Freq = {} 18 self.label2Freq = {}
19 self.serializeAsArray = False 19 self.serializeAsArray = False
  20 + self.additionalData = additionalData
20 21
21 @property 22 @property
22 def transitionsNum(self): 23 def transitionsNum(self):
fsabuilder/morfeuszbuilder/segrules/preprocessor.py
@@ -34,11 +34,6 @@ class ArgDefine(object): @@ -34,11 +34,6 @@ class ArgDefine(object):
34 def __str__(self): 34 def __str__(self):
35 return '%s(%s) %s' % (self.name, self.arg, self.val) 35 return '%s(%s) %s' % (self.name, self.arg, self.val)
36 36
37 -class PreprocessorException(Exception):  
38 -  
39 - def __init__(self, msg, line):  
40 - pass  
41 -  
42 def _tryToSubstituteArgDefine(s, t, defines): 37 def _tryToSubstituteArgDefine(s, t, defines):
43 defineName = t[0] 38 defineName = t[0]
44 substituteValue = t[1] 39 substituteValue = t[1]
fsabuilder/morfeuszbuilder/segrules/rules.py
@@ -4,6 +4,8 @@ Created on 24 sty 2014 @@ -4,6 +4,8 @@ Created on 24 sty 2014
4 @author: mlenart 4 @author: mlenart
5 ''' 5 '''
6 6
  7 +from morfeuszbuilder.segrules.rulesNFA import RulesNFAState
  8 +
7 class SegmentRule(object): 9 class SegmentRule(object):
8 ''' 10 '''
9 classdocs 11 classdocs
@@ -14,46 +16,91 @@ class SegmentRule(object): @@ -14,46 +16,91 @@ class SegmentRule(object):
14 ''' 16 '''
15 Constructor 17 Constructor
16 ''' 18 '''
  19 +
  20 + def addToNFA(self, fsa):
  21 + raise NotImplementedError()
  22 +
  23 + def _doAddToNFA(self, startStates, endState):
  24 + raise NotImplementedError()
17 25
18 class TagRule(SegmentRule): 26 class TagRule(SegmentRule):
19 27
20 - def __init__(self, tagType, line):  
21 - self.tagType = tagType  
22 - self.line = line 28 + def __init__(self, segnum):
  29 + self.segnum = segnum
  30 +
  31 + def addToNFA(self, fsa):
  32 + endState = RulesNFAState(final=True)
  33 + self._doAddToNFA(fsa.initialState, endState)
  34 +
  35 + def _doAddToNFA(self, startState, endState):
  36 + startState.addTransition(self.segnum, endState)
23 37
24 class UnaryRule(SegmentRule): 38 class UnaryRule(SegmentRule):
25 39
26 - def __init__(self, child, line): 40 + def __init__(self, child):
27 self.child = child 41 self.child = child
28 - self.line = line  
29 42
30 class ComplexRule(SegmentRule): 43 class ComplexRule(SegmentRule):
31 44
32 - def __init__(self, children, line): 45 + def __init__(self, children):
33 self.children = children 46 self.children = children
34 - self.line = line 47 +
  48 + def addToNFA(self, fsa):
  49 + endState = RulesNFAState(final=True)
  50 + self._doAddToNFA(fsa.initialState, endState)
35 51
36 class ConcatRule(ComplexRule): 52 class ConcatRule(ComplexRule):
37 53
38 - def __init__(self, children, line):  
39 - super(ConcatRule, self).__init__(children, line) 54 + def __init__(self, children):
  55 + super(ConcatRule, self).__init__(children)
  56 +
  57 + def _doAddToNFA(self, startState, endState):
  58 + currStartState = startState
  59 + for child in self.children[:-1]:
  60 + currEndState = RulesNFAState()
  61 + child._doAddToNFA(currStartState, currEndState)
  62 + nextStartState = RulesNFAState()
  63 + currEndState.addTransition(None, nextStartState)
  64 + currStartState = nextStartState
  65 + lastChild = self.children[-1]
  66 + lastChild._doAddToNFA(currStartState, endState)
40 67
41 class OrRule(ComplexRule): 68 class OrRule(ComplexRule):
42 69
43 - def __init__(self, children, line):  
44 - super(OrRule, self).__init__(children, line) 70 + def __init__(self, children):
  71 + super(OrRule, self).__init__(children)
  72 +
  73 + def _doAddToNFA(self, startState, endState):
  74 + for child in self.children:
  75 + intermStartState = RulesNFAState()
  76 + intermEndState = RulesNFAState()
  77 + startState.addTransition(None, intermStartState)
  78 + child._doAddToNFA(intermStartState, intermEndState)
  79 + intermEndState.addTransition(None, endState)
45 80
46 class ZeroOrMoreRule(UnaryRule): 81 class ZeroOrMoreRule(UnaryRule):
47 82
48 - def __init__(self, child, line):  
49 - super(ZeroOrMoreRule, self).__init__(child, line)  
50 -  
51 -class OneOrMoreRule(UnaryRule): 83 + def __init__(self, child):
  84 + super(ZeroOrMoreRule, self).__init__(child)
  85 +
  86 + def addToNFA(self, fsa):
  87 + raise ValueError()
52 88
53 - def __init__(self, child, line):  
54 - super(OneOrMoreRule, self).__init__(child, line) 89 + def _doAddToNFA(self, startState, endState):
  90 + intermStartState = RulesNFAState()
  91 + intermEndState = RulesNFAState()
  92 +
  93 + startState.addTransition(None, intermStartState)
  94 + startState.addTransition(None, endState)
  95 + self.child._doAddToNFA(intermStartState, intermEndState)
  96 + intermEndState.addTransition(None, endState)
  97 + endState.addTransition(None, intermStartState)
55 98
56 class IgnoreOrthRule(UnaryRule): 99 class IgnoreOrthRule(UnaryRule):
57 100
58 - def __init__(self, child, line):  
59 - super(IgnoreOrthRule, self).__init__(child, line) 101 + def __init__(self, child):
  102 + super(IgnoreOrthRule, self).__init__(child)
  103 +
  104 + def _doAddToNFA(self, startState, endState):
  105 + startState.addTransition(self.child.segnum, endState, ignoreOrth=True)
  106 +
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py 0 → 100644
  1 +'''
  2 +Created on 24 sty 2014
  3 +
  4 +@author: mlenart
  5 +'''
  6 +
  7 +from morfeuszbuilder.fsa import fsa, state, encode
  8 +
  9 +class RulesNFAState(object):
  10 +
  11 + def __init__(self, initial=False, final=False):
  12 + self.transitionsMap = {}
  13 + self.initial = initial
  14 + self.final = final
  15 +
  16 + def addTransition(self, label, targetState, ignoreOrth=False):
  17 + assert not ignoreOrth or label is not None
  18 + self.transitionsMap.setdefault((label, ignoreOrth), set())
  19 + self.transitionsMap[(label, ignoreOrth)].add(targetState)
  20 +
  21 +class RulesNFA(object):
  22 +
  23 + def __init__(self, key2Def={}):
  24 + self.initialState = RulesNFAState(initial=True)
  25 +
  26 + def _doConvertState(self, dfaState, nfaStates):
  27 + for label, (nextIgnoreOrth, nextNFAStates) in self._groupOutputByLabels(nfaStates).iteritems():
  28 + nextDFAState = state.State(additionalData=nextIgnoreOrth)
  29 + dfaState.setTransition(label, nextDFAState)
  30 + dfaState.encodedData = bytearray()
  31 + self._doConvertState(nextDFAState, nextNFAStates)
  32 +
  33 + def convertToDFA(self):
  34 + dfa = fsa.FSA(encoder=None, encodeWords=False)
  35 + startStates = self.initialState.getClosure()
  36 + assert not any(filter(lambda s: s.final, startStates))
  37 + dfa.initialState = state.State(additionalData=False)
  38 + self._doConvertState(dfa.initialState, startStates)
  39 +
  40 +
0 \ No newline at end of file 41 \ No newline at end of file
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
1 1
2 from pyparsing import * 2 from pyparsing import *
  3 +ParserElement.enablePackrat()
3 from morfeuszbuilder.tagset import segtypes 4 from morfeuszbuilder.tagset import segtypes
4 -from morfeuszbuilder.utils import configFile  
5 -from morfeuszbuilder.segrules import preprocessor 5 +from morfeuszbuilder.utils import configFile, exceptions
  6 +from morfeuszbuilder.segrules import preprocessor, rules
6 import codecs 7 import codecs
7 import re 8 import re
8 9
9 import itertools 10 import itertools
10 import logging 11 import logging
11 -import segsfsa  
12 -  
13 -# header = Suppress('[') + Word(alphas, bodyChars=alphanums+'_') + Suppress(']')  
14 -# define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd()  
15 -# ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd()  
16 -# endif = Keyword('#endif').suppress() + LineEnd() + StringEnd()  
17 -  
18 -def doprint(toks):  
19 - print toks 12 +from morfeuszbuilder.segrules import rulesNFA
20 13
21 class RulesParser(object): 14 class RulesParser(object):
22 15
@@ -31,7 +24,7 @@ class RulesParser(object): @@ -31,7 +24,7 @@ class RulesParser(object):
31 key, defs = lineToParse.parseString(line) 24 key, defs = lineToParse.parseString(line)
32 res[key] = tuple(defs) 25 res[key] = tuple(defs)
33 except Exception as ex: 26 except Exception as ex:
34 - raise configFile.ConfigFileException(segtypesConfigFile.filename, lineNum, u'Error in [options] section: %s' % str(ex)) 27 + raise exceptions.ConfigFileException(segtypesConfigFile.filename, lineNum, u'Error in [options] section: %s' % str(ex))
35 return res 28 return res
36 29
37 def parse(self, filename): 30 def parse(self, filename):
@@ -48,12 +41,12 @@ class RulesParser(object): @@ -48,12 +41,12 @@ class RulesParser(object):
48 41
49 for defs in itertools.product(*key2Defs.values()): 42 for defs in itertools.product(*key2Defs.values()):
50 key2Def = dict([(def2Key[define], define) for define in defs]) 43 key2Def = dict([(def2Key[define], define) for define in defs])
51 - fsa = segsfsa.SegmentsFSA(key2Def) 44 + nfa = rulesNFA.RulesNFA(key2Def)
52 combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations') 45 combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations')
53 combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs)) 46 combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs))
54 for rule in self._doParse(combinationEnumeratedLines, segtypesHelper): 47 for rule in self._doParse(combinationEnumeratedLines, segtypesHelper):
55 - fsa.addSegmentRule(rule)  
56 - res.append(fsa) 48 + rule.addToNFA(nfa)
  49 + res.append(nfa)
57 return res 50 return res
58 51
59 def _doParse(self, combinationEnumeratedLines, segtypesHelper): 52 def _doParse(self, combinationEnumeratedLines, segtypesHelper):
@@ -61,6 +54,12 @@ class RulesParser(object): @@ -61,6 +54,12 @@ class RulesParser(object):
61 if not line.startswith('#'): 54 if not line.startswith('#'):
62 yield self._doParseOneLine(lineNum, line, segtypesHelper) 55 yield self._doParseOneLine(lineNum, line, segtypesHelper)
63 56
  57 + def _createNewTagRule(self, segtype, lineNum, line, segtypesHelper):
  58 + if not segtypesHelper.hasSegtype(segtype):
  59 + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype))
  60 + else:
  61 + return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype))
  62 +
64 def _doParseOneLine(self, lineNum, line, segtypesHelper): 63 def _doParseOneLine(self, lineNum, line, segtypesHelper):
65 rule = Forward() 64 rule = Forward()
66 tagRule = Word(alphanums+'_') 65 tagRule = Word(alphanums+'_')
@@ -74,9 +73,21 @@ class RulesParser(object): @@ -74,9 +73,21 @@ class RulesParser(object):
74 complexRule = unaryRule ^ oneOfRule 73 complexRule = unaryRule ^ oneOfRule
75 concatRule = OneOrMore(complexRule) 74 concatRule = OneOrMore(complexRule)
76 rule << concatRule 75 rule << concatRule
  76 +
  77 + tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], lineNum, line, segtypesHelper))
  78 + ignoreOrthRule.setParseAction(lambda string, loc, toks: rules.IgnoreOrthRule(toks[0]))
  79 +# parenRule.setParseAction(lambda string, loc, toks: toks[0])
  80 + zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0]))
  81 + oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])]))
  82 + oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks))
  83 + concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks))
  84 +
  85 +
77 # rule << tagRule ^ ignoreOrthRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ orRule ^ concatRule ^ parenRule 86 # rule << tagRule ^ ignoreOrthRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ orRule ^ concatRule ^ parenRule
78 87
79 # tagRule.setParseAction(lambda s,l,toks: doprint(toks)) 88 # tagRule.setParseAction(lambda s,l,toks: doprint(toks))
80 # print lineNum, line 89 # print lineNum, line
81 - parsedLine = rule.parseString(line, parseAll=True) 90 + parsedRule = rule.parseString(line, parseAll=True)[0]
  91 + print parsedRule
  92 + return parsedRule
82 # print parsedLine 93 # print parsedLine
fsabuilder/morfeuszbuilder/segrules/segsfsa.py deleted
1 -'''  
2 -Created on 24 sty 2014  
3 -  
4 -@author: mlenart  
5 -'''  
6 -  
7 -class SegmentsFSAState(object):  
8 -  
9 - def __init__(self):  
10 - self.transitionsMap = {}  
11 -  
12 - def addSegmentRule(self, segmentRule):  
13 - pass  
14 -  
15 -class SegmentsFSA(object):  
16 -  
17 - def __init__(self, key2Def={}):  
18 - self.initialState = SegmentsFSAState()  
19 -  
20 - def addSegmentRule(self, segmentRule):  
21 - self.initialState.addSegmentRule(segmentRule)  
22 -  
23 - def serialize(self):  
24 - res = bytearray()  
25 - return res  
26 -  
27 -  
28 \ No newline at end of file 0 \ No newline at end of file
fsabuilder/morfeuszbuilder/segrules/test/parserTest.py
@@ -9,9 +9,11 @@ from morfeuszbuilder.segrules import rulesParser @@ -9,9 +9,11 @@ from morfeuszbuilder.segrules import rulesParser
9 from morfeuszbuilder.tagset import tagset 9 from morfeuszbuilder.tagset import tagset
10 10
11 class Test(unittest.TestCase): 11 class Test(unittest.TestCase):
  12 + print 'do test'
12 t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset')) 13 t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset'))
13 parser = rulesParser.RulesParser(t) 14 parser = rulesParser.RulesParser(t)
14 parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) 15 parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat'))
  16 + print 'done'
15 17
16 if __name__ == "__main__": 18 if __name__ == "__main__":
17 unittest.main() 19 unittest.main()
fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat
@@ -3,7 +3,6 @@ aggl=permissive strict isolated @@ -3,7 +3,6 @@ aggl=permissive strict isolated
3 praet=split composite 3 praet=split composite
4 4
5 [combinations] 5 [combinations]
6 -(dupa|dupa)  
7 #define wsz_interp (interp|kropka|dywiz)* 6 #define wsz_interp (interp|kropka|dywiz)*
8 7
9 #define moze_interp(segmenty) wsz_interp segmenty wsz_interp 8 #define moze_interp(segmenty) wsz_interp segmenty wsz_interp
fsabuilder/morfeuszbuilder/tagset/segtypes.py
@@ -4,80 +4,85 @@ Created on 17 lut 2014 @@ -4,80 +4,85 @@ Created on 17 lut 2014
4 @author: mlenart 4 @author: mlenart
5 ''' 5 '''
6 import re 6 import re
  7 +from morfeuszbuilder.utils import exceptions
7 8
8 class Segtypes(object): 9 class Segtypes(object):
9 10
10 - def __init__(self, tagset, segrulesFile): 11 + def __init__(self, tagset, segrulesConfigFile):
11 12
12 self.tagset = tagset 13 self.tagset = tagset
13 14
14 - self.segrulesConfigFile = segrulesFile 15 + self.filename = segrulesConfigFile.filename
15 16
16 self.segtype2Segnum = {} 17 self.segtype2Segnum = {}
17 self.patternsList = [] 18 self.patternsList = []
  19 + self._readLexemes(segrulesConfigFile)
  20 + self._readTags(segrulesConfigFile)
  21 +
  22 + def _validate(self, msg, lineNum, cond):
  23 + if not cond:
  24 + raise exceptions.ConfigFileException(self.filename, lineNum, msg)
  25 +
  26 + def _readTags(self, segrulesConfigFile):
  27 + for lineNum, line in segrulesConfigFile.enumerateLinesInSection('tags'):
  28 + print lineNum, line
  29 + splitLine = re.split(r'\s+', line.strip())
  30 + self._validate(
  31 + u'Line in [tags] section must contain exactly two fields - segment type and tag pattern',
  32 + lineNum,
  33 + len(splitLine) == 2)
  34 + segtype, pattern = splitLine
  35 + self._validate(
  36 + u'Segment type must be a lowercase alphanumeric with optional underscores',
  37 + lineNum,
  38 + re.match(r'[a-z_]+', segtype))
  39 + self._validate(
  40 + u'Pattern must contain only ":", "%", "." and lowercase alphanumeric letters',
  41 + lineNum,
  42 + re.match(r'[a-z_\.\:\%]+', pattern))
  43 +
  44 + if segtype in self.segtype2Segnum:
  45 + segnum = self.segtype2Segnum[segtype]
  46 + else:
  47 + segnum = len(self.segtype2Segnum)
  48 + self.segtype2Segnum[segtype] = segnum
  49 +
  50 + self.patternsList.append(SegtypePattern(None, pattern, segnum))
  51 +
  52 + def _readLexemes(self, segrulesConfigFile):
  53 + for lineNum, line in segrulesConfigFile.enumerateLinesInSection('lexemes'):
  54 + segtype, pattern = line.strip().split('\t')
  55 + self._validate(
  56 + u'Segment type must be a lowercase alphanumeric with optional underscores',
  57 + lineNum,
  58 + re.match(r'[a-z_]+', segtype))
  59 + self._validate(
  60 + u'Pattern must contain lemma and POS',
  61 + lineNum,
  62 + re.match(r'.+\:[a-z_]+', pattern, re.U))
  63 +
  64 + if segtype in self.segtype2Segnum:
  65 + segnum = self.segtype2Segnum[segtype]
  66 + else:
  67 + segnum = len(self.segtype2Segnum)
  68 + self.segtype2Segnum[segtype] = segnum
  69 +
  70 + lemma, pos = pattern.split(':')
  71 +
  72 + self.patternsList.append(SegtypePattern(lemma, pos + ':%', segnum))
18 73
19 - def readTags(self, lines):  
20 - inTags = False  
21 - for lineNum, line in enumerate(lines, start=1):  
22 - header = self._getHeaderValue(line, lineNum)  
23 - if header == 'tags':  
24 - inTags = True  
25 - elif header:  
26 - inTags = False  
27 - elif inTags:  
28 - segtype, pattern = line.strip().split('\t')  
29 - self._validate(  
30 - u'Segment type must be a lowercase alphanumeric with optional underscores',  
31 - lineNum,  
32 - re.match(r'[a-z_]+', segtype))  
33 - self._validate(  
34 - u'Pattern must contain only ":", "%", "." and lowercase alphanumeric letters',  
35 - lineNum,  
36 - re.match(r'[a-z_\.\:\%]+', pattern))  
37 -  
38 - if segtype in self.segtype2Segnum:  
39 - segnum = self.segtype2Segnum[segtype]  
40 - else:  
41 - segnum = len(self.segtype2Segnum)  
42 - self.segtype2Segnum[segtype] = segnum  
43 -  
44 - self.patternsList.append(SegtypePattern(None, pattern, segnum)) 74 + def hasSegtype(self, segTypeString):
  75 + return segTypeString in self.segtype2Segnum
45 76
46 - def readLexemes(self, lines):  
47 - inLexemes = False  
48 - for lineNum, line in enumerate(lines, start=1):  
49 - header = self._getHeaderValue(line, lineNum)  
50 - if header == 'lexemes':  
51 - inLexemes = True  
52 - elif header:  
53 - inLexemes = False  
54 - elif inLexemes:  
55 - segtype, pattern = line.strip().split('\t')  
56 - self._validate(  
57 - u'Segment type must be a lowercase alphanumeric with optional underscores',  
58 - lineNum,  
59 - re.match(r'[a-z_]+', segtype))  
60 - self._validate(  
61 - u'Pattern must contain lemma and POS',  
62 - lineNum,  
63 - re.match(r'\w+\:[a-z_]+', pattern, re.U))  
64 -  
65 - if segtype in self.segtype2Segnum:  
66 - segnum = self.segtype2Segnum[segtype]  
67 - else:  
68 - segnum = len(self.segtype2Segnum)  
69 - self.segtype2Segnum[segtype] = segnum  
70 -  
71 - lemma, pos = pattern.split(':')  
72 -  
73 - self.patternsList.append(SegtypePattern(lemma, pos + ':%', segnum)) 77 + def getSegnum4Segtype(self, segTypeString):
  78 + return self.segtype2Segnum[segTypeString]
74 79
75 def lexeme2Segnum(self, lemma, tag): 80 def lexeme2Segnum(self, lemma, tag):
76 for p in self.patternsList: 81 for p in self.patternsList:
77 res = p.tryToMatch(lemma, tag) 82 res = p.tryToMatch(lemma, tag)
78 if res >= 0: 83 if res >= 0:
79 return res 84 return res
80 - raise SegtypesException('Cannot find segment type for given tag: %s' % tag) 85 + return None
81 86
82 class SegtypePattern(object): 87 class SegtypePattern(object):
83 88
@@ -92,11 +97,3 @@ class SegtypePattern(object): @@ -92,11 +97,3 @@ class SegtypePattern(object):
92 return self.segnum 97 return self.segnum
93 else: 98 else:
94 return -1 99 return -1
95 -  
96 -class SegtypesException(Exception):  
97 -  
98 - def __init__(self, msg):  
99 - self.msg = msg  
100 -  
101 - def __str__(self):  
102 - return u'Error in segment rules: %s' % self.msg  
fsabuilder/morfeuszbuilder/tagset/tagset.py
@@ -12,10 +12,11 @@ class Tagset(object): @@ -12,10 +12,11 @@ class Tagset(object):
12 NAMES = 2 12 NAMES = 2
13 SEP = '\t' 13 SEP = '\t'
14 14
15 - def __init__(self, filename, encoding='utf8'): 15 + def __init__(self, filename=None, encoding='utf8'):
16 self.tag2tagnum = {} 16 self.tag2tagnum = {}
17 self.name2namenum = {} 17 self.name2namenum = {}
18 - self._doInit(filename, encoding) 18 + if filename:
  19 + self._doInit(filename, encoding)
19 self.tagnum2tag = dict(map(lambda (k, v): (v, k), self.tag2tagnum.iteritems())) 20 self.tagnum2tag = dict(map(lambda (k, v): (v, k), self.tag2tagnum.iteritems()))
20 21
21 def _doInit(self, filename, encoding): 22 def _doInit(self, filename, encoding):
@@ -37,4 +38,4 @@ class Tagset(object): @@ -37,4 +38,4 @@ class Tagset(object):
37 res[tag] = int(tagNum) 38 res[tag] = int(tagNum)
38 39
39 def getTag4Tagnum(self, tagnum): 40 def getTag4Tagnum(self, tagnum):
40 - return self.tagnum2tag[tagnum]  
41 \ No newline at end of file 41 \ No newline at end of file
  42 + return self.tagnum2tag[tagnum]
fsabuilder/morfeuszbuilder/utils/configFile.py
@@ -6,6 +6,7 @@ Created on 18 lut 2014 @@ -6,6 +6,7 @@ Created on 18 lut 2014
6 6
7 import re 7 import re
8 import codecs 8 import codecs
  9 +import exceptions
9 10
10 def getHeaderValue(line, lineNum): 11 def getHeaderValue(line, lineNum):
11 m = re.match(ur'\s*\[(.*?)\]\s*(\#.*)?', line) 12 m = re.match(ur'\s*\[(.*?)\]\s*(\#.*)?', line)
@@ -25,9 +26,9 @@ class ConfigFile(object): @@ -25,9 +26,9 @@ class ConfigFile(object):
25 26
26 def _addSectionStart(self, sectionName, lineNum): 27 def _addSectionStart(self, sectionName, lineNum):
27 if not sectionName in self.sectionNames: 28 if not sectionName in self.sectionNames:
28 - raise ConfigFileException(self.filename, lineNum, 'Invalid section: %s' % sectionName) 29 + raise exceptions.ConfigFileException(self.filename, lineNum, 'Invalid section: %s' % sectionName)
29 if sectionName in self.section2Lines: 30 if sectionName in self.section2Lines:
30 - raise ConfigFileException(self.filename, lineNum, 'Duplicate section: %s' % sectionName) 31 + raise exceptions.ConfigFileException(self.filename, lineNum, 'Duplicate section: %s' % sectionName)
31 self.section2Lines[sectionName] = [] 32 self.section2Lines[sectionName] = []
32 self.currSection = sectionName 33 self.currSection = sectionName
33 34
@@ -35,7 +36,7 @@ class ConfigFile(object): @@ -35,7 +36,7 @@ class ConfigFile(object):
35 line = line.strip() 36 line = line.strip()
36 if line: 37 if line:
37 if self.currSection is None and not line.startswith('#'): 38 if self.currSection is None and not line.startswith('#'):
38 - raise ConfigFileException(self.filename, lineNum, 'Text outside of any section') 39 + raise exceptions.ConfigFileException(self.filename, lineNum, 'Text outside of any section')
39 self.section2Lines[self.currSection].append((lineNum, line)) 40 self.section2Lines[self.currSection].append((lineNum, line))
40 41
41 def _getHeaderValue(self, line, lineNum): 42 def _getHeaderValue(self, line, lineNum):
@@ -56,13 +57,3 @@ class ConfigFile(object): @@ -56,13 +57,3 @@ class ConfigFile(object):
56 self._addSectionStart(header, lineNum) 57 self._addSectionStart(header, lineNum)
57 else: 58 else:
58 self._addLine(line, lineNum) 59 self._addLine(line, lineNum)
59 -  
60 -class ConfigFileException(Exception):  
61 -  
62 - def __init__(self, filename, lineNum, msg):  
63 - self.filename = filename  
64 - self.lineNum = lineNum  
65 - self.msg = msg  
66 -  
67 - def __str__(self):  
68 - return u'%s:%d - %s' % (self.filename, self.lineNum, self.msg)  
fsabuilder/morfeuszbuilder/utils/exceptions.py 0 → 100644
  1 +'''
  2 +Created on Feb 19, 2014
  3 +
  4 +@author: lennyn
  5 +'''
  6 +
  7 +class FSABuilderException(Exception):
  8 + '''
  9 + Exception in configFile module
  10 + '''
  11 +
  12 + def __init__(self, msg):
  13 + self.msg = msg
  14 +
  15 + def __str__(self):
  16 + return 'Failed to create FSA files: ' + self.msg
  17 +
  18 +class SegtypesException(FSABuilderException):
  19 +
  20 + def __init__(self, msg):
  21 + self.msg = msg
  22 +
  23 + def __str__(self):
  24 + return u'Error in segment rules: %s' % self.msg
  25 +
  26 +class ConfigFileException(FSABuilderException):
  27 +
  28 + def __init__(self, filename, lineNum, msg):
  29 + self.filename = filename
  30 + self.lineNum = lineNum
  31 + self.msg = msg
  32 +
  33 + def __str__(self):
  34 + return u'%s:%d - %s' % (self.filename, self.lineNum, self.msg)