Blame view

fsabuilder/morfeuszbuilder/fsa/common.py 4.56 KB
Michał Lenart authored
1
2
3
4
5
6
7
'''
Created on Nov 7, 2013

@author: mlenart
'''

import codecs
Michał Lenart authored
8
import logging
Michał Lenart authored
9
Michał Lenart authored
10
class EncodedFormWithoutPrefix(object):
Michał Lenart authored
11
Michał Lenart authored
12
    def __init__(self, fromWord, targetWord, lowercase):
Michał Lenart authored
13
14
        assert type(fromWord) == unicode
        assert type(targetWord) == unicode
Michał Lenart authored
15
        root = u''
Michał Lenart authored
16
        for o, b in zip(fromWord, targetWord):
Michał Lenart authored
17
            if ((o.lower() == b.lower()) if lowercase else o == b):
Michał Lenart authored
18
                root += b
Michał Lenart authored
19
20
            else:
                break
Michał Lenart authored
21
22
        self.cutLength = len(fromWord) - len(root)
        self.suffixToAdd = targetWord[len(root):]
Michał Lenart authored
23
        self.casePattern = [c == c.upper() and c != c.lower() for c in root]
Michał Lenart authored
24
#         self.prefixCutLength = 0
Michał Lenart authored
25
Michał Lenart authored
26
class EncodedForm4Generator(object):
Michał Lenart authored
27
Michał Lenart authored
28
    def __init__(self, fromWord, targetWord):
Michał Lenart authored
29
30
31
32
33
        assert type(fromWord) == unicode
        assert type(targetWord) == unicode
        bestEncodedForm = None
        bestPrefixLength = -1
        for prefixLength in range(min(len(targetWord), 5)):
Michał Lenart authored
34
            encodedForm = EncodedFormWithoutPrefix(fromWord, targetWord[prefixLength:], lowercase=False)
Michał Lenart authored
35
36
37
38
39
            if not bestEncodedForm \
            or len(encodedForm.suffixToAdd) + prefixLength < len(bestEncodedForm.suffixToAdd) + bestPrefixLength:
                bestEncodedForm = encodedForm
                bestPrefixLength = prefixLength
        assert bestPrefixLength >= 0
Michał Lenart authored
40
Michał Lenart authored
41
42
43
        self.cutLength = bestEncodedForm.cutLength
        self.suffixToAdd = bestEncodedForm.suffixToAdd
        self.prefixToAdd = targetWord[:bestPrefixLength]
Michał Lenart authored
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63

class EncodedForm4Analyzer(object):

    def __init__(self, fromWord, targetWord):
        assert type(fromWord) == unicode
        assert type(targetWord) == unicode
        bestEncodedForm = None
        bestPrefixCutLength = -1
        for prefixCutLength in range(min(len(fromWord), 5)):
            encodedForm = EncodedFormWithoutPrefix(fromWord[prefixCutLength:], targetWord, lowercase=True)
            if not bestEncodedForm \
            or len(encodedForm.suffixToAdd) + prefixCutLength < len(bestEncodedForm.suffixToAdd):
                bestEncodedForm = encodedForm
                bestPrefixCutLength = prefixCutLength
        assert bestPrefixCutLength >= 0

        self.prefixCutLength = bestPrefixCutLength
        self.cutLength = bestEncodedForm.cutLength
        self.suffixToAdd = bestEncodedForm.suffixToAdd
        self.casePattern = bestEncodedForm.casePattern
Michał Lenart authored
64
Michał Lenart authored
65
class Interpretation4Analyzer(object):
Michał Lenart authored
66
Michał Lenart authored
67
    def __init__(self, orth, base, tagnum, namenum, typenum, qualifiers):
Michał Lenart authored
68
        self.encodedForm = EncodedForm4Analyzer(orth, base)
Michał Lenart authored
69
        self.orthCasePattern = [c == c.upper() and c != c.lower() for c in orth[:len(orth) - self.encodedForm.cutLength]]
Michał Lenart authored
70
71
        self.tagnum = tagnum
        self.namenum = namenum
Michał Lenart authored
72
        self.typenum = typenum
Michał Lenart authored
73
        self.qualifiers = qualifiers
Michał Lenart authored
74
75

    def getSortKey(self):
Michał Lenart authored
76
        return (
Michał Lenart authored
77
78
                self.encodedForm.cutLength,
                self.encodedForm.prefixCutLength,
Michał Lenart authored
79
                tuple(self.encodedForm.suffixToAdd), 
Michał Lenart authored
80
                tuple(self.encodedForm.casePattern),
Michał Lenart authored
81
                tuple(self.orthCasePattern),
Michał Lenart authored
82
83
                self.tagnum, 
                self.namenum)
Michał Lenart authored
84
85

    def __eq__(self, other):
Michał Lenart authored
86
        if isinstance(other, Interpretation4Analyzer):
Michał Lenart authored
87
88
89
90
91
92
93
            return self.getSortKey() == other.getSortKey()
        else:
            return False

    def __hash__(self):
        return hash(self.getSortKey())
Michał Lenart authored
94
95
class Interpretation4Generator(object):
Michał Lenart authored
96
    def __init__(self, orth, base, tagnum, namenum, typenum, homonymId, qualifiers):
Michał Lenart authored
97
        self.lemma = base
Michał Lenart authored
98
        self.encodedForm = EncodedForm4Generator(base, orth)
Michał Lenart authored
99
100
        self.tagnum = tagnum
        self.namenum = namenum
Michał Lenart authored
101
        self.typenum = typenum
Michał Lenart authored
102
        self.homonymId = homonymId
Michał Lenart authored
103
        self.qualifiers = qualifiers
Michał Lenart authored
104
105
106

    def getSortKey(self):
        return (
Michał Lenart authored
107
                self.homonymId,
Michał Lenart authored
108
                self.tagnum,
Michał Lenart authored
109
                self.encodedForm.cutLength, 
Michał Lenart authored
110
                tuple(self.encodedForm.suffixToAdd),
Michał Lenart authored
111
#                 tuple(self.encodedForm.casePattern), 
Michał Lenart authored
112
113
114
                self.namenum)

    def __eq__(self, other):
Michał Lenart authored
115
        if isinstance(other, Interpretation4Generator):
Michał Lenart authored
116
117
118
119
120
121
            return self.getSortKey() == other.getSortKey()
        else:
            return False

    def __hash__(self):
        return hash(self.getSortKey())
Michał Lenart authored
122
123

    def __unicode__(self):
Michał Lenart authored
124
        return u'<%s,(%d %s),%d,%d>' % (self.lemma.decode('utf8'), self.encodedForm.cutLength, self.encodedForm.suffixToAdd.decode('utf8'), self.tagnum, self.namenum)
Michał Lenart authored
125
126
127

    def __repr__(self):
        return unicode(self)