Blame view

fsabuilder/fsa/common.py 2.47 KB
Michał Lenart authored
1
2
3
4
5
6
7
'''
Created on Nov 7, 2013

@author: mlenart
'''

import codecs
Michał Lenart authored
8
import logging
Michał Lenart authored
9
10
11

class Lemma(object):
Michał Lenart authored
12
    def __init__(self, cutLength, suffixToAdd, casePattern):
Michał Lenart authored
13
14
        self.cutLength = cutLength
        self.suffixToAdd = suffixToAdd
Michał Lenart authored
15
        self.casePattern = casePattern
Michał Lenart authored
16
17
18

class Interpretation(object):
Michał Lenart authored
19
    def __init__(self, orth, base, tagnum, namenum, typenum):
Michał Lenart authored
20
21
22
23
        assert type(orth) == unicode
        assert type(base) == unicode
        root = u''
        for o, b in zip(orth, base):
Michał Lenart authored
24
            if o.lower() == b.lower():
Michał Lenart authored
25
                root += b
Michał Lenart authored
26
27
            else:
                break
Michał Lenart authored
28
        cutLength = len(orth) - len(root)
Michał Lenart authored
29
30
        self.lemma = Lemma(
                           cutLength=cutLength,
Michał Lenart authored
31
32
                           suffixToAdd=base[len(root):],
                           casePattern = [c == c.upper() for c in root])
Michał Lenart authored
33
34
        self.tagnum = tagnum
        self.namenum = namenum
Michał Lenart authored
35
        self.typenum = typenum
Michał Lenart authored
36
37

    def getSortKey(self):
Michał Lenart authored
38
39
40
41
42
43
        return (
                self.lemma.cutLength, 
                tuple(self.lemma.suffixToAdd), 
                tuple(self.lemma.casePattern), 
                self.tagnum, 
                self.namenum)
Michał Lenart authored
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83

    def __eq__(self, other):
        if isinstance(other, Interpretation):
            return self.getSortKey() == other.getSortKey()
        else:
            return False

    def __hash__(self):
        return hash(self.getSortKey())

class Tagset(object):

    TAGS = 1
    NAMES = 2
    SEP = '\t'

    def __init__(self, filename, encoding='utf8'):
        self.tag2tagnum = {}
        self.name2namenum = {}
        self._doInit(filename, encoding)
        print self.tag2tagnum
        print self.name2namenum

    def _doInit(self, filename, encoding):
        addingTo = None
        with codecs.open(filename, 'r', encoding) as f:
            for line in f:
                line = line.strip('\n')
                if line == u'[TAGS]':
                    addingTo = Tagset.TAGS
                elif line == u'[NAMES]':
                    addingTo = Tagset.NAMES
                elif line and not line.startswith(u'#'):
                    assert addingTo in [Tagset.TAGS, Tagset.NAMES]
                    res = {Tagset.TAGS: self.tag2tagnum,
                           Tagset.NAMES: self.name2namenum}[addingTo]
                    tagNum = line.split(Tagset.SEP)[0]
                    tag = line.split(Tagset.SEP)[1]
                    assert tag not in res
                    res[tag] = int(tagNum)