Blame view

morfeusz/IdResolverImpl.cpp 4.79 KB
Michał Lenart authored
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46

#include "IdResolverImpl.hpp"
#include "fsa/const.hpp"
#include "utils.hpp"
#include "const.hpp"
#include "deserialization/deserializationUtils.hpp"
#include "morfeusz2.h"

using namespace std;

namespace morfeusz {

    inline static void readTags(const unsigned char*& currPtr, std::vector<std::string>& tags) {
        tags.clear();
        tags.resize(65536);
        uint16_t tagsNum = readInt16(currPtr);
        for (unsigned int i = 0; i < tagsNum; i++) {
            unsigned int tagNum = readInt16(currPtr);
            tags[tagNum] = readString(currPtr);
        }
    }

    inline static void createReverseMapping(IdResolverImpl::IdStringMapping& mapping) {
        mapping.string2Id.clear();
        for (unsigned int i = 0; i < mapping.id2String.size(); i++) {
            mapping.string2Id[mapping.id2String[i]] = i;
        }
    }

    template <class T>
    inline static const T& getFromMap(map<string, T> string2T, const string& key, const char* errMsg) {
        if (string2T.count(key) != 0) {
            return string2T.find(key)->second;
        }
        else {
            throw MorfeuszException(string(errMsg) + ": " + key);
        }
    }

    inline static void convertCharset(const CharsetConverter* charsetConverter, IdResolverImpl::IdStringMapping& mapping) {
        for (unsigned int i = 0; i < mapping.id2String.size(); i++) {
            mapping.id2String[i] = charsetConverter->fromUTF8(
                    charsetConverter->toUTF8(mapping.id2String[i]));
        }
        createReverseMapping(mapping);
    }
Michał Lenart authored
47
48

    IdResolverImpl::IdResolverImpl()
Michał Lenart authored
49
50
    : tagsetId(),
    tags(),
Michał Lenart authored
51
52
53
54
55
56
    names(),
    labels(),
    labelsAsSets(),
    charsetConverter(&UTF8CharsetConverter::getInstance()) {

    }
Michał Lenart authored
57
58

    IdResolverImpl::IdResolverImpl(const unsigned char* ptr, const CharsetConverter* charsetConverter)
Michał Lenart authored
59
60
    : tagsetId(),
    tags(),
Michał Lenart authored
61
62
63
64
65
66
67
    names(),
    labels(),
    labelsAsSets(),
    charsetConverter(charsetConverter) {
        uint32_t fsaSize = readInt32Const(ptr + FSA_DATA_SIZE_OFFSET);
        const unsigned char* currPtr = ptr + FSA_DATA_OFFSET + fsaSize + 4;
Michał Lenart authored
68
        this->tagsetId = readString(currPtr);
Michał Lenart authored
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
        readTags(currPtr, this->tags.id2String);
        createReverseMapping(this->tags);

        readTags(currPtr, this->names.id2String);
        createReverseMapping(this->names);

        readTags(currPtr, this->labels.id2String);
        createReverseMapping(this->labels);
        for (unsigned int i = 0; i < this->labels.id2String.size(); i++) {
            vector<string> labelsVector = split(this->labels.id2String[i], LABELS_SEPARATOR);
            this->labelsAsSets.push_back(set<string>(labelsVector.begin(), labelsVector.end()));
        }

        setCharsetConverter(charsetConverter);
    }
Michał Lenart authored
84
Michał Lenart authored
85
86
87
88
89
90
91
92
93
    // FIXME - probably should not convert whole tagset on every setCharsetConverter method invocation.

    void IdResolverImpl::setCharsetConverter(const CharsetConverter* charsetConverter) {
        convertCharset(charsetConverter, this->tags);
        convertCharset(charsetConverter, this->names);
        convertCharset(charsetConverter, this->labels);

        this->charsetConverter = charsetConverter;
    }
Michał Lenart authored
94
95

    const string IdResolverImpl::getTagsetId() const {
Michał Lenart authored
96
        return this->tagsetId;
Michał Lenart authored
97
    }
Michał Lenart authored
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137

    const string& IdResolverImpl::getTag(const int tagId) const {
        return this->tags.id2String.at(tagId);
    }

    int IdResolverImpl::getTagId(const std::string& tag) const {
        return getFromMap(this->tags.string2Id, tag, "Invalid tag");
    }

    const string& IdResolverImpl::getName(const int nameId) const {
        return this->names.id2String.at(nameId);
    }

    int IdResolverImpl::getNameId(const std::string& name) const {
        return getFromMap(this->names.string2Id, name, "Invalid name");
    }

    const string& IdResolverImpl::getLabelsAsString(int labelsId) const {
        return this->labels.id2String.at(labelsId);
    }

    const set<string>& IdResolverImpl::getLabels(int labelsId) const {
        return this->labelsAsSets.at(labelsId);
    }

    int IdResolverImpl::getLabelsId(const string& labelsStr) const {
        return getFromMap(this->labels.string2Id, labelsStr, "Invalid labels string");
    }

    size_t IdResolverImpl::getTagsCount() const {
        return this->tags.id2String.size();
    }

    size_t IdResolverImpl::getNamesCount() const {
        return this->names.id2String.size();
    }

    size_t IdResolverImpl::getLabelsCount() const {
        return this->labels.id2String.size();
    }
Michał Lenart authored
138
139

    bool IdResolverImpl::isCompatibleWith(const IdResolverImpl& other) const {
Michał Lenart authored
140
141
        return this->tagsetId == other.tagsetId
                && this->tags.id2String == other.tags.id2String 
Michał Lenart authored
142
143
144
                && this->names.id2String == other.names.id2String
                && this->labels.id2String == other.labels.id2String;
    }
Michał Lenart authored
145
}