morfeusz2.h 11.2 KB
/* 
 * File:   morfeusz2.h
 * Author: mlenart
 *
 * Created on 13 czerwiec 2014, 17:28
 */

#ifndef MORFEUSZ2_H
#define	MORFEUSZ2_H

#include <vector>
#include <string>
#include <list>
#include <set>

#ifndef __WIN32
#define DLLIMPORT
#else
/* A Windows system.  Need to define DLLIMPORT. */
#if BUILDING_MORFEUSZ
#  define DLLIMPORT __declspec (dllexport)
#else
#  define DLLIMPORT __declspec (dllimport)
#endif
#endif

namespace morfeusz {

    class DLLIMPORT MorphInterpretation;
    class DLLIMPORT Morfeusz;
    class DLLIMPORT ResultsIterator;
    class DLLIMPORT IdResolver;
    class DLLIMPORT MorfeuszException;

    enum Charset {
        UTF8 = 101,
        //    UTF16LE,
        //    UTF16BE,
        //    UTF32,
        ISO8859_2 = 102,
        CP1250 = 103,
        CP852 = 104
    };

    enum TokenNumbering {
        /**
         * Start from 0. Reset counter for every invocation of Morfeusz::analyze
         */
        SEPARATE_NUMBERING = 201,

        /**
         * Also start from 0. Reset counter for every invocation of Morfeusz::setTokenNumbering only
         */
        CONTINUOUS_NUMBERING = 202
    };

    enum CaseHandling {
        /**
         * Case-sensitive but allows interpretations that do not match case but there are no alternatives
         */
        CONDITIONALLY_CASE_SENSITIVE = 100,

        /**
         * Strictly case-sensitive, reject all interpretations that do not match case
         */
        STRICTLY_CASE_SENSITIVE = 101,

        /**
         * Case-insensitive - ignores case
         */
        IGNORE_CASE = 102
    };

    enum WhitespaceHandling {
        /**
         * Ignore whitespaces
         */
        SKIP_WHITESPACES = 301,

        /**
         * Append whitespaces to previous MorphInterpretation
         */
        APPEND_WHITESPACES = 302,

        /**
         * Whitespaces are separate MorphInterpretation objects
         */
        KEEP_WHITESPACES = 303
    };

    /**
     * Performs morphological analysis (analyze methods) and syntesis (generate methods).
     * 
     * It is NOT thread-safe
     * but it is possible to use separate Morfeusz instance for each concurrent thread.
     */
    class DLLIMPORT Morfeusz {
    public:

        /**
         * Returns a string containing library version.
         * @return 
         */
        static std::string getVersion();

        /**
         * Creates actual instance of Morfeusz class.
         * The caller is responsible for destroying it.
         * 
         * @return 
         */
        static Morfeusz* createInstance();

        virtual ~Morfeusz();

        /**
         * Analyze given text and return the results as iterator.
         * Use this method for analysis of big texts.
         * Copies the text under the hood - use analyze(const char*) if you want to avoid this.
         * 
         * @param text - text for morphological analysis.
         * @return - iterator over morphological analysis results
         */
        virtual ResultsIterator* analyse(const std::string& text) const = 0;
        
        /**
         * Analyze given text and return the results as iterator.
         * It does not store results for whole text at once, so may be less memory-consuming for analysis of big texts
         * 
         * 
         * @param text - text for morphological analysis. This pointer must not be deleted before returned ResultsIterator object.
         * @return - iterator over morphological analysis results
         */
        virtual ResultsIterator* analyse(const char* text) const = 0;

        /**
         * Perform morphological analysis on a given text and put results in a vector.
         * 
         * @param text - text to be analyzed
         * @param result - results vector
         */
        virtual void analyse(const std::string& text, std::vector<MorphInterpretation>& result) const = 0;

        /**
         * Perform morphological synthesis on a given lemma and put results in a vector.
         * 
         * @param lemma - lemma to be analyzed
         * @param result - results vector
         */
        virtual void generate(const std::string& lemma, std::vector<MorphInterpretation>& result) const = 0;

        /**
         * Perform morphological synthesis on a given lemma and put results in a vector.
         * Limit results to interpretations with the specified tag.
         * 
         * @param lemma - lemma to be analyzed
         * @param tag - tag of result interpretations
         * @param result - results vector
         */
        virtual void generate(const std::string& lemma, int tagId, std::vector<MorphInterpretation>& result) const = 0;

        /**
         * Set encoding for input and output string objects.
         * 
         * @param encoding
         */
        virtual void setCharset(Charset encoding) = 0;

        /**
         * Set aggl segmentation option value.
         * 
         * @param aggl
         */
        virtual void setAggl(const std::string& aggl) = 0;

        /**
         * Set praet segmentation option value.
         * 
         * @param praet
         */
        virtual void setPraet(const std::string& praet) = 0;

        /**
         * Set case handling.
         * 
         * @param caseSensitive
         */
        virtual void setCaseHandling(CaseHandling caseHandling) = 0;

        /**
         * Set token numbering policy.
         * 
         * @param numbering
         */
        virtual void setTokenNumbering(TokenNumbering numbering) = 0;

        /**
         * Set whitespace handling.
         * 
         * @param numbering
         */
        virtual void setWhitespaceHandling(WhitespaceHandling whitespaceHandling) = 0;

        /**
         * Set debug option value.
         * 
         * @param debug
         */
        virtual void setDebug(bool debug) = 0;
        
        /**
         * Get reference to tagset currently being in use.
         * 
         * @return currently used tagset
         */
        virtual const IdResolver& getIdResolver() const = 0;
        
        /**
         * Set current dictionary to the one with provided name.
         * 
         * This is NOT thread safe (no other thread may invoke setDictionary 
         * either within this instance, or any other in the same application.
         * 
         * @param dictName dictionary name
         */
//        virtual void setDictionary(const std::string& dictName) = 0;
        
        /**
         * List of directories where current Morfeusz instance will look for dictionaries.
         */
        std::list<std::string> dictionarySearchPaths;
    
        
        virtual void setAnalyzerDictionary(const std::string& filename) = 0;

        virtual void setGeneratorDictionary(const std::string& filename) = 0;
        
    protected:
        /**
         * Same as analyze(text) but copies the text under the hood.
         * Useful for wrappers to other languages.
         */
        virtual ResultsIterator* analyseWithCopy(const char* text) const = 0;
    };

    class DLLIMPORT ResultsIterator {
    public:
        virtual bool hasNext() = 0;
        virtual const MorphInterpretation& peek() = 0;
        virtual MorphInterpretation next() = 0;

        virtual ~ResultsIterator() {}
    };

    /**
     * Represents a tagset
     */
    class DLLIMPORT IdResolver {
    public:

        /**
         * Returns tag (denoted by its index).
         * 
         * @param tagNum - tag index in the tagset.
         * @return - the tag
         */
        virtual const std::string& getTag(const int tagId) const = 0;
        
        /**
         * Returns identifier for given tag.
         * Throws MorfeuszException when none exists.
         * 
         * @return identifier for given tag
         */
        virtual int getTagId(const std::string& tag) const = 0;

        /**
         * Returns named entity type (denoted by its index).
         * 
         * @param nameNum - name index in the tagset.
         * @return - the named entity type
         */
        virtual const std::string& getName(const int nameId) const = 0;
        
        /**
         * Returns identifier for given named entity.
         * Throws MorfeuszException when none exists.
         * 
         * @return identifier for given named entity
         */
        virtual int getNameId(const std::string& name) const = 0;
        
        virtual const std::string& getLabelsAsString(int labelsId) const = 0;
        
        virtual const std::set<std::string>& getLabels(int labelsId) const = 0;
        
        virtual int getLabelsId(const std::string& labelsStr) const = 0;

        /**
         * Returs number of tags this tagset contains.
         * 
         * @return 
         */
        virtual size_t getTagsCount() const = 0;

        /**
         * Returs number of named entity types this tagset contains.
         * 
         * @return 
         */
        virtual size_t getNamesCount() const = 0;
        
        virtual size_t getLabelsCount() const = 0;

        virtual ~IdResolver() {
        }
    };

    /**
     The result of analysis is  a directed acyclic graph with numbered
     nodes representing positions  in text (points _between_ segments)
     and edges representing interpretations of segments that span from
     one node to another.  E.g.,

         {0,1,"ja","ja","ppron12:sg:nom:m1.m2.m3.f.n1.n2:pri"}
         |
         |      {1,2,"został","zostać","praet:sg:m1.m2.m3:perf"}
         |      |
       __|  ____|   __{2,3,"em","być","aglt:sg:pri:imperf:wok"}
      /  \ /     \ / \
     * Ja * został*em *
     0    1       2   3

     Note that the word 'zostałem' got broken into 2 separate segments.

     The structure below describes one edge of this DAG:

     */
    struct DLLIMPORT MorphInterpretation {

        /**
         * Creates new instance with "ign" tag (meaning: "not found in the dictionary")
         */
        static MorphInterpretation createIgn(
            int startNode, int endNode,
            const std::string& orth, const std::string& lemma);

        /**
         * Creates new instance with "sp" tag (meaning: "this is a sequence of whitespaces")
         */
        static MorphInterpretation createWhitespace(int startNode, int endNode, const std::string& orth);

        inline bool isIgn() const {
            return tagId == 0;
        }

        inline bool isWhitespace() const {
            return tagId == 1;
        }
        
        // FIXME - do wyrzucenia gdzie indziej
        bool hasHomonym(const std::string& homonymId) const;

        // FIXME - do wyrzucenia gdzie indziej
        std::string toString(bool includeNodeNumbers) const;
        
        int startNode;
        int endNode;
        std::string orth;
        std::string lemma;
        int tagId;
        int nameId;
        int labelsId;
    };

    class DLLIMPORT MorfeuszException : public std::exception {
    public:

        MorfeuszException(const std::string& what) : msg(what.c_str()) {
        }

        virtual ~MorfeuszException() throw () {
        }

        virtual const char* what() const throw () {
            return this->msg.c_str();
        }
    private:
        const std::string msg;
    };

    class DLLIMPORT FileFormatException : public MorfeuszException {
    public:

        FileFormatException(const std::string& what) : MorfeuszException(what) {
        }
    };
}

#endif	/* MORFEUSZ2_H */