/* * File: morfeusz2.h * Author: mlenart * * Created on 13 czerwiec 2014, 17:28 */ #ifndef MORFEUSZ2_H #define MORFEUSZ2_H #include <vector> #include <string> #include <list> #include <set> #ifndef __WIN32 #define DLLIMPORT #else /* A Windows system. Need to define DLLIMPORT. */ #if BUILDING_MORFEUSZ #define DLLIMPORT __declspec (dllexport) #else #define DLLIMPORT __declspec (dllimport) #endif #endif namespace morfeusz { class DLLIMPORT MorphInterpretation; class DLLIMPORT Morfeusz; class DLLIMPORT ResultsIterator; class DLLIMPORT IdResolver; class DLLIMPORT MorfeuszException; enum Charset { UTF8 = 11, // UTF16LE, // UTF16BE, // UTF32, ISO8859_2 = 12, CP1250 = 13, CP852 = 14 }; enum TokenNumbering { /** * Start from 0. Reset counter for every invocation of Morfeusz::analyze */ SEPARATE_NUMBERING = 201, /** * Also start from 0. Reset counter for every invocation of Morfeusz::setTokenNumbering only */ CONTINUOUS_NUMBERING = 202 }; enum CaseHandling { /** * Case-sensitive but allows interpretations that do not match case but there are no alternatives */ CONDITIONALLY_CASE_SENSITIVE = 100, /** * Strictly case-sensitive, reject all interpretations that do not match case */ STRICTLY_CASE_SENSITIVE = 101, /** * Case-insensitive - ignores case */ IGNORE_CASE = 102 }; enum WhitespaceHandling { /** * Ignore whitespaces */ SKIP_WHITESPACES = 301, /** * Append whitespaces to previous MorphInterpretation */ APPEND_WHITESPACES = 302, /** * Whitespaces are separate MorphInterpretation objects */ KEEP_WHITESPACES = 303 }; enum MorfeuszUsage { ANALYSE_ONLY = 401, GENERATE_ONLY = 402, BOTH_ANALYSE_AND_GENERATE = 403 }; /** * Performs morphological analysis (analyze methods) and syntesis (generate methods). * * It is NOT thread-safe * but it is possible to use separate Morfeusz instance for each concurrent thread. */ class DLLIMPORT Morfeusz { public: /** * Returns a string containing library version. * @return */ static std::string getVersion(); /** * Returns a string containing default dictionary name. * @return */ static std::string getDefaultDictName(); /** * Creates actual instance of Morfeusz class. * The caller is responsible for destroying it. * * @remarks NOT THREAD-SAFE (affects ALL Morfeusz instances) * @return new instance of Morfeusz. */ static Morfeusz* createInstance(MorfeuszUsage usage); /** * Creates exact copy of Morfeusz object. * * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances). */ virtual Morfeusz* clone() const = 0; virtual ~Morfeusz(); /** * Analyze given text and return the results as iterator. * Use this method for analysis of big texts. * Copies the text under the hood - use analyze(const char*) if you want to avoid this. * * @param text - text for morphological analysis. * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances). * @return - iterator over morphological analysis results */ virtual ResultsIterator* analyse(const std::string& text) const = 0; /** * Analyze given text and return the results as iterator. * It does not store results for whole text at once, so may be less memory-consuming for analysis of big texts * * * @param text - text for morphological analysis. This pointer must not be deleted before returned ResultsIterator object. * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances). * @return - iterator over morphological analysis results */ virtual ResultsIterator* analyse(const char* text) const = 0; /** * Perform morphological analysis on a given text and put results in a vector. * * @param text - text to be analyzed * @param result - results vector * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances). */ virtual void analyse(const std::string& text, std::vector<MorphInterpretation>& result) const = 0; /** * Perform morphological synthesis on a given lemma and put results in a vector. * * @param lemma - lemma to be analyzed * @param result - results vector * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances). * @throws MorfeuszException - when lemma parameter contains whitespaces. */ virtual void generate(const std::string& lemma, std::vector<MorphInterpretation>& result) const = 0; /** * Perform morphological synthesis on a given lemma and put results in a vector. * Limit results to interpretations with the specified tag. * * @param lemma - lemma to be analyzed * @param tag - tag of result interpretations * @param result - results vector * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances). * @throws MorfeuszException - when lemma parameter contains whitespaces or tagId is outside tagset. */ virtual void generate(const std::string& lemma, int tagId, std::vector<MorphInterpretation>& result) const = 0; /** * Set encoding for input and output string objects. * * @param encoding * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances). */ virtual void setCharset(Charset encoding) = 0; /** * Select agglutination rules * * @param aggl * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances). * @throws MorfeuszException - for invalid aggl parameter. */ virtual void setAggl(const std::string& aggl) = 0; /** * Select past tense segmentation * * @param praet * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances). * @throws MorfeuszException - for invalid aggl praet parameter. */ virtual void setPraet(const std::string& praet) = 0; /** * Set case handling. * * @param caseSensitive * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances). */ virtual void setCaseHandling(CaseHandling caseHandling) = 0; /** * Set token numbering policy. * * @param numbering * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances). */ virtual void setTokenNumbering(TokenNumbering numbering) = 0; /** * Set whitespace handling. * * @param numbering * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances). */ virtual void setWhitespaceHandling(WhitespaceHandling whitespaceHandling) = 0; /** * Set debug option value. * * @param debug */ virtual void setDebug(bool debug) = 0; /** * Get reference to tagset currently being in use. * * @return currently used tagset */ virtual const IdResolver& getIdResolver() const = 0; /** * Set current dictionary to the one with provided name. * * This is NOT THREAD SAFE - no other thread may invoke setDictionary * either within this instance, or any other in the same application. * * @param dictName dictionary name * @remarks NOT THREAD-SAFE (affects ALL Morfeusz instances) * @throws MorfeuszException - when dictionary not found. * @throws std::ios_base::failure - when IO error occurred when loading given dictionary. */ virtual void setDictionary(const std::string& dictName) = 0; /** * List of paths where current Morfeusz instance will look for dictionaries. * Modifying it is NOT THREAD-SAFE. */ static std::list<std::string> dictionarySearchPaths; /** * Get available parameters for "setAggl" method. * @return */ virtual const std::set<std::string>& getAvailableAgglOptions() const = 0; /** * Get available parameters for "setPraet" method. * @return */ virtual const std::set<std::string>& getAvailablePraetOptions() const = 0; protected: /** * Same as analyze(text) but copies the text under the hood. * Useful for wrappers to other languages. */ virtual ResultsIterator* analyseWithCopy(const char* text) const = 0; }; class DLLIMPORT ResultsIterator { public: /** * * @return true iff this iterator contains more elements. */ virtual bool hasNext() = 0; /** * * @return the element, that will be returned in next next() invocation. * @throws std::out_of_range when this iterator has already reached the end. */ virtual const MorphInterpretation& peek() = 0; /** * * @return next analysis result. * @throws std::out_of_range when this iterator has already reached the end. */ virtual MorphInterpretation next() = 0; virtual ~ResultsIterator() { } }; /** * Represents a tagset */ class DLLIMPORT IdResolver { public: /** * Returns tag (denoted by its index). * * @param tagNum - tag index in the tagset. * @return - the tag * @throws std::out_of_range when invalid tagId is provided. */ virtual const std::string& getTag(const int tagId) const = 0; /** * Returns identifier for given tag. * Throws MorfeuszException when none exists. * * @return identifier for given tag * @throws MorfeuszException when invalid tag parameter is provided. */ virtual int getTagId(const std::string& tag) const = 0; /** * Returns named entity type (denoted by its index). * * @param nameNum - name index in the tagset. * @return - the named entity type * @throws std::out_of_range when invalid nameId is provided. */ virtual const std::string& getName(const int nameId) const = 0; /** * Returns identifier for given named entity. * Throws MorfeuszException when none exists. * * @return identifier for given named entity * @throws MorfeuszException when invalid name parameter is provided. */ virtual int getNameId(const std::string& name) const = 0; /** * Returns labels string for given labelsId. * * @param labelsId * @return labels as string * @throws std::out_of_range when invalid labelsId is provided. */ virtual const std::string& getLabelsAsString(int labelsId) const = 0; /** * Returns labels as set of strings for given labelsId. * @param labelsId * @return labels as set of strings * @throws std::out_of_range when invalid labelsId is provided. */ virtual const std::set<std::string>& getLabels(int labelsId) const = 0; /** * Get labelsId for given labels as string. * * @param labelsStr * @return labelsId * @throws MorfeuszException when invalid tag is provided. */ virtual int getLabelsId(const std::string& labelsStr) const = 0; /** * Returns number of tags this tagset contains. * * @return */ virtual size_t getTagsCount() const = 0; /** * Returns number of named entity types this tagset contains. * * @return */ virtual size_t getNamesCount() const = 0; /** * Returns number of different labels combinations. */ virtual size_t getLabelsCount() const = 0; virtual ~IdResolver() { } }; /** The result of analysis is a directed acyclic graph with numbered nodes representing positions in text (points _between_ segments) and edges representing interpretations of segments that span from one node to another. E.g., {0,1,"ja","ja","ppron12:sg:nom:m1.m2.m3.f.n1.n2:pri"} | | {1,2,"został","zostać","praet:sg:m1.m2.m3:perf"} | | __| ____| __{2,3,"em","być","aglt:sg:pri:imperf:wok"} / \ / \ / \ * Ja * został*em * 0 1 2 3 Note that the word 'zostałem' got broken into 2 separate segments. The structure below describes one edge of this DAG: */ struct DLLIMPORT MorphInterpretation { /** * Creates new instance with "ign" tag (meaning: "not found in the dictionary") */ static MorphInterpretation createIgn( int startNode, int endNode, const std::string& orth, const std::string& lemma); /** * Creates new instance with "sp" tag (meaning: "this is a sequence of whitespaces") */ static MorphInterpretation createWhitespace(int startNode, int endNode, const std::string& orth); /** * * @return true iff this instance represents an unknown word. */ inline bool isIgn() const { return tagId == 0; } /** * * @return true iff this instance represents a whitespace. */ inline bool isWhitespace() const { return tagId == 1; } /** * Get tag as string. * * @param morfeusz Morfeusz instance this interpretation was created by. * @return */ inline const std::string& getTag(const Morfeusz& morfeusz) const { return morfeusz.getIdResolver().getTag(this->tagId); } /** * Get name as string. * * @param morfeusz Morfeusz instance this interpretation was created by. * @return */ inline const std::string& getName(const Morfeusz& morfeusz) const { return morfeusz.getIdResolver().getName(this->nameId); } /** * Get labels as string. * * @param morfeusz Morfeusz instance this interpretation was created by. * @return */ inline const std::string& getLabelsAsString(const Morfeusz& morfeusz) const { return morfeusz.getIdResolver().getLabelsAsString(this->labelsId); } /** * Get tag as set of strings. * * @param morfeusz Morfeusz instance this interpretation was created by. * @return */ inline const std::set<std::string>& getLabels(const Morfeusz& morfeusz) const { return morfeusz.getIdResolver().getLabels(this->labelsId); } int startNode; int endNode; std::string orth; std::string lemma; int tagId; int nameId; int labelsId; }; class DLLIMPORT MorfeuszException : public std::exception { public: MorfeuszException(const std::string& what) : msg(what.c_str()) { } virtual ~MorfeuszException() throw () { } virtual const char* what() const throw () { return this->msg.c_str(); } private: const std::string msg; }; class DLLIMPORT FileFormatException : public MorfeuszException { public: FileFormatException(const std::string& what) : MorfeuszException(what) { } }; } #endif /* MORFEUSZ2_H */