Polish.hs 4.91 KB
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}


module NLP.Concraft.Polish
(
-- * Model
  C.Concraft
, C.saveModel
, C.loadModel

-- * Tagging
, tag
, marginals

-- * Analysis
, macaPar

-- * Training
, TrainConf (..)
, train

-- * Pruning
, C.prune

-- -- * Analysis
-- , anaSent
-- , reAnaPar
) where


import           Control.Applicative ((<$>))
import qualified Data.Text.Lazy as L
import qualified Data.Set as S

import qualified Data.Tagset.Positional as P
import qualified Numeric.SGD as SGD

import qualified NLP.Concraft.Morphosyntax as X
import qualified NLP.Concraft.Schema as S
import           NLP.Concraft.Schema (SchemaConf(..), entry, entryWith)
import qualified NLP.Concraft.Guess as G
import qualified NLP.Concraft.Disamb as D
import qualified NLP.Concraft as C
-- import qualified NLP.Concraft.Analysis as A

import           NLP.Concraft.Polish.Morphosyntax hiding (tag)
import           NLP.Concraft.Polish.Maca


-------------------------------------------------
-- Default configuration
-------------------------------------------------


-- | Default configuration for the guessing observation schema.
guessSchemaDefault :: SchemaConf
guessSchemaDefault = S.nullConf
    { lowPrefixesC  = entryWith [1, 2]      [0]
    , lowSuffixesC  = entryWith [1, 2]      [0]
    , knownC        = entry                 [0]
    , begPackedC    = entry                 [0] }


-- | Default configuration for the guessing observation schema.
disambSchemaDefault :: SchemaConf
disambSchemaDefault = S.nullConf
    { lowOrthC      = entry                         [-2, -1, 0, 1]
    , lowPrefixesC  = oov $ entryWith [1, 2, 3]     [0]
    , lowSuffixesC  = oov $ entryWith [1, 2, 3]     [0]
    , begPackedC    = oov $ entry                   [0] }
  where
    oov (Just body) = Just $ body { S.oovOnly = True }
    oov Nothing     = Nothing


-- | Default tiered tagging configuration.
tiersDefault :: [D.Tier]
tiersDefault =
    [tier1, tier2]
  where
    tier1 = D.Tier True $ S.fromList ["cas", "per"]
    tier2 = D.Tier False $ S.fromList
        [ "nmb", "gnd", "deg", "asp" , "ngt", "acm"
        , "acn", "ppr", "agg", "vlc", "dot" ]


-------------------------------------------------
-- Tagging
-------------------------------------------------


-- | Tag the analysed sentence.
tag :: C.Concraft -> Sent Tag -> Sent Tag
tag concraft sent =
    [ select' gs t seg
    | (seg, gs, t) <- zip3 sent gss ts ]
  where
    tagset = C.tagset concraft
    packed = packSent tagset sent
    tagged = C.tag concraft packed
    gss    = map (map showTag . S.toList . fst) tagged
    ts     = map (showTag . snd) tagged
    showTag = P.showTag tagset


-- | Tag the sentence with marginal probabilities.
marginals :: C.Concraft -> Sent Tag -> Sent Tag
marginals concraft sent
    = map (uncurry selectWMap)
    $ zip wmaps sent
  where
    tagset = C.tagset concraft
    packed = packSent tagset sent
    wmaps  = map
        (X.mapWMap showTag)
        (C.marginals concraft packed)
    showTag = P.showTag tagset


-------------------------------------------------
-- Training
-------------------------------------------------


-- | Training configuration.
data TrainConf = TrainConf {
    -- | Tagset.
      tagset    :: P.Tagset
    -- | SGD parameters.
    , sgdArgs   :: SGD.SgdArgs
    -- | Perform reanalysis.
    , reana     :: Bool
    -- | Store SGD dataset on disk.
    , onDisk    :: Bool
    -- | Numer of guessed tags for each word.
    , guessNum  :: Int
    -- | `G.r0T` parameter.
    , r0        :: G.R0T }

-- | Train concraft model.
-- TODO: It should be possible to supply the two training procedures with
-- different SGD arguments.
train
    :: TrainConf
    -> IO [SentO Tag]      -- ^ Training data
    -> IO [SentO Tag]      -- ^ Evaluation data
    -> IO C.Concraft
train TrainConf{..} train0 eval0 = do

    pool <- newMacaPool 1
    let ana = anaSent tagset pool
        train1 = map (packSentO tagset) <$> train0
        eval1  = map (packSentO tagset) <$> eval0

    if reana
        then doReana ana train1 eval1
        else noReana     train1 eval1

  where

    doReana ana   = C.reAnaTrain tagset ana guessNum guessConf disambConf
    noReana tr ev = C.train tagset guessNum guessConf disambConf 
        (map X.segs <$> tr) (map X.segs <$> ev)

    guessConf  = G.TrainConf guessSchemaDefault sgdArgs onDisk r0
    disambConf = D.TrainConf tiersDefault disambSchemaDefault sgdArgs onDisk


-------------------------------------------------
-- Re-analysis
-------------------------------------------------


-- | Analyse the given sentence with Maca.
-- anaSent :: MacaPool -> L.Text -> IO (Sent Tag)
anaSent :: P.Tagset -> MacaPool -> L.Text -> IO (X.Sent Word P.Tag)
anaSent tagset pool
    = fmap (packSent tagset . concat)
    . macaPar pool . L.toStrict


-- -- | Reanalyse the input paragraph (lazy IO).
-- reAnaPar :: P.Tagset -> [SentO Tag] -> IO [Sent Tag]
-- reAnaPar tagset inp = do
--     pool <- newMacaPool 1
--     A.reAnaPar tagset (anaSent pool) inp