Blame view

concraft/src/NLP/Concraft/Polish.hs 4.91 KB
Jan Lupa authored
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}


module NLP.Concraft.Polish
(
-- * Model
  C.Concraft
, C.saveModel
, C.loadModel

-- * Tagging
, tag
, marginals

-- * Analysis
, macaPar

-- * Training
, TrainConf (..)
, train

-- * Pruning
, C.prune

-- -- * Analysis
-- , anaSent
-- , reAnaPar
) where


import           Control.Applicative ((<$>))
import qualified Data.Text.Lazy as L
import qualified Data.Set as S

import qualified Data.Tagset.Positional as P
import qualified Numeric.SGD as SGD

import qualified NLP.Concraft.Morphosyntax as X
import qualified NLP.Concraft.Schema as S
import           NLP.Concraft.Schema (SchemaConf(..), entry, entryWith)
import qualified NLP.Concraft.Guess as G
import qualified NLP.Concraft.Disamb as D
import qualified NLP.Concraft as C
-- import qualified NLP.Concraft.Analysis as A

import           NLP.Concraft.Polish.Morphosyntax hiding (tag)
import           NLP.Concraft.Polish.Maca


-------------------------------------------------
-- Default configuration
-------------------------------------------------


-- | Default configuration for the guessing observation schema.
guessSchemaDefault :: SchemaConf
guessSchemaDefault = S.nullConf
    { lowPrefixesC  = entryWith [1, 2]      [0]
    , lowSuffixesC  = entryWith [1, 2]      [0]
    , knownC        = entry                 [0]
    , begPackedC    = entry                 [0] }


-- | Default configuration for the guessing observation schema.
disambSchemaDefault :: SchemaConf
disambSchemaDefault = S.nullConf
    { lowOrthC      = entry                         [-2, -1, 0, 1]
    , lowPrefixesC  = oov $ entryWith [1, 2, 3]     [0]
    , lowSuffixesC  = oov $ entryWith [1, 2, 3]     [0]
    , begPackedC    = oov $ entry                   [0] }
  where
    oov (Just body) = Just $ body { S.oovOnly = True }
    oov Nothing     = Nothing


-- | Default tiered tagging configuration.
tiersDefault :: [D.Tier]
tiersDefault =
    [tier1, tier2]
  where
    tier1 = D.Tier True $ S.fromList ["cas", "per"]
    tier2 = D.Tier False $ S.fromList
        [ "nmb", "gnd", "deg", "asp" , "ngt", "acm"
        , "acn", "ppr", "agg", "vlc", "dot" ]


-------------------------------------------------
-- Tagging
-------------------------------------------------


-- | Tag the analysed sentence.
tag :: C.Concraft -> Sent Tag -> Sent Tag
tag concraft sent =
    [ select' gs t seg
    | (seg, gs, t) <- zip3 sent gss ts ]
  where
    tagset = C.tagset concraft
    packed = packSent tagset sent
    tagged = C.tag concraft packed
    gss    = map (map showTag . S.toList . fst) tagged
    ts     = map (showTag . snd) tagged
    showTag = P.showTag tagset


-- | Tag the sentence with marginal probabilities.
marginals :: C.Concraft -> Sent Tag -> Sent Tag
marginals concraft sent
    = map (uncurry selectWMap)
    $ zip wmaps sent
  where
    tagset = C.tagset concraft
    packed = packSent tagset sent
    wmaps  = map
        (X.mapWMap showTag)
        (C.marginals concraft packed)
    showTag = P.showTag tagset


-------------------------------------------------
-- Training
-------------------------------------------------


-- | Training configuration.
data TrainConf = TrainConf {
    -- | Tagset.
      tagset    :: P.Tagset
    -- | SGD parameters.
    , sgdArgs   :: SGD.SgdArgs
    -- | Perform reanalysis.
    , reana     :: Bool
    -- | Store SGD dataset on disk.
    , onDisk    :: Bool
    -- | Numer of guessed tags for each word.
    , guessNum  :: Int
    -- | `G.r0T` parameter.
    , r0        :: G.R0T }

-- | Train concraft model.
-- TODO: It should be possible to supply the two training procedures with
-- different SGD arguments.
train
    :: TrainConf
    -> IO [SentO Tag]      -- ^ Training data
    -> IO [SentO Tag]      -- ^ Evaluation data
    -> IO C.Concraft
train TrainConf{..} train0 eval0 = do

    pool <- newMacaPool 1
    let ana = anaSent tagset pool
        train1 = map (packSentO tagset) <$> train0
        eval1  = map (packSentO tagset) <$> eval0

    if reana
        then doReana ana train1 eval1
        else noReana     train1 eval1

  where

    doReana ana   = C.reAnaTrain tagset ana guessNum guessConf disambConf
    noReana tr ev = C.train tagset guessNum guessConf disambConf 
        (map X.segs <$> tr) (map X.segs <$> ev)

    guessConf  = G.TrainConf guessSchemaDefault sgdArgs onDisk r0
    disambConf = D.TrainConf tiersDefault disambSchemaDefault sgdArgs onDisk


-------------------------------------------------
-- Re-analysis
-------------------------------------------------


-- | Analyse the given sentence with Maca.
-- anaSent :: MacaPool -> L.Text -> IO (Sent Tag)
anaSent :: P.Tagset -> MacaPool -> L.Text -> IO (X.Sent Word P.Tag)
anaSent tagset pool
    = fmap (packSent tagset . concat)
    . macaPar pool . L.toStrict


-- -- | Reanalyse the input paragraph (lazy IO).
-- reAnaPar :: P.Tagset -> [SentO Tag] -> IO [Sent Tag]
-- reAnaPar tagset inp = do
--     pool <- newMacaPool 1
--     A.reAnaPar tagset (anaSent pool) inp