Polish.hs
4.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
module NLP.Concraft.Polish
(
-- * Model
C.Concraft
, C.saveModel
, C.loadModel
-- * Tagging
, tag
, marginals
-- * Analysis
, macaPar
-- * Training
, TrainConf (..)
, train
-- * Pruning
, C.prune
-- -- * Analysis
-- , anaSent
-- , reAnaPar
) where
import Control.Applicative ((<$>))
import qualified Data.Text.Lazy as L
import qualified Data.Set as S
import qualified Data.Tagset.Positional as P
import qualified Numeric.SGD as SGD
import qualified NLP.Concraft.Morphosyntax as X
import qualified NLP.Concraft.Schema as S
import NLP.Concraft.Schema (SchemaConf(..), entry, entryWith)
import qualified NLP.Concraft.Guess as G
import qualified NLP.Concraft.Disamb as D
import qualified NLP.Concraft as C
-- import qualified NLP.Concraft.Analysis as A
import NLP.Concraft.Polish.Morphosyntax hiding (tag)
import NLP.Concraft.Polish.Maca
-------------------------------------------------
-- Default configuration
-------------------------------------------------
-- | Default configuration for the guessing observation schema.
guessSchemaDefault :: SchemaConf
guessSchemaDefault = S.nullConf
{ lowPrefixesC = entryWith [1, 2] [0]
, lowSuffixesC = entryWith [1, 2] [0]
, knownC = entry [0]
, begPackedC = entry [0] }
-- | Default configuration for the guessing observation schema.
disambSchemaDefault :: SchemaConf
disambSchemaDefault = S.nullConf
{ lowOrthC = entry [-2, -1, 0, 1]
, lowPrefixesC = oov $ entryWith [1, 2, 3] [0]
, lowSuffixesC = oov $ entryWith [1, 2, 3] [0]
, begPackedC = oov $ entry [0] }
where
oov (Just body) = Just $ body { S.oovOnly = True }
oov Nothing = Nothing
-- | Default tiered tagging configuration.
tiersDefault :: [D.Tier]
tiersDefault =
[tier1, tier2]
where
tier1 = D.Tier True $ S.fromList ["cas", "per"]
tier2 = D.Tier False $ S.fromList
[ "nmb", "gnd", "deg", "asp" , "ngt", "acm"
, "acn", "ppr", "agg", "vlc", "dot" ]
-------------------------------------------------
-- Tagging
-------------------------------------------------
-- | Tag the analysed sentence.
tag :: C.Concraft -> Sent Tag -> Sent Tag
tag concraft sent =
[ select' gs t seg
| (seg, gs, t) <- zip3 sent gss ts ]
where
tagset = C.tagset concraft
packed = packSent tagset sent
tagged = C.tag concraft packed
gss = map (map showTag . S.toList . fst) tagged
ts = map (showTag . snd) tagged
showTag = P.showTag tagset
-- | Tag the sentence with marginal probabilities.
marginals :: C.Concraft -> Sent Tag -> Sent Tag
marginals concraft sent
= map (uncurry selectWMap)
$ zip wmaps sent
where
tagset = C.tagset concraft
packed = packSent tagset sent
wmaps = map
(X.mapWMap showTag)
(C.marginals concraft packed)
showTag = P.showTag tagset
-------------------------------------------------
-- Training
-------------------------------------------------
-- | Training configuration.
data TrainConf = TrainConf {
-- | Tagset.
tagset :: P.Tagset
-- | SGD parameters.
, sgdArgs :: SGD.SgdArgs
-- | Perform reanalysis.
, reana :: Bool
-- | Store SGD dataset on disk.
, onDisk :: Bool
-- | Numer of guessed tags for each word.
, guessNum :: Int
-- | `G.r0T` parameter.
, r0 :: G.R0T }
-- | Train concraft model.
-- TODO: It should be possible to supply the two training procedures with
-- different SGD arguments.
train
:: TrainConf
-> IO [SentO Tag] -- ^ Training data
-> IO [SentO Tag] -- ^ Evaluation data
-> IO C.Concraft
train TrainConf{..} train0 eval0 = do
pool <- newMacaPool 1
let ana = anaSent tagset pool
train1 = map (packSentO tagset) <$> train0
eval1 = map (packSentO tagset) <$> eval0
if reana
then doReana ana train1 eval1
else noReana train1 eval1
where
doReana ana = C.reAnaTrain tagset ana guessNum guessConf disambConf
noReana tr ev = C.train tagset guessNum guessConf disambConf
(map X.segs <$> tr) (map X.segs <$> ev)
guessConf = G.TrainConf guessSchemaDefault sgdArgs onDisk r0
disambConf = D.TrainConf tiersDefault disambSchemaDefault sgdArgs onDisk
-------------------------------------------------
-- Re-analysis
-------------------------------------------------
-- | Analyse the given sentence with Maca.
-- anaSent :: MacaPool -> L.Text -> IO (Sent Tag)
anaSent :: P.Tagset -> MacaPool -> L.Text -> IO (X.Sent Word P.Tag)
anaSent tagset pool
= fmap (packSent tagset . concat)
. macaPar pool . L.toStrict
-- -- | Reanalyse the input paragraph (lazy IO).
-- reAnaPar :: P.Tagset -> [SentO Tag] -> IO [Sent Tag]
-- reAnaPar tagset inp = do
-- pool <- newMacaPool 1
-- A.reAnaPar tagset (anaSent pool) inp