Commit 4105efc5dcebea75ccaf873196895e83dcf565be
1 parent
482d7740
Revert "Added interactive versions of Concraft, MACA, and Świgra along with inst…
…allation instructions." This reverts commit 482d7740.
Showing
151 changed files
with
0 additions
and
120082 deletions
Too many changes to show.
To preserve performance only 28 of 151 files are displayed.
tools/concraft/.gitignore deleted
tools/concraft/LICENSE deleted
1 | -Copyright (c) 2013 Jakub Waszczuk | |
2 | -All rights reserved. | |
3 | - | |
4 | -Redistribution and use in source and binary forms, with or without | |
5 | -modification, are permitted provided that the following conditions | |
6 | -are met: | |
7 | - | |
8 | - * Redistributions of source code must retain the above copyright | |
9 | - notice, this list of conditions and the following disclaimer. | |
10 | - | |
11 | - * Redistributions in binary form must reproduce the above | |
12 | - copyright notice, this list of conditions and the following | |
13 | - disclaimer in the documentation and/or other materials provided | |
14 | - with the distribution. | |
15 | - | |
16 | -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
17 | -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
18 | -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
19 | -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
20 | -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
21 | -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
22 | -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
23 | -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
24 | -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
25 | -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
26 | -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
tools/concraft/README.md deleted
1 | -The instructions below apply to modern systems with the following caveats: | |
2 | -Before running stack install it might be necessary to execute stack setup in order to install the correct version of ghc. | |
3 | - | |
4 | -Maca can be installed using the instructions on nlp.pwr.wroc.pl/redmine/projects/libpltagger/wiki/InstallOnUbuntu11 with the following exceptions: | |
5 | -The package libboost-all-dev can be installed in place of libboost1.42-all-dev as the latter might be unavailable. | |
6 | -Instead of using Bartosz Zaborowski's repository one can download Morfeusz directly from | |
7 | -ppa.launchpad.net/bartosz-zaborowski/nlp/ubuntu/pool/main/m/morfeusz-sgjp/morfeusz-sgjp_0.81-1~precise_amd64.deb | |
8 | -Before installing SFST it is necessary to modify the file maca/third_party/SFST-1.2/SFST/src/fst.C | |
9 | -In lines 445 and 449 the identifier it must be replaced with iter (or any other non-conflicting identifier). | |
10 | -It might also be necessary to install ncurses using | |
11 | -sudo apt-get install libncurses5-dev | |
12 | - | |
13 | -After Concraft is successfully installed its executable can be found in ~/.local/bin | |
14 | -For an example usage of Concraft's server-client mode (in OCaml) see concraft_test.ml | |
15 | - | |
16 | -Below are the contents of the original README file. | |
17 | - | |
18 | -Concraft-pl | |
19 | -=========== | |
20 | - | |
21 | -This package provides a morphosyntactic tagger for the Polish language. | |
22 | -The tool combines the following components into a pipeline: | |
23 | -* A morphosyntactic segmentation and analysis tool [Maca][maca], | |
24 | -* A morphosyntactic disambiguation library [Concraft][concraft], | |
25 | - | |
26 | -<!--- | |
27 | -* A simple, frequency-driven lemmatiser (TODO). Until the lemmatiser component | |
28 | - is implemented, the tagger may output multiple interpretations (all related | |
29 | - to the same morphosyntactic tag, but with different lemmas) in some cases. | |
30 | ---> | |
31 | - | |
32 | -As for now, the tagger doesn't provide any lemmatisation capabilities. | |
33 | -As a result, it may output multiple interpretations (all related | |
34 | -to the same morphosyntactic tag, but with different lemmas) for some known | |
35 | -words, while for the out-of-vocabulary words it just outputs orthographic | |
36 | -forms as lemmas. | |
37 | - | |
38 | -See the [homepage][homepage] if you wish to download a pre-trained | |
39 | -model for the Polish language. | |
40 | - | |
41 | - | |
42 | -Installation | |
43 | -============ | |
44 | - | |
45 | -It is recommanded to install Concraft-pl using the | |
46 | -[Haskell Tool Stack][stack], which you will need to downoload and | |
47 | -install on your machine beforehand. Then clone this repository into | |
48 | -a local directory and use `stack` to install the library by running: | |
49 | - | |
50 | - stack install | |
51 | - | |
52 | -Unless you plan to use a custom preprocessing pipeline or run | |
53 | -[Maca][maca] on a different machine (see section | |
54 | -[Tagging analysed data](#tagging-analysed-data)), you will also need | |
55 | -the [Maca][maca] tool. A detailed [installation guide][maca-install] | |
56 | -can be found on the [Maca][maca] homepage. | |
57 | - | |
58 | - | |
59 | -Data format | |
60 | -=========== | |
61 | - | |
62 | -The current version of Concraft-pl works on a simple `plain` text format supported by | |
63 | -the [Corpus2][corpus2] tools. You will have to install these tools when you install | |
64 | -[Maca][maca] anyway, so you can use them to convert the output generated | |
65 | -by Concraft-pl to one of other formats supported by [Corpus2][corpus2]. | |
66 | - | |
67 | - | |
68 | -Training | |
69 | -======== | |
70 | - | |
71 | -If you have the training material with disambiguation annotations (stored in the | |
72 | -`plain` text format) you can train the Concraft-pl model yourself. | |
73 | - | |
74 | - concraft-pl train train.plain -e eval.plain -o model.gz | |
75 | - | |
76 | -Concraft-pl uses the [NKJP][nkjp] [morphosyntactic tagset definition](config/nkjp-tagset.cfg) | |
77 | -by default. It will also reanalyse the input data before the actual training. If you want | |
78 | -to change this behaviour, use the `--tagset` and `--noana` command-line options. | |
79 | - | |
80 | -Consider using [runtime system options][ghc-rts]. You can speed up processing | |
81 | -by making use of multiple cores by using the `-N` option. The `-s` option will | |
82 | -produce the runtime statistics, such as the time spent in the garbage collector. | |
83 | -If the program is spending too much time collecting garbage, you can try to | |
84 | -increase the allocation area size with the `-A` option. If you have a big | |
85 | -dataset and it doesn't fit in the computer memory, use the `--disk` flag. | |
86 | -For example, to train the model using four threads and 256M allocation area | |
87 | -size, run: | |
88 | - | |
89 | - concraft-pl train train.plain -e eval.plain -o model.gz +RTS -N4 -A256M -s | |
90 | - | |
91 | -Run `concraft-pl train --help` to learn more about the program arguments and | |
92 | -possible training options. | |
93 | - | |
94 | -Finally, you may consider pruning the resultant model in order to reduce its size. | |
95 | -Features with values close to 0 (in log-domain) have little effect on the modeled | |
96 | -probability and, therefore, it should be safe to discard them. | |
97 | - | |
98 | - concraft-pl prune -t 0.05 input-model.gz pruned-model.gz | |
99 | - | |
100 | - | |
101 | -Tagging | |
102 | -======= | |
103 | - | |
104 | -Once you have a Concraft-pl model you can use the following command tag `input.txt` file: | |
105 | - | |
106 | - concraft-pl tag model.gz < input.txt > output.plain | |
107 | - | |
108 | -The input file is first divided into paragraphs (the tool interprets empty lines | |
109 | -as paragraph ending markers). After that, [Maca][maca] is used to segment and analyse | |
110 | -each paragraph. Finally, [Concraft][concraft] module is used to disambiguate each | |
111 | -sentence in the [Maca][maca] output. | |
112 | - | |
113 | -With the `--marginals` option enabled, Concraft-pl will output marginal probabilities | |
114 | -corresponding to individual tags (determined on the basis of the disambiguation model) | |
115 | -instead of `disamb` markers. | |
116 | - | |
117 | -Run `concraft-pl tag --help` to learn more about possible tagging options. | |
118 | - | |
119 | - | |
120 | -Server | |
121 | -====== | |
122 | - | |
123 | -Concraft-pl provides also a client/server mode. It is handy when, for example, | |
124 | -you need to tag a large collection of small files. Loading Concraft-pl model | |
125 | -from a disk takes considerable amount of time which makes the tagging method | |
126 | -described above very slow in such a setting. | |
127 | - | |
128 | -To start the Concraft-pl server, run: | |
129 | - | |
130 | - concraft-pl server --inmodel model.gz | |
131 | - | |
132 | -You can supply a custom port number using a `--port` option. For example, | |
133 | -to run the server on the `10101` port, use the following command: | |
134 | - | |
135 | - concraft-pl server --inmodel model.gz --port 10101 | |
136 | - | |
137 | -To use the server in a multi-threaded environment, you need to specify the | |
138 | -`-N` [RTS][ghc-rts] option. A set of options which usually yields good | |
139 | -server performance is presented in the following example: | |
140 | - | |
141 | - concraft-pl server --inmodel model.gz +RTS -N -A4M -qg1 -I0 | |
142 | - | |
143 | -Run `concraft-pl server --help` to learn more about possible server-mode options. | |
144 | - | |
145 | -The client mode works just like the tagging mode. The only difference is that, | |
146 | -instead of supplying your client with a model, you need to specify the port number | |
147 | -(in case you used a custom one when starting the server; otherwise, the default | |
148 | -port number will be used). | |
149 | - | |
150 | - concraft-pl client --port 10101 < input.txt > output.plain | |
151 | - | |
152 | -Run `concraft-pl client --help` to learn more about possible client-mode options. | |
153 | - | |
154 | - | |
155 | -Tagging analysed data | |
156 | -===================== | |
157 | - | |
158 | -In some situations you might want to feed Concraft-pl with a previously | |
159 | -analysed data. Perhaps your Maca instance is installed on a different | |
160 | -machine, or maybe you want to use Concraft-pl with a custom | |
161 | -preprocessing pipeline. | |
162 | - | |
163 | -If you want to use a preprocessing pipeline significantly different from | |
164 | -the standard one (Maca), you should first train your own Concraft model. | |
165 | -To train the model on analysed data use the `--noana` training flag. | |
166 | - | |
167 | -Use the same `--noana` flag when you want to tag analysed data. | |
168 | -Input format should be the same as the output format. | |
169 | -This option is currently not supported in the client/server mode. | |
170 | - | |
171 | -*Remember to use the same preprocessing pipeline (segmentation + analysis) for both | |
172 | -training and disambiguation. Inconsistencies between training material and input | |
173 | -data may severely harm the quality of disambiguation.* | |
174 | - | |
175 | - | |
176 | -[stack]: http://docs.haskellstack.org "Haskell Tool Stack" | |
177 | -[homepage]: http://zil.ipipan.waw.pl/Concraft "Homepage" | |
178 | -[concraft]: https://github.com/kawu/concraft "Concraft" | |
179 | -[hackage-repo]: http://hackage.haskell.org/package/concraft-pl "Concraft-pl Hackage repository" | |
180 | -[maca]: http://nlp.pwr.wroc.pl/redmine/projects/libpltagger/wiki "Maca" | |
181 | -[maca-install]: http://nlp.pwr.wroc.pl/redmine/projects/libpltagger/wiki#Download-and-install-MACA "Maca installation guide" | |
182 | -[corpus2]: http://nlp.pwr.wroc.pl/redmine/projects/corpus2/wiki "Corpus2" | |
183 | -[ghc]: http://www.haskell.org/ghc "Glasgow Haskell Compiler" | |
184 | -[ghc-rts]: http://www.haskell.org/ghc/docs/latest/html/users_guide/runtime-control.html "GHC runtime system options" | |
185 | -[cabal]: http://www.haskell.org/cabal "Cabal" | |
186 | -[haskell-platform]: http://www.haskell.org/platform "Haskell Platform" | |
187 | -[nkjp]: http://nkjp.pl/index.php?page=0&lang=1 "NKJP" |
tools/concraft/changelog deleted
1 | --*-change-log-*- | |
2 | - | |
3 | -0.7.5 Aug 2016 | |
4 | - * Start using `stack` to avoid dependency problems | |
5 | - | |
6 | -0.7.4 Nov 2014 | |
7 | - * Ignore non-printable characters for compatibility with Maca | |
8 | - | |
9 | -0.7.0 Nov 2013 | |
10 | - * Output guessing results for unknown words | |
11 | - * Allow "--noana" option in the client/server mode | |
12 | - * Add option to output marginal probabilities | |
13 | - * Add separate mode for model pruning | |
14 | - | |
15 | -0.6.0 Nov 2013 | |
16 | - * Output wordform instead of "None" as a lemma for unknown words | |
17 | - | |
18 | -0.5.0 Nov 2013 | |
19 | - * Restore the orth[-2] feature in the observation schema |
tools/concraft/concraft-pl.cabal deleted
1 | -name: concraft-pl | |
2 | -version: 0.7.5 | |
3 | -synopsis: Morphological tagger for Polish | |
4 | -description: | |
5 | - A morphological tagger for Polish based on the concraft library. | |
6 | -license: BSD3 | |
7 | -license-file: LICENSE | |
8 | -cabal-version: >= 1.6 | |
9 | -copyright: Copyright (c) 2013 Jakub Waszczuk | |
10 | -author: Jakub Waszczuk | |
11 | -maintainer: waszczuk.kuba@gmail.com | |
12 | -stability: experimental | |
13 | -category: Natural Language Processing | |
14 | -homepage: http://zil.ipipan.waw.pl/Concraft | |
15 | -build-type: Simple | |
16 | - | |
17 | -extra-source-files: changelog | |
18 | - | |
19 | -data-files: config/nkjp-tagset.cfg | |
20 | - | |
21 | -library | |
22 | - hs-source-dirs: src | |
23 | - | |
24 | - build-depends: | |
25 | - base >= 4 && < 5 | |
26 | - , concraft >= 0.9 && < 0.10 | |
27 | - , tagset-positional >= 0.3 && < 0.4 | |
28 | - , sgd >= 0.3.3 && < 0.4 | |
29 | - , containers >= 0.4 && < 0.6 | |
30 | - , bytestring >= 0.9 && < 0.11 | |
31 | - , text >= 0.11 && < 1.3 | |
32 | - , aeson >= 0.6 && < 0.9 | |
33 | - , binary >= 0.5 && < 0.8 | |
34 | - , process >= 1.1 && < 1.3 | |
35 | - , mtl >= 2.0 && < 2.3 | |
36 | - , transformers >= 0.2 && < 0.5 | |
37 | - , network >= 2.3 && < 2.7 | |
38 | - , lazy-io >= 0.1 && < 0.2 | |
39 | - , split >= 0.2 && < 0.3 | |
40 | - | |
41 | - exposed-modules: | |
42 | - NLP.Concraft.Polish | |
43 | - , NLP.Concraft.Polish.Maca | |
44 | - , NLP.Concraft.Polish.Morphosyntax | |
45 | - , NLP.Concraft.Polish.Request | |
46 | - , NLP.Concraft.Polish.Server | |
47 | - | |
48 | - other-modules: | |
49 | - NLP.Concraft.Polish.Format.Plain | |
50 | - | |
51 | - ghc-options: -Wall -O2 | |
52 | - | |
53 | -source-repository head | |
54 | - type: git | |
55 | - location: https://github.com/kawu/concraft-pl.git | |
56 | - | |
57 | -executable concraft-pl | |
58 | - build-depends: | |
59 | - cmdargs >= 0.10 && < 0.11 | |
60 | - hs-source-dirs: src, tools | |
61 | - main-is: concraft-pl.hs | |
62 | - ghc-options: -Wall -O2 -threaded -rtsopts |
tools/concraft/concraft_test.ml deleted
1 | -let wait_for_server () = | |
2 | - let rec wait s a = | |
3 | - try Unix.connect s a | |
4 | - with _ -> Unix.sleep 1; wait s a in | |
5 | - let s = Unix.socket Unix.PF_INET Unix.SOCK_STREAM 6 in | |
6 | - let a = Unix.ADDR_INET (Unix.inet_addr_loopback, 10089) in | |
7 | - wait s a; | |
8 | - Unix.shutdown s Unix.SHUTDOWN_SEND; | |
9 | - Unix.close s | |
10 | - | |
11 | -let start_server () = | |
12 | - let client_out, server_out = Unix.pipe () in | |
13 | - let client_err, server_err = Unix.pipe () in | |
14 | - let pid = Unix.create_process "concraft-pl" [|"concraft-pl"; "server"; "--inmodel"; "nkjp-model-0.2.gz"|] | |
15 | - Unix.stdin server_out server_err in | |
16 | - List.iter Unix.close [client_out; server_out; client_err; server_err]; | |
17 | - wait_for_server (); | |
18 | - pid | |
19 | - | |
20 | -let stop_server pid = | |
21 | - Unix.kill pid Sys.sigint | |
22 | - | |
23 | -let tag s = | |
24 | - Unix.open_process_full ("echo \"" ^ s ^ "\" | concraft-pl client") | |
25 | - [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] | |
26 | - | |
27 | -let _ = | |
28 | - let pid = start_server () in | |
29 | - let concraft_in, concraft_out, concraft_err = tag "Ala ma kota." in | |
30 | - try | |
31 | - while true do | |
32 | - print_endline @@ input_line concraft_in | |
33 | - done | |
34 | - with End_of_file -> (); | |
35 | - try | |
36 | - while true do | |
37 | - print_endline @@ "concraft error message: " ^ input_line concraft_err | |
38 | - done | |
39 | - with End_of_file -> (); | |
40 | - ignore @@ Unix.close_process_full (concraft_in, concraft_out, concraft_err); | |
41 | - stop_server pid |
tools/concraft/config/nkjp-tagset.cfg deleted
1 | -# NKJP tagset definition | |
2 | - | |
3 | -[ATTR] | |
4 | - | |
5 | -# Attributes and their values | |
6 | -nmb = sg pl | |
7 | -cas = nom gen dat acc inst loc voc | |
8 | -gnd = m1 m2 m3 f n # n1 n2 n3 p1 p2 p3 | |
9 | -per = pri sec ter | |
10 | -deg = pos com sup | |
11 | -asp = imperf perf | |
12 | -ngt = aff neg | |
13 | -acm = congr rec | |
14 | -acn = akc nakc | |
15 | -ppr = npraep praep | |
16 | -agg = agl nagl | |
17 | -vlc = nwok wok | |
18 | -dot = pun npun | |
19 | -# sentiment = spos sneg | |
20 | - | |
21 | -[RULE] | |
22 | - | |
23 | -# Parsing rules definitions. | |
24 | - | |
25 | -adja = | |
26 | -adjp = | |
27 | -adjc = | |
28 | -conj = | |
29 | -comp = | |
30 | -interp = | |
31 | -pred = | |
32 | -xxx = | |
33 | -adv = [deg] | |
34 | -imps = asp | |
35 | -inf = asp | |
36 | -pant = asp | |
37 | -pcon = asp | |
38 | -qub = [vlc] | |
39 | -prep = cas [vlc] | |
40 | -siebie = cas | |
41 | -subst = nmb cas gnd | |
42 | -depr = nmb cas gnd | |
43 | -ger = nmb cas gnd asp ngt | |
44 | -ppron12 = nmb cas gnd per [acn] | |
45 | -ppron3 = nmb cas gnd per [acn] [ppr] | |
46 | -num = nmb cas gnd acm | |
47 | -numcol = nmb cas gnd acm | |
48 | -adj = nmb cas gnd deg | |
49 | -pact = nmb cas gnd asp ngt | |
50 | -ppas = nmb cas gnd asp ngt | |
51 | -winien = nmb gnd asp | |
52 | -praet = nmb gnd asp [agg] | |
53 | -bedzie = nmb per asp | |
54 | -fin = nmb per asp | |
55 | -impt = nmb per asp | |
56 | -aglt = nmb per asp vlc | |
57 | - | |
58 | -# `ign` is a special tag and we treat in a special way. | |
59 | -# ign = | |
60 | - | |
61 | -brev = dot | |
62 | -burk = | |
63 | -interj = | |
64 | - |
tools/concraft/eval/cv.sh deleted
1 | -# Cross-validation script. | |
2 | - | |
3 | - | |
4 | -# Root data directory | |
5 | -data=$1 | |
6 | - | |
7 | -# NKJP tagset configuration | |
8 | -nkjp_tagset=$2 | |
9 | - | |
10 | - | |
11 | -echo "" | |
12 | -echo "Data directory: "$1 | |
13 | -echo "Tagset file: "$2 | |
14 | -shift | |
15 | -shift | |
16 | -echo "Training arguments: ""$@" | |
17 | -echo "" | |
18 | - | |
19 | - | |
20 | -for i in "01" "02" "03" "04" "05" "06" "07" "08" "09" "10" | |
21 | -do | |
22 | - echo $i | |
23 | - | |
24 | - # Construct training material. | |
25 | - echo -n "" > $data/train/train$i.plain | |
26 | - for j in "01" "02" "03" "04" "05" "06" "07" "08" "09" "10" | |
27 | - do | |
28 | - if [ $i != $j ]; then | |
29 | - echo "> "$j | |
30 | - cat $data/folds/test$j.plain >> $data/train/train$i.plain | |
31 | - fi | |
32 | - done | |
33 | - | |
34 | - # Train tagger and tag eval file. | |
35 | - concraft-pl train --tagset=$nkjp_tagset $data/train/train$i.plain -e $data/folds/test$i.plain -o $data/concraft/model$i.gz "$@" | |
36 | - concraft-pl tag $data/concraft/model$i.gz < $data/text/test$i.txt > $data/tagged/test$i.plain | |
37 | - | |
38 | - echo -e "\nSTATS\n" | |
39 | - concraft-pl compare --tagset=$nkjp_tagset $data/folds/test$i.plain $data/tagged/test$i.plain | |
40 | - echo -e "\nEND STATS\n" | |
41 | - | |
42 | - # Delete the training material, we will not need it again. | |
43 | - rm $data/train/train$i.plain | |
44 | -done |
tools/concraft/makefile deleted
tools/concraft/nkjp-model-0.2.gz deleted
No preview for this file type
tools/concraft/notes/binary.txt deleted
1 | -Pamięć zajmowana przez poszczególne elementy binarnej reprezentacji modelu: | |
2 | - | |
3 | -15M ./codec.bin | |
4 | -305M ./actualModel.bin | |
5 | -202M ./featMap.bin | |
6 | -130M ./featMap-otherMap.bin | |
7 | -73M ./featMap-transMaps.bin | |
8 | -104M ./valuesVect.bin | |
9 | - | |
10 | -Sortując zgodnie z rozmiarami (tylko elemety najniższego poziomu): | |
11 | - | |
12 | -130M ./featMap-otherMap.bin | |
13 | -104M ./valuesVect.bin | |
14 | -73M ./featMap-transMaps.bin | |
15 | -15M ./codec.bin | |
16 | - | |
17 | -Trzeba wziąć pod uwagę, jakie są różnice pomiędzy rozmiarami | |
18 | -zwykłej oraz binarnej reprezentacji poszczególnych struktur danych. | |
19 | -Przykładowo, w przypadku wektora różnica ta prawdopodobnie jest | |
20 | -niewielka. W przypadku map różnica jest prawdopodobnie duża. | |
21 | - | |
22 | -Rozmiar mapy `otherMap` to 3390538 par (klucz, wartość). | |
23 | -Będziemy go oznaczali przez N. Zgodnie z | |
24 | -http://blog.johantibell.com/2011/06/memory-footprints-of-some-common-data.html | |
25 | -rozmiar takie mapy w pamięci można szacować jako (na systemie 64-bitowym): | |
26 | - | |
27 | - 6N words + size of keys & values | |
28 | - | |
29 | - 6N words = ~163 MB | |
30 | - size of keys = N * 4 words = ~108 MB | |
31 | - size of vals = N words = ~27 MB | |
32 | - | |
33 | - W sumie = ~300MB ! |
tools/concraft/notes/notes.txt deleted
1 | -Plany i pomysły na dalszy rozwój Concrafta. | |
2 | - | |
3 | - | |
4 | -[HIGH] | |
5 | - | |
6 | - | |
7 | -* Umożliwienie użytkownikowi określenia dodatkowych | |
8 | - cech schematu obserwacji, spoza zestawu oferowanego | |
9 | - przez bibliotekę podstawową. [HARD] | |
10 | - | |
11 | - | |
12 | -[MEDIUM] | |
13 | - | |
14 | - | |
15 | -* Trenowanie modelu bezpośrednio na korpusie NKJP. [EASY] | |
16 | - | |
17 | - | |
18 | -* Cześć konfiguracji należałoby przenieść do plików konfiguracyjnych. | |
19 | - Dotyczy to np. konfiguracji trenowania -- parametry SGD bez większych | |
20 | - problemów mogłyby być zdefiniowane w postaci plików konfiguracyjnych. | |
21 | - [MEDIUM] | |
22 | - | |
23 | - | |
24 | -* Powyższy pomysł jest w bardziej ogólny. Chcielibyśmy dać użytkownikowi | |
25 | - możliwość konfigurowania nie tylko parametrów SGD, ale również | |
26 | - - schematu obserwacji | |
27 | - - podziału na warstwy | |
28 | - W szczególności, użytkownik powinien mieć możliwość zdefiniowania | |
29 | - osobnego schematu obserwacji dla różnych warstw. | |
30 | - | |
31 | - PYTANIE: Czy obecnie tworzona jest jeden kodek dla wszystkich typów | |
32 | - obserwacji? Zdecydowanie powinno tak być! | |
33 | - | |
34 | - | |
35 | -* Fajnie byłoby użyć na poziomie kodeka biblioteki DAWG, dzięki czemu | |
36 | - model byłby bardziej skompresowany. [MEDIUM/HARD] | |
37 | - | |
38 | - ALE: W postaci binarnej kodek zajmuje jedyne 15MB, w porównaniu do | |
39 | - tego pamięć zajmowana przez inne części modelu jest ogromna! | |
40 | - | |
41 | - | |
42 | -* Zmiana interfejsu biblioteki podstawowej: zamiast klasy | |
43 | - Word można dodać np. "środowisko zdania". [HARD] | |
44 | - | |
45 | - | |
46 | -[LOW] | |
47 | - | |
48 | - | |
49 | -* Obsługa dodatkowych formatów. [MEDIUM] | |
50 | - <- Mało ważne chociażby dlatego, że jeśli już Maca jest dostępna, | |
51 | - to może być też wykorzystana do konwersji danych wyjściowych | |
52 | - do innego formatu. |
tools/concraft/notes/plan.txt deleted
1 | -Kroki prowadzące do opublikowania nowej wersji Concrafta w sieci. | |
2 | - | |
3 | -1. Zmergować gałąź 0.5 z gałęzią master w repozytorium https://github.com/kawu/concraft. DONE | |
4 | - | |
5 | -2. Zaktualizować opis na stronie http://zil.ipipan.waw.pl/Concraft. DONE | |
6 | - W szczególności, należy umieścić tam nowy model. DONE | |
7 | - | |
8 | -[Dodatki] | |
9 | - | |
10 | -A. Obecny model 0.5 wytrenowany jest na 90% korpusu milionowego NKJP. | |
11 | - Kiedyś później należałoby wytrenować model na całym korpusie. |
tools/concraft/src/NLP/Concraft/Polish.hs deleted
1 | -{-# LANGUAGE OverloadedStrings #-} | |
2 | -{-# LANGUAGE RecordWildCards #-} | |
3 | - | |
4 | - | |
5 | -module NLP.Concraft.Polish | |
6 | -( | |
7 | --- * Model | |
8 | - C.Concraft | |
9 | -, C.saveModel | |
10 | -, C.loadModel | |
11 | - | |
12 | --- * Tagging | |
13 | -, tag | |
14 | -, marginals | |
15 | - | |
16 | --- * Analysis | |
17 | -, macaPar | |
18 | - | |
19 | --- * Training | |
20 | -, TrainConf (..) | |
21 | -, train | |
22 | - | |
23 | --- * Pruning | |
24 | -, C.prune | |
25 | - | |
26 | --- -- * Analysis | |
27 | --- , anaSent | |
28 | --- , reAnaPar | |
29 | -) where | |
30 | - | |
31 | - | |
32 | -import Control.Applicative ((<$>)) | |
33 | -import qualified Data.Text.Lazy as L | |
34 | -import qualified Data.Set as S | |
35 | - | |
36 | -import qualified Data.Tagset.Positional as P | |
37 | -import qualified Numeric.SGD as SGD | |
38 | - | |
39 | -import qualified NLP.Concraft.Morphosyntax as X | |
40 | -import qualified NLP.Concraft.Schema as S | |
41 | -import NLP.Concraft.Schema (SchemaConf(..), entry, entryWith) | |
42 | -import qualified NLP.Concraft.Guess as G | |
43 | -import qualified NLP.Concraft.Disamb as D | |
44 | -import qualified NLP.Concraft as C | |
45 | --- import qualified NLP.Concraft.Analysis as A | |
46 | - | |
47 | -import NLP.Concraft.Polish.Morphosyntax hiding (tag) | |
48 | -import NLP.Concraft.Polish.Maca | |
49 | - | |
50 | - | |
51 | -------------------------------------------------- | |
52 | --- Default configuration | |
53 | -------------------------------------------------- | |
54 | - | |
55 | - | |
56 | --- | Default configuration for the guessing observation schema. | |
57 | -guessSchemaDefault :: SchemaConf | |
58 | -guessSchemaDefault = S.nullConf | |
59 | - { lowPrefixesC = entryWith [1, 2] [0] | |
60 | - , lowSuffixesC = entryWith [1, 2] [0] | |
61 | - , knownC = entry [0] | |
62 | - , begPackedC = entry [0] } | |
63 | - | |
64 | - | |
65 | --- | Default configuration for the guessing observation schema. | |
66 | -disambSchemaDefault :: SchemaConf | |
67 | -disambSchemaDefault = S.nullConf | |
68 | - { lowOrthC = entry [-2, -1, 0, 1] | |
69 | - , lowPrefixesC = oov $ entryWith [1, 2, 3] [0] | |
70 | - , lowSuffixesC = oov $ entryWith [1, 2, 3] [0] | |
71 | - , begPackedC = oov $ entry [0] } | |
72 | - where | |
73 | - oov (Just body) = Just $ body { S.oovOnly = True } | |
74 | - oov Nothing = Nothing | |
75 | - | |
76 | - | |
77 | --- | Default tiered tagging configuration. | |
78 | -tiersDefault :: [D.Tier] | |
79 | -tiersDefault = | |
80 | - [tier1, tier2] | |
81 | - where | |
82 | - tier1 = D.Tier True $ S.fromList ["cas", "per"] | |
83 | - tier2 = D.Tier False $ S.fromList | |
84 | - [ "nmb", "gnd", "deg", "asp" , "ngt", "acm" | |
85 | - , "acn", "ppr", "agg", "vlc", "dot" ] | |
86 | - | |
87 | - | |
88 | -------------------------------------------------- | |
89 | --- Tagging | |
90 | -------------------------------------------------- | |
91 | - | |
92 | - | |
93 | --- | Tag the analysed sentence. | |
94 | -tag :: C.Concraft -> Sent Tag -> Sent Tag | |
95 | -tag concraft sent = | |
96 | - [ select' gs t seg | |
97 | - | (seg, gs, t) <- zip3 sent gss ts ] | |
98 | - where | |
99 | - tagset = C.tagset concraft | |
100 | - packed = packSent tagset sent | |
101 | - tagged = C.tag concraft packed | |
102 | - gss = map (map showTag . S.toList . fst) tagged | |
103 | - ts = map (showTag . snd) tagged | |
104 | - showTag = P.showTag tagset | |
105 | - | |
106 | - | |
107 | --- | Tag the sentence with marginal probabilities. | |
108 | -marginals :: C.Concraft -> Sent Tag -> Sent Tag | |
109 | -marginals concraft sent | |
110 | - = map (uncurry selectWMap) | |
111 | - $ zip wmaps sent | |
112 | - where | |
113 | - tagset = C.tagset concraft | |
114 | - packed = packSent tagset sent | |
115 | - wmaps = map | |
116 | - (X.mapWMap showTag) | |
117 | - (C.marginals concraft packed) | |
118 | - showTag = P.showTag tagset | |
119 | - | |
120 | - | |
121 | -------------------------------------------------- | |
122 | --- Training | |
123 | -------------------------------------------------- | |
124 | - | |
125 | - | |
126 | --- | Training configuration. | |
127 | -data TrainConf = TrainConf { | |
128 | - -- | Tagset. | |
129 | - tagset :: P.Tagset | |
130 | - -- | SGD parameters. | |
131 | - , sgdArgs :: SGD.SgdArgs | |
132 | - -- | Perform reanalysis. | |
133 | - , reana :: Bool | |
134 | - -- | Store SGD dataset on disk. | |
135 | - , onDisk :: Bool | |
136 | - -- | Numer of guessed tags for each word. | |
137 | - , guessNum :: Int | |
138 | - -- | `G.r0T` parameter. | |
139 | - , r0 :: G.R0T } | |
140 | - | |
141 | --- | Train concraft model. | |
142 | --- TODO: It should be possible to supply the two training procedures with | |
143 | --- different SGD arguments. | |
144 | -train | |
145 | - :: TrainConf | |
146 | - -> IO [SentO Tag] -- ^ Training data | |
147 | - -> IO [SentO Tag] -- ^ Evaluation data | |
148 | - -> IO C.Concraft | |
149 | -train TrainConf{..} train0 eval0 = do | |
150 | - | |
151 | - pool <- newMacaPool 1 | |
152 | - let ana = anaSent tagset pool | |
153 | - train1 = map (packSentO tagset) <$> train0 | |
154 | - eval1 = map (packSentO tagset) <$> eval0 | |
155 | - | |
156 | - if reana | |
157 | - then doReana ana train1 eval1 | |
158 | - else noReana train1 eval1 | |
159 | - | |
160 | - where | |
161 | - | |
162 | - doReana ana = C.reAnaTrain tagset ana guessNum guessConf disambConf | |
163 | - noReana tr ev = C.train tagset guessNum guessConf disambConf | |
164 | - (map X.segs <$> tr) (map X.segs <$> ev) | |
165 | - | |
166 | - guessConf = G.TrainConf guessSchemaDefault sgdArgs onDisk r0 | |
167 | - disambConf = D.TrainConf tiersDefault disambSchemaDefault sgdArgs onDisk | |
168 | - | |
169 | - | |
170 | -------------------------------------------------- | |
171 | --- Re-analysis | |
172 | -------------------------------------------------- | |
173 | - | |
174 | - | |
175 | --- | Analyse the given sentence with Maca. | |
176 | -anaSent :: P.Tagset -> MacaPool -> L.Text -> IO (X.Sent Word P.Tag) | |
177 | -anaSent tagset pool | |
178 | - = fmap (packSent tagset . concat) | |
179 | - . macaPar pool . L.toStrict | |
180 | - | |
181 | - | |
182 | --- -- | Reanalyse the input paragraph (lazy IO). | |
183 | --- reAnaPar :: P.Tagset -> [SentO Tag] -> IO [Sent Tag] | |
184 | --- reAnaPar tagset inp = do | |
185 | --- pool <- newMacaPool 1 | |
186 | --- A.reAnaPar tagset (anaSent pool) inp |
tools/concraft/src/NLP/Concraft/Polish/Format/Plain.hs deleted
1 | -{-# LANGUAGE OverloadedStrings #-} | |
2 | -{-# LANGUAGE RecordWildCards #-} | |
3 | - | |
4 | --- | Simple format for morphosyntax representation which | |
5 | --- with no spaces within and that one of the tags indicates | |
6 | --- unknown words. | |
7 | - | |
8 | -module NLP.Concraft.Polish.Format.Plain | |
9 | -( | |
10 | --- * Parsing | |
11 | - parsePlain | |
12 | -, parsePara | |
13 | -, parseSent | |
14 | - | |
15 | --- * Printing | |
16 | -, ShowCfg (..) | |
17 | -, showPlain | |
18 | -, showPara | |
19 | -, showSent | |
20 | -) where | |
21 | - | |
22 | -import Data.Monoid (Monoid, mappend, mconcat) | |
23 | -import Data.Maybe (catMaybes) | |
24 | -import Data.List (groupBy) | |
25 | -import Data.String (IsString) | |
26 | -import qualified Data.Char as C | |
27 | -import qualified Data.Map as M | |
28 | -import qualified Data.Text as T | |
29 | -import qualified Data.Text.Lazy as L | |
30 | -import qualified Data.Text.Lazy.Builder as L | |
31 | -import qualified Data.Text.Lazy.Read as R | |
32 | -import Text.Printf (printf) | |
33 | - | |
34 | -import qualified NLP.Concraft.Morphosyntax as X | |
35 | -import NLP.Concraft.Polish.Morphosyntax | |
36 | - | |
37 | --- | Parse the text in the plain format. | |
38 | -parsePlain :: L.Text -> [[Sent Tag]] | |
39 | -parsePlain = | |
40 | - map parsePara' . groupBy f . L.splitOn "\n\n" | |
41 | - where | |
42 | - f _ xs = case L.uncons xs of | |
43 | - Nothing -> False | |
44 | - Just (x, _) -> not (C.isSpace x) | |
45 | - | |
46 | --- | Parse the paragraph in the plain format. | |
47 | -parsePara :: L.Text -> [Sent Tag] | |
48 | -parsePara = parsePara' . L.splitOn "\n\n" | |
49 | - | |
50 | --- | Parse paragraph already divided into sentence chunks. | |
51 | -parsePara' :: [L.Text] -> [Sent Tag] | |
52 | -parsePara' = map (parseSent . L.strip) . filter (not.isEmpty) | |
53 | - | |
54 | --- | Identify empty chunks of text. | |
55 | -isEmpty :: L.Text -> Bool | |
56 | -isEmpty = L.all C.isSpace | |
57 | - | |
58 | --- | Parse the sentence in the plain format. | |
59 | -parseSent :: L.Text -> Sent Tag | |
60 | -parseSent | |
61 | - = map parseWord | |
62 | - . groupBy (\_ x -> cond x) | |
63 | - . L.lines | |
64 | - where | |
65 | - cond = ("\t" `L.isPrefixOf`) | |
66 | - | |
67 | -parseWord :: [L.Text] -> Seg Tag | |
68 | -parseWord xs = Seg | |
69 | - (Word _orth _space _known) | |
70 | - _interps | |
71 | - where | |
72 | - (_orth, _space) = parseHeader (head xs) | |
73 | - ys = map parseInterp (tail xs) | |
74 | - _known = not (Nothing `elem` ys) | |
75 | - -- _interps = M.fromListWith max (catMaybes ys) | |
76 | - _interps = X.mkWMap $ catMaybes ys | |
77 | - | |
78 | -parseInterp :: L.Text -> Maybe (Interp Tag, Double) | |
79 | -parseInterp = | |
80 | - doIt . tail . L.splitOn "\t" | |
81 | - where | |
82 | - doIt [form, tag] | |
83 | - | tag == ign = Nothing | |
84 | - | otherwise = Just $ | |
85 | - (mkInterp form tag, 0) | |
86 | - doIt [form, tag, "disamb"] = | |
87 | - Just (mkInterp form tag, 1) | |
88 | - doIt [form, tag, weight] = case R.double weight of | |
89 | - Left er -> error $ "parseInterp (weight):" ++ show er | |
90 | - Right w -> Just (mkInterp form tag, fst w) | |
91 | - doIt xs = error $ "parseInterp: " ++ show xs | |
92 | - mkInterp form tag = Interp (L.toStrict form) (L.toStrict tag) | |
93 | - | |
94 | -parseHeader :: L.Text -> (T.Text, Space) | |
95 | -parseHeader xs = | |
96 | - let [_orth, space] = L.splitOn "\t" xs | |
97 | - in (L.toStrict _orth, parseSpace space) | |
98 | - | |
99 | --- TODO: Should we represent newlines and spaces in the `Space` data type? | |
100 | -parseSpace :: L.Text -> Space | |
101 | -parseSpace "none" = None | |
102 | -parseSpace "space" = Space | |
103 | -parseSpace "spaces" = Space -- Multiple spaces | |
104 | -parseSpace "newline" = NewLine | |
105 | -parseSpace "newlines" = NewLine -- Multiple newlines | |
106 | -parseSpace xs = error ("parseSpace: " ++ L.unpack xs) | |
107 | - | |
108 | ------------ | |
109 | --- Printing | |
110 | ------------ | |
111 | - | |
112 | --- | Printing configuration. | |
113 | -data ShowCfg = ShowCfg { | |
114 | - -- | Print weights instead of 'disamb' tags. | |
115 | - showWsCfg :: Bool } | |
116 | - | |
117 | --- | Show the plain data. | |
118 | -showPlain :: ShowCfg ->[[Sent Tag]] -> L.Text | |
119 | -showPlain cfg = | |
120 | - L.intercalate "\n" . map (showPara cfg) | |
121 | - | |
122 | --- | Show the paragraph. | |
123 | -showPara :: ShowCfg -> [Sent Tag] -> L.Text | |
124 | -showPara cfg = L.toLazyText . mconcat . map (\xs -> buildSent cfg xs <> "\n") | |
125 | - | |
126 | --- | Show the sentence. | |
127 | -showSent :: ShowCfg -> Sent Tag -> L.Text | |
128 | -showSent cfg xs = L.toLazyText $ buildSent cfg xs | |
129 | - | |
130 | -buildSent :: ShowCfg -> Sent Tag -> L.Builder | |
131 | -buildSent cfg = mconcat . map (buildWord cfg) | |
132 | - | |
133 | -buildWord :: ShowCfg -> Seg Tag -> L.Builder | |
134 | -buildWord cfg Seg{..} | |
135 | - = L.fromText orth <> "\t" | |
136 | - <> buildSpace space <> "\n" | |
137 | - <> buildKnown orth known | |
138 | - <> buildInterps cfg interps | |
139 | - where Word{..} = word | |
140 | - | |
141 | -buildInterps :: ShowCfg -> X.WMap (Interp Tag) -> L.Builder | |
142 | -buildInterps ShowCfg{..} interps = mconcat | |
143 | - [ "\t" <> buildBase interp <> | |
144 | - "\t" <> buildTag interp <> buildDmb dmb | |
145 | - | (interp, dmb) <- M.toList (X.unWMap interps) ] | |
146 | - where | |
147 | - buildTag = L.fromText . tag | |
148 | - buildBase = L.fromText . base | |
149 | - buildDmb = case showWsCfg of | |
150 | - True -> \x -> between "\t" "\n" | |
151 | - $ L.fromString | |
152 | - $ printf "%.3f" x | |
153 | - False -> \x -> if x > 0 | |
154 | - then "\tdisamb\n" | |
155 | - else "\n" | |
156 | - between x y z = x <> z <> y | |
157 | - | |
158 | -buildSpace :: Space -> L.Builder | |
159 | -buildSpace None = "none" | |
160 | -buildSpace Space = "space" | |
161 | -buildSpace NewLine = "newline" | |
162 | - | |
163 | -buildKnown :: T.Text -> Bool -> L.Builder | |
164 | -buildKnown _ True = "" | |
165 | -buildKnown lemma False | |
166 | - = "\t" <> L.fromText lemma | |
167 | - <> "\t" <> L.fromText ign | |
168 | - <> "\n" | |
169 | - | |
170 | - | |
171 | ------------ | |
172 | --- Utils | |
173 | ------------ | |
174 | - | |
175 | - | |
176 | --- | An infix synonym for 'mappend'. | |
177 | -(<>) :: Monoid m => m -> m -> m | |
178 | -(<>) = mappend | |
179 | -{-# INLINE (<>) #-} | |
180 | - | |
181 | - | |
182 | --- | Tag which indicates unknown words. | |
183 | -ign :: IsString a => a | |
184 | -ign = "ign" | |
185 | -{-# INLINE ign #-} |
tools/concraft/src/NLP/Concraft/Polish/Maca.hs deleted
1 | -{-# LANGUAGE OverloadedStrings #-} | |
2 | -{-# LANGUAGE RecordWildCards #-} | |
3 | - | |
4 | - | |
5 | --- | The module provides interface for the Maca analysis tool. | |
6 | --- See <http://nlp.pwr.wroc.pl/redmine/projects/libpltagger/wiki> | |
7 | --- for more information about the analyser. | |
8 | - | |
9 | - | |
10 | -module NLP.Concraft.Polish.Maca | |
11 | -( | |
12 | - MacaPool | |
13 | -, newMacaPool | |
14 | -, macaPar | |
15 | -) where | |
16 | - | |
17 | - | |
18 | -import Control.Applicative ((<$>)) | |
19 | -import Control.Monad (void, forever, guard, replicateM, unless) | |
20 | -import Control.Concurrent | |
21 | -import Control.Exception | |
22 | -import System.Process | |
23 | -import System.IO | |
24 | -import Data.Function (on) | |
25 | -import qualified Data.Char as C | |
26 | -import qualified Data.Text as T | |
27 | -import qualified Data.Text.IO as T | |
28 | -import qualified Data.Text.Lazy as L | |
29 | -import qualified Data.Text.Lazy.IO as L | |
30 | -import qualified Control.Monad.State.Strict as S | |
31 | -import qualified Control.Monad.Trans.Maybe as M | |
32 | -import Control.Monad.Trans.Class (lift) | |
33 | - | |
34 | -import NLP.Concraft.Polish.Morphosyntax hiding (restore) | |
35 | -import qualified NLP.Concraft.Polish.Format.Plain as Plain | |
36 | - | |
37 | - | |
38 | ----------------------------- | |
39 | --- Maca instance | |
40 | ----------------------------- | |
41 | - | |
42 | - | |
43 | --- TODO: We don't have to use channels here. Maximum one element | |
44 | --- should be present in the input/output channel. | |
45 | - | |
46 | - | |
47 | --- | Input channel. | |
48 | -type In = Chan T.Text | |
49 | - | |
50 | - | |
51 | --- | Output channel. | |
52 | -type Out = Chan [Sent Tag] | |
53 | - | |
54 | - | |
55 | --- | Maca communication channels. | |
56 | -newtype Maca = Maca (In, Out) | |
57 | - | |
58 | - | |
59 | --- | Run Maca instance. | |
60 | -newMaca :: IO Maca | |
61 | -newMaca = do | |
62 | - inCh <- newChan | |
63 | - outCh <- newChan | |
64 | - void $ runMacaOn inCh outCh | |
65 | - return $ Maca (inCh, outCh) | |
66 | - | |
67 | - | |
68 | --- | Run Maca server on given channels. | |
69 | --- TODO: Should check, if maca works. In particular, if morfeusz is available. | |
70 | -runMacaOn :: In -> Out -> IO ThreadId | |
71 | -runMacaOn inCh outCh = mask $ \restore -> forkIO (do | |
72 | - let cmd = "maca-analyse" | |
73 | - args = ["-q", "morfeusz-nkjp-official", "-o", "plain", "-l"] | |
74 | - (Just inh, Just outh, Just errh, pid) <- | |
75 | - createProcess (proc cmd args){ std_in = CreatePipe | |
76 | - , std_out = CreatePipe | |
77 | - , std_err = CreatePipe } | |
78 | - | |
79 | - let excHandler = do | |
80 | - let tryIO = try :: IO a -> IO (Either IOException a) | |
81 | - void $ tryIO $ do | |
82 | - err <- hGetContents errh | |
83 | - unless (all C.isSpace err) $ do | |
84 | - putStr "Maca error: " | |
85 | - putStrLn err | |
86 | - hClose inh; hClose outh; hClose errh | |
87 | - terminateProcess pid | |
88 | - waitForProcess pid | |
89 | - | |
90 | - -- TODO: Document, why LineBuffering is needed here. | |
91 | - hSetBuffering outh LineBuffering | |
92 | - flip onException excHandler $ restore $ forever $ do | |
93 | - | |
94 | - -- Take element from the input channel. | |
95 | - txt <- readChan inCh | |
96 | - -- putStr "REQUEST: " | |
97 | - -- print txt | |
98 | - | |
99 | - -- Write text to maca stdin. | |
100 | - -- TODO: Handle the "empty" case? | |
101 | - T.hPutStr inh txt; hFlush inh | |
102 | - | |
103 | - -- Read maca response and put it in the output channel. | |
104 | - writeChan outCh =<< readMacaResponse outh (textWeight txt) | |
105 | - ) | |
106 | - | |
107 | - | |
108 | -readMacaResponse :: Handle -> Int -> IO [Sent Tag] | |
109 | -readMacaResponse h n | |
110 | - | n <= 0 = return [] | |
111 | - | otherwise = do | |
112 | - x <- readMacaSent h | |
113 | - xs <- readMacaResponse h (n - sentWeight x) | |
114 | - return (x : xs) | |
115 | - | |
116 | - | |
117 | -readMacaSent :: Handle -> IO (Sent Tag) | |
118 | -readMacaSent h = | |
119 | - Plain.parseSent . L.unlines <$> getTxt | |
120 | - where | |
121 | - getTxt = do | |
122 | - x <- L.hGetLine h | |
123 | - if L.null x | |
124 | - then return [] | |
125 | - else (x:) <$> getTxt | |
126 | - | |
127 | - | |
128 | ----------------------------- | |
129 | --- Client | |
130 | ----------------------------- | |
131 | - | |
132 | - | |
133 | --- | Analyse paragraph with Maca. | |
134 | -doMacaPar :: Maca -> T.Text -> IO [Sent Tag] | |
135 | -doMacaPar (Maca (inCh, outCh)) par = do | |
136 | - let par' = T.intercalate " " (T.lines par) `T.append` "\n" | |
137 | - writeChan inCh par' | |
138 | - restoreSpaces par <$> readChan outCh | |
139 | - | |
140 | - | |
141 | --- | Restore info abouts spaces from a text and insert them | |
142 | --- to a parsed paragraph. | |
143 | -restoreSpaces :: T.Text -> [Sent Tag] -> [Sent Tag] | |
144 | -restoreSpaces par sents = | |
145 | - S.evalState (mapM onSent sents) (0, chunks) | |
146 | - where | |
147 | - -- For each space chunk in the paragraph compute | |
148 | - -- total weight of earlier chunks. | |
149 | - parts = T.groupBy ((==) `on` C.isSpace) par | |
150 | - weights = scanl1 (+) (map textWeight parts) | |
151 | - chunks = filter (T.any C.isSpace . fst) (zip parts weights) | |
152 | - | |
153 | - -- Stateful monadic computation which modifies spaces | |
154 | - -- assigned to individual segments. | |
155 | - onSent = mapM onWord | |
156 | - onWord seg = do | |
157 | - n <- addWeight seg | |
158 | - s <- popSpace n | |
159 | - let word' = (word seg) { space = s } | |
160 | - return $ seg { word = word' } | |
161 | - | |
162 | - -- Add weight of the segment to the current weight. | |
163 | - addWeight seg = S.state $ \(n, xs) -> | |
164 | - let m = n + segWeight seg | |
165 | - in (m, (m, xs)) | |
166 | - | |
167 | - -- Pop space from the stack if its weight is lower than | |
168 | - -- the current one. | |
169 | - popSpace n = fmap (maybe None id) . M.runMaybeT $ do | |
170 | - spaces <- lift $ S.gets snd | |
171 | - (sp, m) <- liftMaybe $ maybeHead spaces | |
172 | - guard $ m < n | |
173 | - lift $ S.modify $ \(n', xs) -> (n', tail xs) | |
174 | - return $ toSpace sp | |
175 | - liftMaybe = M.MaybeT . return | |
176 | - maybeHead xs = case xs of | |
177 | - (x:_) -> Just x | |
178 | - [] -> Nothing | |
179 | - | |
180 | - -- Parse strings representation of a Space. | |
181 | - toSpace x | |
182 | - | has '\n' = NewLine | |
183 | - | has ' ' = Space | |
184 | - | otherwise = None | |
185 | - where has c = maybe False (const True) (T.find (==c) x) | |
186 | - | |
187 | - | |
188 | ----------------------------- | |
189 | --- Pool | |
190 | ----------------------------- | |
191 | - | |
192 | - | |
193 | --- | A pool of Maca instances. | |
194 | -newtype MacaPool = MacaPool (Chan Maca) | |
195 | - | |
196 | - | |
197 | --- | Run Maca server. | |
198 | -newMacaPool | |
199 | - :: Int -- ^ Number of Maca instances | |
200 | - -> IO MacaPool | |
201 | -newMacaPool n = do | |
202 | - chan <- newChan | |
203 | - macas <- replicateM n newMaca | |
204 | - writeList2Chan chan macas | |
205 | - return $ MacaPool chan | |
206 | - | |
207 | - | |
208 | -popMaca :: MacaPool -> IO Maca | |
209 | -popMaca (MacaPool c) = readChan c | |
210 | - | |
211 | - | |
212 | -putMaca :: Maca -> MacaPool -> IO () | |
213 | -putMaca x (MacaPool c) = writeChan c x | |
214 | - | |
215 | - | |
216 | --- | Analyse paragraph with Maca. The function is thread-safe. As a | |
217 | --- pre-processing step, all non-printable characters are removed from | |
218 | --- the input (based on empirical observations, Maca behaves likewise). | |
219 | -macaPar :: MacaPool -> T.Text -> IO [Sent Tag] | |
220 | -macaPar pool par0 = do | |
221 | - let par = T.filter C.isPrint par0 | |
222 | - maca <- popMaca pool | |
223 | - doMacaPar maca par `finally` putMaca maca pool | |
224 | - | |
225 | - | |
226 | ------------------------------------------------------------- | |
227 | --- Weight: a number of non-space characters | |
228 | ------------------------------------------------------------- | |
229 | - | |
230 | - | |
231 | --- | A weight of a text. | |
232 | -textWeight :: T.Text -> Int | |
233 | --- textWeight = T.length . T.filter C.isAlphaNum | |
234 | -textWeight = T.length . T.filter (not . C.isSpace) | |
235 | --- textWeight = T.length . T.filter ((&&) <$> not . C.isSpace <*> C.isPrint) | |
236 | - | |
237 | - | |
238 | --- | A weight of a segment. | |
239 | -segWeight :: Seg t -> Int | |
240 | -segWeight = textWeight . orth . word | |
241 | - | |
242 | - | |
243 | --- | A weight of a sentence. | |
244 | -sentWeight :: Sent t -> Int | |
245 | -sentWeight = sum . map segWeight |
tools/concraft/src/NLP/Concraft/Polish/Morphosyntax.hs deleted
1 | -{-# LANGUAGE OverloadedStrings #-} | |
2 | -{-# LANGUAGE RecordWildCards #-} | |
3 | -{-# LANGUAGE TupleSections #-} | |
4 | - | |
5 | - | |
6 | --- | Morphosyntax data layer in Polish. | |
7 | - | |
8 | - | |
9 | -module NLP.Concraft.Polish.Morphosyntax | |
10 | -( | |
11 | --- * Tag | |
12 | - Tag | |
13 | - | |
14 | --- * Segment | |
15 | -, Seg (..) | |
16 | -, Word (..) | |
17 | -, Interp (..) | |
18 | -, Space (..) | |
19 | -, select | |
20 | -, select' | |
21 | -, selectWMap | |
22 | - | |
23 | --- * Sentence | |
24 | -, Sent | |
25 | -, SentO (..) | |
26 | -, restore | |
27 | -, withOrig | |
28 | - | |
29 | --- * Conversion | |
30 | -, packSeg | |
31 | -, packSent | |
32 | -, packSentO | |
33 | -) where | |
34 | - | |
35 | - | |
36 | -import Control.Applicative ((<$>), (<*>)) | |
37 | -import Control.Arrow (first) | |
38 | -import Data.Maybe (catMaybes) | |
39 | -import Data.Aeson | |
40 | -import Data.Binary (Binary, put, get, putWord8, getWord8) | |
41 | -import qualified Data.Aeson as Aeson | |
42 | -import qualified Data.Set as S | |
43 | -import qualified Data.Map as M | |
44 | -import qualified Data.Text as T | |
45 | -import qualified Data.Text.Lazy as L | |
46 | -import qualified Data.Tagset.Positional as P | |
47 | - | |
48 | -import qualified NLP.Concraft.Morphosyntax as X | |
49 | - | |
50 | - | |
51 | --- | A textual representation of a morphosyntactic tag. | |
52 | -type Tag = T.Text | |
53 | - | |
54 | - | |
55 | --------------------------------- | |
56 | --- Segment | |
57 | --------------------------------- | |
58 | - | |
59 | - | |
60 | --- | A segment consists of a word and a set of morphosyntactic interpretations. | |
61 | -data Seg t = Seg | |
62 | - { word :: Word | |
63 | - -- | Interpretations of the token, each interpretation annotated | |
64 | - -- with a /disamb/ Boolean value (if 'True', the interpretation | |
65 | - -- is correct within the context). | |
66 | - , interps :: X.WMap (Interp t) } | |
67 | - deriving (Show, Eq, Ord) | |
68 | - | |
69 | - | |
70 | -instance (Ord t, Binary t) => Binary (Seg t) where | |
71 | - put Seg{..} = put word >> put interps | |
72 | - get = Seg <$> get <*> get | |
73 | - | |
74 | - | |
75 | --- | A word. | |
76 | -data Word = Word | |
77 | - { orth :: T.Text | |
78 | - , space :: Space | |
79 | - , known :: Bool } | |
80 | - deriving (Show, Eq, Ord) | |
81 | - | |
82 | - | |
83 | -instance X.Word Word where | |
84 | - orth = orth | |
85 | - oov = not.known | |
86 | - | |
87 | - | |
88 | -instance ToJSON Word where | |
89 | - toJSON Word{..} = object | |
90 | - [ "orth" .= orth | |
91 | - , "space" .= space | |
92 | - , "known" .= known ] | |
93 | - | |
94 | - | |
95 | -instance FromJSON Word where | |
96 | - parseJSON (Object v) = Word | |
97 | - <$> v .: "orth" | |
98 | - <*> v .: "space" | |
99 | - <*> v .: "known" | |
100 | - parseJSON _ = error "parseJSON [Word]" | |
101 | - | |
102 | - | |
103 | -instance Binary Word where | |
104 | - put Word{..} = put orth >> put space >> put known | |
105 | - get = Word <$> get <*> get <*> get | |
106 | - | |
107 | - | |
108 | --- | A morphosyntactic interpretation. | |
109 | -data Interp t = Interp | |
110 | - { base :: T.Text | |
111 | - , tag :: t } | |
112 | - deriving (Show, Eq, Ord) | |
113 | - | |
114 | - | |
115 | -instance (Ord t, Binary t) => Binary (Interp t) where | |
116 | - put Interp{..} = put base >> put tag | |
117 | - get = Interp <$> get <*> get | |
118 | - | |
119 | - | |
120 | --- | No space, space or newline. | |
121 | --- TODO: Perhaps we should use a bit more informative data type. | |
122 | -data Space | |
123 | - = None | |
124 | - | Space | |
125 | - | NewLine | |
126 | - deriving (Show, Eq, Ord) | |
127 | - | |
128 | - | |
129 | -instance Binary Space where | |
130 | - put x = case x of | |
131 | - None -> putWord8 1 | |
132 | - Space -> putWord8 2 | |
133 | - NewLine -> putWord8 3 | |
134 | - get = getWord8 >>= \x -> return $ case x of | |
135 | - 1 -> None | |
136 | - 2 -> Space | |
137 | - _ -> NewLine | |
138 | - | |
139 | - | |
140 | -instance ToJSON Space where | |
141 | - toJSON x = Aeson.String $ case x of | |
142 | - None -> "none" | |
143 | - Space -> "space" | |
144 | - NewLine -> "newline" | |
145 | - | |
146 | - | |
147 | -instance FromJSON Space where | |
148 | - parseJSON (Aeson.String x) = return $ case x of | |
149 | - "none" -> None | |
150 | - "space" -> Space | |
151 | - "newline" -> NewLine | |
152 | - _ -> error "parseJSON [Space]" | |
153 | - parseJSON _ = error "parseJSON [Space]" | |
154 | - | |
155 | - | |
156 | --- | Select one chosen interpretation. | |
157 | -select :: Ord a => a -> Seg a -> Seg a | |
158 | -select = select' [] | |
159 | - | |
160 | - | |
161 | --- | Select multiple interpretations and one chosen interpretation. | |
162 | -select' :: Ord a => [a] -> a -> Seg a -> Seg a | |
163 | -select' ys x = selectWMap . X.mkWMap $ (x, 1) : map (,0) ys | |
164 | - | |
165 | - | |
166 | --- | Select interpretations. | |
167 | -selectWMap :: Ord a => X.WMap a -> Seg a -> Seg a | |
168 | -selectWMap wMap seg = | |
169 | - seg { interps = newInterps } | |
170 | - where | |
171 | - wSet = S.fromList . map tag . M.keys . X.unWMap . interps $ seg | |
172 | - newInterps = X.mkWMap $ | |
173 | - [ case M.lookup (tag interp) (X.unWMap wMap) of | |
174 | - Just x -> (interp, x) | |
175 | - Nothing -> (interp, 0) | |
176 | - | interp <- (M.keys . X.unWMap) (interps seg) ] | |
177 | - ++ catMaybes | |
178 | - [ if tag `S.member` wSet | |
179 | - then Nothing | |
180 | - else Just (Interp lemma tag, x) | |
181 | - | let lemma = orth $ word seg -- Default base form | |
182 | - , (tag, x) <- M.toList (X.unWMap wMap) ] | |
183 | - | |
184 | - | |
185 | --------------------------------- | |
186 | --- Sentence | |
187 | --------------------------------- | |
188 | - | |
189 | - | |
190 | --- | A sentence. | |
191 | -type Sent t = [Seg t] | |
192 | - | |
193 | - | |
194 | --- | A sentence. | |
195 | -data SentO t = SentO | |
196 | - { segs :: [Seg t] | |
197 | - , orig :: L.Text } | |
198 | - | |
199 | - | |
200 | --- | Restore textual representation of a sentence. | |
201 | --- The function is not very accurate, it could be improved | |
202 | --- if we enrich representation of a space. | |
203 | -restore :: Sent t -> L.Text | |
204 | -restore = | |
205 | - let wordStr Word{..} = [spaceStr space, orth] | |
206 | - spaceStr None = "" | |
207 | - spaceStr Space = " " | |
208 | - spaceStr NewLine = "\n" | |
209 | - in L.fromChunks . concatMap (wordStr . word) | |
210 | - | |
211 | - | |
212 | --- | Use `restore` to translate `Sent` to a `SentO`. | |
213 | -withOrig :: Sent t -> SentO t | |
214 | -withOrig s = SentO | |
215 | - { segs = s | |
216 | - , orig = restore s } | |
217 | - | |
218 | - | |
219 | ---------------------------- | |
220 | --- Conversion | |
221 | ---------------------------- | |
222 | - | |
223 | - | |
224 | --- | Convert a segment to a segment from a core library. | |
225 | -packSeg_ :: Ord a => Seg a -> X.Seg Word a | |
226 | -packSeg_ Seg{..} | |
227 | - = X.Seg word | |
228 | - $ X.mkWMap | |
229 | - $ map (first tag) | |
230 | - $ M.toList | |
231 | - $ X.unWMap interps | |
232 | - | |
233 | - | |
234 | --- | Convert a segment to a segment from a core library. | |
235 | -packSeg :: P.Tagset -> Seg Tag -> X.Seg Word P.Tag | |
236 | -packSeg tagset = X.mapSeg (P.parseTag tagset) . packSeg_ | |
237 | - | |
238 | - | |
239 | --- | Convert a sentence to a sentence from a core library. | |
240 | -packSent :: P.Tagset -> Sent Tag -> X.Sent Word P.Tag | |
241 | -packSent = map . packSeg | |
242 | - | |
243 | - | |
244 | --- | Convert a sentence to a sentence from a core library. | |
245 | -packSentO :: P.Tagset -> SentO Tag -> X.SentO Word P.Tag | |
246 | -packSentO tagset s = X.SentO | |
247 | - { segs = packSent tagset (segs s) | |
248 | - , orig = orig s } |
tools/concraft/src/NLP/Concraft/Polish/Request.hs deleted
1 | -{-# LANGUAGE RecordWildCards #-} | |
2 | -{-# LANGUAGE GeneralizedNewtypeDeriving #-} | |
3 | - | |
4 | - | |
5 | -module NLP.Concraft.Polish.Request | |
6 | -( | |
7 | --- * Request | |
8 | - Request (..) | |
9 | -, Config (..) | |
10 | --- ** Short | |
11 | -, Short (..) | |
12 | -, short | |
13 | --- ** Long | |
14 | -, Long (..) | |
15 | -, long | |
16 | -) where | |
17 | - | |
18 | - | |
19 | -import Control.Applicative ((<$>), (<*>)) | |
20 | -import qualified Control.Monad.LazyIO as LazyIO | |
21 | -import qualified Data.Char as Char | |
22 | -import qualified Data.List.Split as Split | |
23 | -import qualified Data.Text as T | |
24 | -import qualified Data.Text.Lazy as L | |
25 | -import qualified Data.Binary as B | |
26 | - | |
27 | -import NLP.Concraft.Polish | |
28 | -import NLP.Concraft.Polish.Maca | |
29 | -import NLP.Concraft.Polish.Morphosyntax hiding (tag) | |
30 | - | |
31 | - | |
32 | -------------------------------------------------- | |
33 | --- Configuration | |
34 | -------------------------------------------------- | |
35 | - | |
36 | - | |
37 | --- | A request with configuration. | |
38 | -data Request t = Request { | |
39 | - -- | The actuall request. | |
40 | - rqBody :: t | |
41 | - -- | Request configuration. | |
42 | - , rqConf :: Config } | |
43 | - | |
44 | - | |
45 | -instance B.Binary t => B.Binary (Request t) where | |
46 | - put Request{..} = B.put rqBody >> B.put rqConf | |
47 | - get = Request <$> B.get <*> B.get | |
48 | - | |
49 | - | |
50 | --- | Tagging configuration. | |
51 | -newtype Config = Config { | |
52 | - -- | Tag with marginal probabilities. | |
53 | - tagProbs :: Bool | |
54 | - } deriving (B.Binary) | |
55 | - | |
56 | - | |
57 | -------------------------------------------------- | |
58 | --- Short request | |
59 | -------------------------------------------------- | |
60 | - | |
61 | - | |
62 | --- | A short request. | |
63 | -data Short | |
64 | - = Short T.Text | |
65 | - | Par [Sent Tag] | |
66 | - | |
67 | - | |
68 | -instance B.Binary Short where | |
69 | - put (Short x) = B.putWord8 0 >> B.put x | |
70 | - put (Par x) = B.putWord8 1 >> B.put x | |
71 | - get = B.getWord8 >>= \x -> case x of | |
72 | - 0 -> Short <$> B.get | |
73 | - _ -> Par <$> B.get | |
74 | - | |
75 | - | |
76 | --- | Process the short request. | |
77 | -short :: MacaPool -> Concraft -> Request Short -> IO [Sent Tag] | |
78 | -short pool concraft Request{..} = case rqBody of | |
79 | - Short x -> map (tagit concraft) <$> macaPar pool x | |
80 | - Par x -> return $ map (tagit concraft) x | |
81 | - where | |
82 | - tagit = if tagProbs rqConf then marginals else tag | |
83 | - | |
84 | - | |
85 | -------------------------------------------------- | |
86 | --- Long request | |
87 | -------------------------------------------------- | |
88 | - | |
89 | - | |
90 | --- | A request to parse a long text. | |
91 | -data Long | |
92 | - = Long L.Text | |
93 | - | Doc [[Sent Tag]] | |
94 | - | |
95 | - | |
96 | -instance B.Binary Long where | |
97 | - put (Long x) = B.putWord8 0 >> B.put x | |
98 | - put (Doc x) = B.putWord8 1 >> B.put x | |
99 | - get = B.getWord8 >>= \x -> case x of | |
100 | - 0 -> Long <$> B.get | |
101 | - _ -> Doc <$> B.get | |
102 | - | |
103 | - | |
104 | --- | Process the long request given the processor for the | |
105 | --- short request. | |
106 | -long :: (Request Short -> IO a) -> Request Long -> IO [a] | |
107 | -long handler Request{..} = case rqBody of | |
108 | - Long inp -> | |
109 | - LazyIO.mapM f . map L.unlines | |
110 | - . Split.splitWhen (L.all Char.isSpace) | |
111 | - . L.lines $ inp | |
112 | - Doc inp -> LazyIO.mapM g inp | |
113 | - where | |
114 | - f x = handler . r $ Short $ L.toStrict x | |
115 | - g x = handler . r $ Par x | |
116 | - r x = Request {rqBody = x, rqConf = rqConf} |
tools/concraft/src/NLP/Concraft/Polish/Server.hs deleted
1 | -{-# LANGUAGE OverloadedStrings #-} | |
2 | -{-# LANGUAGE RecordWildCards #-} | |
3 | - | |
4 | - | |
5 | -module NLP.Concraft.Polish.Server | |
6 | -( | |
7 | --- * Server | |
8 | - runConcraftServer | |
9 | - | |
10 | --- * Client | |
11 | -, submit | |
12 | -) where | |
13 | - | |
14 | - | |
15 | -import Control.Applicative ((<$>)) | |
16 | -import Control.Monad (forever, void) | |
17 | -import Control.Concurrent (forkIO) | |
18 | -import System.IO (Handle, hFlush) | |
19 | -import qualified Network as N | |
20 | -import qualified Data.Binary as B | |
21 | -import qualified Data.ByteString.Lazy as BS | |
22 | - | |
23 | -import NLP.Concraft.Polish.Morphosyntax hiding (tag) | |
24 | -import NLP.Concraft.Polish.Maca | |
25 | -import qualified NLP.Concraft.Polish as C | |
26 | -import qualified NLP.Concraft.Polish.Request as R | |
27 | - | |
28 | - | |
29 | -------------------------------------------------- | |
30 | --- Server | |
31 | -------------------------------------------------- | |
32 | - | |
33 | - | |
34 | --- | Run a Concraft server on a given port. | |
35 | -runConcraftServer :: MacaPool -> C.Concraft -> N.PortID -> IO () | |
36 | -runConcraftServer pool concraft port = N.withSocketsDo $ do | |
37 | - sock <- N.listenOn port | |
38 | - forever $ sockHandler pool concraft sock | |
39 | - | |
40 | - | |
41 | --- | Read and process short requests from the socket. | |
42 | -sockHandler :: MacaPool -> C.Concraft -> N.Socket -> IO () | |
43 | -sockHandler pool concraft sock = do | |
44 | - (handle, _, _) <- N.accept sock | |
45 | - -- putStrLn "Connection established" | |
46 | - void $ forkIO $ do | |
47 | - -- putStrLn "Waiting for input..." | |
48 | - inp <- recvMsg handle | |
49 | - -- putStr "> " >> T.putStrLn inp | |
50 | - out <- R.short pool concraft inp | |
51 | - -- putStr "No. of sentences: " >> print (length out) | |
52 | - sendMsg handle out | |
53 | - | |
54 | - | |
55 | -------------------------------------------------- | |
56 | --- Client | |
57 | -------------------------------------------------- | |
58 | - | |
59 | - | |
60 | --- | Submit the given request. | |
61 | -submit :: N.HostName -> N.PortID -> R.Request R.Short -> IO [Sent Tag] | |
62 | -submit host port inp = do | |
63 | - handle <- N.connectTo host port | |
64 | - -- putStrLn "Connection established" | |
65 | - -- putStr "Send request: " >> T.putStrLn inp | |
66 | - sendMsg handle inp | |
67 | - recvMsg handle | |
68 | - | |
69 | - | |
70 | -------------------------------------------------- | |
71 | --- Communication | |
72 | -------------------------------------------------- | |
73 | - | |
74 | - | |
75 | -sendMsg :: B.Binary a => Handle -> a -> IO () | |
76 | -sendMsg h msg = do | |
77 | - let x = B.encode msg | |
78 | - n = fromIntegral $ BS.length x | |
79 | - sendInt h n | |
80 | - BS.hPut h x | |
81 | - hFlush h | |
82 | - | |
83 | - | |
84 | -recvMsg :: B.Binary a => Handle -> IO a | |
85 | -recvMsg h = do | |
86 | - n <- recvInt h | |
87 | - B.decode <$> BS.hGet h n | |
88 | - | |
89 | - | |
90 | -sendInt :: Handle -> Int -> IO () | |
91 | -sendInt h x = BS.hPut h (B.encode x) | |
92 | - | |
93 | - | |
94 | -recvInt :: Handle -> IO Int | |
95 | -recvInt h = B.decode <$> BS.hGet h 8 | |
96 | - | |
97 | - | |
98 | --- ------------------------------------------------- | |
99 | --- -- Stream binary encoding | |
100 | --- ------------------------------------------------- | |
101 | --- | |
102 | --- | |
103 | --- newtype Stream a = Stream { unstream :: [a] } | |
104 | --- | |
105 | --- | |
106 | --- instance B.Binary a => B.Binary (Stream a) where | |
107 | --- put (Stream []) = B.putWord8 0 | |
108 | --- put (Stream (x:xs)) = B.putWord8 1 >> B.put x >> B.put (Stream xs) | |
109 | --- get = error "use lazyDecodeStream insted" | |
110 | --- | |
111 | --- | |
112 | --- getMaybe :: B.Binary a => B.Get (Maybe a) | |
113 | --- getMaybe = do | |
114 | --- t <- B.getWord8 | |
115 | --- case t of | |
116 | --- 0 -> return Nothing | |
117 | --- _ -> fmap Just B.get | |
118 | --- | |
119 | --- | |
120 | --- step :: B.Binary a => (ByteString, Int64) -> Maybe (a, (ByteString, Int64)) | |
121 | --- step (xs, offset) = case B.runGetState getMaybe xs offset of | |
122 | --- (Just v, ys, newOffset) -> Just (v, (ys, newOffset)) | |
123 | --- _ -> Nothing | |
124 | --- | |
125 | --- | |
126 | --- lazyDecodeList :: B.Binary a => ByteString -> [a] | |
127 | --- lazyDecodeList xs = unfoldr step (xs, 0) | |
128 | --- | |
129 | --- | |
130 | --- lazyDecodeStream :: B.Binary a => ByteString -> Stream a | |
131 | --- lazyDecodeStream = Stream . lazyDecodeList |
tools/concraft/stack.yaml deleted
1 | -# For more information, see: https://github.com/commercialhaskell/stack/blob/release/doc/yaml_configuration.md | |
2 | - | |
3 | -# Specifies the GHC version and set of packages available (e.g., lts-3.5, nightly-2015-09-21, ghc-7.10.2) | |
4 | -resolver: lts-1.15 | |
5 | - | |
6 | -# Local packages, usually specified by relative directory name | |
7 | -packages: | |
8 | -- '.' | |
9 | - | |
10 | -# Packages to be pulled from upstream that are not in the resolver (e.g., acme-missiles-0.3) | |
11 | -extra-deps: | |
12 | -- concraft-0.9.4 | |
13 | -- lazy-io-0.1.0 | |
14 | -- sgd-0.3.7 | |
15 | -- tagset-positional-0.3.0 | |
16 | -- crf-chain1-constrained-0.3.2 | |
17 | -- crf-chain2-tiers-0.2.4 | |
18 | -- data-lens-2.10.7 | |
19 | -- monad-codec-0.2.0 | |
20 | -- monad-ox-0.3.0 | |
21 | -- vector-binary-0.1.1 | |
22 | -- temporary-1.1.2.5 | |
23 | - | |
24 | -# Override default flag values for local packages and extra-deps | |
25 | -flags: {} | |
26 | - | |
27 | -# Extra package databases containing global packages | |
28 | -extra-package-dbs: [] | |
29 | - | |
30 | -# Control whether we use the GHC we find on the path | |
31 | -# system-ghc: true | |
32 | - | |
33 | -# Require a specific version of stack, using version ranges | |
34 | -# require-stack-version: -any # Default | |
35 | -# require-stack-version: >= 1.0.0 | |
36 | - | |
37 | -# Override the architecture used by stack, especially useful on Windows | |
38 | -# arch: i386 | |
39 | -# arch: x86_64 | |
40 | - | |
41 | -# Extra directories used by stack for building | |
42 | -# extra-include-dirs: [/path/to/dir] | |
43 | -# extra-lib-dirs: [/path/to/dir] |
tools/concraft/tools/concraft-pl.hs deleted
1 | -{-# LANGUAGE DeriveDataTypeable #-} | |
2 | -{-# LANGUAGE ScopedTypeVariables #-} | |
3 | -{-# LANGUAGE RecordWildCards #-} | |
4 | - | |
5 | - | |
6 | -import Control.Applicative ((<$>)) | |
7 | -import Control.Monad (unless) | |
8 | -import System.Console.CmdArgs | |
9 | -import System.IO (hFlush, stdout) | |
10 | -import qualified Network as N | |
11 | -import qualified Numeric.SGD as SGD | |
12 | -import qualified Data.Text.Lazy as L | |
13 | -import qualified Data.Text.Lazy.IO as L | |
14 | -import Data.Tagset.Positional (parseTagset) | |
15 | -import GHC.Conc (numCapabilities) | |
16 | - | |
17 | -import qualified NLP.Concraft.Morphosyntax.Accuracy as Acc | |
18 | -import qualified NLP.Concraft.Guess as Guess | |
19 | - | |
20 | -import qualified NLP.Concraft.Polish.Maca as Maca | |
21 | -import qualified NLP.Concraft.Polish as C | |
22 | -import qualified NLP.Concraft.Polish.Request as R | |
23 | -import qualified NLP.Concraft.Polish.Server as S | |
24 | -import qualified NLP.Concraft.Polish.Morphosyntax as X | |
25 | -import qualified NLP.Concraft.Polish.Format.Plain as P | |
26 | - | |
27 | -import Paths_concraft_pl (version, getDataFileName) | |
28 | -import Data.Version (showVersion) | |
29 | - | |
30 | - | |
31 | --- | Default port number. | |
32 | -portDefault :: Int | |
33 | -portDefault = 10089 | |
34 | - | |
35 | - | |
36 | ---------------------------------------- | |
37 | --- Command line options | |
38 | ---------------------------------------- | |
39 | - | |
40 | - | |
41 | --- | Data formats. | |
42 | -data Format = Plain deriving (Data, Typeable, Show) | |
43 | - | |
44 | - | |
45 | --- | A description of the Concraft-pl tool | |
46 | -concraftDesc :: String | |
47 | -concraftDesc = "Concraft-pl " ++ showVersion version | |
48 | - | |
49 | - | |
50 | -data Concraft | |
51 | - = Train | |
52 | - { trainPath :: FilePath | |
53 | - , evalPath :: Maybe FilePath | |
54 | - , format :: Format | |
55 | - , tagsetPath :: Maybe FilePath | |
56 | - , noAna :: Bool | |
57 | - -- , discardHidden :: Bool | |
58 | - , iterNum :: Double | |
59 | - , batchSize :: Int | |
60 | - , regVar :: Double | |
61 | - , gain0 :: Double | |
62 | - , tau :: Double | |
63 | - , disk :: Bool | |
64 | - , outModel :: FilePath | |
65 | - , guessNum :: Int | |
66 | - , r0 :: Guess.R0T } | |
67 | - | Tag | |
68 | - { inModel :: FilePath | |
69 | - , noAna :: Bool | |
70 | - , format :: Format | |
71 | - , marginals :: Bool } | |
72 | - -- , guessNum :: Int } | |
73 | - | Server | |
74 | - { inModel :: FilePath | |
75 | - , port :: Int } | |
76 | - | Client | |
77 | - { noAna :: Bool | |
78 | - , format :: Format | |
79 | - , marginals :: Bool | |
80 | - , host :: String | |
81 | - , port :: Int } | |
82 | - | Compare | |
83 | - { tagsetPath :: Maybe FilePath | |
84 | - , refPath :: FilePath | |
85 | - , otherPath :: FilePath | |
86 | - , format :: Format } | |
87 | - | Prune | |
88 | - { inModel :: FilePath | |
89 | - , outModel :: FilePath | |
90 | - , threshold :: Double } | |
91 | --- | ReAna | |
92 | --- { format :: Format } | |
93 | - deriving (Data, Typeable, Show) | |
94 | - | |
95 | - | |
96 | -trainMode :: Concraft | |
97 | -trainMode = Train | |
98 | - { trainPath = def &= argPos 1 &= typ "TRAIN-FILE" | |
99 | - , evalPath = def &= typFile &= help "Evaluation file" | |
100 | - , tagsetPath = def &= typFile &= help "Tagset definition file" | |
101 | - , format = enum [Plain &= help "Plain format"] | |
102 | - , noAna = False &= help "Do not perform reanalysis" | |
103 | - -- , discardHidden = False &= help "Discard hidden features" | |
104 | - , iterNum = 20 &= help "Number of SGD iterations" | |
105 | - , batchSize = 50 &= help "Batch size" | |
106 | - , regVar = 10.0 &= help "Regularization variance" | |
107 | - , gain0 = 1.0 &= help "Initial gain parameter" | |
108 | - , tau = 5.0 &= help "Initial tau parameter" | |
109 | - , disk = False &= help "Store SGD dataset on disk" | |
110 | - , outModel = def &= typFile &= help "Output Model file" | |
111 | - , guessNum = 10 &= help "Number of guessed tags for each unknown word" | |
112 | - , r0 = Guess.OovChosen &= help "R0 construction method" } | |
113 | - | |
114 | - | |
115 | -tagMode :: Concraft | |
116 | -tagMode = Tag | |
117 | - { inModel = def &= argPos 0 &= typ "MODEL-FILE" | |
118 | - , noAna = False &= help "Do not analyse input text" | |
119 | - , format = enum [Plain &= help "Plain format"] | |
120 | - , marginals = False &= help "Tag with marginal probabilities" } | |
121 | - -- , guessNum = 10 &= help "Number of guessed tags for each unknown word" } | |
122 | - | |
123 | - | |
124 | -serverMode :: Concraft | |
125 | -serverMode = Server | |
126 | - { inModel = def &= argPos 0 &= typ "MODEL-FILE" | |
127 | - , port = portDefault &= help "Port number" } | |
128 | - | |
129 | - | |
130 | -clientMode :: Concraft | |
131 | -clientMode = Client | |
132 | - { noAna = False &= help "Do not perform reanalysis" | |
133 | - , port = portDefault &= help "Port number" | |
134 | - , host = "localhost" &= help "Server host name" | |
135 | - , format = enum [Plain &= help "Plain output format"] | |
136 | - , marginals = False &= help "Tag with marginal probabilities" } | |
137 | - | |
138 | - | |
139 | -compareMode :: Concraft | |
140 | -compareMode = Compare | |
141 | - { refPath = def &= argPos 1 &= typ "REFERENCE-FILE" | |
142 | - , otherPath = def &= argPos 2 &= typ "OTHER-FILE" | |
143 | - , tagsetPath = def &= typFile &= help "Tagset definition file" | |
144 | - , format = enum [Plain &= help "Plain format"] } | |
145 | - | |
146 | - | |
147 | -pruneMode :: Concraft | |
148 | -pruneMode = Prune | |
149 | - { inModel = def &= argPos 0 &= typ "INPUT-MODEL" | |
150 | - , outModel = def &= argPos 1 &= typ "OUTPUT-MODEL" | |
151 | - , threshold = 0.05 &= | |
152 | - help "Remove disambiguation features below the threshold" } | |
153 | - | |
154 | - | |
155 | --- reAnaMode :: Concraft | |
156 | --- reAnaMode = ReAna | |
157 | --- { format = enum [Plain &= help "Plain format"] } | |
158 | - | |
159 | - | |
160 | -argModes :: Mode (CmdArgs Concraft) | |
161 | -argModes = cmdArgsMode $ modes | |
162 | - [trainMode, tagMode, serverMode, clientMode, compareMode, pruneMode] | |
163 | - &= summary concraftDesc | |
164 | - &= program "concraft-pl" | |
165 | - | |
166 | - | |
167 | ---------------------------------------- | |
168 | --- Main | |
169 | ---------------------------------------- | |
170 | - | |
171 | - | |
172 | -main :: IO () | |
173 | -main = exec =<< cmdArgsRun argModes | |
174 | - | |
175 | - | |
176 | -exec :: Concraft -> IO () | |
177 | - | |
178 | - | |
179 | -exec Train{..} = do | |
180 | - tagsetPath' <- case tagsetPath of | |
181 | - Nothing -> getDataFileName "config/nkjp-tagset.cfg" | |
182 | - Just x -> return x | |
183 | - tagset <- parseTagset tagsetPath' <$> readFile tagsetPath' | |
184 | - let train0 = parseFileO format trainPath | |
185 | - let eval0 = parseFileO' format evalPath | |
186 | - concraft <- C.train (trainConf tagset) train0 eval0 | |
187 | - unless (null outModel) $ do | |
188 | - putStrLn $ "\nSaving model in " ++ outModel ++ "..." | |
189 | - C.saveModel outModel concraft | |
190 | - where | |
191 | - sgdArgs = SGD.SgdArgs | |
192 | - { SGD.batchSize = batchSize | |
193 | - , SGD.regVar = regVar | |
194 | - , SGD.iterNum = iterNum | |
195 | - , SGD.gain0 = gain0 | |
196 | - , SGD.tau = tau } | |
197 | - trainConf tagset = C.TrainConf | |
198 | - { tagset = tagset | |
199 | - , sgdArgs = sgdArgs | |
200 | - , reana = not noAna | |
201 | - , onDisk = disk | |
202 | - , guessNum = guessNum | |
203 | - , r0 = r0 } | |
204 | - | |
205 | - | |
206 | -exec Tag{..} = do | |
207 | - cft <- C.loadModel inModel | |
208 | - pool <- Maca.newMacaPool numCapabilities | |
209 | - inp <- L.getContents | |
210 | - out <- R.long (R.short pool cft) $ rq $ if noAna | |
211 | - then R.Doc $ parseText format inp | |
212 | - else R.Long inp | |
213 | - L.putStr $ showData showCfg out | |
214 | - where | |
215 | - rq x = R.Request | |
216 | - { R.rqBody = x | |
217 | - , R.rqConf = rqConf } | |
218 | - rqConf = R.Config | |
219 | - { R.tagProbs = marginals } | |
220 | - showCfg = ShowCfg | |
221 | - { formatCfg = format | |
222 | - , showWsCfg = marginals } | |
223 | - | |
224 | - | |
225 | -exec Server{..} = do | |
226 | - putStr "Loading model..." >> hFlush stdout | |
227 | - concraft <- C.loadModel inModel | |
228 | - putStrLn " done" | |
229 | - pool <- Maca.newMacaPool numCapabilities | |
230 | - let portNum = N.PortNumber $ fromIntegral port | |
231 | - putStrLn $ "Listening on port " ++ show port | |
232 | - S.runConcraftServer pool concraft portNum | |
233 | - | |
234 | - | |
235 | -exec Client{..} = do | |
236 | - let portNum = N.PortNumber $ fromIntegral port | |
237 | - inp <- L.getContents | |
238 | - out <- R.long (S.submit host portNum) $ rq $ if noAna | |
239 | - then R.Doc $ parseText format inp | |
240 | - else R.Long inp | |
241 | - L.putStr $ showData showCfg out | |
242 | - where | |
243 | - rq x = R.Request | |
244 | - { R.rqBody = x | |
245 | - , R.rqConf = rqConf } | |
246 | - rqConf = R.Config | |
247 | - { R.tagProbs = marginals } | |
248 | - showCfg = ShowCfg | |
249 | - { formatCfg = format | |
250 | - , showWsCfg = marginals } | |
251 | - | |
252 | - | |
253 | -exec Compare{..} = do | |
254 | - tagsetPath' <- case tagsetPath of | |
255 | - Nothing -> getDataFileName "config/nkjp-tagset.cfg" | |
256 | - Just x -> return x | |
257 | - tagset <- parseTagset tagsetPath' <$> readFile tagsetPath' | |
258 | - let convert = map (X.packSeg tagset) . concat | |
259 | - xs <- convert <$> parseFile format refPath | |
260 | - ys <- convert <$> parseFile format otherPath | |
261 | - let s = Acc.weakLB tagset xs ys | |
262 | - putStrLn $ "Number of segments in reference file: " ++ show (Acc.gold s) | |
263 | - putStrLn $ "Number of correct tags: " ++ show (Acc.good s) | |
264 | - putStrLn $ "Weak accuracy lower bound: " ++ show (Acc.accuracy s) | |
265 | - | |
266 | - | |
267 | -exec Prune{..} = do | |
268 | - cft <- C.loadModel inModel | |
269 | - C.saveModel outModel $ C.prune threshold cft | |
270 | - | |
271 | - | |
272 | --- exec ReAna{..} = do | |
273 | --- inp <- parseText format <$> L.getContents | |
274 | --- out <- showData format <$> | |
275 | - | |
276 | - | |
277 | ---------------------------------------- | |
278 | --- Reading files | |
279 | ---------------------------------------- | |
280 | - | |
281 | - | |
282 | -parseFileO' :: Format -> Maybe FilePath -> IO [X.SentO X.Tag] | |
283 | -parseFileO' format path = case path of | |
284 | - Nothing -> return [] | |
285 | - Just pt -> parseFileO format pt | |
286 | - | |
287 | - | |
288 | -parseFileO :: Format -> FilePath -> IO [X.SentO X.Tag] | |
289 | -parseFileO format path = parseParaO format <$> L.readFile path | |
290 | - | |
291 | - | |
292 | -parseFile :: Format -> FilePath -> IO [X.Sent X.Tag] | |
293 | -parseFile format path = parsePara format <$> L.readFile path | |
294 | - | |
295 | - | |
296 | ---------------------------------------- | |
297 | --- Parsing text | |
298 | ---------------------------------------- | |
299 | - | |
300 | - | |
301 | --- parseTextO :: Format -> L.Text -> [[X.SentO X.Tag]] | |
302 | --- parseTextO format = map (map X.withOrig) . parseText format | |
303 | - | |
304 | - | |
305 | -parseParaO :: Format -> L.Text -> [X.SentO X.Tag] | |
306 | -parseParaO format = map X.withOrig . parsePara format | |
307 | - | |
308 | - | |
309 | ---------------------------------------- | |
310 | --- Parsing (format dependent) | |
311 | ---------------------------------------- | |
312 | - | |
313 | - | |
314 | -parseText :: Format -> L.Text -> [[X.Sent X.Tag]] | |
315 | -parseText Plain = P.parsePlain | |
316 | - | |
317 | - | |
318 | -parsePara :: Format -> L.Text -> [X.Sent X.Tag] | |
319 | -parsePara Plain = P.parsePara | |
320 | - | |
321 | - | |
322 | ---------------------------------------- | |
323 | --- Showing (format dependent) | |
324 | ---------------------------------------- | |
325 | - | |
326 | - | |
327 | -data ShowCfg = ShowCfg { | |
328 | - -- | The format used. | |
329 | - formatCfg :: Format | |
330 | - -- | Show weights? | |
331 | - , showWsCfg :: Bool } | |
332 | - | |
333 | - | |
334 | -showData :: ShowCfg -> [[X.Sent X.Tag]] -> L.Text | |
335 | -showData ShowCfg{..} = P.showPlain (P.ShowCfg {P.showWsCfg = showWsCfg}) |
tools/maca deleted
1 | -Subproject commit 57f9fa8dd51f204ce066f162c2ec201928aece3d |
tools/swigra/README.md deleted
1 | -Before using Świgra it might be necessary to compile the Prolog interface to the morphological analyzer Morfeusz. | |
2 | -First the package libmorfeusz2-dev must be downloaded from sgjp.pl/morfeusz/dopobrania.html and installed. | |
3 | -(WARNING: do NOT install the package morfeusz2 as it conflicts with the required package) | |
4 | -Then the interface can be compiled by executing | |
5 | -swipl-ld -v -shared -o morfeusz2-swi.so morfeusz2-glueswi.cc -lmorfeusz2 | |
6 | -in the parser directory. | |
7 | - | |
8 | -For an example usage of Świgra's server mode (in OCaml) see parser/swigra_test.ml | |
9 | -The demo assumes you are using swipl which you can install by executing: | |
10 | -sudo apt-get install swi-prolog | |
11 | -It also makes use of curl which can be installed with: | |
12 | -sudo apt-get install curl | |
13 | - | |
14 | -Below are the contents of the original README file. | |
15 | - | |
16 | -======================================================================== | |
17 | -Analizator składniowy Świgra | |
18 | - | |
19 | -======================================================================== | |
20 | - | |
21 | -Paczka zawiera analizator składniowy Świgra w wersji 1 (2007) i w wersji | |
22 | -rozwojowej kandydującej do nazwy Świgra 2 (2016). | |
23 | - | |
24 | -Wersja 1 stanowi komputerową realizację gramatyki przedstawionej przez | |
25 | -prof. Marka Świdzińskiego w jego książce „Gramatyka formalna języka | |
26 | -polskiego” (1992). Wersja 1 jest częścią wyników pracy doktorskiej | |
27 | -Marcina Wolińskiego opracowanej i obronionej w Instytucie Podstaw | |
28 | -Informatyki PAN w Warszawie w maju 2005. Inicjatorem i promotorem | |
29 | -pracy był Janusz S. Bień, aktualnie z Katedry Lingwistyki Formalnej | |
30 | -Uniwersytetu Warszawskiego. | |
31 | - | |
32 | -Wersja 2 reguł została napisana od nowa przez Marcina Wolińskiego przy | |
33 | -współpracy z prof. Świdzińskim. | |
34 | - | |
35 | -Runtime analizatora nazywamy roboczo Birnam (z hasłem reklamowym „We | |
36 | -bring forests to your doors”). Stanowi on interpreter gramatyk DCG | |
37 | -stosujący strategię wstępującą z zapamiętywaniem wyników pośrednich. | |
38 | -Zawiera też pewne rozszerzenia specyficzne dla implementowanej | |
39 | -gramatyki. | |
40 | - | |
41 | -Nie wszystkie pliki składające się na analizator zostały udostępnione | |
42 | -na tej samej licencji. Warunki licencji można sprawdzić w nagłówkach | |
43 | -poszczególnych plików. Większość plików składających się na te | |
44 | -programy zostaje udostępniona na licencji GNU General Public License 3 | |
45 | -(por. plik gpl-3.0.txt). Pliki umożliwiające skład drzew analizy w | |
46 | -LaTeXu zostały udostępnione na licencji LaTeX Project Public License | |
47 | -wersja 1.2, ponieważ licencja GPL nie jest zgodna ze standardowymi | |
48 | -modułami LaTeXa. Niektóre pliki pomocnicze zostały dla uproszczenia | |
49 | -uznane za dobro publiczne — autor zrzekł się praw autorskich do nich. | |
50 | -Dotyczy to przykładu użycia platformy Birnam z inną gramatyką oraz | |
51 | -kodu w języku C pozwalającego na wywołanie analizatora Morfeusz z | |
52 | -programu prologowego. | |
53 | - | |
54 | -Udostępnienie wersji 1 analizatora na licencji GPL było możliwe dzięki | |
55 | -uprzejmej zgodzie następujących osób i instytucji: | |
56 | - | |
57 | -1. prof. Marka Świdzińskiego jako autora reguł gramatycznych, | |
58 | - | |
59 | -2. dr. hab. Janusza S. Bienia, prof. UW jako kierownika zrealizowanego | |
60 | - w Instytucie Informatyki Uniwersytetu Warszawskiego projektu KBN | |
61 | - 8T11C 00213 "Zestaw testów do weryfikacji i oceny analizatorów | |
62 | - języka polskiego", w ramach którego w latach 1997-1999 powstał | |
63 | - prototyp analizatora Świgra (pod nazwą AS), | |
64 | - | |
65 | -3. dyrekcji Instytutu Podstaw Informatyki PAN - mojego pracodawcy. | |
66 | - | |
67 | -Rozwój wersji 2 był finansowany w ramach projektów: | |
68 | -• MNiSW N N104 224735 (2008–2011), | |
69 | -• Nekst http://www.ipipan.waw.pl/nekst/ (2012–2013), | |
70 | -• CLARIN-PL http://clarin-pl.eu/ (2014–2016). | |
71 | - | |
72 | -Właścicielem majątkowych praw autorskich do wersji 2 jest Instytut | |
73 | -Podstaw Informatyki Polskiej Akademii Nauk. | |
74 | - | |
75 | -======================================================================== | |
76 | -Użytkowanie w systemie Ubuntu | |
77 | -======================================================================== | |
78 | - | |
79 | -Pliki analizatora i dezambiguatora muszą znaleźć się w systemie plików | |
80 | -w takich podkatalogach, jak w paczce dystrybucyjnej. Analizator (z | |
81 | -wczepionym dezambiguatorem) uruchamia się za pomocą skryptów zawartych | |
82 | -w podkatalogu parser/. | |
83 | - | |
84 | -Najwygodniejszym sposobem interaktywnego użycia Świgry wydaje się | |
85 | -interfejs webowy, który można wywołać poleceniem | |
86 | - ./swigra -w | |
87 | -Alternatywnie można użyć interfejsu okienkowego SWI Prologu: | |
88 | - ./swigra | |
89 | -Domyślnie używana jest Świgra 2, aby wywołać archiwalną wersję Świgry | |
90 | -jeden w wywołaniu skryptu musi znaleźć się argument -1. | |
91 | - | |
92 | -Świgra używa analizatora morfologicznego Morfeusz 2 | |
93 | -(http://sgjp.pl/morfeusz), który musi być zainstalowany w systemie. | |
94 | -Dodatkową komplikację stanowi fragment kodu łączącego Morfeusza 2 z | |
95 | -SWI Prologiem (morfeusz2-glueswi.cc), który musi zostać skompilowany | |
96 | -do biblioteki dynamicznej zgodnie z opisem na początku wymienionego | |
97 | -pliku źródłowego. Dystrybucja zawiera kilka wariantów kompilatu | |
98 | -(morfeusz2-*so), w szczególności morfeusz2-swi7-64.so dla Ubuntu 16.04 | |
99 | -64-bit. W razie niezgodności należy przekompilować kod. | |
100 | - | |
101 | -W trybie wsadowym Świgra może używać Morfeusza lub pracować na | |
102 | -interpretacjach dostarczonych w formacie NKJP. | |
103 | - | |
104 | -Pakiety Ubuntu konieczne dla działania całej wsadowej ścieżki | |
105 | -przetwarzania (skrypty genparser i swigra_batch): | |
106 | - | |
107 | -swi-prolog | |
108 | -libbsd-resource-perl | |
109 | -libfile-slurp-perl | |
110 | -libxml-libxml-perl | |
111 | - | |
112 | - | |
113 | - | |
114 | -======================================================================== | |
115 | -Marcin Woliński <wolinski@ipipan.waw.pl> | |
116 | -Zespół Inżynierii Lingwistycznej | |
117 | -Instytut Podstaw Informatki PAN | |
118 | - | |
119 | -%%% Local Variables: | |
120 | -%%% coding: utf-8 | |
121 | -%%% mode: text | |
122 | -%%% End: |
tools/swigra/disambiguator-pcfg/README deleted
1 | -Katalog zawiera pliki źródłowe dezambiguatora dla lasów składniowych w formacie Świgra/Składnica oraz: | |
2 | - | |
3 | -pcfg-grammar.pkl - gramatyka pcfg slużąca do dezambiguacji | |
4 | -skrypt make-model.py służy do stworzenia nowej gamatyki | |
5 | - | |
6 | -skrypt disamb-tree.py służy do automatycznej anotacji ujednoznacznionego drzewa | |
7 | - | |
8 | -sposób użycia: | |
9 | - | |
10 | - python disamb-tree.py sciezka/do/nazwa_pliku_wejsciowego.xml | |
11 | - skrypt utworzy plik sciezka/do/nazwa_pliku_wejscioweg-disamb.xml | |
12 | 0 | \ No newline at end of file |
tools/swigra/disambiguator-pcfg/__init__.py deleted
tools/swigra/disambiguator-pcfg/__init__.pyc deleted
No preview for this file type
tools/swigra/disambiguator-pcfg/disamb-tree.py deleted
1 | -# -*- encoding: utf-8 -*- | |
2 | -__author__ = 'nika' | |
3 | - | |
4 | -from xml.sax import make_parser | |
5 | -from treeparser import TreeParser | |
6 | -import os | |
7 | -import pickle | |
8 | -import sys | |
9 | -import xml.dom.minidom | |
10 | -import codecs | |
11 | - | |
12 | -try: | |
13 | - f = sys.argv[1] | |
14 | -except IndexError: | |
15 | - print("ERROR: no filename given") | |
16 | - exit() | |
17 | - | |
18 | -# parse xml file for future processing | |
19 | -dom = xml.dom.minidom.parse(sys.argv[1]) | |
20 | -nodes = dom.getElementsByTagName("node") | |
21 | - | |
22 | -if nodes: # if there is anything to disambiguate | |
23 | - # load grammar | |
24 | -# grammar = pickle.load(open("grammars/pcfg-tfw-130718.pkl")) | |
25 | - path = os.path.join(os.path.dirname(__file__), "grammars/pcfg-tfw-150326.pkl") | |
26 | - grammar = pickle.load(open(path)) | |
27 | - # make parser, parse tree | |
28 | - parser = make_parser() | |
29 | - handler = TreeParser(f) | |
30 | - parser.setContentHandler(handler) | |
31 | - parser.parse(f) | |
32 | - tree = handler.getTree() | |
33 | - | |
34 | - # disambiguation | |
35 | - tree.act_pcfg(grammar) | |
36 | - disamb_nodes = tree.getDisambNodes() | |
37 | - | |
38 | -# update chosen nodes (if any) | |
39 | -for node in nodes: | |
40 | - if node.attributes["nid"].value in disamb_nodes: | |
41 | - node.attributes["chosen"] = "true" | |
42 | - children_all = node.getElementsByTagName("children") | |
43 | - for children in children_all: | |
44 | - chosen = True | |
45 | - for child in children.getElementsByTagName("child"): | |
46 | - if child.attributes["nid"].value not in disamb_nodes: | |
47 | - chosen = False | |
48 | - if chosen: | |
49 | - children.attributes["chosen"] = "true" | |
50 | - else: | |
51 | - node.attributes["chosen"] = "false" | |
52 | - | |
53 | -new_f = sys.argv[1].rsplit(".", 1)[0] + "-disamb.xml" | |
54 | -print "saving in :", new_f | |
55 | -open(new_f, 'w').write(codecs.encode(dom.toxml(), 'utf-8')) |
tools/swigra/disambiguator-pcfg/disamb-tree.pyc deleted
No preview for this file type