Commit ce693286e7a0b34ff6803f63002a794fc1b8f8eb
1 parent
091664b8
Interferjs webowy dla ENIAMmorphology
Showing
7 changed files
with
270 additions
and
3 deletions
morphology/.gitignore
morphology/ENIAMinflexion.ml
... | ... | @@ -233,3 +233,48 @@ let catch_get_interpretations form = |
233 | 233 | try |
234 | 234 | let result = get_interpretations form in result,"" |
235 | 235 | with e -> [], Printexc.to_string e |
236 | + | |
237 | +let int_of_status = function | |
238 | + LemmaVal -> 0 | |
239 | + | LemmaAlt -> 1 | |
240 | + | LemmNotVal -> 2 | |
241 | + | TokNotFound -> 3 | |
242 | + | |
243 | +let compare_status s t = | |
244 | + compare (int_of_status s) (int_of_status t) | |
245 | + | |
246 | +let int_of_cat = function | |
247 | + "verb" -> 0 | |
248 | + | "adv" -> 1 | |
249 | + | "adj" -> 2 | |
250 | + | "adj:grad" -> 3 | |
251 | + | "noun" -> 4 | |
252 | + | "ndm" -> 5 | |
253 | + | _ -> 6 | |
254 | + | |
255 | +let compare_cat s t = | |
256 | + compare (int_of_cat s) (int_of_cat t) | |
257 | + | |
258 | +let int_of_star = function | |
259 | + Productive -> 0 | |
260 | + | Star -> 1 | |
261 | + | Ndm -> 2 | |
262 | + | Dial -> 6 | |
263 | + | Acro -> 5 | |
264 | + | Aux2 -> 4 | |
265 | + | Aux -> 3 | |
266 | + | |
267 | +let compare_star s t = | |
268 | + compare (int_of_star s) (int_of_star t) | |
269 | + | |
270 | +let compare_results s t = | |
271 | + let c = compare_status s.status t.status in | |
272 | + if c <> 0 then c else | |
273 | + let c = compare_cat (ENIAMmorphologyRules.get_tag s.tags "cat") (ENIAMmorphologyRules.get_tag t.tags "cat") in | |
274 | + if c <> 0 then c else | |
275 | + let c = compare_star s.star t.star in | |
276 | + if c <> 0 then c else | |
277 | + compare s.lemma t.lemma | |
278 | + | |
279 | +let sort_results l = | |
280 | + Xlist.sort l compare_results | |
... | ... |
morphology/doc/model2.pdf
No preview for this file type
morphology/doc/model2.tex
... | ... | @@ -2922,9 +2922,9 @@ Poniższa tabela przedstawia liczbę reguł z podziałem na ich typy i części |
2922 | 2922 | \begin{tabular}{l|r|r|r|r|r} |
2923 | 2923 | & noun & adj & adv & verb & suma \\ |
2924 | 2924 | \hline |
2925 | -produktywne & 7269 & 1502 & 150 & 9107 & 18028 \\ | |
2925 | +produktywne & 7274 & 1502 & 150 & 9107 & 18033 \\ | |
2926 | 2926 | \hline |
2927 | -* nieproduktywne & 206+7 & 454+2 & --- & 3707 & 4376 \\ | |
2927 | +* nieproduktywne & 208 & 456 & --- & 3707 & 4371 \\ | |
2928 | 2928 | \hline |
2929 | 2929 | A obce & 1247 & --- & --- & --- & 1247 \\ |
2930 | 2930 | \hline |
... | ... | @@ -2934,7 +2934,7 @@ C akronimy & 571 & --- & --- & --- & 571 \\ |
2934 | 2934 | \hline |
2935 | 2935 | D gwarowe & 2679 & 395 & --- & 3474 & 6548 \\ |
2936 | 2936 | \hline |
2937 | -suma & 12185+7 & 2351+2 & 150 & 16288 & 30983+9 \\ | |
2937 | +suma & 12192 & 2353 & 150 & 16288 & 30983 \\ | |
2938 | 2938 | \end{tabular} |
2939 | 2939 | \end{center} |
2940 | 2940 | Wers A zawiera liczę reguł opisujących odmianę słów o obcej ortografii, w których pierwotna postać rdzenia jest zawarta w obserwowanej formie, |
... | ... |
morphology/makefile
... | ... | @@ -51,6 +51,9 @@ interface: interface.ml |
51 | 51 | api: apiInterface.ml |
52 | 52 | $(OCAMLOPT) -o morphology.api $(OCAMLOPTFLAGS2) $^ |
53 | 53 | |
54 | +web: webInterface.ml | |
55 | + $(OCAMLOPT) -o morphology.cgi $(OCAMLOPTFLAGS2) $^ | |
56 | + | |
54 | 57 | freq_test: freq_test.ml |
55 | 58 | $(OCAMLOPT) -o freq_test $(OCAMLOPTFLAGS) freq_test.ml |
56 | 59 | |
... | ... |
morphology/morphology.html
0 → 100755
1 | +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> | |
2 | +<html> | |
3 | + <head> | |
4 | + <META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=utf8"> | |
5 | + <TITLE>ENIAMmorphology</TITLE> | |
6 | + <META HTTP-EQUIV="Content-Language" CONTENT="pl"> | |
7 | + </head> | |
8 | + | |
9 | + <body> | |
10 | + <center> | |
11 | + <h1>ENIAMmorphology: analizator morfologiczny i guessser dla języka polskiego</h1> | |
12 | + <h3>Podaj słowo:</h3> | |
13 | + <form method=POST action="cgi-bin/morphology.cgi"> | |
14 | + <p><input type="text" name="text0" value="" size="100"></p> | |
15 | + <p><input type="submit" value="Analizuj" size="60"></p> | |
16 | + </form> | |
17 | +<BR><BR> | |
18 | + <h3>Opis programu</h3> | |
19 | +</center> | |
20 | + | |
21 | +<p>Dla zadanej formy program znajduje możliwe lematyzacje oraz interpretacje morfosyntaktyczne.<BR> | |
22 | +Wskazuje, które z nich znajdują się w SGJP w wersji z 2017.07.30.<BR> | |
23 | +Program analizuje słowa obce zaopatrzone w polskie końcówki fleksyjne, akronimy oraz niektóre formy gwarowe. | |
24 | + | |
25 | +<p>Program na wejściu otrzymuje pojedynczą formę. <BR> | |
26 | +Forma nie może być liczbą zapisaną za pomocą cyfr, znakiem interpunkcyjnym, ani inną sekwencją znaków<BR> | |
27 | +nie reprezentującą słowa za pomocą reguł ortograficznych.<BR> | |
28 | +Nie może ona posiadać aglutynatu, ani sufiksu trybu przypuszczającego.<BR> | |
29 | +Nie są analizowane również formy typu '2-gi'. | |
30 | + | |
31 | +<p>Na wyjściu generowana jest tabela, w której kolejnych wersach znajdują się możliwe interpretacje zadanej formy.<BR> | |
32 | +Kolejne kolumny tabeli zawierają: | |
33 | +<ul> | |
34 | +<li> lemat, | |
35 | +<li> interpretację morfosyntaktyczną, | |
36 | +<li> frekwencję (liczbę form z SGJP, które lematyzują się w taki sam sposób) | |
37 | +<li> status, możliwe wartości to | |
38 | +<ul> | |
39 | + <li> LemmaVal: znajduje się w SGJP, | |
40 | + <li> LemmaAlt: wyjątek - znajduje się w SGJP, nie jest objęty przez reguły morfologiczne zaimplementowane w modelu, | |
41 | + <li> LemmNotVal: interpretacja zgodna z modelem, nie zawarta w SGJP, | |
42 | + <li> TokNotFound: nie znaleziono interpretacji, | |
43 | +</ul> | |
44 | +<li> star, wartość pusta dla reguł produktywnych, pozostałe możliwe wartości to | |
45 | +<ul> | |
46 | + <li> *: interpretacja nieproduktywna, | |
47 | + <li> A: obca ortografia na styku tematu i końcówki, | |
48 | + <li> B: obca ortografia w lemacie , | |
49 | + <li> C: akronim, | |
50 | + <li> D: forma gwarowa, | |
51 | +</ul> | |
52 | +<li> atrybuty opisujące reguły morfologiczne wykorzystane do uzyskania interpretacji. | |
53 | +</ul> | |
54 | + | |
55 | +<p>Interpretacje, w których star=B są wysoce niejednoznaczne, | |
56 | +ponieważ polegają na odgadnięciu obcej końcówki tematu, | |
57 | +która nie jest widoczna w zadanej formie.<BR> | |
58 | +Atrybuty w ostatnim wersie tabeli są podawane po to, by | |
59 | +umożliwić budowanie tagerów wykorzystujących cechy morfologiczne. | |
60 | + | |
61 | + <center> | |
62 | +<hr align="center" size="2" width="800" /> | |
63 | +Autor: <A HREF="http://www.mimuw.edu.pl/~wjaworski">Wojciech Jaworski</A>, Szymon Rutkowski<BR> | |
64 | +Copyright © 2016-2017 Institute of Computer Science Polish Academy of Sciences<BR><BR> | |
65 | +Parser wykorzystuje z następujące zasoby:<BR> | |
66 | +<A HREF="http://sgjp.pl">SGJP</A> Copyright © 2007–2017 Zygmunt Saloni, Włodzimierz Gruszczyński, Marcin Woliński, Robert Wołosz, Danuta Skowrońska<BR> | |
67 | +<P><small>W trosce o jakość generowanych lematyzacji zapytania użytkowników mogą być logowane.</small> | |
68 | +</center> | |
69 | + </body> | |
70 | +</html> | |
... | ... |
morphology/webInterface.ml
0 → 100644
1 | +(* | |
2 | + * ENIAMmorphology, a morphological analyser and a guesser for Polish | |
3 | + * Copyright (C) 2016-2017 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl> | |
4 | + * Copyright (C) 2016-2017 Institute of Computer Science Polish Academy of Sciences | |
5 | + * | |
6 | + * This program is free software: you can redistribute it and/or modify | |
7 | + * it under the terms of the GNU General Public License as published by | |
8 | + * the Free Software Foundation, either version 3 of the License, or | |
9 | + * (at your option) any later version. | |
10 | + * | |
11 | + * This program is distributed in the hope that it will be useful, | |
12 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | + * GNU General Public License for more details. | |
15 | + * | |
16 | + * You should have received a copy of the GNU General Public License | |
17 | + * along with this program. If not, see <http://www.gnu.org/licenses/>. | |
18 | + *) | |
19 | + | |
20 | +open Printf | |
21 | + | |
22 | +let get_sock_addr host_name port = | |
23 | + let he = | |
24 | + try Unix.gethostbyname host_name | |
25 | + with Not_found -> failwith ("get_sock_addr: host " ^ host_name ^ " not found") in | |
26 | + let addr = he.Unix.h_addr_list in | |
27 | + Unix.ADDR_INET(addr.(0),port) | |
28 | + | |
29 | +let process_query query = | |
30 | + let sock = get_sock_addr "wloczykij"(*"localhost"*) 5736 in | |
31 | + let ic,oc = | |
32 | + try Unix.open_connection sock | |
33 | + with e -> failwith ("server connection error: " ^ Printexc.to_string e) in | |
34 | + Printf.fprintf oc "%s\n\n%!" query; | |
35 | + let result,msg = (Marshal.from_channel ic : ENIAMinflexion.t list * string) in | |
36 | + Printf.fprintf oc "\n%!"; | |
37 | + let _ = Unix.shutdown_connection ic in | |
38 | + ENIAMinflexion.html_of_interpretations (ENIAMinflexion.sort_results result) msg | |
39 | + | |
40 | +let get_input () = | |
41 | + let r = ref [] in | |
42 | + (try | |
43 | + while true do | |
44 | + r := (input_line stdin) :: (!r) | |
45 | + done; | |
46 | + !r | |
47 | + with End_of_file -> !r) | |
48 | + | |
49 | +let rec translate_input_rec buf i size query = | |
50 | + if i >= size then Buffer.contents buf else ( | |
51 | + let c,i = | |
52 | + if String.get query i = '%' then | |
53 | + Scanf.sscanf (String.sub query (i+1) 2) "%x" (fun a -> Char.chr a), i+3 else | |
54 | + if String.get query i = '+' then ' ', i+1 else | |
55 | + String.get query i, i+1 in | |
56 | + Buffer.add_char buf c; | |
57 | + translate_input_rec buf i size query) | |
58 | + | |
59 | +let translate_input query = | |
60 | + match query with | |
61 | + [query] -> | |
62 | + if String.sub query 0 6 = "text0=" then | |
63 | + let buf = Buffer.create (String.length query) in | |
64 | + translate_input_rec buf 6 (String.length query) query | |
65 | + else failwith "translate_input 1" | |
66 | + | _ -> failwith "translate_input 2" | |
67 | + | |
68 | +(* let get_query_id () = | |
69 | + let filename = Filename.temp_file ~temp_dir:"results/web/" "page_" "" in | |
70 | +(* print_endline filename; *) | |
71 | + let n = String.length "results/web/" + String.length "page_" in | |
72 | + let id = String.sub filename n (String.length filename - n) in | |
73 | +(* print_endline id; *) | |
74 | + id *) | |
75 | + | |
76 | +let generate_header () = | |
77 | + Printf.printf "Content-type: text/html\n"; | |
78 | + Printf.printf "\n" | |
79 | + | |
80 | +let generate_trailer () = | |
81 | + (*Printf.printf "</BODY>\n</HTML>\n"*)() | |
82 | + | |
83 | +let page_header path = | |
84 | +"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\"> | |
85 | +<html> | |
86 | + <head> | |
87 | + <META HTTP-EQUIV=\"CONTENT-TYPE\" CONTENT=\"text/html; charset=utf8\"> | |
88 | + <TITLE>ENIAMmorphology</TITLE> | |
89 | + <META HTTP-EQUIV=\"Content-Language\" CONTENT=\"pl\"> | |
90 | + </head> | |
91 | + | |
92 | + <body> | |
93 | + <center> | |
94 | + <h1>ENIAMmorphology: analizator morfologiczny i guessser dla języka polskiego</h1> | |
95 | + <h3>Podaj słowo:</h3> | |
96 | + <form method=POST action=\"" ^ path ^ "morphology.cgi\"> | |
97 | + <p><input type=\"text\" name=\"text0\" value=\"\" size=\"100\"></p> | |
98 | + <p><input type=\"submit\" value=\"Analizuj\" size=\"60\"></p> | |
99 | + </form>" | |
100 | + | |
101 | +let generate_webpage query content = | |
102 | + print_endline (page_header ""); | |
103 | + printf " <h3>%s</h3> | |
104 | + %s | |
105 | +</center> | |
106 | + </body> | |
107 | +</html>" query content | |
108 | + | |
109 | + | |
110 | +let generate_error_message e = | |
111 | + Printf.printf | |
112 | +"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\"> | |
113 | +<html> | |
114 | + <head> | |
115 | + <META HTTP-EQUIV=\"CONTENT-TYPE\" CONTENT=\"text/html; charset=utf8\"> | |
116 | + <TITLE>ENIAMmorphology</TITLE> | |
117 | + <META HTTP-EQUIV=\"Content-Language\" CONTENT=\"pl\"> | |
118 | + </head> | |
119 | + | |
120 | + <body> | |
121 | + <center> | |
122 | + <h1>ENIAMmorphology</h1> | |
123 | + <h3>%s</h3> | |
124 | +</center> | |
125 | + </body> | |
126 | +</html>" e | |
127 | + | |
128 | +let _ = | |
129 | + generate_header (); | |
130 | + (try | |
131 | + let query = get_input () in | |
132 | + let query = translate_input query in | |
133 | + (* let id = get_query_id () in *) | |
134 | + let content = process_query query in | |
135 | + generate_webpage query content | |
136 | + with | |
137 | + Failure e -> generate_error_message e | |
138 | + | e -> generate_error_message (Printexc.to_string e)); | |
139 | + generate_trailer () | |
140 | + | |
141 | +(* uruchamianie serwera: | |
142 | +./morphology -m -p 5736 | |
143 | +*) | |
144 | + | |
145 | +(* testowanie z linii poleceń: | |
146 | +echo "text0=profesory" | ./morphology.cgi | |
147 | +echo "text0=miauczy" | ./morphology.cgi | |
148 | +*) | |
... | ... |