Commit e8f36ddb80503c8fc4b08b4b92ae313ce7236871
1 parent
fab3e07a
Uruchomienie Świgry
Showing
7 changed files
with
90 additions
and
21 deletions
.gitignore
integration/ENIAMpreIntegration.ml
... | ... | @@ -32,6 +32,11 @@ let mate_parser_path = "../tools/mate-tools/" |
32 | 32 | let mate_in = ref stdin |
33 | 33 | let mate_out = ref stdout |
34 | 34 | |
35 | +let swigra_path = "../tools/swigra/parser" | |
36 | +let swigra_in = ref stdin | |
37 | +let swigra_out = ref stdout | |
38 | +let swigra_err = ref stdin | |
39 | + | |
35 | 40 | let concraft_exists () = |
36 | 41 | let check_in, check_out, check_err = Unix.open_process_full ("command -v concraft-pl") |
37 | 42 | [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in |
... | ... | @@ -42,7 +47,7 @@ let concraft_exists () = |
42 | 47 | true |
43 | 48 | with End_of_file -> ignore @@ close_check (); false |
44 | 49 | |
45 | -let wait_for_server () = | |
50 | +let wait_for_concraft_server () = | |
46 | 51 | let rec wait s a = |
47 | 52 | try Unix.connect s a |
48 | 53 | with e -> Unix.sleep 1; wait s a in |
... | ... | @@ -52,32 +57,50 @@ let wait_for_server () = |
52 | 57 | Unix.shutdown s Unix.SHUTDOWN_SEND; |
53 | 58 | Unix.close s |
54 | 59 | |
55 | -let start_server m = | |
60 | +let start_concraft_server m = | |
56 | 61 | let client_out, server_out = Unix.pipe () in |
57 | 62 | let client_err, server_err = Unix.pipe () in |
58 | 63 | let pid = Unix.create_process "concraft-pl" [|"concraft-pl"; "server"; "--inmodel"; m|] |
59 | 64 | Unix.stdin server_out server_err in |
60 | 65 | List.iter Unix.close [client_out; server_out; client_err; server_err]; |
61 | - wait_for_server (); | |
66 | + wait_for_concraft_server (); | |
62 | 67 | pid |
63 | 68 | |
64 | -let stop_server pid = | |
69 | +let stop_concraft_server pid = | |
65 | 70 | Unix.kill pid Sys.sigint |
66 | 71 | |
72 | +let start_swigra_server dir = | |
73 | + let serv_in, serv_out, serv_err = Unix.open_process_full ("cd " ^ dir ^ "; ./swigra -w") | |
74 | + [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in | |
75 | + ignore @@ input_line serv_in; | |
76 | + serv_in, serv_out, serv_err | |
77 | + | |
78 | +let stop_swigra_server (serv_in, serv_out, serv_err) = | |
79 | + output_string serv_out "halt.\n"; | |
80 | + ignore @@ Unix.close_process_full (serv_in, serv_out, serv_err) | |
81 | + | |
67 | 82 | let initialize () = |
68 | 83 | if !concraft_enabled then ( |
69 | 84 | if not (concraft_exists ()) then failwith "The command concraft-pl is missing. Please make sure Concraft is installed properly." else |
70 | 85 | if not (Sys.file_exists concraft_model_filename) then failwith "Concraft model file does not exist." else |
71 | 86 | print_endline "Starting Concraft Server"; |
72 | - concraft_server_pid := start_server concraft_model_filename; | |
87 | + concraft_server_pid := start_concraft_server concraft_model_filename; | |
73 | 88 | print_endline "Server started"); |
74 | 89 | if !mate_parser_enabled then ( |
75 | 90 | let m_in, m_out = (*Unix.open_process "java -jar ../dependencyParser/basic/mate-tools/dist/anna-3.5.jar -model ../dependencyParser/basic/mate-tools/examples/160622_Polish_MateParser.mdl -test"*) |
76 | 91 | Unix.open_process ("java -jar " ^ mate_parser_path ^ "dist/anna-3.5.jar -model " ^ |
77 | 92 | mate_parser_path ^ "examples/160622_Polish_MateParser.mdl -test") in |
78 | 93 | mate_in := m_in; |
79 | - mate_out := m_out) | |
94 | + mate_out := m_out); | |
95 | + if !swigra_enabled then ( | |
96 | + let a,b,c = start_swigra_server swigra_path in | |
97 | + swigra_in := a; | |
98 | + swigra_out := b; | |
99 | + swigra_err := c) | |
80 | 100 | |
101 | +let stop_servers () = | |
102 | + if !concraft_enabled then stop_concraft_server !concraft_server_pid; | |
103 | + if !swigra_enabled then stop_swigra_server (!swigra_in, !swigra_out, !swigra_err) | |
81 | 104 | |
82 | 105 | let read_whole_channel c = |
83 | 106 | let r = ref [] in |
... | ... | @@ -155,18 +178,41 @@ let rec parse_mate_sentence tokens pbeg s = |
155 | 178 | print_endline "parse_mate2 5"; |
156 | 179 | DepSentence[paths1;paths2] |
157 | 180 | |
181 | +let curl_swigra s = | |
182 | + let curl_in, curl_out, curl_err = Unix.open_process_full ("curl 'http://localhost:3333/swigra' --data-urlencode 'q=" ^ s ^ "'") | |
183 | + [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in | |
184 | + try | |
185 | + while true do | |
186 | + ignore @@ input_line curl_in | |
187 | + done | |
188 | + with End_of_file -> ignore @@ Unix.close_process_full (curl_in, curl_out, curl_err) | |
189 | + | |
190 | +let print_swigra_xml dir = | |
191 | + let xml_in = open_in @@ dir ^ "/httpd/forest-disamb.xml" in | |
192 | + try | |
193 | + while true do | |
194 | + print_endline @@ input_line xml_in | |
195 | + done | |
196 | + with End_of_file -> close_in xml_in | |
197 | + | |
198 | +let parse_swigra_sentence s = | |
199 | + curl_swigra s; | |
200 | + let xml = Xml.parse_file (swigra_path ^ "/httpd/forest-disamb.xml") in | |
201 | + print_swigra_xml swigra_path; | |
202 | + RawSentence s | |
203 | + | |
158 | 204 | let compare_mode (x,_) (y,_) = compare_mode x y |
159 | 205 | |
160 | 206 | let rec parse_sentence mode tokens pbeg = function |
161 | 207 | RawSentence s -> |
162 | 208 | [Raw,RawSentence s] @ |
163 | 209 | (if !mate_parser_enabled && !concraft_enabled then [Mate,parse_mate_sentence tokens pbeg s] else []) @ |
164 | - (if !swigra_enabled then [Swigra,RawSentence s] else []) @ | |
210 | + (if !swigra_enabled then [Swigra,parse_swigra_sentence s] else []) @ | |
165 | 211 | (if !polfie_enabled then [POLFIE,RawSentence s] else []) |
166 | 212 | | StructSentence(paths,last) -> [mode,StructSentence(paths,last)] |
167 | 213 | | DepSentence(paths) -> [mode,DepSentence paths] |
168 | 214 | | QuotedSentences sentences -> |
169 | - let sentences =Xlist.rev_map sentences (fun p -> | |
215 | + let sentences = Xlist.rev_map sentences (fun p -> | |
170 | 216 | let sentence = parse_sentence mode tokens p.beg p.sentence in (* FIXME: p.pbeg czy pbeg *) |
171 | 217 | let sentence = match sentence with |
172 | 218 | [_,s] -> s |
... | ... |
integration/makefile
... | ... | @@ -29,6 +29,10 @@ eniam-integration.cmxa: $(SOURCES) |
29 | 29 | test: test.ml |
30 | 30 | $(OCAMLOPT) -o test $(OCAMLOPTFLAGS) test.ml |
31 | 31 | |
32 | +# swigra_test: swigra_test.ml | |
33 | +# ocamlopt -o swigra_test $(OCAMLOPTFLAGS) swigra_test.ml | |
34 | + | |
35 | + | |
32 | 36 | .SUFFIXES: .mll .mly .ml .mli .cmo .cmi .cmx |
33 | 37 | |
34 | 38 | .mll.ml: |
... | ... |
integration/test.ml
... | ... | @@ -19,16 +19,17 @@ |
19 | 19 | |
20 | 20 | let test_strings = [ |
21 | 21 | "Szpak frunie."; |
22 | - "Kot np. miauczy."; | |
22 | + (* "Kot np. miauczy."; | |
23 | 23 | "Ala ma kota."; |
24 | 24 | "Ale mają kota:"; |
25 | 25 | "Szpak frunie. Kot miauczy."; |
26 | - "Szpak powiedział: „Frunę. Kiszę.”"; | |
26 | + "Szpak powiedział: „Frunę. Kiszę.”"; *) | |
27 | 27 | ] |
28 | 28 | |
29 | 29 | let _ = |
30 | - ENIAMpreIntegration.concraft_enabled := true; | |
31 | - ENIAMpreIntegration.mate_parser_enabled := true; | |
30 | + (* ENIAMpreIntegration.concraft_enabled := true; | |
31 | + ENIAMpreIntegration.mate_parser_enabled := true; *) | |
32 | + ENIAMpreIntegration.swigra_enabled := true; | |
32 | 33 | ENIAMsubsyntax.initialize (); |
33 | 34 | ENIAMpreIntegration.initialize (); |
34 | 35 | print_endline "Testy wbudowane"; |
... | ... | @@ -50,5 +51,5 @@ let _ = |
50 | 51 | print_endline "Wpisz tekst i naciśnij ENTER, pusty tekst kończy."; |
51 | 52 | s := read_line () |
52 | 53 | done;*) |
53 | - ENIAMpreIntegration.stop_server !ENIAMpreIntegration.concraft_server_pid; | |
54 | + ENIAMpreIntegration.stop_servers (); | |
54 | 55 | () |
... | ... |
tokenizer/ENIAMpatterns.ml
... | ... | @@ -394,7 +394,9 @@ let digit_patterns4 = [ |
394 | 394 | |
395 | 395 | let url_patterns1 = [ |
396 | 396 | [L; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url")); |
397 | + [L; D "dig"; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url")); | |
397 | 398 | [L; S "."; L; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url")); |
399 | + [L; S "."; L; D "dig"; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url")); | |
398 | 400 | [L; S "."; L; S "."; L; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url")); |
399 | 401 | [L; S "."; L; S "."; L; S "."; L; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url")); |
400 | 402 | [L; S "."; L; S "."; L; S "."; L; S "."; L; S "."; O "pl"], (function l -> Dig(concat_orths2 l,"url")); |
... | ... | @@ -447,9 +449,8 @@ let url_patterns2 = [ |
447 | 449 | [L; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email")); |
448 | 450 | [L; S "."; L; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email")); |
449 | 451 | [L; S "."; L; S "."; L; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email")); |
450 | - [L; D "dig"; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email")); | |
452 | + [L; D "intnum"; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email")); | |
451 | 453 | [L; S "."; L; D "dig"; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email")); |
452 | - [L; S "-"; D "dig"; S "@"; D "url"], (function l -> Dig(concat_orths2 l,"email")); | |
453 | 454 | [O "http"; S ":"; S "/"; S "/"; D "url"], (function l -> Dig(concat_orths2 l,"url")); |
454 | 455 | [O "https"; S ":"; S "/"; S "/"; D "url"], (function l -> Dig(concat_orths2 l,"url")); |
455 | 456 | ] |
... | ... |
tokenizer/ENIAMtokens.ml
... | ... | @@ -381,7 +381,7 @@ let recognize_stem poss_s_beg has_sufix i letters = |
381 | 381 | else |
382 | 382 | if Xlist.size letters = 1 then |
383 | 383 | if first_capital letters then Variant[ |
384 | - Token{t with token=SmallLetter orth; weight=cs_weight; attrs=CS :: t.attrs}; | |
384 | + Token{t with token=SmallLetter(merge (lowercase_first letters)); weight=cs_weight; attrs=CS :: t.attrs}; | |
385 | 385 | Token{t with token=CapLetter(orth,merge (lowercase_first letters)); attrs=MaybeCS :: t.attrs}] |
386 | 386 | else Token{t with token=SmallLetter orth} |
387 | 387 | else |
... | ... | @@ -790,6 +790,14 @@ let rec recognize_sign_group poss_s_beg i = function |
790 | 790 | Seq[Token{empty_token_env with orth=":";beg=i;len=factor;next=i+factor;token=Interp ":"; attrs=[MaybeCS]}; |
791 | 791 | Token{empty_token_env with orth="]";beg=i+factor;len=factor;next=i+2*factor;token=Interp "]"; attrs=[MaybeCS]}]],i2,l,false |
792 | 792 | | (Sign ";") :: (Sign "]") :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign "]") :: []) l (make_lemma (";]","sinterj")) |
793 | + | (Sign ":") :: (Capital("P",_)) :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "P") :: []) l (make_lemma (":P","sinterj")) | |
794 | + | (Sign ";") :: (Capital("P",_)) :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign "P") :: []) l (make_lemma (";P","sinterj")) | |
795 | + | (Sign ":") :: (Sign "-") :: (Capital("P",_)) :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "-") :: (Sign "P") :: []) l (make_lemma (":-P","sinterj")) | |
796 | + | (Sign ";") :: (Sign "-") :: (Capital("P",_)) :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign "-") :: (Sign "P") :: []) l (make_lemma (";-P","sinterj")) | |
797 | + | (Sign ":") :: (Capital("D",_)) :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "D") :: []) l (make_lemma (":D","sinterj")) | |
798 | + | (Sign ";") :: (Capital("D",_)) :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign "D") :: []) l (make_lemma (";D","sinterj")) | |
799 | + | (Sign ":") :: (Sign "-") :: (Capital("D",_)) :: l -> create_sign_token poss_s_beg i ((Sign ":") :: (Sign "-") :: (Sign "D") :: []) l (make_lemma (":-D","sinterj")) | |
800 | + | (Sign ";") :: (Sign "-") :: (Capital("D",_)) :: l -> create_sign_token poss_s_beg i ((Sign ";") :: (Sign "-") :: (Sign "D") :: []) l (make_lemma (";-D","sinterj")) | |
793 | 801 | | (Sign "]") :: l -> create_sign_token poss_s_beg i [Sign "]"] l (Interp "]") |
794 | 802 | | (Sign "[") :: l -> create_sign_token poss_s_beg i [Sign "["] l (Interp "[") |
795 | 803 | | (Sign ":") :: l -> |
... | ... | @@ -983,7 +991,7 @@ let rec recognize_sign_group poss_s_beg i = function |
983 | 991 | | (Sign "^") :: (Sign "^") :: l -> create_sign_token poss_s_beg i [Sign "^";Sign "^"] l (make_lemma ("^^","sinterj")) |
984 | 992 | | (Sign "^") :: l -> create_sign_token poss_s_beg i [Sign "^"] l (Symbol "^") |
985 | 993 | | (Sign "|") :: l -> create_sign_token poss_s_beg i [Sign "|"] l (Symbol "|") |
986 | - | (Sign "&") :: l -> | |
994 | + | (Sign "&") :: l -> | |
987 | 995 | let t,i = create_empty_sign_token i [Sign "&"] in |
988 | 996 | Variant[Token{t with token=Symbol "&"};Token{t with token=make_lemma ("&","conj")}],i,l,false |
989 | 997 | | (Sign "=") :: l -> create_sign_token poss_s_beg i [Sign "="] l (Symbol "=") |
... | ... |
tokenizer/test.ml
... | ... | @@ -68,15 +68,21 @@ let test_strings = [ |
68 | 68 | "( Głosujmy !)"; |
69 | 69 | "„Dialog”"; *) |
70 | 70 | (* "x br." *) |
71 | - "ponad 388 tys. ludzi"; | |
72 | - "ponad 388 tys. km.2"; | |
71 | + (* "ponad 388 tys. ludzi"; | |
72 | + "ponad 388 tys. km.2"; *) | |
73 | 73 | (* "(PTTK Żyrardów, tel. 0-46 855-45-26)"; |
74 | 74 | "40-045 Katowice, ul. Astrów 7, tel. (032) 51 30 86, tel. i faks 51 86 28, 517 193, 518 609"; *) |
75 | - "przeciętnie 7,5 tys. kibiców"; | |
75 | + (* "przeciętnie 7,5 tys. kibiców"; | |
76 | 76 | "0,4mln"; |
77 | 77 | "8,8665tys."; |
78 | 78 | "70-75 tys."; |
79 | - "70-75tys."; | |
79 | + "70-75tys."; *) | |
80 | + (* "myjni \"A-Car Auto\" Myjnia"; *) | |
81 | + "nowaka@lp2.pl"; | |
82 | + (* "poufale :P) Im"; *) | |
83 | + (*"Piłsudskiego 12 A Konstancin-Jeziorna"; | |
84 | + "Mary Mary"; | |
85 | + "TTTTTTTK TTTTTTTK"; *) | |
80 | 86 | ] |
81 | 87 | |
82 | 88 | let _ = |
... | ... |