Commit e2dcc521c3ca72dd05c4b650e2fcc82b10ae77f7
1 parent
62409dbd
integracja CONLL z pre i parserem
Showing
6 changed files
with
85 additions
and
19 deletions
.gitignore
corpora/CONLL.ml
... | ... | @@ -17,7 +17,7 @@ let string_of_sentence sentence = |
17 | 17 | RawSentence text -> failwith ("string_of_sentence: " ^ text) |
18 | 18 | | StructSentence (tokens, n) -> String.concat "\n" @@ List.map (fun x -> string_of_token x) tokens |
19 | 19 | | ORSentence (_,_,_,_) -> failwith ("string_of_sentence: ORSentence") |
20 | - | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts | |
20 | + | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts | |
21 | 21 | then pom (snd (List.find (fun (mode, s) -> mode = CONLL) alts)) |
22 | 22 | else failwith ("string_of_sentence: no CONLL mode in AltSentence") in |
23 | 23 | (if sentence.pid = "" |
... | ... | @@ -88,7 +88,7 @@ let rec establish_for_token i res text = function |
88 | 88 | h :: t -> if Xstring.check_prefix " " text |
89 | 89 | then establish_for_token (i+1) res (Xstring.cut_prefix " " text) (h :: t) |
90 | 90 | else if Xstring.check_prefix h.orth text |
91 | - then | |
91 | + then | |
92 | 92 | let n = List.length @@ Xunicode.utf8_chars_of_utf8_string h.orth in |
93 | 93 | let n_h = {h with beg = i ; len = n} in |
94 | 94 | establish_for_token (i+n) (n_h :: res) (Xstring.cut_prefix h.orth text) t |
... | ... | @@ -100,10 +100,10 @@ let rec establish_lengths text = function |
100 | 100 | | StructSentence (tokens, n) -> let pbeg, plen, rev_tokens = establish_for_token 0 [] text tokens in |
101 | 101 | pbeg, plen, StructSentence (List.rev rev_tokens, n) |
102 | 102 | | ORSentence (_,_,_,_) -> failwith ("establish_lengths: ORSentence") |
103 | - | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts | |
103 | + | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts | |
104 | 104 | then establish_lengths text (snd (List.find (fun (mode, s) -> mode = CONLL) alts)) |
105 | 105 | else failwith ("establish_lengths: no CONLL mode in AltSentence") |
106 | - | |
106 | + | |
107 | 107 | |
108 | 108 | (******************) |
109 | 109 | |
... | ... | @@ -126,16 +126,17 @@ let match_sentence sentence = |
126 | 126 | RawSentence text -> failwith ("match_sentence: " ^ text) |
127 | 127 | | StructSentence (tokens, n) -> String.concat " " @@ List.map (fun x -> x.orth) tokens |
128 | 128 | | ORSentence (_,_,_,_) -> failwith ("match_sentence: ORSentence") |
129 | - | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts | |
129 | + | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts | |
130 | 130 | then info_token (snd (List.find (fun (mode, s) -> mode = CONLL) alts)) |
131 | 131 | else failwith ("match_sentence: no CONLL mode in AltSentence") in |
132 | 132 | let info_token = info_token sentence.psentence in |
133 | - try | |
133 | + (* try *) | |
134 | 134 | let id, text = StringMap.find info_map info_token in |
135 | 135 | let pbeg, plen, n_sentence = establish_lengths text sentence.psentence (* -1, -1, sentence.psentence *) in |
136 | - {pid = sentence.pid; pbeg = pbeg; plen = plen; psentence = AltSentence[Raw, RawSentence text; CONLL, n_sentence]} | |
136 | + AltText[Raw,RawText text;CONLL,StructText([StructParagraph([{pid = sentence.pid; pbeg = pbeg; plen = plen; | |
137 | + psentence = AltSentence[Raw, RawSentence text; CONLL, n_sentence]}],-1)])] | |
137 | 138 | (* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *) |
138 | - with _ -> sentence | |
139 | + (* with _ -> sentence *) | |
139 | 140 | |
140 | 141 | let match_corpus corpus = |
141 | 142 | Xlist.map corpus match_sentence |
... | ... |
parser/exec.ml
... | ... | @@ -113,12 +113,20 @@ let process_text = function |
113 | 113 | | PreTypes.RawText "" -> [],0,0 |
114 | 114 | | _ -> failwith "process_text" |
115 | 115 | |
116 | -let process_query ic oc timeout test_only_flag id query max_n = | |
116 | +let rec extract_query_text = function | |
117 | + PreTypes.RawText s -> s | |
118 | + | PreTypes.AltText l -> (try extract_query_text (Xlist.assoc l PreTypes.Raw) with Not_found -> failwith "extract_query_text") | |
119 | + | _ -> failwith "extract_query_text" | |
120 | + | |
121 | +let process_query ic oc timeout test_only_flag id full_query max_n = | |
122 | + print_endline "process_query 0"; | |
123 | + let query = extract_query_text full_query in | |
117 | 124 | let result = {empty_result with query=query} in |
118 | 125 | let time1 = time_fun () in |
119 | 126 | (* Printf.fprintf oc "%s\n%!" query; *) |
120 | 127 | print_endline "process_query 1"; |
121 | - Marshal.to_channel oc (PreTypes.RawText query) []; | |
128 | + (* Marshal.to_channel oc (PreTypes.RawText query) []; *) | |
129 | + Marshal.to_channel oc query []; | |
122 | 130 | flush oc; |
123 | 131 | print_endline "process_query 2"; |
124 | 132 | let text,msg,pre_time1 = (Marshal.from_channel ic : PreTypes.text * string * float) in |
... | ... | @@ -296,7 +304,7 @@ let generate_queries_id filename timeout = |
296 | 304 | [id;query] -> id,(query,timeout) |
297 | 305 | | _ -> failwith ("generate_queries_id: " ^ line))) |
298 | 306 | |
299 | -let test_process_file filename output_filename timeout = | |
307 | +(*let test_process_file filename output_filename timeout = | |
300 | 308 | let queries = generate_queries filename timeout in |
301 | 309 | let ic,oc = Unix.open_connection (get_sock_addr Paths.pre_host Paths.pre_port) in |
302 | 310 | File.file_out output_filename (fun file -> |
... | ... | @@ -324,4 +332,4 @@ let process_file_id filename output_filename timeout = |
324 | 332 | ()); |
325 | 333 | Printf.fprintf oc "\n%!"; |
326 | 334 | let _ = Unix.shutdown_connection ic in |
327 | - () | |
335 | + ()*) | |
... | ... |
parser/makefile
1 | 1 | OCAMLC=ocamlc |
2 | 2 | OCAMLOPT=ocamlopt |
3 | 3 | OCAMLDEP=ocamldep |
4 | -INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I ../../../Dropbox/lib/latexvis -I ../lib/xt -I ../../../Dropbox/Clarin-pl/podzadania/nkjp/fold_text -I ../podzadania/morfeusz -I ../pre | |
4 | +INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I ../../../Dropbox/lib/latexvis -I ../lib/xt -I ../../../Dropbox/Clarin-pl/podzadania/nkjp/fold_text -I ../podzadania/morfeusz -I ../pre -I ../corpora | |
5 | 5 | #INCLUDES=-I +xml-light -I +xlib -I ../pre |
6 | 6 | OCAMLFLAGS=$(INCLUDES) -g |
7 | 7 | OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa latexvis.cmxa #nkjp.cmxa |
8 | 8 | #OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa xlib.cmxa |
9 | 9 | |
10 | -PRE= ../pre/paths.ml ../pre/walTypes.ml ../pre/preTypes.ml ../pre/walStringOf.ml | |
10 | +PRE= ../pre/paths.ml ../pre/walTypes.ml ../pre/preTypes.ml ../pre/walStringOf.ml ../corpora/CONLL.ml | |
11 | 11 | LCG= LCGtypes.ml LCGstringOf.ml LCGrules.ml LCGrenderer.ml LCGchart.ml LCGlatexOf.ml LCGreductions.ml LCGlexicon.ml LCGvalence.ml |
12 | 12 | #LCG= LCGtypes.ml LCGstringOf.ml LCGrules.ml LCGrenderer.ml LCGchart.ml LCGreductions.ml LCGlexicon.ml LCGvalence.ml |
13 | 13 | DISAMB= disambSelPref.ml disambLemma.ml |
... | ... |
parser/pipe.ml
... | ... | @@ -46,7 +46,7 @@ let simple_disambiguate (paths,last) = |
46 | 46 | |
47 | 47 | let lcg_process query = |
48 | 48 | let ic,oc = Unix.open_connection (get_sock_addr Paths.pre_host Paths.pre_port) in |
49 | - let result = Exec.process_query ic oc 3000. false "x" query 10 in | |
49 | + let result = Exec.process_query ic oc 3000. false "x" (PreTypes.RawText query) 10 in | |
50 | 50 | Exec.print_result stdout result; |
51 | 51 | Visualization.print_paths "results/" "paths" result.paths; |
52 | 52 | Visualization.print_paths_latex "paths" result.paths; |
... | ... | @@ -137,7 +137,7 @@ let lcg_process_file filename result_path result_name = |
137 | 137 | incr id; |
138 | 138 | let query = List.hd (Str.split (Str.regexp "\t") query) in |
139 | 139 | print_endline query; |
140 | - let result = Exec.process_query ic oc 3000. false "x" query 10 in | |
140 | + let result = Exec.process_query ic oc 3000. false "x" (PreTypes.RawText query) 10 in | |
141 | 141 | (* LCGexec.print_result stdout result; *) |
142 | 142 | if result.status = Parsed then |
143 | 143 | Visualization.print_graph2 result_path (result_name ^ string_of_int !id) query result.sem3) |
... | ... | @@ -145,14 +145,37 @@ let lcg_process_file filename result_path result_name = |
145 | 145 | (* let _ = lcg_process_file "data/testy_podstawowe_rob.txt" "results/testy_podstawowe/" "test" *) |
146 | 146 | (* let _ = lcg_process_file "data/zdania_testowe.txt" "zdania_testowe" *) |
147 | 147 | |
148 | -let _ = | |
148 | +(* let _ = | |
149 | 149 | if Array.length Sys.argv < 2 then print_endline "missing argument" else |
150 | - lcg_process Sys.argv.(1) | |
150 | + lcg_process Sys.argv.(1) *) | |
151 | 151 | |
152 | 152 | (* let _ = Exec.test_process_file "data/testy_podstawowe.txt" "results/testy_podstawowe.eff" 100. *) |
153 | 153 | (* let _ = LCGexec.test_process_file "data/sentences-składnica.txt" "results/sentences-składnica.eff" 100. *) |
154 | 154 | (* let _ = LCGexec.process_file_id "data/sentences-składnica-with-trees.tab" "results/sentences-składnica-with-trees.eff" 100. *) |
155 | 155 | |
156 | +(* Przetwarzanie korpusów w formacie CONLL *) | |
157 | + | |
158 | +let process_conll_corpus filename = | |
159 | + let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in | |
160 | + print_endline "process_conll_corpus"; | |
161 | + (* Xlist.iter corpus (fun sentence -> print_endline (CONLL.string_of_sentence sentence)); *) | |
162 | + let ic,oc = Unix.open_connection (get_sock_addr Paths.pre_host Paths.pre_port) in | |
163 | + Xlist.iter corpus (fun query -> | |
164 | + let result = Exec.process_query ic oc 3000. false "x" query 10 in | |
165 | + Exec.print_result stdout result; | |
166 | + Visualization.print_paths "results/" "paths" result.paths; | |
167 | + ()); | |
168 | + Marshal.to_channel oc (PreTypes.RawText "") []; | |
169 | + flush oc; | |
170 | + let _ = Unix.shutdown_connection ic in | |
171 | + () | |
172 | + | |
173 | +let _ = | |
174 | + process_conll_corpus "../../NLP resources/Skladnica-zaleznosciowa-mod_130121.conll"; | |
175 | + () | |
176 | + | |
177 | + | |
178 | + | |
156 | 179 | let has_pos pos (paths,_,_) = |
157 | 180 | Xlist.fold paths false (fun b (_,_,t) -> |
158 | 181 | match t.PreTypes.token with |
... | ... |
pre/preProcessing.ml
... | ... | @@ -585,6 +585,31 @@ let split_into_sentences par paths last next_id = |
585 | 585 | PreSentences.extract_sentences par (paths,last), next_id |
586 | 586 | (* [{pid="";pbeg=(-1); plen=(-1); psentence=StructSentence(paths,last,next_id)}] *) |
587 | 587 | |
588 | + let parse_conll paths = | |
589 | + let paths = PreMWE.process paths in | |
590 | + (* print_endline "a12"; *) | |
591 | + let paths = find_proper_names paths in | |
592 | + (* print_endline "a13"; *) | |
593 | + let paths = modify_weights paths in | |
594 | + let paths = PreWordnet.assign_senses paths in | |
595 | + (* print_endline "a14"; *) | |
596 | + let paths = combine_interps paths in (* FIXME: to powinno też działać dla Proper *) | |
597 | + (* print_endline "a15"; *) | |
598 | + let paths = assign_valence paths in | |
599 | + (* print_endline "a16"; *) | |
600 | + let paths = disambiguate_senses paths in | |
601 | + let paths = assign_simplified_valence paths in | |
602 | + let paths = PreSemantics.assign_semantics paths in | |
603 | + (* print_endline "a16"; *) | |
604 | + let paths = select_tokens paths in | |
605 | + (* print_endline "a17"; *) | |
606 | + (* let paths = if !single_sense_flag then single_sense paths else paths in | |
607 | + let paths = if !single_frame_flag then single_frame paths else paths in*) | |
608 | + (* let paths, next_id = add_ids paths in *) (* FIXME: jak powiązać id z connl z tymi z pre *) | |
609 | + let paths = prepare_indexes paths in | |
610 | + (* print_endline "a18"; *) | |
611 | + paths, (*next_id*) -1 | |
612 | + | |
588 | 613 | let parse_text = function |
589 | 614 | RawText query -> |
590 | 615 | print_endline query; |
... | ... | @@ -592,6 +617,14 @@ let parse_text = function |
592 | 617 | let (paths,last : PreTypes.token_record list * int), next_id = parse par in |
593 | 618 | let sentences, next_id = split_into_sentences par paths last next_id in |
594 | 619 | AltParagraph[Raw,RawParagraph par; Struct,StructParagraph(sentences,next_id)]))] |
620 | + | AltText[Raw,RawText query;CONLL,StructText[ | |
621 | + StructParagraph([{psentence = AltSentence[Raw, RawSentence text; CONLL, StructSentence(paths,last)]} as p],_)]] -> | |
622 | + let (paths,last), next_id = parse_conll (paths,last) in | |
623 | + let conll = StructText[StructParagraph([{p with psentence = AltSentence[Raw, RawSentence text; CONLL, StructSentence(paths,last)]}],next_id)] in | |
624 | + let (paths,last), next_id = parse query in | |
625 | + let sentences, next_id = split_into_sentences query paths last next_id in | |
626 | + let struct_par = StructText[StructParagraph(sentences,next_id)] in | |
627 | + AltText[Raw,RawText query; Struct, struct_par; CONLL, conll] | |
595 | 628 | | _ -> failwith "parse_text: not implemented" |
596 | 629 | |
597 | 630 | let rec main_loop in_chan out_chan = |
... | ... |