Commit e2dcc521c3ca72dd05c4b650e2fcc82b10ae77f7

Authored by Wojciech Jaworski
1 parent 62409dbd

integracja CONLL z pre i parserem

.gitignore
... ... @@ -2,4 +2,5 @@
2 2 *.cm[oix]
3 3 *.o
4 4 *.a
5   -*.cmxa
6 5 \ No newline at end of file
  6 +*.cmxa
  7 +.DS_Store
7 8 \ No newline at end of file
... ...
corpora/CONLL.ml
... ... @@ -17,7 +17,7 @@ let string_of_sentence sentence =
17 17 RawSentence text -> failwith ("string_of_sentence: " ^ text)
18 18 | StructSentence (tokens, n) -> String.concat "\n" @@ List.map (fun x -> string_of_token x) tokens
19 19 | ORSentence (_,_,_,_) -> failwith ("string_of_sentence: ORSentence")
20   - | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts
  20 + | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts
21 21 then pom (snd (List.find (fun (mode, s) -> mode = CONLL) alts))
22 22 else failwith ("string_of_sentence: no CONLL mode in AltSentence") in
23 23 (if sentence.pid = ""
... ... @@ -88,7 +88,7 @@ let rec establish_for_token i res text = function
88 88 h :: t -> if Xstring.check_prefix " " text
89 89 then establish_for_token (i+1) res (Xstring.cut_prefix " " text) (h :: t)
90 90 else if Xstring.check_prefix h.orth text
91   - then
  91 + then
92 92 let n = List.length @@ Xunicode.utf8_chars_of_utf8_string h.orth in
93 93 let n_h = {h with beg = i ; len = n} in
94 94 establish_for_token (i+n) (n_h :: res) (Xstring.cut_prefix h.orth text) t
... ... @@ -100,10 +100,10 @@ let rec establish_lengths text = function
100 100 | StructSentence (tokens, n) -> let pbeg, plen, rev_tokens = establish_for_token 0 [] text tokens in
101 101 pbeg, plen, StructSentence (List.rev rev_tokens, n)
102 102 | ORSentence (_,_,_,_) -> failwith ("establish_lengths: ORSentence")
103   - | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts
  103 + | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts
104 104 then establish_lengths text (snd (List.find (fun (mode, s) -> mode = CONLL) alts))
105 105 else failwith ("establish_lengths: no CONLL mode in AltSentence")
106   -
  106 +
107 107  
108 108 (******************)
109 109  
... ... @@ -126,16 +126,17 @@ let match_sentence sentence =
126 126 RawSentence text -> failwith ("match_sentence: " ^ text)
127 127 | StructSentence (tokens, n) -> String.concat " " @@ List.map (fun x -> x.orth) tokens
128 128 | ORSentence (_,_,_,_) -> failwith ("match_sentence: ORSentence")
129   - | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts
  129 + | AltSentence alts -> if List.exists (fun (mode, s) -> mode = CONLL) alts
130 130 then info_token (snd (List.find (fun (mode, s) -> mode = CONLL) alts))
131 131 else failwith ("match_sentence: no CONLL mode in AltSentence") in
132 132 let info_token = info_token sentence.psentence in
133   - try
  133 + (* try *)
134 134 let id, text = StringMap.find info_map info_token in
135 135 let pbeg, plen, n_sentence = establish_lengths text sentence.psentence (* -1, -1, sentence.psentence *) in
136   - {pid = sentence.pid; pbeg = pbeg; plen = plen; psentence = AltSentence[Raw, RawSentence text; CONLL, n_sentence]}
  136 + AltText[Raw,RawText text;CONLL,StructText([StructParagraph([{pid = sentence.pid; pbeg = pbeg; plen = plen;
  137 + psentence = AltSentence[Raw, RawSentence text; CONLL, n_sentence]}],-1)])]
137 138 (* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *)
138   - with _ -> sentence
  139 + (* with _ -> sentence *)
139 140  
140 141 let match_corpus corpus =
141 142 Xlist.map corpus match_sentence
... ...
parser/exec.ml
... ... @@ -113,12 +113,20 @@ let process_text = function
113 113 | PreTypes.RawText "" -> [],0,0
114 114 | _ -> failwith "process_text"
115 115  
116   -let process_query ic oc timeout test_only_flag id query max_n =
  116 +let rec extract_query_text = function
  117 + PreTypes.RawText s -> s
  118 + | PreTypes.AltText l -> (try extract_query_text (Xlist.assoc l PreTypes.Raw) with Not_found -> failwith "extract_query_text")
  119 + | _ -> failwith "extract_query_text"
  120 +
  121 +let process_query ic oc timeout test_only_flag id full_query max_n =
  122 + print_endline "process_query 0";
  123 + let query = extract_query_text full_query in
117 124 let result = {empty_result with query=query} in
118 125 let time1 = time_fun () in
119 126 (* Printf.fprintf oc "%s\n%!" query; *)
120 127 print_endline "process_query 1";
121   - Marshal.to_channel oc (PreTypes.RawText query) [];
  128 + (* Marshal.to_channel oc (PreTypes.RawText query) []; *)
  129 + Marshal.to_channel oc query [];
122 130 flush oc;
123 131 print_endline "process_query 2";
124 132 let text,msg,pre_time1 = (Marshal.from_channel ic : PreTypes.text * string * float) in
... ... @@ -296,7 +304,7 @@ let generate_queries_id filename timeout =
296 304 [id;query] -> id,(query,timeout)
297 305 | _ -> failwith ("generate_queries_id: " ^ line)))
298 306  
299   -let test_process_file filename output_filename timeout =
  307 +(*let test_process_file filename output_filename timeout =
300 308 let queries = generate_queries filename timeout in
301 309 let ic,oc = Unix.open_connection (get_sock_addr Paths.pre_host Paths.pre_port) in
302 310 File.file_out output_filename (fun file ->
... ... @@ -324,4 +332,4 @@ let process_file_id filename output_filename timeout =
324 332 ());
325 333 Printf.fprintf oc "\n%!";
326 334 let _ = Unix.shutdown_connection ic in
327   - ()
  335 + ()*)
... ...
parser/makefile
1 1 OCAMLC=ocamlc
2 2 OCAMLOPT=ocamlopt
3 3 OCAMLDEP=ocamldep
4   -INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I ../../../Dropbox/lib/latexvis -I ../lib/xt -I ../../../Dropbox/Clarin-pl/podzadania/nkjp/fold_text -I ../podzadania/morfeusz -I ../pre
  4 +INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I ../../../Dropbox/lib/latexvis -I ../lib/xt -I ../../../Dropbox/Clarin-pl/podzadania/nkjp/fold_text -I ../podzadania/morfeusz -I ../pre -I ../corpora
5 5 #INCLUDES=-I +xml-light -I +xlib -I ../pre
6 6 OCAMLFLAGS=$(INCLUDES) -g
7 7 OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa latexvis.cmxa #nkjp.cmxa
8 8 #OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa xlib.cmxa
9 9  
10   -PRE= ../pre/paths.ml ../pre/walTypes.ml ../pre/preTypes.ml ../pre/walStringOf.ml
  10 +PRE= ../pre/paths.ml ../pre/walTypes.ml ../pre/preTypes.ml ../pre/walStringOf.ml ../corpora/CONLL.ml
11 11 LCG= LCGtypes.ml LCGstringOf.ml LCGrules.ml LCGrenderer.ml LCGchart.ml LCGlatexOf.ml LCGreductions.ml LCGlexicon.ml LCGvalence.ml
12 12 #LCG= LCGtypes.ml LCGstringOf.ml LCGrules.ml LCGrenderer.ml LCGchart.ml LCGreductions.ml LCGlexicon.ml LCGvalence.ml
13 13 DISAMB= disambSelPref.ml disambLemma.ml
... ...
parser/pipe.ml
... ... @@ -46,7 +46,7 @@ let simple_disambiguate (paths,last) =
46 46  
47 47 let lcg_process query =
48 48 let ic,oc = Unix.open_connection (get_sock_addr Paths.pre_host Paths.pre_port) in
49   - let result = Exec.process_query ic oc 3000. false "x" query 10 in
  49 + let result = Exec.process_query ic oc 3000. false "x" (PreTypes.RawText query) 10 in
50 50 Exec.print_result stdout result;
51 51 Visualization.print_paths "results/" "paths" result.paths;
52 52 Visualization.print_paths_latex "paths" result.paths;
... ... @@ -137,7 +137,7 @@ let lcg_process_file filename result_path result_name =
137 137 incr id;
138 138 let query = List.hd (Str.split (Str.regexp "\t") query) in
139 139 print_endline query;
140   - let result = Exec.process_query ic oc 3000. false "x" query 10 in
  140 + let result = Exec.process_query ic oc 3000. false "x" (PreTypes.RawText query) 10 in
141 141 (* LCGexec.print_result stdout result; *)
142 142 if result.status = Parsed then
143 143 Visualization.print_graph2 result_path (result_name ^ string_of_int !id) query result.sem3)
... ... @@ -145,14 +145,37 @@ let lcg_process_file filename result_path result_name =
145 145 (* let _ = lcg_process_file "data/testy_podstawowe_rob.txt" "results/testy_podstawowe/" "test" *)
146 146 (* let _ = lcg_process_file "data/zdania_testowe.txt" "zdania_testowe" *)
147 147  
148   -let _ =
  148 +(* let _ =
149 149 if Array.length Sys.argv < 2 then print_endline "missing argument" else
150   - lcg_process Sys.argv.(1)
  150 + lcg_process Sys.argv.(1) *)
151 151  
152 152 (* let _ = Exec.test_process_file "data/testy_podstawowe.txt" "results/testy_podstawowe.eff" 100. *)
153 153 (* let _ = LCGexec.test_process_file "data/sentences-składnica.txt" "results/sentences-składnica.eff" 100. *)
154 154 (* let _ = LCGexec.process_file_id "data/sentences-składnica-with-trees.tab" "results/sentences-składnica-with-trees.eff" 100. *)
155 155  
  156 +(* Przetwarzanie korpusów w formacie CONLL *)
  157 +
  158 +let process_conll_corpus filename =
  159 + let corpus = File.file_in filename (fun file -> CONLL.match_corpus (CONLL.load_corpus file)) in
  160 + print_endline "process_conll_corpus";
  161 + (* Xlist.iter corpus (fun sentence -> print_endline (CONLL.string_of_sentence sentence)); *)
  162 + let ic,oc = Unix.open_connection (get_sock_addr Paths.pre_host Paths.pre_port) in
  163 + Xlist.iter corpus (fun query ->
  164 + let result = Exec.process_query ic oc 3000. false "x" query 10 in
  165 + Exec.print_result stdout result;
  166 + Visualization.print_paths "results/" "paths" result.paths;
  167 + ());
  168 + Marshal.to_channel oc (PreTypes.RawText "") [];
  169 + flush oc;
  170 + let _ = Unix.shutdown_connection ic in
  171 + ()
  172 +
  173 +let _ =
  174 + process_conll_corpus "../../NLP resources/Skladnica-zaleznosciowa-mod_130121.conll";
  175 + ()
  176 +
  177 +
  178 +
156 179 let has_pos pos (paths,_,_) =
157 180 Xlist.fold paths false (fun b (_,_,t) ->
158 181 match t.PreTypes.token with
... ...
pre/preProcessing.ml
... ... @@ -585,6 +585,31 @@ let split_into_sentences par paths last next_id =
585 585 PreSentences.extract_sentences par (paths,last), next_id
586 586 (* [{pid="";pbeg=(-1); plen=(-1); psentence=StructSentence(paths,last,next_id)}] *)
587 587  
  588 + let parse_conll paths =
  589 + let paths = PreMWE.process paths in
  590 + (* print_endline "a12"; *)
  591 + let paths = find_proper_names paths in
  592 + (* print_endline "a13"; *)
  593 + let paths = modify_weights paths in
  594 + let paths = PreWordnet.assign_senses paths in
  595 + (* print_endline "a14"; *)
  596 + let paths = combine_interps paths in (* FIXME: to powinno też działać dla Proper *)
  597 + (* print_endline "a15"; *)
  598 + let paths = assign_valence paths in
  599 + (* print_endline "a16"; *)
  600 + let paths = disambiguate_senses paths in
  601 + let paths = assign_simplified_valence paths in
  602 + let paths = PreSemantics.assign_semantics paths in
  603 + (* print_endline "a16"; *)
  604 + let paths = select_tokens paths in
  605 + (* print_endline "a17"; *)
  606 + (* let paths = if !single_sense_flag then single_sense paths else paths in
  607 + let paths = if !single_frame_flag then single_frame paths else paths in*)
  608 + (* let paths, next_id = add_ids paths in *) (* FIXME: jak powiązać id z connl z tymi z pre *)
  609 + let paths = prepare_indexes paths in
  610 + (* print_endline "a18"; *)
  611 + paths, (*next_id*) -1
  612 +
588 613 let parse_text = function
589 614 RawText query ->
590 615 print_endline query;
... ... @@ -592,6 +617,14 @@ let parse_text = function
592 617 let (paths,last : PreTypes.token_record list * int), next_id = parse par in
593 618 let sentences, next_id = split_into_sentences par paths last next_id in
594 619 AltParagraph[Raw,RawParagraph par; Struct,StructParagraph(sentences,next_id)]))]
  620 + | AltText[Raw,RawText query;CONLL,StructText[
  621 + StructParagraph([{psentence = AltSentence[Raw, RawSentence text; CONLL, StructSentence(paths,last)]} as p],_)]] ->
  622 + let (paths,last), next_id = parse_conll (paths,last) in
  623 + let conll = StructText[StructParagraph([{p with psentence = AltSentence[Raw, RawSentence text; CONLL, StructSentence(paths,last)]}],next_id)] in
  624 + let (paths,last), next_id = parse query in
  625 + let sentences, next_id = split_into_sentences query paths last next_id in
  626 + let struct_par = StructText[StructParagraph(sentences,next_id)] in
  627 + AltText[Raw,RawText query; Struct, struct_par; CONLL, conll]
595 628 | _ -> failwith "parse_text: not implemented"
596 629  
597 630 let rec main_loop in_chan out_chan =
... ...