Commit 0b6dd720b5d555d212ecb7927d1cf27c27c9acc4

Authored by Wojciech Jaworski
1 parent 5007a782

uruchomienie MateParsera

Showing 2 changed files with 140 additions and 137 deletions
corpora/CONLL.ml
1 1 open Xstd
2 2 open PreTypes
3 3  
4   -let alternative_string f mode alts = if List.exists (fun (m,_) -> mode = m) alts
5   - then f (snd @@ List.find (fun (m,_) -> m = mode) alts)
6   - else f (snd @@ List.find (fun (m,_) -> m = Struct) alts)
7   -
8   -let string_of_token mode token id super label =
9   - let decompose_lemma = function
10   - | Lemma(a,b,c) -> a,b,if c = [[]]
11   - then "_"
12   - else String.concat "][" @@ Xlist.map c (fun x ->
13   - String.concat "|" @@ Xlist.map x ( fun y ->
14   - String.concat "." y))
15   - | _ -> failwith ("string_of_token: not Lemma") in
16   - match mode with
17   - | Raw -> token.orth
18   - | Struct -> failwith ("function string_of_token for mode Struct is not defined")
19   - | CONLL -> let lemma,cat,interp = decompose_lemma token.token in
20   - String.concat "\t" [id;
21   - token.orth; lemma; cat; cat; interp; "_"; "_";
22   - string_of_int token.beg; string_of_int token.len]
23   - | Mate -> let lemma,cat,interp = decompose_lemma token.token in
24   - String.concat "\t" [id;
25   - token.orth; lemma; lemma; cat; cat; interp; interp; "_"; "_"; "_"; "_"; "_"; "_"]
26   - | _ -> failwith "string_of_token: ni"
27   -
28   -let string_of_paths mode tokens paths =
29   - Array.fold_left (fun acc (id,super,label) ->
30   - acc ^ "\n" ^ (string_of_token mode (ExtArray.get tokens id) (string_of_int id) super label)) "" paths
31   -
32   -let rec string_of_sentence mode tokens = function
33   - RawSentence s -> if mode = Raw then s else ""
34   - | StructSentence (_,tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*)
35   - | DepSentence (_, paths) -> string_of_paths mode tokens paths
36   - | QuotedSentences _ -> failwith ("string_of_sentence: QuotedSentences")
37   - | AltSentence alts -> alternative_string (string_of_sentence mode tokens) mode alts
38   -
39   -let string_of_p_record mode tokens p_record =
40   - (if p_record.pid = "" then "" else p_record.pid ^ "\n") ^
41   - string_of_sentence mode tokens p_record.psentence
42   -
43   -(*let rec string_of_paragraph mode tokens = function
44   - RawParagraph s -> if mode = Raw then s else ""
45   - | StructParagraph p_records -> String.concat "\n\n" @@ Xlist.map p_records (string_of_p_record mode tokens)
46   - | AltParagraph alts -> alternative_string (string_of_paragraph mode) mode alts
47   -
48   -let rec string_of_text mode tokens = function
49   - RawText s -> if mode = Raw then s else ""
50   - | StructText (paragraphs,_) -> String.concat "\n\n" @@ Xlist.map paragraphs (string_of_paragraph mode tokens)
51   - | AltText alts -> alternative_string (string_of_text mode) mode alts*)
52   -
53   -
54   -(******************)
55   -
56   -let establish_next tokens paths =
57   - let n = ExtArray.size tokens in
58   - Int.iter 1 (n - 2) (fun i ->
59   - let f = ExtArray.get tokens i in
60   - let s = ExtArray.get tokens (i+1) in
61   - ExtArray.set tokens i {f with next = s.beg});
62   - let last = ExtArray.get tokens (n-1) in
63   - ExtArray.set tokens (n-1) {last with next = last.beg + last.len}
64   -
65   -
66   - (*let rec pom res = function
67   - h :: t -> let next = if res = []
68   - then h.beg+h.len
69   - else (List.hd res).beg in
70   - pom ({h with next = next} :: res) t
71   - | [] -> res in
72   - pom [] rev_tokens*)
73   -
74   -let rec establish_for_token i text tokens = function
75   - (id,_,_) :: t as l->
76   - let h = ExtArray.get tokens id in
77   - if Xstring.check_prefix " " text
78   - then establish_for_token (i+100) (Xstring.cut_prefix " " text) tokens l
79   - else if Xstring.check_prefix h.orth text
80   - then
81   - let n = (List.length @@ Xunicode.utf8_chars_of_utf8_string h.orth) * 100 in
82   - let n_h = {h with beg = i ; len = n} in
83   - ExtArray.set tokens id n_h;
84   - establish_for_token (i+n) (Xstring.cut_prefix h.orth text) tokens t
85   - else failwith ("establish_for_token :" ^ h.orth ^ " " ^ text)
86   - | [] -> 100, i
87   -
88   -let rec establish_lengths text paths tokens =
89   - let pbeg, plen = establish_for_token 100 text tokens (List.tl (Array.to_list paths)) in
90   - establish_next tokens paths;
91   - pbeg, plen-100
92   -
93   -(******************)
94   -
95   -exception ErrorInfoFile of string
96   -
97   -let info_file = "../corpora/info_sentences.txt"
98   -
99   -let info = Xstring.split "\n\n" @@ File.load_file_gen info_file
100   -
101   -let add_to_map map info_str =
102   - match Xstring.split "\n" info_str with
103   - [id; text; info_token] -> StringMap.add map info_token (id, text)
104   - | _ -> raise (ErrorInfoFile info_str)
105   -
106   -let info_map =
107   - Xlist.fold info StringMap.empty add_to_map
108   -
109   -let match_sentence (p_record,tokens) =
110   - let rec info_token s = match s with
111   - RawSentence text -> failwith ("match_sentence: " ^ text)
112   - | StructSentence (_, tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*)
113   - | DepSentence (_, paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths
114   - | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences")
115   - | AltSentence alts -> failwith ("match_sentence: AltSentence")
116   - (*if List.exists (fun (mode, s) -> mode = CONLL) alts
117   - then info_token (snd (List.find (fun (mode, s) -> mode = CONLL) alts))
118   - else failwith ("match_sentence: no CONLL mode in AltSentence")*) in
119   - let info_token, paths = info_token p_record.psentence in
120   - try
121   - let id, text = StringMap.find info_map info_token in
122   - let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in
123   - AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len;
124   - psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence("", paths)]}]],tokens)]
125   -(* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *)
126   - with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)]
127   -
128   -let match_corpus corpus =
  4 +let alternative_string f mode alts = if List.exists (fun (m,_) -> mode = m) alts
  5 + then f (snd @@ List.find (fun (m,_) -> m = mode) alts)
  6 + else f (snd @@ List.find (fun (m,_) -> m = PreTypes.Struct) alts)
  7 +
  8 +let string_of_token mode token conll_id super label =
  9 + let decompose_lemma = function
  10 + | PreTypes.Lemma(a,b,c) -> a,b,if c = [[]]
  11 + then "_"
  12 + else String.concat "][" @@ Xlist.map c (fun x ->
  13 + String.concat "|" @@ Xlist.map x ( fun y ->
  14 + String.concat "." y))
  15 + | t -> failwith ("string_of_token: not Lemma") in
  16 + match mode with
  17 + | PreTypes.Raw -> token.PreTypes.orth
  18 + | PreTypes.Struct -> failwith ("function string_of_token for mode Struct is not defined")
  19 + | PreTypes.CONLL -> let lemma,cat,interp = decompose_lemma token.PreTypes.token in
  20 + String.concat "\t" [string_of_int conll_id;
  21 + token.PreTypes.orth; lemma; cat; cat; interp; "_"; "_";
  22 + string_of_int token.PreTypes.beg; string_of_int token.PreTypes.len]
  23 + | PreTypes.Mate -> let lemma,cat,interp = decompose_lemma token.PreTypes.token in
  24 + String.concat "\t" [string_of_int conll_id;
  25 + token.PreTypes.orth; lemma; lemma; cat; cat; interp; interp; "_"; "_"; "_"; "_"; "_"; "_"]
  26 + | _ -> failwith "string_of_token: ni"
  27 +
  28 +let string_of_paths mode tokens paths =
  29 + let l = Int.fold 1 (Array.length paths - 1) [] (fun l conll_id ->
  30 + let id,super,label = paths.(conll_id) in
  31 + (string_of_token mode (ExtArray.get tokens id) conll_id super label) :: l) in
  32 + String.concat "\n" (List.rev l) ^ "\n\n"
  33 +
  34 +let rec string_of_sentence mode tokens = function
  35 + RawSentence s -> if mode = Raw then s else ""
  36 + | StructSentence (_,tokens, _) -> failwith ("string_of_sentence: StructSentence") (*String.concat "\n" @@ Xlist.map tokens (fun x -> string_of_token mode x)*)
  37 + | DepSentence (_, paths) -> string_of_paths mode tokens paths
  38 + | QuotedSentences _ -> failwith ("string_of_sentence: QuotedSentences")
  39 + | AltSentence alts -> alternative_string (string_of_sentence mode tokens) mode alts
  40 +
  41 +let string_of_p_record mode tokens p_record =
  42 + (if p_record.pid = "" then "" else p_record.pid ^ "\n") ^
  43 + string_of_sentence mode tokens p_record.psentence
  44 +
  45 +(*let rec string_of_paragraph mode tokens = function
  46 + RawParagraph s -> if mode = Raw then s else ""
  47 + | StructParagraph p_records -> String.concat "\n\n" @@ Xlist.map p_records (string_of_p_record mode tokens)
  48 + | AltParagraph alts -> alternative_string (string_of_paragraph mode) mode alts
  49 +
  50 +let rec string_of_text mode tokens = function
  51 + RawText s -> if mode = Raw then s else ""
  52 + | StructText (paragraphs,_) -> String.concat "\n\n" @@ Xlist.map paragraphs (string_of_paragraph mode tokens)
  53 + | AltText alts -> alternative_string (string_of_text mode) mode alts*)
  54 +
  55 +
  56 +(******************)
  57 +
  58 +let establish_next tokens paths =
  59 + let n = ExtArray.size tokens in
  60 + Int.iter 1 (n - 2) (fun i ->
  61 + let f = ExtArray.get tokens i in
  62 + let s = ExtArray.get tokens (i+1) in
  63 + ExtArray.set tokens i {f with next = s.beg});
  64 + let last = ExtArray.get tokens (n-1) in
  65 + ExtArray.set tokens (n-1) {last with next = last.beg + last.len}
  66 +
  67 +
  68 + (*let rec pom res = function
  69 + h :: t -> let next = if res = []
  70 + then h.beg+h.len
  71 + else (List.hd res).beg in
  72 + pom ({h with next = next} :: res) t
  73 + | [] -> res in
  74 + pom [] rev_tokens*)
  75 +
  76 +let rec establish_for_token i text tokens = function
  77 + (id,_,_) :: t as l->
  78 + let h = ExtArray.get tokens id in
  79 + if Xstring.check_prefix " " text
  80 + then establish_for_token (i+100) (Xstring.cut_prefix " " text) tokens l
  81 + else if Xstring.check_prefix h.orth text
  82 + then
  83 + let n = (List.length @@ Xunicode.utf8_chars_of_utf8_string h.orth) * 100 in
  84 + let n_h = {h with beg = i ; len = n} in
  85 + ExtArray.set tokens id n_h;
  86 + establish_for_token (i+n) (Xstring.cut_prefix h.orth text) tokens t
  87 + else failwith ("establish_for_token :" ^ h.orth ^ " " ^ text)
  88 + | [] -> 100, i
  89 +
  90 +let rec establish_lengths text paths tokens =
  91 + let pbeg, plen = establish_for_token 100 text tokens (List.tl (Array.to_list paths)) in
  92 + establish_next tokens paths;
  93 + pbeg, plen-100
  94 +
  95 +(******************)
  96 +
  97 +exception ErrorInfoFile of string
  98 +
  99 +let info_file = "../corpora/info_sentences.txt"
  100 +
  101 +let info = Xstring.split "\n\n" @@ File.load_file_gen info_file
  102 +
  103 +let add_to_map map info_str =
  104 + match Xstring.split "\n" info_str with
  105 + [id; text; info_token] -> StringMap.add map info_token (id, text)
  106 + | _ -> raise (ErrorInfoFile info_str)
  107 +
  108 +let info_map =
  109 + Xlist.fold info StringMap.empty add_to_map
  110 +
  111 +let match_sentence (p_record,tokens) =
  112 + let rec info_token s = match s with
  113 + RawSentence text -> failwith ("match_sentence: " ^ text)
  114 + | StructSentence (_, tokens, n) -> failwith ("match_sentence: StructSentence") (*String.concat " " @@ List.map (fun x -> x.orth) tokens*)
  115 + | DepSentence (_, paths) -> String.concat " " @@ List.map (fun (id,_,_) -> (ExtArray.get tokens id).orth) (List.tl (Array.to_list paths)), paths
  116 + | QuotedSentences _ -> failwith ("match_sentence: QuotedSentences")
  117 + | AltSentence alts -> failwith ("match_sentence: AltSentence")
  118 + (*if List.exists (fun (mode, s) -> mode = CONLL) alts
  119 + then info_token (snd (List.find (fun (mode, s) -> mode = CONLL) alts))
  120 + else failwith ("match_sentence: no CONLL mode in AltSentence")*) in
  121 + let info_token, paths = info_token p_record.psentence in
  122 + try
  123 + let id, text = StringMap.find info_map info_token in
  124 + let beg, len = establish_lengths text paths tokens (* -1, -1, p_record.psentence *) in
  125 + AltText[Raw,RawText text;CONLL,StructText([StructParagraph[{pid = id; pbeg = beg; plen = len; pnext = beg+len;
  126 + psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence("", paths)]}]],tokens)]
  127 +(* {s_id = id; s_text = text; s_tokens = sentence.s_tokens} *)
  128 + with _ -> AltText[CONLL,StructText([StructParagraph[p_record]],tokens)]
  129 +
  130 +let match_corpus corpus =
129 131 Xlist.map corpus match_sentence
130 132  
131 133 (******************)
... ...
parser/exec.ml
... ... @@ -198,10 +198,10 @@ let conll_parse_sentence timeout test_only_flag id paths tokens =
198 198 let result = {result with lex_time=time3 -. time2} in
199 199 try
200 200 (* print_endline "conll_parse_sentence 1"; *)
201   - LCGlatexOf.print_references "results/" "references1" references;
  201 + (* LCGlatexOf.print_references "results/" "references1" references; *)
202 202 let parsed_dep_chart = LCGchart.dep_parse dep_chart references timeout time_fun in (* uwaga: niejawna zmiana imperatywna w references *)
203 203 (* print_endline "conll_parse_sentence 2"; *)
204   - LCGlatexOf.print_references "results/" "references2" references;
  204 + (* LCGlatexOf.print_references "results/" "references2" references; *)
205 205 let time4 = time_fun () in
206 206 let result = if test_only_flag then result else {result with parsed_dep_chart=parsed_dep_chart} in
207 207 let result = {result with parse_time=time4 -. time3} in
... ... @@ -253,6 +253,10 @@ let conll_parse_sentence timeout test_only_flag id paths tokens =
253 253  
254 254 let mate_in, mate_out = Unix.open_process "java -jar ../dependencyParser/basic/mate-tools/dist/anna-3.5.jar -model ../dependencyParser/basic/mate-tools/examples/160622_Polish_MateParser.mdl -test"
255 255  
  256 +let get_paths = function
  257 + {PreTypes.psentence=PreTypes.DepSentence(_,paths)},_ -> paths
  258 + | _ -> failwith "get_paths"
  259 +
256 260 let rec parse_sentence timeout test_only_flag mode tokens = function
257 261 RawSentence s -> RawSentence s
258 262 | StructSentence(id,paths,last) ->
... ... @@ -272,19 +276,16 @@ let rec parse_sentence timeout test_only_flag mode tokens = function
272 276 let result = {empty_eniam_parse_result with status=Parsed; term=graph} in
273 277 ENIAMSentence result, next_id *)
274 278 | Mate ->
275   - (*print_endline "parse_sentence 1";
276   - let conll = CONLL.string_of_sentence PreTypes.Mate (PreTypes.StructSentence(paths,last)) in
  279 + print_endline "parse_sentence 1";
  280 + let conll = CONLL.string_of_paths PreTypes.Mate tokens paths in
277 281 print_endline "parse_sentence 2";
278   - printf "%s\n" conll;
  282 + printf "|%s|\n" conll;
279 283 Printf.fprintf mate_out "%s\n\n%!" conll;
280 284 print_endline "parse_sentence 3";
281   - let conll = CONLL.load_sentence mate_in in
282   - print_endline "parse_sentence 4";*)
283   - (*konwersja na strukturę danych*)
284   - (* FIXME: tu trzeba wstawić konwersję na tekstowy format CONLL,
285   - uruchomienie MateParser i
286   - powtórną konwersję wyniku. *)
287   - RawSentence ""
  285 + let new_paths = get_paths (CONLL.load_sentence mate_in) in
  286 + print_endline "parse_sentence 4";
  287 + let result = conll_parse_sentence timeout test_only_flag id new_paths tokens in
  288 + CONLLSentence result
288 289 | _ -> failwith "parse_sentence")
289 290 | QuotedSentences sentences ->
290 291 let sentences = Xlist.rev_map sentences (fun p ->
... ...