preSentences.ml
8.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
(*
* ENIAM: Categorial Syntactic-Semantic Parser for Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Printf
open PreTypes
open Xstd
let sentence_beg_selector tokens id =
(ExtArray.get tokens id).token = Interp "<sentence>"
let sentence_mid_selector tokens id =
let t = (ExtArray.get tokens id).token in
t <> Interp "<sentence>" && t <> Interp "</sentence>"
let sentence_end_selector tokens id =
(ExtArray.get tokens id).token = Interp "</sentence>"
let sentence_command ids = Sentence ids
let find_bracket_begs tokens chart last beg_selector =
Int.fold 0 last [] (fun found lnode ->
Xlist.fold chart.(lnode) found (fun found (id,rnode) ->
if beg_selector tokens id then (id,lnode,rnode) :: found else found))
let rec find_bracket_ends tokens chart mid_selector end_selector found nodes =
if IntSet.is_empty nodes then found else
let lnode = IntSet.min_elt nodes in
let nodes = IntSet.remove nodes lnode in
let found,nodes = Xlist.fold chart.(lnode) (found,nodes) (fun (found,nodes) (id,rnode) ->
if end_selector tokens id then (id,lnode,rnode) :: found, nodes else
if mid_selector tokens id then found, IntSet.add nodes rnode else
found, nodes) in
find_bracket_ends tokens chart mid_selector end_selector found nodes
let rec find_bracket_paths tokens chart mid_selector rnode map nodes =
if IntSet.is_empty nodes then failwith "find_bracket_paths" else
let lnode = IntSet.min_elt nodes in
let ids = try IntMap.find map lnode with Not_found -> IntSet.empty in
if lnode = rnode then IntSet.to_list ids else
let nodes = IntSet.remove nodes lnode in
let map,nodes = Xlist.fold chart.(lnode) (map,nodes) (fun (map,nodes) (id,rnode) ->
if not (mid_selector tokens id) then map, nodes else
let ids = IntSet.add ids id in
let map = IntMap.add_inc map rnode ids (fun set -> IntSet.union set ids) in
let nodes = IntSet.add nodes rnode in
map,nodes) in
find_bracket_paths tokens chart mid_selector rnode map nodes
let parse_bracket_rule tokens chart last beg_selector mid_selector end_selector command =
let begs = find_bracket_begs tokens chart last beg_selector in
let found = Xlist.fold begs [] (fun found (beg_id,beg_l,beg_r) ->
let ends = find_bracket_ends tokens chart mid_selector end_selector [] (IntSet.singleton beg_r) in
if ends = [] then failwith "parse_bracket_rule: end not found" else
Xlist.fold ends found (fun found (end_id,end_l,end_r) ->
let ids = find_bracket_paths tokens chart mid_selector end_l IntMap.empty (IntSet.singleton beg_r) in
(beg_id,ids,end_id,beg_l,end_r) :: found)) in
Xlist.iter found (fun (beg_id,ids,end_id,lnode,rnode) ->
let beg_t = ExtArray.get tokens beg_id in
let end_t = ExtArray.get tokens end_id in
let id = ExtArray.add tokens {empty_token with beg=beg_t.beg; len=end_t.beg+end_t.len-beg_t.beg; next=end_t.next;
token=command (*tokens*) (beg_id::end_id::ids)} in
chart.(lnode) <- (id,rnode) :: chart.(lnode))
let find_sentences tokens chart last =
parse_bracket_rule tokens chart last sentence_beg_selector sentence_mid_selector sentence_end_selector sentence_command
(*let find_sentence_begs tokens paths =
List.rev (List.sort compare (IntSet.to_list (Xlist.fold paths IntSet.empty (fun set (id,beg,_) ->
match ExtArray.get tokens id with
{token = Interp "<sentence>"} as t -> IntSet.add set beg
| _ -> set))))
let rec find_sentence2 tokens beg1 found map = function
[] -> found
| (id,beg,next) :: l ->
let t = ExtArray.get tokens id in
if t.token = Interp "<sentence>" || not (IntMap.mem map beg) then find_sentence2 tokens beg1 found map l else
let tokens1 = IntSet.add (IntMap.find map beg) id in
if t.token = Interp "</sentence>" then find_sentence2 tokens beg1 ((beg,t.beg+t.len-beg1,t.next,tokens1) :: found) map l else
find_sentence2 tokens beg1 found (IntMap.add_inc map t.next tokens1 (fun tokens2 -> IntSet.union tokens1 tokens2)) l
let rec find_sentence tokens beg1 found = function
[] -> failwith "find_sentence"
| (id,beg,next) :: l ->
if beg = beg1 && (ExtArray.get tokens id).token = Interp "<sentence>" then
find_sentence beg (find_sentence2 tokens beg found (IntMap.add IntMap.empty next (IntSet.singleton id)) l) l
else
if beg > beg1 then found else find_sentence tokens beg1 found l
let find_sentences tokens (paths,last) =
(* print_endline (PrePaths.to_string (PrePaths.sort (paths,last))); *)
let begs = find_sentence_begs tokens paths in
Xlist.fold begs (paths,last) (fun (paths,last) beg ->
(* Printf.printf "BEG=%d\n%!" beg; *)
let paths,last = PrePaths.sort (paths,last) in
let found = find_sentence tokens beg [] paths in
let paths = Xlist.fold found paths (fun new_paths (beg,len,next,set) ->
let sentence = Xlist.fold paths [] (fun sentence t -> if IntSet.mem set t.id then t :: sentence else sentence) in
(* printf "SENTENCE beg=%d len=%d next=%d\n%!" beg len next; *)
(* print_endline (PrePaths.to_string (PrePaths.sort (sentence,beg+len))); *)
let sentence,last = PrePaths.sort (sentence,beg+len) in
let n = ExtArray.add tokens {empty_token with beg=beg; len=len; next=next; token=Sentence(sentence,last)} in
(n,beg,next) :: new_paths) in
paths, last)*)
let is_sentence = function
Sentence _ -> true
| _ -> false
let get_sentence t =
match t.token with
Sentence(paths,last) -> paths,last
| _ -> failwith "get_sentence"
let rec find_query2 found map = function
[] -> found
| t :: l ->
if not (IntMap.mem map t.beg) then find_query2 found map l else
if t.token = Interp "</query>" then find_query2 ((IntMap.find map t.beg) :: found) map l else
if not (is_sentence t.token) then find_query2 found map l else
let tokens = IntSet.add (IntMap.find map t.beg) t.id in
find_query2 found (IntMap.add_inc map t.next tokens (fun tokens2 -> IntSet.union tokens tokens2)) l
let rec find_query found = function
[] -> failwith "find_query"
| t :: l ->
if t.beg = 0 && t.token = Interp "<query>" then
find_query2 found (IntMap.add IntMap.empty t.next IntSet.empty) l
else
if t.beg > 0 then found else find_query found l
let get_raw_sentence a beg len =
if beg mod factor <> 0 then failwith ("get_raw_sentence: beg " ^ string_of_int beg) else
if len mod factor <> 0 then failwith ("get_raw_sentence: len " ^ string_of_int len) else
let buf = Buffer.create 512 in
Int.iter (beg / factor - 1) (beg / factor + len / factor - 2) (fun i ->
(* printf "%d" i; printf " %s\n%!" a.(i); *)
Buffer.add_string buf a.(i));
Buffer.contents buf
let add_struct_sentence_ids sentences =
match sentences with
[{psentence=AltSentence[Raw,s;ENIAM,StructSentence(_,paths,last)]} as p] ->
[{p with psentence=AltSentence[Raw,s;ENIAM,StructSentence("E",paths,last)]}]
| _ -> fst (Xlist.fold sentences ([],1) (fun (l,n) -> function
{psentence=AltSentence[Raw,s;ENIAM,StructSentence(_,paths,last)]} as p ->
{p with psentence=AltSentence[Raw,s;ENIAM,StructSentence("E" ^ string_of_int n,paths,last)]} :: l, n+1
| _ -> failwith "add_struct_sentence_ids"))
let extract_sentences par (paths,last) =
let par = Array.of_list (Xunicode.utf8_chars_of_utf8_string par) in
let paths,last = PrePaths.sort (paths,last) in
let found = find_query [] paths in
let pars = Xlist.fold found [] (fun pars set ->
Xlist.fold paths [] (fun sentences t -> if IntSet.mem set t.id then
let paths,last = get_sentence t in
{pid=string_of_int t.id; pbeg=t.beg; plen=t.len;
psentence=AltSentence[Raw,RawSentence (get_raw_sentence par t.beg t.len);
ENIAM,StructSentence("",paths,(*last*)10)]} :: sentences else sentences) :: pars) in (* FIXME: (*last*)10 !!!! *)
match pars with
[sentences] -> add_struct_sentence_ids sentences
| _ -> failwith "extract_sentences"