preSentences.ml
8.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
(*
* ENIAM: Categorial Syntactic-Semantic Parser for Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Printf
open PreTypes
open Xstd
let find_bracket_begs tokens chart last beg_selector =
Int.fold 0 last [] (fun found lnode ->
Xlist.fold chart.(lnode) found (fun found (id,rnode) ->
if beg_selector tokens id then (id,lnode,rnode) :: found else found))
let rec find_bracket_ends tokens chart mid_selector end_selector found nodes =
if IntSet.is_empty nodes then found else
let lnode = IntSet.min_elt nodes in
let nodes = IntSet.remove nodes lnode in
let found,nodes = Xlist.fold chart.(lnode) (found,nodes) (fun (found,nodes) (id,rnode) ->
if end_selector tokens id then (id,lnode,rnode) :: found, nodes else
if mid_selector tokens id then found, IntSet.add nodes rnode else
found, nodes) in
find_bracket_ends tokens chart mid_selector end_selector found nodes
let rec find_bracket_paths tokens chart mid_selector rnode map nodes =
if IntSet.is_empty nodes then failwith "find_bracket_paths" else
let lnode = IntSet.min_elt nodes in
let ids = try IntMap.find map lnode with Not_found -> IntSet.empty in
if lnode = rnode then IntSet.to_list ids else
let nodes = IntSet.remove nodes lnode in
let map,nodes = Xlist.fold chart.(lnode) (map,nodes) (fun (map,nodes) (id,rnode) ->
if not (mid_selector tokens id) then map, nodes else
let ids = IntSet.add ids id in
let map = IntMap.add_inc map rnode ids (fun set -> IntSet.union set ids) in
let nodes = IntSet.add nodes rnode in
map,nodes) in
find_bracket_paths tokens chart mid_selector rnode map nodes
let get_raw_sentence a beg len =
let next = beg + len in
let beg = beg / factor + (if beg mod factor < 50 then 0 else 1) in
let next = next / factor + (if next mod factor < 50 then 0 else 1) in
(* if beg mod factor <> 0 then failwith ("get_raw_sentence: beg " ^ string_of_int beg) else
if len mod factor <> 0 then failwith ("get_raw_sentence: len " ^ string_of_int len) else *)
let buf = Buffer.create 512 in
(* Int.iter (beg / factor - 1) (beg / factor + len / factor - 2) (fun i -> *)
Int.iter beg (next - 1) (fun i ->
(* printf "%d" i; printf " %s\n%!" a.(i); *)
Buffer.add_string buf a.(i)(*try a.(i) with _ -> "<" ^ string_of_int i ^ ">"*));
Buffer.contents buf
let parse_bracket_rule paragraph tokens chart last beg_selector mid_selector end_selector command =
let begs = find_bracket_begs tokens chart last beg_selector in
let found = Xlist.fold begs [] (fun found (beg_id,beg_l,beg_r) ->
let ends = find_bracket_ends tokens chart mid_selector end_selector [] (IntSet.singleton beg_r) in
(* if ends = [] then failwith "parse_bracket_rule: end not found" else *)
Xlist.fold ends found (fun found (end_id,end_l,end_r) ->
let ids = find_bracket_paths tokens chart mid_selector end_l IntMap.empty (IntSet.singleton beg_r) in
(beg_id,ids,end_id,beg_l,end_r) :: found)) in
Xlist.iter found (fun (beg_id,ids,end_id,lnode,rnode) ->
let beg_t = ExtArray.get tokens beg_id in
let end_t = ExtArray.get tokens end_id in
let beg = beg_t.beg in
let len = end_t.beg+end_t.len-beg_t.beg in
let id = ExtArray.add tokens {empty_token with orth=get_raw_sentence paragraph beg len;
beg=beg; len=len; next=end_t.next;
token=command (*tokens*) (beg_id::end_id::ids)} in
chart.(lnode) <- (id,rnode) :: chart.(lnode))
let find_sentence paragraph tokens chart last =
parse_bracket_rule paragraph tokens chart last
(fun tokens id -> (ExtArray.get tokens id).token = Interp "<sentence>")
(fun tokens id ->
let t = (ExtArray.get tokens id).token in
t <> Interp "<sentence>" && t <> Interp "</sentence>")
(fun tokens id -> (ExtArray.get tokens id).token = Interp "</sentence>")
(fun ids -> Tokens("sentence",ids))
let find_quoted_sentences paragraph tokens chart last =
parse_bracket_rule paragraph tokens chart last
(fun tokens id -> (ExtArray.get tokens id).token = Interp "„s")
(fun tokens id ->
match (ExtArray.get tokens id).token with
Tokens("sentence",_) -> true
| _ -> false)
(fun tokens id -> (ExtArray.get tokens id).token = Interp "”s")
(fun ids -> Tokens("quoted_sentences",ids))
let find_query paragraph tokens chart last =
parse_bracket_rule paragraph tokens chart last
(fun tokens id -> (ExtArray.get tokens id).token = Interp "<query>")
(fun tokens id ->
match (ExtArray.get tokens id).token with
Tokens("sentence",_) -> true
| Tokens("quoted_sentences",_) -> true
| _ -> false)
(fun tokens id -> (ExtArray.get tokens id).token = Interp "</query>")
(fun ids -> Tokens("query",ids))
let extract_sentences paragraph tokens chart last =
(* let paragraph = Array.of_list (Xunicode.utf8_chars_of_utf8_string paragraph) in *)
let paths = Int.fold 0 last [] (fun paths lnode ->
Xlist.fold chart.(lnode) paths (fun paths (id,rnode) ->
(id,lnode,rnode) :: paths)) in
[{pid="xx"; pbeg=0; plen=0;
psentence=AltSentence[Raw,RawSentence paragraph;
ENIAM,StructSentence("",paths,last)]}]
(*
let is_sentence = function
Sentence _ -> true
| _ -> false
let get_sentence t =
match t.token with
Sentence(paths,last) -> paths,last
| _ -> failwith "get_sentence"
let rec find_query2 found map = function
[] -> found
| t :: l ->
if not (IntMap.mem map t.beg) then find_query2 found map l else
if t.token = Interp "</query>" then find_query2 ((IntMap.find map t.beg) :: found) map l else
if not (is_sentence t.token) then find_query2 found map l else
let tokens = IntSet.add (IntMap.find map t.beg) t.id in
find_query2 found (IntMap.add_inc map t.next tokens (fun tokens2 -> IntSet.union tokens tokens2)) l
let rec find_query found = function
[] -> failwith "find_query"
| t :: l ->
if t.beg = 0 && t.token = Interp "<query>" then
find_query2 found (IntMap.add IntMap.empty t.next IntSet.empty) l
else
if t.beg > 0 then found else find_query found l
let add_struct_sentence_ids sentences =
match sentences with
[{psentence=AltSentence[Raw,s;ENIAM,StructSentence(_,paths,last)]} as p] ->
[{p with psentence=AltSentence[Raw,s;ENIAM,StructSentence("E",paths,last)]}]
| _ -> fst (Xlist.fold sentences ([],1) (fun (l,n) -> function
{psentence=AltSentence[Raw,s;ENIAM,StructSentence(_,paths,last)]} as p ->
{p with psentence=AltSentence[Raw,s;ENIAM,StructSentence("E" ^ string_of_int n,paths,last)]} :: l, n+1
| _ -> failwith "add_struct_sentence_ids"))
let extract_sentences par (paths,last) =
let par = Array.of_list (Xunicode.utf8_chars_of_utf8_string par) in
let paths,last = PrePaths.sort (paths,last) in
let found = find_query [] paths in
let pars = Xlist.fold found [] (fun pars set ->
Xlist.fold paths [] (fun sentences t -> if IntSet.mem set t.id then
let paths,last = get_sentence t in
{pid=string_of_int t.id; pbeg=t.beg; plen=t.len;
psentence=AltSentence[Raw,RawSentence (get_raw_sentence par t.beg t.len);
ENIAM,StructSentence("",paths,(*last*)10)]} :: sentences else sentences) :: pars) in (* FIXME: (*last*)10 !!!! *)
match pars with
[sentences] -> add_struct_sentence_ids sentences
| _ -> failwith "extract_sentences"
*)