preSentences.ml
4.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
(*
* ENIAM: Categorial Syntactic-Semantic Parser for Polish
* Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
* Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*)
open Printf
open PreTypes
open Xstd
let find_sentence_begs paths =
List.rev (List.sort compare (IntSet.to_list (Xlist.fold paths IntSet.empty (fun set -> function
{token = Interp "<sentence>"} as t -> IntSet.add set t.beg
| _ -> set))))
let rec find_sentence2 beg found map = function
[] -> found
| t :: l ->
if t.token = Interp "<sentence>" || not (IntMap.mem map t.beg) then find_sentence2 beg found map l else
let tokens = IntSet.add (IntMap.find map t.beg) t.id in
if t.token = Interp "</sentence>" then find_sentence2 beg ((beg,t.beg+t.len-beg,t.next,tokens) :: found) map l else
find_sentence2 beg found (IntMap.add_inc map t.next tokens (fun tokens2 -> IntSet.union tokens tokens2)) l
let rec find_sentence beg found = function
[] -> failwith "find_sentence"
| t :: l ->
if t.beg = beg && t.token = Interp "<sentence>" then
find_sentence beg (find_sentence2 beg found (IntMap.add IntMap.empty t.next (IntSet.singleton t.id)) l) l
else
if t.beg > beg then found else find_sentence beg found l
let find_sentences (paths,last) next_id =
print_endline (PrePaths.to_string (PrePaths.sort (paths,last)));
let begs = find_sentence_begs paths in
Xlist.fold begs (paths,last,next_id) (fun (paths,last,next_id) beg ->
(* Printf.printf "BEG=%d\n%!" beg; *)
let paths,last = PrePaths.sort (paths,last) in
let found = find_sentence beg [] paths in
let paths,next_id = Xlist.fold found (paths,next_id) (fun (new_paths,next_id) (beg,len,next,set) ->
let sentence = Xlist.fold paths [] (fun sentence t -> if IntSet.mem set t.id then t :: sentence else sentence) in
(* printf "SENTENCE beg=%d len=%d next=%d\n%!" beg len next; *)
(* print_endline (PrePaths.to_string (PrePaths.sort (sentence,beg+len))); *)
let sentence,last = PrePaths.sort (sentence,beg+len) in
{empty_token with beg=beg; len=len; next=next; token=Sentence(sentence,last); id=next_id} :: new_paths, next_id+1) in
paths, last, next_id)
let is_sentence = function
Sentence _ -> true
| _ -> false
let get_sentence t =
match t.token with
Sentence(paths,last) -> paths,last
| _ -> failwith "get_sentence"
let rec find_query2 found map = function
[] -> found
| t :: l ->
if not (IntMap.mem map t.beg) then find_query2 found map l else
if t.token = Interp "</query>" then find_query2 ((IntMap.find map t.beg) :: found) map l else
if not (is_sentence t.token) then find_query2 found map l else
let tokens = IntSet.add (IntMap.find map t.beg) t.id in
find_query2 found (IntMap.add_inc map t.next tokens (fun tokens2 -> IntSet.union tokens tokens2)) l
let rec find_query found = function
[] -> failwith "find_query"
| t :: l ->
if t.beg = 0 && t.token = Interp "<query>" then
find_query2 found (IntMap.add IntMap.empty t.next IntSet.empty) l
else
if t.beg > 0 then found else find_query found l
let get_raw_sentence a beg len =
if beg mod factor <> 0 then failwith ("get_raw_sentence: beg " ^ string_of_int beg) else
if len mod factor <> 0 then failwith ("get_raw_sentence: len " ^ string_of_int len) else
let buf = Buffer.create 512 in
Int.iter (beg / factor - 1) (beg / factor + len / factor - 2) (fun i ->
printf "%d" i; printf " %s\n%!" a.(i);
Buffer.add_string buf a.(i));
Buffer.contents buf
let extract_sentences par (paths,last) =
let par = Array.of_list (Xunicode.utf8_chars_of_utf8_string par) in
let paths,last = PrePaths.sort (paths,last) in
let found = find_query [] paths in
let pars = Xlist.fold found [] (fun pars set ->
Xlist.fold paths [] (fun sentences t -> if IntSet.mem set t.id then
let paths,last = get_sentence t in
{pid=string_of_int t.id; pbeg=t.beg; plen=t.len;
psentence=AltSentence[Raw,RawSentence (get_raw_sentence par t.beg t.len);
Struct,StructSentence(paths,last)]} :: sentences else sentences) :: pars) in
match pars with
[sentences] -> sentences
| _ -> failwith "extract_sentences"