preSentences.ml 8.59 KB
(*
 *  ENIAM: Categorial Syntactic-Semantic Parser for Polish
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open Printf
open PreTypes
open Xstd

let sentence_beg_selector tokens id =
  (ExtArray.get tokens id).token = Interp "<sentence>"

let sentence_mid_selector tokens id =
  let t = (ExtArray.get tokens id).token in
  t <> Interp "<sentence>" && t <> Interp "</sentence>"

let sentence_end_selector tokens id =
  (ExtArray.get tokens id).token = Interp "</sentence>"

let sentence_command ids = Sentence ids

let find_bracket_begs tokens chart last beg_selector =
  Int.fold 0 last [] (fun found lnode ->
    Xlist.fold chart.(lnode) found (fun found (id,rnode) ->
      if beg_selector tokens id then (id,lnode,rnode) :: found else found))

let rec find_bracket_ends tokens chart mid_selector end_selector found nodes =
  if IntSet.is_empty nodes then found else
  let lnode = IntSet.min_elt nodes in
  let nodes = IntSet.remove nodes lnode in
  let found,nodes = Xlist.fold chart.(lnode) (found,nodes) (fun (found,nodes) (id,rnode) ->
    if end_selector tokens id then (id,lnode,rnode) :: found, nodes else
    if mid_selector tokens id then found, IntSet.add nodes rnode else
    found, nodes) in
  find_bracket_ends tokens chart mid_selector end_selector found nodes

let rec find_bracket_paths tokens chart mid_selector rnode map nodes =
  if IntSet.is_empty nodes then failwith "find_bracket_paths" else
  let lnode = IntSet.min_elt nodes in
  let ids = try IntMap.find map lnode with Not_found -> IntSet.empty in
  if lnode = rnode then IntSet.to_list ids else
  let nodes = IntSet.remove nodes lnode in
  let map,nodes = Xlist.fold chart.(lnode) (map,nodes) (fun (map,nodes) (id,rnode) ->
    if not (mid_selector tokens id) then map, nodes else
    let ids = IntSet.add ids id in
    let map = IntMap.add_inc map rnode ids (fun set -> IntSet.union set ids) in
    let nodes = IntSet.add nodes rnode in
    map,nodes) in
  find_bracket_paths tokens chart mid_selector rnode map nodes

let parse_bracket_rule tokens chart last beg_selector mid_selector end_selector command =
  let begs = find_bracket_begs tokens chart last beg_selector in
  let found = Xlist.fold begs [] (fun found (beg_id,beg_l,beg_r) ->
    let ends = find_bracket_ends tokens chart mid_selector end_selector [] (IntSet.singleton beg_r) in
    if ends = [] then failwith "parse_bracket_rule: end not found" else
    Xlist.fold ends found (fun found (end_id,end_l,end_r) ->
      let ids = find_bracket_paths tokens chart mid_selector end_l IntMap.empty (IntSet.singleton beg_r) in
      (beg_id,ids,end_id,beg_l,end_r) :: found)) in
  Xlist.iter found (fun (beg_id,ids,end_id,lnode,rnode) ->
    let beg_t = ExtArray.get tokens beg_id in
    let end_t = ExtArray.get tokens end_id in
    let id = ExtArray.add tokens {empty_token with beg=beg_t.beg; len=end_t.beg+end_t.len-beg_t.beg; next=end_t.next;
                                            token=command (*tokens*) (beg_id::end_id::ids)} in
    chart.(lnode) <- (id,rnode) :: chart.(lnode))

let find_sentences tokens chart last =
  parse_bracket_rule tokens chart last sentence_beg_selector sentence_mid_selector sentence_end_selector sentence_command

(*let find_sentence_begs tokens paths =
  List.rev (List.sort compare (IntSet.to_list (Xlist.fold paths IntSet.empty (fun set (id,beg,_) ->
    match ExtArray.get tokens id with
      {token = Interp "<sentence>"} as t -> IntSet.add set beg
    | _ -> set))))

let rec find_sentence2 tokens beg1 found map = function
    [] -> found
  | (id,beg,next) :: l ->
      let t = ExtArray.get tokens id in
      if t.token = Interp "<sentence>" || not (IntMap.mem map beg) then find_sentence2 tokens beg1 found map l else
      let tokens1 = IntSet.add (IntMap.find map beg) id in
      if t.token = Interp "</sentence>" then find_sentence2 tokens beg1 ((beg,t.beg+t.len-beg1,t.next,tokens1) :: found) map l else
      find_sentence2 tokens beg1 found (IntMap.add_inc map t.next tokens1 (fun tokens2 -> IntSet.union tokens1 tokens2)) l

let rec find_sentence tokens beg1 found = function
    [] -> failwith "find_sentence"
  | (id,beg,next) :: l ->
      if beg = beg1 && (ExtArray.get tokens id).token = Interp "<sentence>" then
        find_sentence beg (find_sentence2 tokens beg found (IntMap.add IntMap.empty next (IntSet.singleton id)) l) l
      else
        if beg > beg1 then found else find_sentence tokens beg1 found l

let find_sentences tokens (paths,last) =
  (* print_endline (PrePaths.to_string (PrePaths.sort (paths,last))); *)
  let begs = find_sentence_begs tokens paths in
  Xlist.fold begs (paths,last) (fun (paths,last) beg ->
    (* Printf.printf "BEG=%d\n%!" beg; *)
    let paths,last = PrePaths.sort (paths,last) in
    let found = find_sentence tokens beg [] paths in
    let paths = Xlist.fold found paths (fun new_paths (beg,len,next,set) ->
      let sentence = Xlist.fold paths [] (fun sentence t -> if IntSet.mem set t.id then t :: sentence else sentence) in
      (* printf "SENTENCE beg=%d len=%d next=%d\n%!" beg len next; *)
      (* print_endline (PrePaths.to_string (PrePaths.sort (sentence,beg+len))); *)
      let sentence,last = PrePaths.sort (sentence,beg+len) in
      let n = ExtArray.add tokens {empty_token with beg=beg; len=len; next=next; token=Sentence(sentence,last)} in
      (n,beg,next) :: new_paths) in
    paths, last)*)

let is_sentence = function
    Sentence _ -> true
  | _ -> false

let get_sentence t =
  match t.token with
    Sentence(paths,last) -> paths,last
  | _ -> failwith "get_sentence"

let rec find_query2 found map = function
    [] -> found
  | t :: l ->
      if not (IntMap.mem map t.beg) then find_query2 found map l else
      if t.token = Interp "</query>" then find_query2 ((IntMap.find map t.beg) :: found) map l else
      if not (is_sentence t.token) then find_query2 found map l else
      let tokens = IntSet.add (IntMap.find map t.beg) t.id in
      find_query2 found (IntMap.add_inc map t.next tokens (fun tokens2 -> IntSet.union tokens tokens2)) l

let rec find_query found = function
    [] -> failwith "find_query"
  | t :: l ->
      if t.beg = 0 && t.token = Interp "<query>" then
        find_query2 found (IntMap.add IntMap.empty t.next IntSet.empty) l
      else
       if t.beg > 0 then found else find_query found l

let get_raw_sentence a beg len =
  if beg mod factor <> 0 then failwith ("get_raw_sentence: beg " ^ string_of_int beg) else
  if len mod factor <> 0 then failwith ("get_raw_sentence: len " ^ string_of_int len) else
  let buf = Buffer.create 512 in
  Int.iter (beg / factor - 1) (beg / factor + len / factor - 2) (fun i ->
    (* printf "%d" i; printf " %s\n%!" a.(i); *)
    Buffer.add_string buf a.(i));
  Buffer.contents buf

let add_struct_sentence_ids sentences =
  match sentences with
    [{psentence=AltSentence[Raw,s;ENIAM,StructSentence(_,paths,last)]} as p] ->
        [{p with psentence=AltSentence[Raw,s;ENIAM,StructSentence("E",paths,last)]}]
  | _ -> fst (Xlist.fold sentences ([],1) (fun (l,n) -> function
        {psentence=AltSentence[Raw,s;ENIAM,StructSentence(_,paths,last)]} as p ->
           {p with psentence=AltSentence[Raw,s;ENIAM,StructSentence("E" ^ string_of_int n,paths,last)]} :: l, n+1
      | _ -> failwith "add_struct_sentence_ids"))

let extract_sentences par (paths,last) =
  let par = Array.of_list (Xunicode.utf8_chars_of_utf8_string par) in
  let paths,last = PrePaths.sort (paths,last) in
  let found = find_query [] paths in
  let pars = Xlist.fold found [] (fun pars set ->
    Xlist.fold paths [] (fun sentences t -> if IntSet.mem set t.id then
      let paths,last = get_sentence t in
      {pid=string_of_int t.id; pbeg=t.beg; plen=t.len;
       psentence=AltSentence[Raw,RawSentence (get_raw_sentence par t.beg t.len);
                             ENIAM,StructSentence("",paths,(*last*)10)]} :: sentences else sentences) :: pars) in (* FIXME: (*last*)10 !!!! *)
  match pars with
    [sentences] -> add_struct_sentence_ids sentences
  | _ -> failwith "extract_sentences"