preSentences.ml 4.85 KB
(*
 *  ENIAM: Categorial Syntactic-Semantic Parser for Polish
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open Printf
open PreTypes
open Xstd

let find_sentence_begs paths =
  List.rev (List.sort compare (IntSet.to_list (Xlist.fold paths IntSet.empty (fun set -> function
    {token = Interp "<sentence>"} as t -> IntSet.add set t.beg
  | _ -> set))))

let rec find_sentence2 beg found map = function
    [] -> found
  | t :: l ->
      if t.token = Interp "<sentence>" || not (IntMap.mem map t.beg) then find_sentence2 beg found map l else
      let tokens = IntSet.add (IntMap.find map t.beg) t.id in
      if t.token = Interp "</sentence>" then find_sentence2 beg ((beg,t.beg+t.len-beg,t.next,tokens) :: found) map l else
      find_sentence2 beg found (IntMap.add_inc map t.next tokens (fun tokens2 -> IntSet.union tokens tokens2)) l

let rec find_sentence beg found = function
    [] -> failwith "find_sentence"
  | t :: l ->
      if t.beg = beg && t.token = Interp "<sentence>" then
        find_sentence beg (find_sentence2 beg found (IntMap.add IntMap.empty t.next (IntSet.singleton t.id)) l) l
      else
        if t.beg > beg then found else find_sentence beg found l

let find_sentences (paths,last) next_id =
  print_endline (PrePaths.to_string (PrePaths.sort (paths,last)));
  let begs = find_sentence_begs paths in
  Xlist.fold begs (paths,last,next_id) (fun (paths,last,next_id) beg ->
    (* Printf.printf "BEG=%d\n%!" beg; *)
    let paths,last = PrePaths.sort (paths,last) in
    let found = find_sentence beg [] paths in
    let paths,next_id = Xlist.fold found (paths,next_id) (fun (new_paths,next_id) (beg,len,next,set) ->
      let sentence = Xlist.fold paths [] (fun sentence t -> if IntSet.mem set t.id then t :: sentence else sentence) in
      (* printf "SENTENCE beg=%d len=%d next=%d\n%!" beg len next; *)
      (* print_endline (PrePaths.to_string (PrePaths.sort (sentence,beg+len))); *)
      let sentence,last = PrePaths.sort (sentence,beg+len) in
      {empty_token with beg=beg; len=len; next=next; token=Sentence(sentence,last); id=next_id} :: new_paths, next_id+1) in
    paths, last, next_id)

let is_sentence = function
    Sentence _ -> true
  | _ -> false

let get_sentence t =
  match t.token with
    Sentence(paths,last) -> paths,last
  | _ -> failwith "get_sentence"

let rec find_query2 found map = function
    [] -> found
  | t :: l ->
      if not (IntMap.mem map t.beg) then find_query2 found map l else
      if t.token = Interp "</query>" then find_query2 ((IntMap.find map t.beg) :: found) map l else
      if not (is_sentence t.token) then find_query2 found map l else
      let tokens = IntSet.add (IntMap.find map t.beg) t.id in
      find_query2 found (IntMap.add_inc map t.next tokens (fun tokens2 -> IntSet.union tokens tokens2)) l

let rec find_query found = function
    [] -> failwith "find_query"
  | t :: l ->
      if t.beg = 0 && t.token = Interp "<query>" then
        find_query2 found (IntMap.add IntMap.empty t.next IntSet.empty) l
      else
       if t.beg > 0 then found else find_query found l

let get_raw_sentence a beg len =
  if beg mod factor <> 0 then failwith ("get_raw_sentence: beg " ^ string_of_int beg) else
  if len mod factor <> 0 then failwith ("get_raw_sentence: len " ^ string_of_int len) else
  let buf = Buffer.create 512 in
  Int.iter (beg / factor - 1) (beg / factor + len / factor - 2) (fun i ->
    printf "%d" i; printf " %s\n%!" a.(i);
    Buffer.add_string buf a.(i));
  Buffer.contents buf

let extract_sentences par (paths,last) =
  let par = Array.of_list (Xunicode.utf8_chars_of_utf8_string par) in
  let paths,last = PrePaths.sort (paths,last) in
  let found = find_query [] paths in
  let pars = Xlist.fold found [] (fun pars set ->
    Xlist.fold paths [] (fun sentences t -> if IntSet.mem set t.id then
      let paths,last = get_sentence t in
      {pid=string_of_int t.id; pbeg=t.beg; plen=t.len;
       psentence=AltSentence[Raw,RawSentence (get_raw_sentence par t.beg t.len);
                             Struct,StructSentence(paths,last)]} :: sentences else sentences) :: pars) in
  match pars with
    [sentences] -> sentences
  | _ -> failwith "extract_sentences"