preSentences.ml 8.07 KB
(*
 *  ENIAM: Categorial Syntactic-Semantic Parser for Polish
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open Printf
open PreTypes
open Xstd

let find_bracket_begs tokens chart last beg_selector =
  Int.fold 0 last [] (fun found lnode ->
    Xlist.fold chart.(lnode) found (fun found (id,rnode) ->
      if beg_selector tokens id then (id,lnode,rnode) :: found else found))

let rec find_bracket_ends tokens chart mid_selector end_selector found nodes =
  if IntSet.is_empty nodes then found else
  let lnode = IntSet.min_elt nodes in
  let nodes = IntSet.remove nodes lnode in
  let found,nodes = Xlist.fold chart.(lnode) (found,nodes) (fun (found,nodes) (id,rnode) ->
    if end_selector tokens id then (id,lnode,rnode) :: found, nodes else
    if mid_selector tokens id then found, IntSet.add nodes rnode else
    found, nodes) in
  find_bracket_ends tokens chart mid_selector end_selector found nodes

let rec find_bracket_paths tokens chart mid_selector rnode map nodes =
  if IntSet.is_empty nodes then failwith "find_bracket_paths" else
  let lnode = IntSet.min_elt nodes in
  let ids = try IntMap.find map lnode with Not_found -> IntSet.empty in
  if lnode = rnode then IntSet.to_list ids else
  let nodes = IntSet.remove nodes lnode in
  let map,nodes = Xlist.fold chart.(lnode) (map,nodes) (fun (map,nodes) (id,rnode) ->
    if not (mid_selector tokens id) then map, nodes else
    let ids = IntSet.add ids id in
    let map = IntMap.add_inc map rnode ids (fun set -> IntSet.union set ids) in
    let nodes = IntSet.add nodes rnode in
    map,nodes) in
  find_bracket_paths tokens chart mid_selector rnode map nodes

let get_raw_sentence a beg len =
  let next = beg + len in
  let beg = beg / factor + (if beg mod factor < 50 then 0 else 1) in
  let next = next / factor + (if next mod factor < 50 then 0 else 1) in
  (* if beg mod factor <> 0 then failwith ("get_raw_sentence: beg " ^ string_of_int beg) else
  if len mod factor <> 0 then failwith ("get_raw_sentence: len " ^ string_of_int len) else *)
  let buf = Buffer.create 512 in
  (* Int.iter (beg / factor - 1) (beg / factor + len / factor - 2) (fun i -> *)
  Int.iter beg (next - 1) (fun i ->
    (* printf "%d" i; printf " %s\n%!" a.(i); *)
    Buffer.add_string buf  a.(i)(*try a.(i) with _ -> "<" ^ string_of_int i ^ ">"*));
  Buffer.contents buf

let parse_bracket_rule paragraph tokens chart last beg_selector mid_selector end_selector command =
  let begs = find_bracket_begs tokens chart last beg_selector in
  let found = Xlist.fold begs [] (fun found (beg_id,beg_l,beg_r) ->
    let ends = find_bracket_ends tokens chart mid_selector end_selector [] (IntSet.singleton beg_r) in
    (* if ends = [] then failwith "parse_bracket_rule: end not found" else *)
    Xlist.fold ends found (fun found (end_id,end_l,end_r) ->
      let ids = find_bracket_paths tokens chart mid_selector end_l IntMap.empty (IntSet.singleton beg_r) in
      (beg_id,ids,end_id,beg_l,end_r) :: found)) in
  Xlist.iter found (fun (beg_id,ids,end_id,lnode,rnode) ->
    let beg_t = ExtArray.get tokens beg_id in
    let end_t = ExtArray.get tokens end_id in
    let beg = beg_t.beg in
    let len = end_t.beg+end_t.len-beg_t.beg in
    let id = ExtArray.add tokens {empty_token with orth=get_raw_sentence paragraph beg len;
                                                   beg=beg; len=len; next=end_t.next;
                                                   token=command (*tokens*) (beg_id::end_id::ids)} in
    chart.(lnode) <- (id,rnode) :: chart.(lnode))

let find_sentence paragraph tokens chart last =
  parse_bracket_rule paragraph tokens chart last
    (fun tokens id -> (ExtArray.get tokens id).token = Interp "<sentence>")
    (fun tokens id ->
      let t = (ExtArray.get tokens id).token in
      t <> Interp "<sentence>" && t <> Interp "</sentence>")
    (fun tokens id -> (ExtArray.get tokens id).token = Interp "</sentence>")
    (fun ids -> Tokens("sentence",ids))

let find_quoted_sentences paragraph tokens chart last =
  parse_bracket_rule paragraph tokens chart last
    (fun tokens id -> (ExtArray.get tokens id).token = Interp "„s")
    (fun tokens id ->
      match (ExtArray.get tokens id).token with
        Tokens("sentence",_) -> true
      | _ -> false)
    (fun tokens id -> (ExtArray.get tokens id).token = Interp "”s")
    (fun ids -> Tokens("quoted_sentences",ids))

let find_query paragraph tokens chart last =
  parse_bracket_rule paragraph tokens chart last
    (fun tokens id -> (ExtArray.get tokens id).token = Interp "<query>")
    (fun tokens id ->
      match (ExtArray.get tokens id).token with
        Tokens("sentence",_) -> true
      | Tokens("quoted_sentences",_) -> true
      | _ -> false)
    (fun tokens id -> (ExtArray.get tokens id).token = Interp "</query>")
    (fun ids -> Tokens("query",ids))

let extract_sentences paragraph tokens chart last =
  (* let paragraph = Array.of_list (Xunicode.utf8_chars_of_utf8_string paragraph) in *)
  let paths = Int.fold 0 last [] (fun paths lnode ->
    Xlist.fold chart.(lnode) paths (fun paths (id,rnode) ->
      (id,lnode,rnode) :: paths)) in
  [{pid="xx"; pbeg=0; plen=0;
    psentence=AltSentence[Raw,RawSentence paragraph;
                          ENIAM,StructSentence("",paths,last)]}]

(*
let is_sentence = function
    Sentence _ -> true
  | _ -> false

let get_sentence t =
  match t.token with
    Sentence(paths,last) -> paths,last
  | _ -> failwith "get_sentence"

let rec find_query2 found map = function
    [] -> found
  | t :: l ->
      if not (IntMap.mem map t.beg) then find_query2 found map l else
      if t.token = Interp "</query>" then find_query2 ((IntMap.find map t.beg) :: found) map l else
      if not (is_sentence t.token) then find_query2 found map l else
      let tokens = IntSet.add (IntMap.find map t.beg) t.id in
      find_query2 found (IntMap.add_inc map t.next tokens (fun tokens2 -> IntSet.union tokens tokens2)) l

let rec find_query found = function
    [] -> failwith "find_query"
  | t :: l ->
      if t.beg = 0 && t.token = Interp "<query>" then
        find_query2 found (IntMap.add IntMap.empty t.next IntSet.empty) l
      else
       if t.beg > 0 then found else find_query found l

let add_struct_sentence_ids sentences =
  match sentences with
    [{psentence=AltSentence[Raw,s;ENIAM,StructSentence(_,paths,last)]} as p] ->
        [{p with psentence=AltSentence[Raw,s;ENIAM,StructSentence("E",paths,last)]}]
  | _ -> fst (Xlist.fold sentences ([],1) (fun (l,n) -> function
        {psentence=AltSentence[Raw,s;ENIAM,StructSentence(_,paths,last)]} as p ->
           {p with psentence=AltSentence[Raw,s;ENIAM,StructSentence("E" ^ string_of_int n,paths,last)]} :: l, n+1
      | _ -> failwith "add_struct_sentence_ids"))

let extract_sentences par (paths,last) =
  let par = Array.of_list (Xunicode.utf8_chars_of_utf8_string par) in
  let paths,last = PrePaths.sort (paths,last) in
  let found = find_query [] paths in
  let pars = Xlist.fold found [] (fun pars set ->
    Xlist.fold paths [] (fun sentences t -> if IntSet.mem set t.id then
      let paths,last = get_sentence t in
      {pid=string_of_int t.id; pbeg=t.beg; plen=t.len;
       psentence=AltSentence[Raw,RawSentence (get_raw_sentence par t.beg t.len);
                             ENIAM,StructSentence("",paths,(*last*)10)]} :: sentences else sentences) :: pars) in (* FIXME: (*last*)10 !!!! *)
  match pars with
    [sentences] -> add_struct_sentence_ids sentences
  | _ -> failwith "extract_sentences"
*)