ENIAMpreIntegration.ml 6.2 KB
(*
 *  ENIAMintegration, a library that integrates ENIAM with other parsers.
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>, Jan Lupa, Daniel Oklesiński
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open ENIAMtokenizerTypes
open ENIAMsubsyntaxTypes

let concraft_enabled=ref false
let mate_parser_enabled=ref false
let swigra_enabled=ref false
let polfie_enabled=ref false


(*
UWAGA: Aby korzytać z concrafta trzeba najpierw postawić serwer wpisując z linii poleceń:
concraft-pl server --inmodel ../concraft/nkjp-model-0.2.gz
*)

let read_whole_channel c =
  let r = ref [] in
  try
    while true do
      r := (input_line c) :: !r
    done;
    !r
  with End_of_file -> List.rev (!r)

let rec process_concraft_result orth lemma interp others rev = function
    [] -> List.rev ((orth,(lemma,interp) :: others) :: rev)
  | "" :: l -> process_concraft_result orth lemma interp others rev l
  | line :: l ->
      (match Xstring.split_delim "\t" line with
        [orth2;s] when s = "none" || s = "space" ->
           if orth = "" then process_concraft_result orth2 lemma interp others rev l
           else process_concraft_result orth2 "" "" [] ((orth,(lemma,interp) :: others) :: rev) l
      | ["";lemma2;interp2] -> process_concraft_result orth lemma interp ((lemma2,interp2) :: others) rev l
      | ["";lemma;interp;"disamb"] -> process_concraft_result orth lemma interp others rev l
      | _ -> failwith ("process_concraft_result: " ^ line))

let concraft_parse s =
  let concraft_in, concraft_out, concraft_err =
    Unix.open_process_full ("echo \"" ^ s ^ "\" | concraft-pl client")
      [|"PATH=" ^ Sys.getenv "PATH"; "LANG=en_GB.UTF-8"|] in
  let err_msg = String.concat "\n" (read_whole_channel concraft_err) in
  let result = read_whole_channel concraft_in in
  if err_msg <> "" then failwith err_msg else
  process_concraft_result "" "" "" [] [] result

(*let rec load_concraft_sentence white orth rev ic =
  (* print_endline "load_concraft_sentence 1"; *)
  (* print_endline ("concraft error message: " ^ input_line concraft_err); *)
  let s = input_line ic in
  (* print_endline ("load_concraft_sentence: " ^ s); *)
  if s = "" then List.rev rev else
  match Xstring.split_delim "\t" s with
    [""; lemma; interp; "disamb"] -> load_concraft_sentence "" "" ((white,orth,lemma,interp) :: rev) ic
  | [""; lemma; interp] -> load_concraft_sentence white orth rev ic
  | [orth; white] -> load_concraft_sentence white orth rev ic
  | _ -> failwith ("load_concraft_sentence: " ^ s)*)

let make_token (orth,l) =
  if l = [] then failwith "make_token 1" else
  let lemma,interp = List.hd l in
  let cat,interp = match Xstring.split ":" interp with
      cat :: l -> cat, [Xlist.map l (fun tag -> [tag])]
    | _ -> failwith ("make_token 2: " ^ orth ^ " " ^ lemma ^ " " ^ interp) in
  {empty_token_env with orth = orth; token = Lemma(lemma,cat,interp)}

let parse_mate tokens pbeg s =
  (* print_endline ("parse_mate: " ^ s); *)
  (* Printf.fprintf concraft_out "%s\n\n%!" s;
  let l = load_concraft_sentence "" "" [] concraft_in in *)
  let l = concraft_parse s in
  let l = Xlist.map l make_token in
  let l = {empty_token_env with token = Interp "<conll_root>"} :: l in
  let l = Xlist.map l (fun t -> ExtArray.add tokens t,-1,"") in
  let _ = ENIAM_CONLL.establish_lengths pbeg s l tokens in
  let dep_paths = Array.of_list l in
  (* parse_conll tokens dep_paths; *)
  dep_paths

let rec parse_mate_sentence tokens pbeg s =
  if not !concraft_enabled then RawSentence s
  else DepSentence(parse_mate tokens pbeg s)

let compare_mode (x,_) (y,_) = compare_mode x y

let rec parse_sentence mode tokens pbeg = function
    RawSentence s ->
      [Raw,RawSentence s] @
      (if !mate_parser_enabled then [Mate,parse_mate_sentence tokens pbeg s] else []) @
      (if !swigra_enabled then [Swigra,RawSentence s] else []) @
      (if !polfie_enabled then [POLFIE,RawSentence s] else [])
  | StructSentence(paths,last) -> [mode,StructSentence(paths,last)]
  | DepSentence(paths) -> [mode,DepSentence paths]
  | QuotedSentences sentences ->
      let sentences =Xlist.rev_map sentences (fun p ->
        let sentence = parse_sentence mode tokens p.beg p.sentence in (* FIXME: p.pbeg czy pbeg *)
        let sentence = match sentence with
            [_,s] -> s
          | _ -> failwith "ENIAMpreIntegration.parse_sentence" in
        {p with sentence=sentence}) in
      [mode,QuotedSentences(List.rev sentences)]
  | AltSentence l ->
      let l = List.flatten (Xlist.rev_map l (fun (mode,sentence) ->
        parse_sentence mode tokens pbeg sentence)) in
      [mode,AltSentence(Xlist.sort l compare_mode)]

let rec parse_paragraph mode tokens = function
    RawParagraph s -> RawParagraph s
  | StructParagraph sentences ->
      let sentences = Xlist.rev_map sentences (fun p ->
        let sentence = parse_sentence mode tokens p.beg p.sentence in
        let sentence = match sentence with
            [_,s] -> s
          | _ -> failwith "ENIAMpreIntegration.parse_paragraph" in
        {p with sentence=sentence}) in
      StructParagraph(List.rev sentences)
  | AltParagraph l ->
      let l = Xlist.rev_map l (fun (mode,paragraph) ->
        mode, parse_paragraph mode tokens paragraph) in
      AltParagraph(List.rev l)

let rec parse_text mode tokens = function
    RawText s -> RawText s
  | StructText paragraphs ->
      let paragraphs = Xlist.rev_map paragraphs (fun paragraph ->
        parse_paragraph mode tokens paragraph) in
      StructText(List.rev paragraphs)
  | AltText l -> AltText(Xlist.map l (fun (mode,text) ->
       mode, parse_text mode tokens text))