ENIAM_LCGlexiconParser.ml 16.8 KB
(*
 *  ENIAM_LCGlexicon is a library that provides LCG lexicon form Polish
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This library is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Lesser General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open Xstd
open ENIAM_LCGtypes
open ENIAM_LCGlexiconTypes
open ENIAMcategoriesPL

let rec get_first n = function
    [] -> []
  | s :: l -> if n = 0 then [] else s :: (get_first (n-1) l)

let print_prefix n l =
  print_endline (String.concat " " (get_first n l))

let remove_comments line =
  try
    let n = String.index line '#' in
    String.sub line 0 n
  with Not_found -> line

let rec parse_phrase_names_rec rev = function
    "@WEIGHTS" :: tokens -> List.rev rev, "@WEIGHTS" :: tokens
  | "@LEXICON" :: tokens -> List.rev rev, "@LEXICON" :: tokens
  | t :: tokens -> parse_phrase_names_rec (t :: rev) tokens
  | [] -> failwith "parse_phrase_names_rec: unexpexted end of input"

let parse_phrase_names = function
    "@PHRASE_NAMES" :: tokens -> parse_phrase_names_rec [] tokens
  | s :: _ -> failwith ("parse_phrase_names: '@PHRASE_NAMES' expected while '" ^ s ^ "' found")
  | [] -> failwith "parse_phrase_names: unexpexted end of input"

let rec parse_weights_rec weights = function
    "@LEXICON" :: tokens -> weights, "@LEXICON" :: tokens
  | w :: "=" :: n :: tokens -> parse_weights_rec (StringMap.add weights w (float_of_string n)) tokens
  | s :: _ -> failwith ("parse_weights_rec: unexpexted token '" ^ s ^ "'")
  | [] -> failwith "parse_weights_rec: unexpexted end of input"

let parse_weights = function
    "@WEIGHTS" :: tokens -> parse_weights_rec StringMap.empty tokens
  | "@LEXICON" :: tokens -> StringMap.empty, "@LEXICON" :: tokens
  | s :: _ -> failwith ("parse_weights: '@WEIGHTS' expected while '" ^ s ^ "' found")
  | [] -> failwith "parse_weights: unexpexted end of input"

let rec split_semic found rev = function
    "lemma" :: "=" :: ";" :: l -> split_semic found (";" :: "=" :: "lemma" :: rev) l
  | ";" :: l -> split_semic (List.rev rev :: found) [] l
  | s :: l -> split_semic found (s :: rev) l
  | [] -> if rev = [] then List.rev found else List.rev ((List.rev rev) :: found)

let rec split_colon found rev = function
    "lemma" :: "=" :: ":" :: l -> split_colon found (":" :: "=" :: "lemma" :: rev) l
  | ":" :: l -> split_colon (List.rev rev :: found) [] l
  | s :: l -> split_colon found (s :: rev) l
  | [] -> if rev = [] then List.rev found else List.rev ((List.rev rev) :: found)

let rec split_comma found rev = function
    "lemma" :: "=" :: "," :: l -> split_comma found ("," :: "=" :: "lemma" :: rev) l
  | "," :: l -> split_comma (List.rev rev :: found) [] l
  | s :: l -> split_comma found (s :: rev) l
  | [] -> if rev = [] then List.rev found else List.rev ((List.rev rev) :: found)

let rec find_right_bracket rev = function
    "]" :: l -> List.rev rev, l
  | s :: l -> find_right_bracket (s :: rev) l
  | [] -> failwith "find_right_bracket"

let match_selectors = function
    s :: l -> (try selector_of_string s,l with _ -> failwith ("match_selectors: " ^ s))
  | [] -> failwith "match_selectors: empty"

let match_relation = function
  (* cat,"=" :: "=" :: l -> cat,StrictEq,l *)
  | cat,"!" :: "=" :: l -> cat,Neq,l
  | cat,"=" :: l -> cat,Eq,l
  | cat,s :: l -> failwith ("match_relation: " ^ (String.concat " " (s :: l)))
  | cat,[] -> failwith "match_relation: empty"

let rec split_mid rev = function
    [s] -> List.rev (s :: rev)
  | s :: "|" :: l -> split_mid (s :: rev) l
  | [] -> failwith "split_mid: empty"
  | l -> failwith ("split_mid: " ^ (String.concat " " l))

let rec check_value selector l =
  let vals = try SelectorMap.find selector_values selector
    with Not_found -> failwith ("check_value: invalid selector " ^ string_of_selector selector) in
  if vals = [] then () else
    Xlist.iter l (fun s ->
        if not (Xlist.mem vals s) then
          failwith ("check_value: invalid selector " ^ string_of_selector selector ^ "=" ^ s));
  l

let match_value = function
    cat,rel,[s] -> cat,rel,[s]
  | cat,rel,[] -> failwith "match_value: empty"
  | cat,rel,l -> cat,rel, check_value cat (split_mid [] l)

let parse_selectors l =
  (* print_endline s; *)
  (* let l = Xlist.map (Str.full_split (Str.regexp "|\\|,\\|=\\|!") s) (function
        Str.Text s -> s
      | Str.Delim s -> s) in *)
  let ll = split_comma [] [] l in
  let l = Xlist.rev_map ll match_selectors in
  let l = Xlist.rev_map l match_relation in
  let l = Xlist.rev_map l match_value in
  l

let manage_lemmata = function
    "lemma" :: "=" :: ":" :: "," :: tokens -> ["lemma";"=";":";","],tokens
  | "lemma" :: "=" :: ":" :: s :: "," :: tokens -> ["lemma";"=";":"^s;","],tokens
  | "lemma" :: "=" :: "<" :: "/" :: s :: "," :: tokens -> ["lemma";"=";"</"^s;","],tokens
  | tokens -> [],tokens


type syntax =
    A of string
  | B of internal_grammar_symbol
  | C of grammar_symbol
  | D of direction * grammar_symbol
  | E of (direction * grammar_symbol) list

let make_atoms phrase_names =
  SelectorMap.fold selector_values (StringSet.of_list phrase_names) (fun atoms _ l ->
      Xlist.fold l atoms StringSet.add)

let operators = StringSet.of_list [
    "*"; "+"; "/"; "|"; "\\"; "("; ")"; ","; "{"; "}"; "?"]

let find_internal_grammar_symbols atoms = function
  | "T" -> B Top
  | "1" -> C One
  | "schema" -> D(Both,Tensor[AVar "schema"])
  | "adjuncts" -> D(Both,Tensor[AVar "adjuncts"])
  | s -> if StringSet.mem selector_names s then B (AVar s) else
    if StringSet.mem atoms s then B (Atom s) else
    if StringSet.mem operators s then A s else
      failwith ("find_internal_grammar_symbols: unknown symbol " ^ s)

let rec find_tensor = function
    B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: A "*" :: B s4 :: A "*" :: B s5 :: A "*" :: B s6 :: A "*" :: B s7 :: A "*" :: B s8 :: l -> failwith "find_tensor 1"
  | B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: A "*" :: B s4 :: A "*" :: B s5 :: A "*" :: B s6 :: A "*" :: B s7 :: l -> C (Tensor[s1;s2;s3;s4;s5;s6;s7]) :: find_tensor l
  | B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: A "*" :: B s4 :: A "*" :: B s5 :: A "*" :: B s6 :: l -> C (Tensor[s1;s2;s3;s4;s5;s6]) :: find_tensor l
  | B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: A "*" :: B s4 :: A "*" :: B s5 :: l -> C (Tensor[s1;s2;s3;s4;s5]) :: find_tensor l
  | B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: A "*" :: B s4 :: l -> C (Tensor[s1;s2;s3;s4]) :: find_tensor l
  | B s1 :: A "*" :: B s2 :: A "*" :: B s3 :: l -> C (Tensor[s1;s2;s3]) :: find_tensor l
  | B s1 :: A "*" :: B s2 :: l -> C (Tensor[s1;s2]) :: find_tensor l
  | B s1 :: l -> C (Tensor[s1]) :: find_tensor l
  | A "*" :: _ -> failwith "find_tensor 2: unexpected '*'"
  | t :: l -> t :: find_tensor l
  | [] -> []

let rec find_plus = function
    C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: A "+" :: C s4 :: A "+" :: C s5 :: A "+" :: C s6 :: A "+" :: C s7 :: l -> failwith "find_plus 1"
  | C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: A "+" :: C s4 :: A "+" :: C s5 :: A "+" :: C s6 :: l -> C (Plus[s1;s2;s3;s4;s5;s6]) :: find_plus l
  | C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: A "+" :: C s4 :: A "+" :: C s5 :: l -> C (Plus[s1;s2;s3;s4;s5]) :: find_plus l
  | C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: A "+" :: C s4 :: l -> C (Plus[s1;s2;s3;s4]) :: find_plus l
  | C s1 :: A "+" :: C s2 :: A "+" :: C s3 :: l -> C (Plus[s1;s2;s3]) :: find_plus l
  | C s1 :: A "+" :: C s2 :: l -> C (Plus[s1;s2]) :: find_plus l
  | A "+" :: _ -> failwith "find_plus 2: unexpected '+'"
  | t :: l -> t :: find_plus l
  | [] -> []

let rec find_paren = function
    A "(" :: C s :: A ")" :: l -> C s :: find_paren l
  | s :: l -> s :: find_paren l
  | [] -> []

let rec find_imp = function
  | C s1 :: A "/" :: C s2 :: l -> C (Imp(s1,Forward,s2)) :: find_imp l
  | C s1 :: A "|" :: C s2 :: l -> C (Imp(s1,Both,s2)) :: find_imp l
  | C s1 :: A "\\" :: C s2 :: l -> C (Imp(s1,Backward,s2)) :: find_imp l
  | s :: l -> s :: find_imp l
  | [] -> []

let rec find_maybe = function
  | A "?" :: C s2 :: l -> C (Maybe s2) :: find_maybe l
  | A "?" :: _ -> failwith "find_maybe 1: unexpected '?'"
  | s :: l -> s :: find_maybe l
  | [] -> []

let rec find_mult_imp = function
  | A "{" :: A "/" :: C s2 :: l -> A "{" :: D (Forward,s2) :: find_mult_imp l
  | A "{" :: A "|" :: C s2 :: l -> A "{" :: D (Both,s2) :: find_mult_imp l
  | A "{" :: A "\\" :: C s2 :: l -> A "{" :: D (Backward,s2) :: find_mult_imp l
  | A "," :: A "/" :: C s2 :: l -> A "," :: D (Forward,s2) :: find_mult_imp l
  | A "," :: A "|" :: C s2 :: l -> A "," :: D (Both,s2) :: find_mult_imp l
  | A "," :: A "\\" :: C s2 :: l -> A "," :: D (Backward,s2) :: find_mult_imp l
  | A "/" :: _ -> failwith "find_mult_imp 1: unexpected '/'"
  | A "|" :: _ -> failwith "find_mult_imp 2: unexpected '|'"
  | A "\\" :: _ -> failwith "find_mult_imp 3: unexpected '\\'"
  | A "(" :: _ -> failwith "find_mult_imp 4: unexpected '('"
  | A ")" :: _ -> failwith "find_mult_imp 5: unexpected ')'"
  | s :: l -> s :: find_mult_imp l
  | [] -> []

let rec find_mult = function
    A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "," :: D(s4,t4) :: A "," :: D(s5,t5) :: A "," :: D(s6,t6) :: A "," :: D(s7,t7) :: A "," :: D(s8,t8) :: A "," :: D(s9,t9) :: A "," :: D(s10,t10) :: A "," :: D _ :: l -> failwith "find_mult 1: to many elements in { }"
  | A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "," :: D(s4,t4) :: A "," :: D(s5,t5) :: A "," :: D(s6,t6) :: A "," :: D(s7,t7) :: A "," :: D(s8,t8) :: A "," :: D(s9,t9) :: A "," :: D(s10,t10) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3;s4,t4;s5,t5;s6,t6;s7,t7;s8,t8;s9,t9;s10,t10] :: find_mult l
  | A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "," :: D(s4,t4) :: A "," :: D(s5,t5) :: A "," :: D(s6,t6) :: A "," :: D(s7,t7) :: A "," :: D(s8,t8) :: A "," :: D(s9,t9) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3;s4,t4;s5,t5;s6,t6;s7,t7;s8,t8;s9,t9] :: find_mult l
  | A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "," :: D(s4,t4) :: A "," :: D(s5,t5) :: A "," :: D(s6,t6) :: A "," :: D(s7,t7) :: A "," :: D(s8,t8) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3;s4,t4;s5,t5;s6,t6;s7,t7;s8,t8] :: find_mult l
  | A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "," :: D(s4,t4) :: A "," :: D(s5,t5) :: A "," :: D(s6,t6) :: A "," :: D(s7,t7) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3;s4,t4;s5,t5;s6,t6;s7,t7] :: find_mult l
  | A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "," :: D(s4,t4) :: A "," :: D(s5,t5) :: A "," :: D(s6,t6) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3;s4,t4;s5,t5;s6,t6] :: find_mult l
  | A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "," :: D(s4,t4) :: A "," :: D(s5,t5) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3;s4,t4;s5,t5] :: find_mult l
  | A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "," :: D(s4,t4) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3;s4,t4] :: find_mult l
  | A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "," :: D(s3,t3) :: A "}" :: l -> E[s1,t1;s2,t2;s3,t3] :: find_mult l
  | A "{" :: D(s1,t1) :: A "," :: D(s2,t2) :: A "}" :: l -> E[s1,t1;s2,t2] :: find_mult l
  | A "{" :: D(s1,t1) :: A "}" :: l -> E[s1,t1] :: find_mult l
  | A "{" :: _ -> failwith "find_mult 2: unexpected '{'"
  | A "}" :: _ -> failwith "find_mult 3: unexpected '}'"
  | A "," :: _ -> failwith "find_mult 4: unexpected ','"
  | t :: l -> t :: find_mult l
  | [] -> []

let rec apply_mult = function
    C s :: E t :: l -> apply_mult (C (ImpSet(s,t)) :: l)
  | [C s] -> C s
  | _ -> failwith "apply_mult"

let parse_syntax atoms l =
  (* print_endline s; *)
  (* let l = Xlist.map (Str.full_split (Str.regexp "?\\|}\\|{\\|,\\|*\\|/\\|+\\|)\\|(\\||\\|\\") s) (function
        Str.Text s -> s
      | Str.Delim s -> s) in *)
  let l = List.rev (Xlist.rev_map l (find_internal_grammar_symbols atoms)) in
  let l = find_tensor l in
  let l = find_plus l in
  let l = find_paren l in
  let l = find_maybe l in
  let l = find_imp l in
  let l = find_paren l in
  let l = find_imp l in
  let l = find_paren l in
  let l = find_imp l in
  let l = find_paren l in
  let l = find_mult_imp l in
  let l = find_mult l in
  match apply_mult l with
    C s -> s
  | _ -> failwith "parse_syntax"

let check_quant_range cat l =
  let set = StringSet.of_list (
      try SelectorMap.find selector_values cat
      with Not_found -> failwith ("check_quant_range: " ^ string_of_selector cat)) in
  if StringSet.is_empty set then () else
    Xlist.iter l (fun v ->
        if not (StringSet.mem set v) then
          failwith ("check_quant_range: " ^ string_of_selector cat ^ "=" ^ v))

let parse_quant_range = function
    _,["0"] -> Zero
  | _,["T"] -> Top
  | _,["all_numbers"] -> ENIAM_LCGrenderer.make_quant_restriction all_numbers
  | _,["all_cases"] -> ENIAM_LCGrenderer.make_quant_restriction all_cases
  | _,["all_genders"] -> ENIAM_LCGrenderer.make_quant_restriction all_genders
  | _,["all_persons"] -> ENIAM_LCGrenderer.make_quant_restriction all_persons
  | cat,l ->
    let l = Xstring.split "&" (String.concat "" l) in
    check_quant_range cat l;
    ENIAM_LCGrenderer.make_quant_restriction l

let parse_quantifiers tokens =
  Xlist.map (split_comma [] [] tokens) (function
        cat :: "=" :: tokens ->
        let cat = selector_of_string cat in
        cat, parse_quant_range (cat,tokens)
      | t :: _ -> failwith ("parse_quantifiers: unexpected token '" ^ t ^ "'")
      | [] -> failwith "parse_quantifiers: no token")

let parse_raised tokens =
  Xlist.map (split_comma [] [] tokens) (function
        [cat] -> selector_of_string cat
      | t :: _ -> failwith ("parse_raised: unexpected token '" ^ t ^ "'")
      | [] -> failwith "parse_raised: no token")

let rec find_syntax_end rev = function
    ("BRACKET" :: _) as tokens -> List.rev rev, tokens
  | ("QUANT" :: "[" :: _) as tokens -> List.rev rev, tokens
  | ("RAISED" :: "[" :: _) as tokens -> List.rev rev, tokens
  | ("SEM" :: "[" :: _) as tokens -> List.rev rev, tokens
  | s :: tokens -> find_syntax_end (s :: rev) tokens
  | [] -> List.rev rev, []

let parse_sem_term sem_term = String.concat "" sem_term

let rec parse_rule atoms = function
    "BRACKET" :: tokens -> Bracket :: parse_rule atoms tokens
  | "QUANT" :: "[" :: tokens ->
    let quant,tokens = find_right_bracket [] tokens in
    Quant(parse_quantifiers quant) :: parse_rule atoms tokens
  | "RAISED" :: "[" :: tokens ->
    let raised,tokens = find_right_bracket [] tokens in
    Raised(parse_raised raised) :: parse_rule atoms tokens
  | "SEM" :: "[" :: tokens ->
    let sem_term,tokens = find_right_bracket [] tokens in
    Sem(parse_sem_term sem_term) :: parse_rule atoms tokens
  | [] -> []
  | tokens ->
    let syntax,tokens = find_syntax_end [] tokens in
    (* print_prefix 100 tokens; *)
    Syntax(parse_syntax atoms syntax) :: parse_rule atoms tokens

let parse_entry atoms weights tokens =
  let prefix,tokens = manage_lemmata tokens in
  let selectors, rule, weight =
    match split_colon [] [] tokens with
      [selectors;rule] -> selectors, rule, 0.
    | [selectors;rule;[weight]] -> selectors, rule,
                                   (try StringMap.find weights weight
                                    with Not_found -> failwith ("parse_entry: unknown weight symbol '" ^ weight ^ "'"))
    | _ -> failwith ("parse_entry: invalid number of ':' in entry " ^ (String.concat " " tokens)) in
  let selectors = parse_selectors (prefix @ selectors) in
  let rule = parse_rule atoms rule in
  selectors, rule, weight

let parse_lexicon atoms weights = function
    "@LEXICON" :: tokens ->
    let entries = split_semic [] [] tokens in
    List.rev (Xlist.rev_map entries (parse_entry atoms weights))
  | s :: _ -> failwith ("parse_lexicon: '@LEXICON' expected while '" ^ s ^ "' found")
  | [] -> failwith "parse_lexicon: unexpexted end of input"

let load_lexicon filename =
  let lines = File.load_lines filename in
  let lines = List.rev (Xlist.rev_map lines remove_comments) in
  let tokens = List.flatten (Xlist.rev_map lines (fun line ->
      Xlist.rev_map (Str.full_split
                       (Str.regexp "\\]\\| \\|\t\\|\r\\|\\?\\|:\\|;\\|&\\|!\\|=\\|}\\|{\\|,\\|\\*\\|/\\|\\+\\|)\\|(\\||\\|\\[\\|\\") line) (function
            Str.Text s -> s
          | Str.Delim s -> s))) in
  let tokens = Xlist.fold tokens [] (fun tokens -> function
        " " -> tokens
      | "\t" -> tokens
      | "\r" -> tokens
      | t -> t :: tokens) in
  let phrase_names,tokens = parse_phrase_names tokens in
  let atoms = make_atoms phrase_names in
  let weights,tokens = parse_weights tokens in
  let lexicon = parse_lexicon atoms weights tokens in
  lexicon