dodanie zasobów z jednoskami wielosłownymi

Wojciech Jaworski
1 parent 766cb2a4
Showing 12 changed files with 27932 additions and 223 deletions
morphology2/TODO
subsyntax/ENIAM_MWE.ml
subsyntax/ENIAMsubsyntaxTypes.ml
subsyntax/TODO
subsyntax/makefile
subsyntax/resources/README
subsyntax/resources/SAWA.dic
subsyntax/resources/SEJF.dic
subsyntax/resources/SEJFEK.dic
subsyntax/test.ml
tokenizer/ENIAMpatterns.ml
tokenizer/ENIAMtokenizerTypes.ml
@@ -2,3 +2,4 @@ zintegrować z modelem probabilistycznym i dokończyć
 dodać tagger
 usunąć reguły praet z dołączonym aglutynatem!
 i usunąć excluded interps z subsyntax
+dodać morfeusz_suplementy wydobyte z zasobów MWE 
@@ -21,227 +21,197 @@ open Xstd
 open ENIAMsubsyntaxTypes
 open ENIAMtokenizerTypes
  
-let load_dict dict filename =
+type sel = V of string | S of string | G
+
+type t =
+    L of string * string * sel list
+  | O of string
+  | D of string * string
+
+let process_interp lemma interp =
+  match Xstring.split ":" interp with
+    cat :: interp -> L(lemma,cat,Xlist.map interp (function
+        "$c" -> S "c"
+      | "$n" -> S "n"
+      | "$g" -> S "g"
+      | "$d" -> S "d"
+      | "$C" -> S "C"
+      | "_" -> G
+      | s -> if String.get s 0 = '$' then failwith ("process_interp: " ^ s) else V s))
+  | _ -> failwith "process_interp"
+
+let load_mwe_dict dict filename =
   File.fold_tab filename dict (fun dict -> function
-      [orth; lemma; interp] ->
-        let s = List.hd (Str.split_delim (Str.regexp " ") orth) in
-        StringMap.add_inc dict s [orth,lemma,interp] (fun l -> (orth,lemma,interp) :: l)
+      [orths; lemma; interp] ->
+        let orths = Xstring.split " " orths in
+        if orths = [] then failwith "load_mwe_dict" else
+        let s = List.hd orths in
+        let orths = Xlist.map orths (fun s -> O s) in
+        let lemma,cat,interp = match process_interp lemma interp with
+            L(lemma,cat,interp) -> lemma,cat,interp
+          | _ -> failwith "load_mwe_dict2" in
+        StringMap.add_inc dict s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l)
     | l -> failwith ("load_mwe_dict '" ^ String.concat "\t" l ^ "'"))
  
-let mwe_dict =
-  let dict = load_dict StringMap.empty brev_filename in
-  let dict = try load_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in
-(*    let dict = load_dict dict complete_entries_filename in*)
-  let dict = load_dict dict mwe_filename in
-  dict
-
-let preselect_dict orths dict =
-  StringSet.fold orths [] (fun rules orth ->
-    try
-      let l = StringMap.find dict orth in
-      Xlist.fold l rules (fun rules (orth,lemma,interp) ->
-               (* print_endline ("preselect_dict: " ^ orth); *)
-               let match_list = Str.split (Str.regexp " ") orth in
-               let b = Xlist.fold match_list true (fun b s ->
-                   (* if not (StringSet.mem orths s) then print_endline s; *)
-                   StringSet.mem orths s && b) in
-               if b then (match_list,lemma,interp) :: rules else rules)
-    with Not_found -> rules)
-
-
-(*
-type matching = {
-  prefix: tokens list;
-  matched: token_record list;
-  suffix: tokens list;
-  pattern: pat list;
-  command: token_record list -> token;
-  last: int
-  }
-
-let rec find_abr_pattern_tail matchings found = function
-    [] -> found
-  | token :: l ->
-      let matchings,found = Xlist.fold matchings ([],found) (fun (matchings,found) matching ->
-        match matching.pattern with
-          [pat] ->
-            let matchings = if token.beg <= matching.last then matching :: matchings else matchings in
-            if PrePatterns.match_token (pat,token.token) && token.beg = matching.last then
-              matchings, {matching with matched = token :: matching.matched; last=token.next; pattern=[]} :: found else
-            matchings, found
-        | pat :: pattern ->
-            let matchings = if token.beg <= matching.last then matching :: matchings else matchings in
-            if PrePatterns.match_token (pat,token.token) && token.beg = matching.last then
-              {matching with matched = token :: matching.matched; last=token.next; pattern=pattern} :: matchings, found else
-            matchings, found
-        | [] -> matchings, matching :: found) in
-      if matchings = [] then found else find_abr_pattern_tail matchings found l
-
-let rec find_abr_pattern all_matchings found = function
-    token :: l ->
-      let matchings = Xlist.fold all_matchings [] (fun matchings matching ->
-        match matching.pattern with
-          pat :: pattern ->
-            (if PrePatterns.match_token (pat,token.token) then
-              [{matching with matched = token :: matching.matched; last=token.next; pattern=pattern}] else []) @ matchings
-        | _ -> failwith "find_abr_pattern: ni") in
-      let found = if matchings = [] then found else find_abr_pattern_tail matchings found l in
-      find_abr_pattern all_matchings found l
-  | [] -> found
-
-let rec make_abr_orth = function
-    [] -> ""
-  | [t] -> t.orth
-  | t :: l -> if t.beg + t.len = t.next then t.orth ^ (make_abr_orth l) else t.orth ^ " " ^ (make_abr_orth l)
-
-let find_abr_patterns patterns tokens =
-  let found = find_abr_pattern (Xlist.map patterns (fun pattern ->
-    {prefix=[]; matched=[]; suffix=[]; pattern=pattern; command=(fun _ -> Symbol ""); last=0})) [] tokens in
-  Xlist.rev_map found (fun matching ->
-    let t1 = List.hd (List.rev matching.matched) in
-    let t2 = List.hd matching.matched in
-    t1.beg,
-    t2.beg + t2.len - t1.beg,
-    t2.next,
-    make_abr_orth (List.rev matching.matched))
-
-let split_interp line gloss interp =
-  if interp = "xxx" then [gloss, "xxx"] else
-  Xlist.map (Str.split (Str.regexp " ") interp) (fun s ->
-    match Str.split (Str.regexp "|") s with
-        [lemma;interp] -> lemma, interp
-      | _ -> failwith ("bad brev entry: " ^ line))
-
-let load_brev_dict () =
-  let lines = File.load_lines "data/brev_20151215.tab" in
-  List.rev (Xlist.rev_map lines (fun line ->
-    match Str.split_delim (Str.regexp "\t") line with
-      [_; orth; gloss; interp; _] -> Str.split (Str.regexp " ") orth, split_interp line gloss interp
-    | [_; orth; gloss; interp] -> Str.split (Str.regexp " ") orth, split_interp line gloss interp
-    | _ -> failwith ("load_brev_dict: " ^ line)))
-
-let parse_lemma lemma =
-  if lemma = ":" then lemma,"" else
-  match Str.split (Str.regexp ":") lemma with
-    [x] -> x,""
-  | [x;y] -> x,y
-  | _ -> failwith ("parse_lemma: " ^ lemma)
-
-let make_orths orth beg len lexeme_postags_list =
-  let n = Xlist.size lexeme_postags_list in
-  let orth_list =
-    if n = 1 then [orth,beg,len] else
-    List.rev (Int.fold 1 n [] (fun l i ->
-      (orth ^ "_" ^ string_of_int i,
-       (if i=1 then beg else beg+len-n+i-1),
-       if i=1 then len-n+1 else 1) :: l)) in
-  List.rev (Xlist.fold (List.combine orth_list lexeme_postags_list) [] (fun orth_list ((orth,beg,len),(lemma,postags)) ->
-    (orth, fst (parse_lemma lemma), ENIAMtokens.parse_postags postags, beg, len) :: orth_list))
-
-let brev_dict = load_brev_dict ()
-
-(* FIXME: trzeba zmienić reprezentację skrótów nazw własnych: przenieść do mwe,
-   Gdy skrót jest częścią nazwy własnej powinien być dalej przetwarzalny *)
-let process_brev paths (*tokens*) = paths
-(*  let paths = Xlist.fold brev_dict paths (fun paths (pattern,lexeme_postags_list) ->
-    let matchings_found = find_abr_patterns [Xlist.map pattern (fun pat -> O pat)] tokens in
-    Xlist.fold matchings_found paths (fun paths (beg,len,next,orth) ->
-      let orths = make_orths orth beg len lexeme_postags_list in
-      ENIAMpaths.add_path paths beg next orths)) in
-  paths*)
-
-let rec preselect_mwe_dict_token set = function
-    SmallLetter orth -> StringSet.add set orth
-  | CapLetter(orth,lc) -> StringSet.add set orth
-  | AllSmall orth -> StringSet.add set orth
-  | AllCap(orth,lc,lc2) -> StringSet.add set orth
-  | FirstCap(orth,lc,_,_) -> StringSet.add set orth
-  | SomeCap orth -> StringSet.add set orth
-  | Symbol orth  -> StringSet.add set orth
-  | Dig(v,"dig") -> StringSet.add set v
-  | Other2 orth  -> StringSet.add set orth
-  | _ -> set
-
-let rec preselect_mwe_dict_tokens set = function
-    Token t -> preselect_mwe_dict_token set t.token
-  | Seq l -> Xlist.fold l set preselect_mwe_dict_tokens
-  | Variant l -> Xlist.fold l set preselect_mwe_dict_tokens
-
-let preselect_mwe_dict mwe_dict tokens =
-  let set = Xlist.fold tokens StringSet.empty preselect_mwe_dict_tokens in
-  let set = StringSet.fold set StringSet.empty (fun set orth ->
-    try
-      let l = StringMap.find mwe_dict orth in
-      Xlist.fold l set StringSet.add
-    with Not_found -> set) in
-(*   StringSet.iter set print_endline; *)
-  StringSet.fold set [] (fun l s ->
-    match Str.split_delim (Str.regexp "\t") s with
-      [lemma; interp; sense] ->
-        (match Str.split_delim (Str.regexp ":") interp with
-           orths :: tags -> (Str.split (Str.regexp " ") orths, lemma, String.concat ":" tags, sense) :: l
-         | _ -> failwith "preselect_mwe_dict")
-    | _ -> failwith "preselect_mwe_dict")
-
-let simplify_lemma lemma =
-         match Str.split (Str.regexp "-") lemma with
-          [x;"1"] -> x
-        | [x;"2"] -> x
-        | [x;"3"] -> x
-        | [x;"4"] -> x
-        | [x;"5"] -> x
-        | _ -> lemma
-
-let mwe_dict = load_mwe_dict ()
-
-let process_mwe paths (*tokens*) = paths
-(*  let mwe_dict = preselect_mwe_dict mwe_dict tokens  in
-  let paths = Xlist.fold mwe_dict paths (fun paths (pattern,lexeme,interp,sense) ->
-    let matchings_found = find_abr_patterns [Xlist.map pattern (fun pat -> O pat)] tokens in
-    Xlist.fold matchings_found paths (fun paths (beg,len,next,orth) ->
-      let orths = make_orths orth beg len [simplify_lemma lexeme,interp] in
-      ENIAMpaths.add_path paths beg next orths)) in
-  paths*)
-*)
+let process_orth = function
+    [Lexer.T lemma; Lexer.B("(",")",[Lexer.T interp])] -> process_interp lemma interp
+  | [Lexer.T orth] -> O orth
+  | [Lexer.B("{","}",l); Lexer.B("(",")",[Lexer.T interp])] -> process_interp (Lexer.string_of_token_list l) interp
+  | [Lexer.B("{","}",l)] -> O(Lexer.string_of_token_list l)
+  | tokens -> failwith ("process_orth: " ^ Lexer.string_of_token_list tokens)
+
+let load_mwe_dict2 (dict,dict2) filename =
+  File.fold_tab filename (dict,dict2) (fun (dict,dict2) -> function
+      [orths; lemma] ->
+        (* print_endline (orths ^ "\t" ^ lemma); *)
+        let tokens = Lexer.split "(\\|)\\|{\\|}\\| " orths in
+        (* print_endline ("load_dict2 1: " ^ Lexer.string_of_token_list tokens); *)
+        let tokens = Lexer.find_brackets ["{","}";"(",")"] [] tokens in
+        (* print_endline ("load_dict2 2: " ^ Lexer.string_of_token_list tokens); *)
+        let orths = List.rev (Xlist.rev_map (Lexer.split_symbol (Lexer.T " ") [] tokens) process_orth) in
+        let tokens = Lexer.split "(\\|)\\|{\\|}" lemma in
+        (* print_endline ("load_dict2 3: " ^ Lexer.string_of_token_list tokens); *)
+        let tokens = Lexer.find_brackets ["{","}";"(",")"] [] tokens in
+        (* print_endline ("load_dict2 4: " ^ Lexer.string_of_token_list tokens); *)
+        let lemma,cat,interp = match process_orth tokens with
+            L(lemma,cat,interp) -> lemma,cat,interp
+          | _ -> failwith "load_mwe_dict2" in
+        if orths = [] then failwith "load_mwe_dict2" else
+        (match List.hd orths with
+            L(s,_,_) -> dict, StringMap.add_inc dict2 s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l)
+          | O s -> StringMap.add_inc dict s [orths,lemma,cat,interp] (fun l -> (orths,lemma,cat,interp) :: l), dict2
+          | D _ -> failwith "load_mwe_dict2")
+    | l -> failwith ("load_mwe_dict2 '" ^ String.concat "\t" l ^ "'"))
+
+let mwe_dict,mwe_dict2 =
+  let dict = load_mwe_dict StringMap.empty brev_filename in
+  let dict = try load_mwe_dict dict fixed_filename with _ -> (prerr_endline ("ENIAMsubsyntax file " ^ fixed_filename ^ " not found"); dict) in
+  let dict = load_mwe_dict dict mwe_filename in
+  let dict,dict2 = load_mwe_dict2 (dict,StringMap.empty) sejf_filename in
+  let dict,dict2 = load_mwe_dict2 (dict,dict2) sejfek_filename in
+  let dict,dict2 = load_mwe_dict2 (dict,dict2) sawa_filename in
+  dict,dict2
  
 let get_orths paths =
   IntMap.fold paths StringSet.empty (fun orths _ map ->
     IntMap.fold map orths (fun orths _ l ->
-      Xlist.fold l orths (fun orths t ->
+      TokenEnvSet.fold l orths (fun orths t ->
         StringSet.add orths (ENIAMtokens.get_orth t.token))))
  
+let get_lemmas paths =
+  IntMap.fold paths StringSet.empty (fun orths _ map ->
+    IntMap.fold map orths (fun orths _ l ->
+      TokenEnvSet.fold l orths (fun orths t ->
+        StringSet.add orths (ENIAMtokens.get_lemma t.token))))
+
 let get_intnum_orths paths =
   IntMap.fold paths StringMap.empty (fun orths _ map ->
     IntMap.fold map orths (fun orths _ l ->
-      Xlist.fold l orths (fun orths t ->
+      TokenEnvSet.fold l orths (fun orths t ->
         match t.token with
           Dig(lemma,"intnum") -> StringMap.add_inc orths (ENIAMtokens.get_orth t.token) (StringSet.singleton lemma) (fun set -> StringSet.add set lemma)
         | _ -> orths)))
  
-let rec match_path_rec map found (t:token_env) rev = function
-    [] -> (t :: rev) :: found
+let preselect orths lemmas rules l =
+  Xlist.fold l rules (fun rules (match_list,lemma,cat,interp) ->
+    let b = Xlist.fold match_list true (fun b -> function
+        O s -> StringSet.mem orths s && b
+      | L(s,_,_) -> StringSet.mem lemmas s && b
+      | D(s,_) -> failwith "preselect") in
+    if b then (match_list,lemma,cat,interp) :: rules else rules)
+
+let preselect_dict orths lemmas dict rules =
+  StringSet.fold orths rules (fun rules orth ->
+    try
+      preselect orths lemmas rules (StringMap.find dict orth)
+    with Not_found -> rules)
+
+let preselect_dict2 orths lemmas dict2 rules =
+  StringSet.fold lemmas rules (fun rules lemma ->
+    try
+      preselect orths lemmas rules (StringMap.find dict2 lemma)
+    with Not_found -> rules)
+
+let add_ordnum_rules orths rules =
+  StringMap.fold orths rules (fun rules orth lemmas ->
+    StringSet.fold lemmas rules (fun rules lemma ->
+      (* Printf.printf "%s %s\n%!" orth lemma; *)
+      ([D(orth,"intnum");O "."],lemma,"ordnum",[]) :: rules))
+
+let select_rules paths mwe_dict mwe_dict2 =
+  let orths = get_orths paths in
+  let lemmas = get_lemmas paths in
+  let intnum_orths = get_intnum_orths paths in
+  let rules = preselect_dict orths lemmas mwe_dict [] in
+  let rules = preselect_dict2 orths lemmas mwe_dict2 rules in
+  let rules = add_ordnum_rules intnum_orths rules in
+  rules
+
+let rec check_interp sels = function
+    [],[] -> true
+  | s :: interp, ["_"] :: interp2 -> check_interp sels (interp,interp2)
+  | V s :: interp, l2 :: interp2 -> if Xlist.mem l2 s then check_interp sels (interp,interp2) else false
+  | S s :: interp, l2 :: interp2 ->
+      (try
+        let l = Xlist.assoc sels s in
+        let b = Xlist.fold l false (fun b s -> Xlist.mem l2 s || b) in
+        if b then check_interp sels (interp,interp2) else false
+      with Not_found -> check_interp sels (interp,interp2))
+  | G :: interp, l2 :: interp2 -> check_interp sels (interp,interp2)
+  | _ -> failwith "check_interp"
+
+let rec get_sels sels = function
+    [],[] -> sels
+  | s :: interp, ["_"] :: interp2 -> get_sels sels (interp,interp2)
+  | V s :: interp, l2 :: interp2 -> get_sels sels (interp,interp2)
+  | S s :: interp, l2 :: interp2 ->
+      (try
+        let l = Xlist.assoc sels s in
+        let sels = List.remove_assoc s sels in
+        let l = Xlist.fold l [] (fun l s -> if Xlist.mem l2 s then s :: l else l) in
+        get_sels ((s,l) :: sels) (interp,interp2)
+      with Not_found -> get_sels ((s,l2) :: sels) (interp,interp2))
+  | G :: interp, l2 :: interp2 -> get_sels sels (interp,interp2)
+  | _ -> failwith "get_sels"
+
+let rec match_path_rec map found (t:token_env) sels rev = function
+    [] -> (t :: rev, sels) :: found
   | s :: l ->
      let map2 = try IntMap.find map t.next with Not_found -> IntMap.empty in
      let found2 = IntMap.fold map2 [] (fun found2 _ l ->
-       Xlist.fold l found2 (fun found2 new_t ->
-         if ENIAMtokens.get_orth new_t.token = s then new_t :: found2 else found2)) in
-     Xlist.fold found2 found (fun found new_t -> match_path_rec map found new_t (t :: rev) l)
+       TokenEnvSet.fold l found2 (fun found2 new_t ->
+           match s,new_t.token with
+             O s, token -> if ENIAMtokens.get_orth token = s then (new_t,sels) :: found2 else found2
+           | L(s,cat,interp), Lemma(s2,cat2,interps2) ->
+               Xlist.fold interps2 found2 (fun found2 interp2 ->
+                 if s=s2 && cat=cat2 && check_interp sels (interp,interp2) then
+                   (new_t,get_sels sels (interp,interp2)) :: found2 else found2)
+           | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (new_t,sels) :: found2 else found2
+           | _ -> found2)) in
+     Xlist.fold found2 found (fun found (new_t,sels) -> match_path_rec map found new_t sels (t :: rev) l)
  
 let match_path map = function
     [] -> failwith "match_path"
   | s :: l ->
      let found = IntMap.fold map [] (fun found i map2 ->
        IntMap.fold map2 found (fun found j l ->
-         Xlist.fold l found (fun found t ->
-           if ENIAMtokens.get_orth t.token = s then t :: found else found))) in
-     Xlist.fold found [] (fun found t -> match_path_rec map found t [] l)
+         TokenEnvSet.fold l found (fun found t ->
+           match s,t.token with
+             O s, token -> if ENIAMtokens.get_orth token = s then (t,[]) :: found else found
+           | L(s,cat,interp), Lemma(s2,cat2,interps2) ->
+               Xlist.fold interps2 found (fun found interp2 ->
+                 if s=s2 && cat=cat2 && check_interp [] (interp,interp2) then
+                   (t,get_sels [] (interp,interp2)) :: found else found)
+           | D(s,cat), Dig(s2,cat2) -> if s=s2 && cat=cat2 then (t,[]) :: found else found
+           | _ -> found))) in
+     Xlist.fold found [] (fun found (t,sels) -> match_path_rec map found t sels [] l)
  
 let concat_orths l =
   let s = String.concat "" (Xlist.map l (fun t -> t.orth ^ (if t.beg+t.len=t.next then "" else " "))) in
   let n = Xstring.size s in
   if String.get s (n-1) = ' ' then String.sub s 0 (n-1) else s
  
-let create_token (matching:token_env list) lemma interp = (* FIXME: problem z nazwami własnymi *)
+let create_token (matching:token_env list) sels lemma cat interp = (* FIXME: problem z nazwami własnymi *)
   let l = List.rev matching in
   let beg = (List.hd l).beg in
   let t = List.hd matching in
@@ -251,40 +221,39 @@ let create_token (matching:token_env list) lemma interp = (* FIXME: problem z na
     beg=beg;
     len=len;
     next=t.next;
-    token=ENIAMtokens.make_lemma (lemma,interp);
+    token=Lemma(lemma,cat,[Xlist.map interp (function
+        S s -> (try Xlist.assoc sels s with Not_found -> ["_"])
+      | V s -> [s]
+      | G -> ["_"])]);
     weight=0.; (* FIXME: dodać wagi do konkretnych reguł i uwzględnić wagi maczowanych tokenów *)
     attrs=ENIAMtokens.merge_attrs l}
  
 let add_token paths t =
   let map = try IntMap.find paths t.beg with Not_found -> IntMap.empty in
-  let map = IntMap.add_inc map t.next [t] (fun l -> t :: l) in
+  let map = IntMap.add_inc map t.next (TokenEnvSet.singleton t) (fun set -> TokenEnvSet.add set t) in
   IntMap.add paths t.beg map
  
-let apply_rule paths (match_list,lemma,interp) =
+let apply_rule paths (match_list,lemma,cat,interp) =
   (* print_endline ("apply_rule: " ^ lemma); *)
   let matchings_found = match_path paths match_list in
-  Xlist.fold matchings_found paths (fun paths matching ->
+  Xlist.fold matchings_found paths (fun paths (matching,sels) ->
     try
-      let token = create_token matching lemma interp in
+      let token = create_token matching sels lemma cat interp in
       add_token paths token
     with Not_found -> paths)
  
-(* FIXME: reguły dla ordnum powinny maczować część mowy a nie tylko orth *)
-let add_ordnum_rules rules paths =
-  let orths = get_intnum_orths paths in
-  StringMap.fold orths rules (fun rules orth lemmas ->
-    StringSet.fold lemmas rules (fun rules lemma ->
-      (* Printf.printf "%s %s\n%!" orth lemma; *)
-      ([orth;"."],lemma,"ordnum") :: rules))
-
 let process (paths,last) =
   let paths = Xlist.fold paths IntMap.empty add_token in
-  let orths = get_orths paths in
-  let rules = preselect_dict orths mwe_dict in
-  let rules = add_ordnum_rules rules paths in
+  let rules = select_rules paths mwe_dict mwe_dict2 in
+  let paths = Xlist.fold rules paths apply_rule in
+  let rules = select_rules paths mwe_dict mwe_dict2 in
+  let paths = Xlist.fold rules paths apply_rule in
+  let rules = select_rules paths mwe_dict mwe_dict2 in
+  let paths = Xlist.fold rules paths apply_rule in
+  let rules = select_rules paths mwe_dict mwe_dict2 in
   let paths = Xlist.fold rules paths apply_rule in
   let paths = IntMap.fold paths [] (fun paths _ map ->
     IntMap.fold map paths (fun paths _ l ->
-      Xlist.fold l paths (fun paths t ->
+      TokenEnvSet.fold l paths (fun paths t ->
         t :: paths))) in
   ENIAMpaths.sort (paths,last)
@@ -48,6 +48,9 @@ let brev_filename = resource_path ^ &quot;/subsyntax/brev.tab&quot;
 let fixed_filename = resource_path ^ "/Walenty/fixed.tab"
 let complete_entries_filename = resource_path ^ "/subsyntax/complete_entries.tab"
 let mwe_filename = resource_path ^ "/subsyntax/mwe.tab"
+let sawa_filename = resource_path ^ "/subsyntax/SAWA.dic"
+let sejf_filename = resource_path ^ "/subsyntax/SEJF.dic"
+let sejfek_filename = resource_path ^ "/subsyntax/SEJFEK.dic"
  
 let lemma_frequencies_filename = resource_path ^ "/subsyntax/NKJP1M-lemma-freq.tab"
  
-- dokończyć rozpoznawanie MWE
-- dodać zasoby MWE
 - rozpoznawanie MWE ze Słowosieci
+- kompresowanie tokenów mających indentyczne lematy (albo po przetworzeniu, albo kompresowanie interpretacji przed rozpoznaniem mwe)
  
 - jak przetwarzać num:comp
 - przetwarzanie liczebników złożonych np dwadzieścia jeden, jedna druga
@@ -3,7 +3,7 @@ OCAMLOPT=ocamlopt
 OCAMLDEP=ocamldep
 INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I +eniam
 OCAMLFLAGS=$(INCLUDES) -g
-OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa eniam-subsyntax.cmxa
+OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-tokenizer.cmxa eniam-morphology.cmxa #eniam-subsyntax.cmxa
 INSTALLDIR=`ocamlc -where`/eniam
  
 SOURCES= ENIAMsubsyntaxTypes.ml ENIAMsubsyntaxStringOf.ml ENIAMsubsyntaxHTMLof.ml ENIAMsubsyntaxXMLof.ml ENIAMsubsyntaxGraphOf.ml ENIAMpaths.ml ENIAM_MWE.ml ENIAMsentences.ml ENIAMsubsyntax.ml
@@ -32,8 +32,8 @@ eniam-subsyntax.cma: $(SOURCES)
 eniam-subsyntax.cmxa: $(SOURCES)
 	ocamlopt -linkall -a -o eniam-subsyntax.cmxa $(INCLUDES) $^
  
-test: test.ml
-	$(OCAMLOPT) -o test $(OCAMLOPTFLAGS) test.ml
+test: $(SOURCES) test.ml
+	$(OCAMLOPT) -o test $(OCAMLOPTFLAGS) $(SOURCES) test.ml
  
 interface: interface.ml
 	$(OCAMLOPT) -o subsyntax $(OCAMLOPTFLAGS) interface.ml
-File NKJP1M-lemma-freq.tab in this folder was created on the basis of 
+File NKJP1M-lemma-freq.tab in this folder was created on the basis of
  
 NKJP1M: the manually annotated 1-million word subcorpus sampled
 from texts of a subset of the National Corpus of Polish.
 version 1.2
  
-File proper_names_sgjp_polimorf_20151020.tab in this folder were created on the basis of 
+File proper_names_sgjp_polimorf_20151020.tab in this folder were created on the basis of
  
 SGJP: Grammatical Dictionary of Polish, version 20151020
 Copyright © 2007–2015 Zygmunt Saloni, Włodzimierz Gruszczyński, Marcin
 Woliński, Robert Wołosz, Danuta Skowrońska
  
-and also on the basis of 
+and also on the basis of
 PoliMorf, version 20151020
+
+File SEJF.dic is created on the basis of
+Grammatical Lexicon of Polish Phraseology
+Copyright © Monika Czerepowicka, Agata Savary
+Copyright © Institute of Computer Science Polish Academy of Sciences
+The data are available under the CC BY-SA license.
+
+File SEJFEK.dic is created on the basis of
+Grammatical Lexicon of Polish Economic Phraseology
+Copyright © Filip Makowiecki, Agata Savary
+Copyright © Institute of Computer Science Polish Academy of Sciences
+The data are available under the CC BY-SA license.
+
+File SAWA.dic is created on the basis of
+Grammatical Lexicon of Warsaw Urban Proper Names
+Copyright © Małgorzata Marciniak, Celina Heliasz, Joanna Rabiega-Wiśniewska, Piotr Sikora, Marcin Woliński, Agata Savary
+Copyright © Institute of Computer Science Polish Academy of Sciences
+The data are available under the CC BY-SA license.