Blame view

pre/preWordnet.ml 7.83 KB
Wojciech Jaworski authored
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
(*
 *  ENIAM: Categorial Syntactic-Semantic Parser for Polish
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)

open Xstd
open PreTypes

let load_lu names filename = 
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  Xlist.fold l (StringMap.empty,names) (fun (lu,names) line -> 
    if String.length line = 0 then lu,names else
    if String.get line 0 = '#' then lu,names else
    match Str.split_delim (Str.regexp "\t") line with
      [id; lemma; variant] -> 
        StringMap.add_inc lu lemma [id,variant] (fun l -> (id,variant) :: l),
        StringMap.add_inc names id (lemma ^ " " ^ variant) (fun _ -> failwith "load_lu")
    | _ -> failwith ("load_lu: " ^ line))

let noun_lu,lu_names = load_lu StringMap.empty Paths.rzeczownik_filename
let verb_lu,lu_names = load_lu lu_names Paths.czasownik_filename
let adj_lu,lu_names = load_lu lu_names Paths.przymiotnik_filename

let load_synsets filename = 
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  Xlist.fold l (StringMap.empty,StringMap.empty) (fun (syn,names) line -> 
    if String.length line = 0 then (syn,names) else
    if String.get line 0 = '#' then (syn,names) else
    match Str.split_delim (Str.regexp "\t") line with
      [syn_id; lu_ids] -> 
        let lu_ids = Str.split_delim (Str.regexp " ") lu_ids in
        let syn = Xlist.fold lu_ids syn (fun syn lu_id -> 
          StringMap.add_inc syn lu_id syn_id (fun _ -> failwith ("load_synsets 1: " ^ lu_id))) in
        let lu_id = try List.hd lu_ids with _ ->  failwith ("load_synsets 2: " ^ syn_id) in
        let name = try StringMap.find lu_names lu_id with Not_found -> "syn_id: " ^ syn_id in (* nieznane synsety są z en wordnetu *)
        let names = StringMap.add_inc names syn_id name (fun _ -> failwith ("load_synsets 4: " ^ syn_id)) in
        syn,names
    | _ -> failwith ("load_synsets 5: " ^ line))

let synsets, syn_names = load_synsets Paths.synsets_filename

let load_hipero filename = 
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  Xlist.fold l StringMap.empty (fun hip line -> 
    if String.length line = 0 then hip else
    if String.get line 0 = '#' then hip else
    match Str.split_delim (Str.regexp "\t") line with
      [id; ids] -> 
        let ids = Str.split_delim (Str.regexp " ") ids in
        StringMap.add_inc hip id ids (fun _ -> failwith "load_hipero")
    | _ -> failwith ("load_hipero: " ^ line))   

let hipero = load_hipero Paths.hipero_filename

let rec get_lu_id variant = function
    (id,v) :: l -> if variant = v then id else get_lu_id variant l
  | [] -> failwith "get_lu_id"

let lu_id_of_sense sense =
  let lemma,variant = 
    match List.rev (Str.split (Str.regexp " ") sense) with
(*       [lemma] -> lemma,"" *)
    | variant :: l -> String.concat " " (List.rev l), variant
    | _ -> failwith "lu_id_of_sense 1" in
  if variant = "" then lemma else
  let l = try StringMap.find noun_lu lemma with Not_found -> failwith ("lu_id_of_sense 2: " ^ lemma) in
  get_lu_id variant l

let load_predef hipero filename = 
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  Xlist.fold l (hipero,StringSet.empty) (fun (hipero,predef) line -> 
    if String.length line = 0 then hipero,predef else
    if String.get line 0 = '#' then hipero,predef else
    match Str.split_delim (Str.regexp "\t") line with
      id :: senses -> 
        let hipero = Xlist.fold senses hipero (fun hipero sense ->
          if StringSet.mem predef sense then StringMap.add_inc hipero sense [id] (fun l -> id :: l) else
          let lu_id = lu_id_of_sense sense in
          let syn_id = try StringMap.find synsets lu_id with Not_found -> lu_id in
          StringMap.add_inc hipero syn_id [id] (fun l -> id :: l)) in
        hipero, StringSet.add predef id
    | _ -> failwith ("load_predef: " ^ line))   

let hipero,predef = load_predef hipero Paths.predef_filename

let rec get_hipero_rec found id =
  if StringSet.mem found id then found else
  let found = StringSet.add found id in
  let l = try StringMap.find hipero id with Not_found -> [] in
  Xlist.fold l found get_hipero_rec

let get_hipero lu_id =   
  let syn_id = StringMap.find synsets lu_id in
  StringSet.to_list (get_hipero_rec StringSet.empty syn_id)

let synset_name id =
  if StringSet.mem predef id then id else
  try StringMap.find syn_names id with Not_found -> failwith "synset_name"

let rename_sense sense =
  let lu_id = lu_id_of_sense sense in
  StringMap.find synsets lu_id

let load_proper_classes filename =
  let l = Str.split_delim (Str.regexp "\n") (File.load_file filename) in
  Xlist.fold l StringMap.empty (fun map line -> 
    if String.length line = 0 then map else
    if String.get line 0 = '#' then map else
    match Str.split_delim (Str.regexp "\t") line with
      id :: senses -> 
        let senses = Xlist.map senses (fun sense ->
          match List.rev (Str.split (Str.regexp " ") sense) with
            weight :: l -> String.concat " " (List.rev l), (try float_of_string weight with _ -> failwith "load_proper_classes 2")
          | _ -> failwith "load_proper_classes 4") in
        let senses = Xlist.map senses (fun (sense,weight) ->
          let sense = if sense = "antroponim 1" then "nazwa własna 1" else sense in
          let sense = if sense = "godzina 4" then "godzina 3" else sense in
(*           print_endline sense; *)
          let lu_id = lu_id_of_sense sense in
          sense,Xlist.map (get_hipero lu_id) synset_name,weight) in
        StringMap.add_inc map id senses (fun _ -> failwith ("load_proper_classes 3: " ^ id))
    | _ -> failwith ("load_proper_classes: " ^ line))   

let proper_classes = load_proper_classes Paths.proper_classes_filename

let simplify_pos = function
    "subst" -> "noun"
  | "depr" -> "noun"
  | "adj" -> "adj"
  | "adja" -> "adj"
  | "adjc" -> "adj"
  | "adjp" -> "adj"
  | "ger" -> "verb"
  | "pact" -> "verb"
  | "ppas" -> "verb"
  | "fin" -> "verb"
  | "bedzie" -> "verb"
  | "praet" -> "verb"
  | "winien" -> "verb"
  | "impt" -> "verb"
  | "imps" -> "verb"
  | "inf" -> "verb"
  | "pcon" -> "verb"
  | "pant" -> "verb"
  | "pred" -> "verb"
  | s -> s

let assign_senses (paths,last) = (* FIXME: sensy zawierające 'się' *)
  List.rev (Xlist.rev_map paths (fun t ->
    match t.token with
      Lemma(lemma,pos,_) -> 
        (*if pos = "ppron12" || pos = "ppron3" || pos = "siebie" then {t with senses=[lemma,["0"],0.]} else*) (* FIXME: ustalić co z zaimkami *)
        let lu = match simplify_pos pos with
            "noun" -> noun_lu
          | "adj" -> adj_lu
          | "verb" -> verb_lu
          | _ -> StringMap.empty in        
        if StringMap.mem lu lemma then
          let l = StringMap.find lu lemma in
          let l = Xlist.rev_map l (fun (id,variant) -> 
            lemma ^ " " ^ variant, Xlist.map (get_hipero id) synset_name, log10 (1. /. (try float_of_string variant with _ -> 3.))) in
          {t with senses=l}
        else t
    | Proper(_,_,_,senses) ->  
        {t with senses=List.flatten (Xlist.rev_map senses (fun sense ->
          try StringMap.find proper_classes sense with Not_found -> failwith ("assign_senses: " ^ sense)))}
    | _ -> t)), last