Blame view

pre/preProcessing.ml 7.57 KB
Wojciech Jaworski authored
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
(*
 *  ENIAM: Categorial Syntactic-Semantic Parser for Polish
 *  Copyright (C) 2016 Wojciech Jaworski <wjaworski atSPAMfree mimuw dot edu dot pl>
 *  Copyright (C) 2016 Institute of Computer Science Polish Academy of Sciences
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *)
Wojciech Jaworski authored
20
(*
Wojciech Jaworski authored
21
let parse query =
Wojciech Jaworski authored
22
(*   print_endline "a1"; *)
Wojciech Jaworski authored
23
  let l = Xunicode.classified_chars_of_utf8_string query in
Wojciech Jaworski authored
24
(*   print_endline "a2"; *)
Wojciech Jaworski authored
25
  let l = PreTokenizer.tokenize l in
Wojciech Jaworski authored
26
27
28
29
30
31
32
33
34
35
36
37
38
(*   print_endline "a3"; *)
  let l = PrePatterns.normalize_tokens [] l in
(*   print_endline "a4"; *)
  let l = PrePatterns.find_replacement_patterns l in
(*   print_endline "a5"; *)
  let l = PrePatterns.remove_spaces [] l in
  let l = PrePatterns.find_abr_patterns PreAcronyms.abr_patterns l in
  let l = PrePatterns.normalize_tokens [] l in
(*   print_endline "a6"; *)
  let paths = PrePaths.translate_into_paths l in
(*   print_endline "a7"; *)
  let paths = PrePaths.lemmatize paths in
(*   print_endline "a8"; *)
Wojciech Jaworski authored
39
  let paths,_ = PreMWE.process paths in
Wojciech Jaworski authored
40
41
42
43
44
(*   print_endline "a12"; *)
  let paths = find_proper_names paths in
(*   print_endline "a13"; *)
  let paths = modify_weights paths in
  let paths = translate_digs paths in
Wojciech Jaworski authored
45
  let paths = assign_senses paths in
Wojciech Jaworski authored
46
47
(*   print_endline "a14"; *)
  let paths = assign_valence paths in
Wojciech Jaworski authored
48
49
(*   print_endline "a15"; *)
  let paths = combine_interps paths in
Wojciech Jaworski authored
50
(*   print_endline "a16"; *)
Wojciech Jaworski authored
51
  let paths = disambiguate_senses paths in
Wojciech Jaworski authored
52
53
54
  let paths = assign_simplified_valence paths in
  let paths = PreSemantics.assign_semantics paths in
(*   print_endline "a16"; *)
Wojciech Jaworski authored
55
  let paths = select_tokens paths in
Wojciech Jaworski authored
56
57
58
(*   print_endline "a17"; *)
(*  let paths = if !single_sense_flag then single_sense paths else paths in
  let paths = if !single_frame_flag then single_frame paths else paths in*)
Wojciech Jaworski authored
59
60
  (*let paths, next_id = add_ids paths next_id in
  let paths = prepare_indexes paths in*)
Wojciech Jaworski authored
61
(*   print_endline "a18"; *)
Wojciech Jaworski authored
62
  paths(*, next_id*)
Wojciech Jaworski authored
63
(*     print_endline (PrePaths.to_string paths);     *)
Wojciech Jaworski authored
64
65
(*   let paths =
    if PrePaths.no_possible_path (PrePaths.map paths PreLemmatization.remove_postags) then
Wojciech Jaworski authored
66
67
68
      PrePaths.map paths process_ign
    else paths in
  let paths = PrePaths.map paths PreLemmatization.remove_postags in
Wojciech Jaworski authored
69
  let paths = PreCaseShift.manage_lower_upper_case paths in (* FIXME: niepotrzebnie powiększa pierwszy token (przymiotniki partykuły itp.) *)
Wojciech Jaworski authored
70
71
  let paths = PreLemmatization.combine_interps paths in
(*     print_endline (PrePaths.to_string paths);     *)*)
Wojciech Jaworski authored
72
Wojciech Jaworski authored
73
let parse_conll tokens dep_paths = (* FIXME: sprawdzić, czy zachowana jest kolejność elementów paths !!! *)
Wojciech Jaworski authored
74
  let paths = List.rev (Int.fold 1 (Array.length dep_paths - 1) [] (fun paths conll_id ->
Wojciech Jaworski authored
75
76
    let id,_,_ = dep_paths.(conll_id) in
    ExtArray.get tokens id :: paths)) in
Wojciech Jaworski authored
77
  (* print_endline "a12"; *)
Wojciech Jaworski authored
78
  let paths = find_proper_names paths in
Wojciech Jaworski authored
79
  (*   print_endline "a13"; *)
Wojciech Jaworski authored
80
81
  let paths = modify_weights paths in
  let paths = PreWordnet.assign_senses paths in
Wojciech Jaworski authored
82
  (*   print_endline "a14"; *)
Wojciech Jaworski authored
83
    (* let paths = combine_interps paths in (* FIXME: to powinno też działać dla Proper *) *)
Wojciech Jaworski authored
84
  (*   print_endline "a15"; *)
Wojciech Jaworski authored
85
  let paths = assign_valence paths in
Wojciech Jaworski authored
86
  (*   print_endline "a16"; *)
Wojciech Jaworski authored
87
88
89
  let paths = disambiguate_senses paths in
  let paths = assign_simplified_valence paths in
  let paths = PreSemantics.assign_semantics paths in
Wojciech Jaworski authored
90
  (*   print_endline "a16"; *)
Wojciech Jaworski authored
91
  let _ = Xlist.fold paths 1 (fun conll_id t ->
Wojciech Jaworski authored
92
93
    let id,_,_ = dep_paths.(conll_id) in
    ExtArray.set tokens id t;
Wojciech Jaworski authored
94
95
    conll_id + 1) in
  ()
Wojciech Jaworski authored
96
Wojciech Jaworski authored
97
98
let parse_text = function
    RawText query ->
Wojciech Jaworski authored
99
100
101
102
      let text,tokens = ENIAMsubsyntax.parse_text query in
      let text = ENIAMpreIntegration.parse_text ENIAMsubsyntaxTypes.Struct tokens text in
      let lex_sems = ENIAMlexSemantics.assign tokens text in
      text,tokens,lex_sems
Wojciech Jaworski authored
103
  | AltText[Raw,RawText query;CONLL,StructText([
Wojciech Jaworski authored
104
            StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]],tokens)] ->
Wojciech Jaworski authored
105
106
        parse_conll tokens dep_paths;
        let paths = parse query in
Wojciech Jaworski authored
107
        let sentences = PreSentences.split_into_sentences "" query tokens paths in
Wojciech Jaworski authored
108
        let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in
Wojciech Jaworski authored
109
        let conll = StructParagraph[{p with psentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths]
Wojciech Jaworski authored
110
          @ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else [])}] in
Wojciech Jaworski authored
111
        AltText[Raw,RawText query; Struct, StructText([
Wojciech Jaworski authored
112
          AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]],tokens)]
Wojciech Jaworski authored
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
  | _ -> failwith "parse_text: not implemented"*)

open ENIAMsubsyntaxTypes

let parse_text = function
    RawText query,_ ->
      let text,tokens = ENIAMsubsyntax.parse_text query in
      let text = ENIAMpreIntegration.parse_text ENIAMsubsyntaxTypes.Struct tokens text in
      let lex_sems = ENIAMlexSemantics.assign tokens text in
      text,tokens,lex_sems
  | AltText[Raw,RawText query;CONLL,StructText[
            StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence dep_paths]} as p]]],tokens ->
      let m_dep_paths = Array.map (fun (id,_,_) -> id,-1,"") dep_paths in
      let conll = StructParagraph[{p with psentence = AltSentence([Raw, RawSentence text; CONLL, DepSentence dep_paths]
        @ if Paths.config.Paths.mate_parser_enabled then [Mate, DepSentence m_dep_paths] else [])}] in
      let paths = ENIAMsubsyntax.parse query in
      let sentences = ENIAMsentences.split_into_sentences "" query tokens paths in
      let text = AltText[Raw,RawText query; Struct, StructText([
        AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]])] in
      let lex_sems = ENIAMlexSemantics.assign tokens text in
      text,tokens,lex_sems
Wojciech Jaworski authored
134
  | _ -> failwith "parse_text: not implemented"
Wojciech Jaworski authored
135
Wojciech Jaworski authored
136
let rec main_loop in_chan out_chan =
Wojciech Jaworski authored
137
  (* print_endline "main_loop 1"; *)
Wojciech Jaworski authored
138
  let query = (Marshal.from_channel in_chan : text * ENIAMtokenizerTypes.token_record ExtArray.t) in
Wojciech Jaworski authored
139
  (* print_endline "main_loop 2"; *)
Wojciech Jaworski authored
140
  if fst query = RawText "" then () else (
Wojciech Jaworski authored
141
  (try
Wojciech Jaworski authored
142
143
(*     let time0 = Sys.time () in *)
    let utime0 = Unix.gettimeofday () in
Wojciech Jaworski authored
144
   (* print_endline "main_loop 3a"; *)
Wojciech Jaworski authored
145
    let text,tokens,lex_sems = parse_text query in
Wojciech Jaworski authored
146
   (* print_endline "main_loop 4a"; *)
Wojciech Jaworski authored
147
148
149
(*     let time2 = Sys.time () in *)
    let utime2 = Unix.gettimeofday () in
(*     Printf.printf "time=%f utime=%f\n%!" (time2 -. time0) (utime2 -. utime0); *)
Wojciech Jaworski authored
150
    Marshal.to_channel out_chan (text,tokens,lex_sems,"",utime2 -. utime0) [];
Wojciech Jaworski authored
151
152
  (* print_endline "main_loop 5"; *)
    ()
Wojciech Jaworski authored
153
154
  with e -> (
    (* print_endline "main_loop 7"; *)
Wojciech Jaworski authored
155
156
157
158
159
160
    Marshal.to_channel out_chan (
      RawText "",
      ExtArray.make 1 ENIAMtokenizerTypes.empty_token,
      ExtArray.make 1 ENIAMlexSemanticsTypes.empty_lex_sem,
      Printexc.to_string e,
      0.) []));
Wojciech Jaworski authored
161
      (* print_endline "main_loop 6"; *)
Wojciech Jaworski authored
162
163
  flush out_chan;
  main_loop in_chan out_chan)
Wojciech Jaworski authored
164
Wojciech Jaworski authored
165
166
167
168
(* let _ = main_loop stdin stdout *)

let sockaddr = Unix.ADDR_INET(Unix.inet_addr_any,Paths.pre_port)
Wojciech Jaworski authored
169
let _ =
Wojciech Jaworski authored
170
171
  Gc.compact ();
  print_endline "Ready!";
Wojciech Jaworski authored
172
  Unix.establish_server main_loop sockaddr