plWordnet.ml 31.7 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652
(********************************************************)
(*                                                      *)
(*  Copyright 2014 Wojciech Jaworski.                   *)
(*                                                      *)
(*  All rights reserved.                                *)
(*                                                      *)
(********************************************************)

open Xstd

(*let nexus_path = "/home/yacheu/Dokumenty/Badania/Jezyk i Umysl/Przetwarzanie Języka Naturalnego/zasoby/"
let toshiba_ub_path = "/home/wjaworski/Dokumenty/zasoby/"

let get_host_name () =
  let chan = Unix.open_process_in "uname -n" in
  input_line chan

let zasoby_path =
  match get_host_name () with
    "nexus" -> nexus_path
  | "toshiba-UB" -> toshiba_ub_path
(*   | "mozart" -> "." *)
  | s -> failwith ("unknown host: " ^ s)*)

let zasoby_path = "../../NLP resources/"

let plwordnet_filename = zasoby_path ^ "Słowosieć/plwordnet-3.0.xml"

type lu = {lu_id: int; lu_name: string; lu_pos: string; lu_tagcount: string; lu_domain: string; lu_desc: string;
          lu_workstate: string; lu_source: string; lu_variant: string; lu_syn: int}

type syn = {syn_workstate: string; syn_split: string; syn_owner: string; syn_definition: string;
          syn_desc: string; syn_abstract: string; syn_units: (int * lu) list; syn_pos: string; syn_no_hipo: int; syn_domain: string}

type rels = {r_parent: int; r_child: int; r_relation: int; r_valid: string; r_owner: string}

type rt = {rt_type: string; rt_reverse: int; rt_name: string; rt_description: string;
          rt_posstr: string; rt_display: string; rt_shortcut: string; rt_autoreverse: string; rt_pwn: string; rt_tests: (string * string) list}

let empty_lu = {lu_id=(-1); lu_name=""; lu_pos=""; lu_tagcount=""; lu_domain=""; lu_desc="";
          lu_workstate=""; lu_source=""; lu_variant=""; lu_syn=(-1)}

let process_unit = function
    Xml.Element("unit-id",[],[Xml.PCData s]) -> int_of_string s, empty_lu
  | node -> failwith ("process_unit " ^ (Xml.to_string node))

let process_tests = function
    Xml.Element("test",["text",text;"pos",pos],[]) -> text,pos
  | node -> failwith ("process_tests " ^ (Xml.to_string node))


(* funkcja zwraca:
lexical-unit map - wiąże leksemy z identyfikatorami
synset map
lexicalrelations
synsetrelations
relationtypes map
*)
let process_entry (lumap,synmap,lr,sr,rtmap) = function
    Xml.Element("lexical-unit",["id",id;"name",name;"pos",pos;"tagcount",tagcount;"domain",domain;"workstate",workstate;
                                "source",source;"variant",variant],[]) ->
        let lumap = IntMap.add_inc lumap (int_of_string id) {lu_id=int_of_string id; lu_name=name; lu_pos=pos; lu_tagcount=tagcount; lu_domain=domain; lu_desc="";
          lu_workstate=workstate; lu_source=source; lu_variant=variant; lu_syn=(-1)} (fun _ -> failwith "process_entry 2") in
        lumap,synmap,lr,sr,rtmap
  | Xml.Element("lexical-unit",["id",id;"name",name;"pos",pos;"tagcount",tagcount;"domain",domain;"desc",desc;"workstate",workstate;
                                "source",source;"variant",variant],[]) ->
        let lumap = IntMap.add_inc lumap (int_of_string id) {lu_id=int_of_string id; lu_name=name; lu_pos=pos; lu_tagcount=tagcount; lu_domain=domain; lu_desc=desc;
          lu_workstate=workstate; lu_source=source; lu_variant=variant; lu_syn=(-1)} (fun _ -> failwith "process_entry 3") in
        lumap,synmap,lr,sr,rtmap
  | Xml.Element("synset",["id",id;"workstate",workstate;"split",split;"owner",owner;"definition",definition;"desc",desc;
                                "abstract",abstract],units) ->
        let units = Xlist.map units process_unit in
        let synmap = IntMap.add_inc synmap (int_of_string id) {syn_workstate=workstate; syn_split=split; syn_owner=owner; syn_definition=definition;
          syn_desc=desc; syn_abstract=abstract; syn_units=units; syn_pos=""; syn_no_hipo=0; syn_domain=""} (fun _ -> failwith "process_entry 4") in
        lumap,synmap,lr,sr,rtmap
  | Xml.Element("synset",["id",id;"workstate",workstate;"split",split;"owner",owner;"desc",desc;
                                "abstract",abstract],units) ->
        let units = Xlist.map units process_unit in
        let synmap = IntMap.add_inc synmap (int_of_string id) {syn_workstate=workstate; syn_split=split; syn_owner=owner; syn_definition="";
          syn_desc=desc; syn_abstract=abstract; syn_units=units; syn_pos=""; syn_no_hipo=0; syn_domain=""} (fun _ -> failwith "process_entry 4") in
        lumap,synmap,lr,sr,rtmap
  | Xml.Element("lexicalrelations",["parent",parent;"child",child;"relation",relation;"valid",valid;"owner",owner],[]) ->
        let lr = {r_parent=int_of_string parent; r_child=int_of_string child; r_relation=int_of_string relation; r_valid=valid; r_owner=owner} :: lr in
        lumap,synmap,lr,sr,rtmap
  | Xml.Element("synsetrelations",["parent",parent;"child",child;"relation",relation;"valid",valid;"owner",owner],[]) ->
        let sr = {r_parent=int_of_string parent; r_child=int_of_string child; r_relation=int_of_string relation; r_valid=valid; r_owner=owner} :: sr in
        lumap,synmap,lr,sr,rtmap
  | Xml.Element("relationtypes",["id",id;"type",typ;"reverse",reverse;"name",name;"description",description;
                                 "posstr",posstr;"display",display;"shortcut",shortcut;"autoreverse",autoreverse;
                                 "pwn",pwn],tests) ->
        let tests = Xlist.map tests process_tests in
        let rtmap = IntMap.add_inc rtmap (int_of_string id) {rt_type=typ; rt_reverse=int_of_string reverse; rt_name=name; rt_description=description;
          rt_posstr=posstr; rt_display=display; rt_shortcut=shortcut; rt_autoreverse=autoreverse; rt_pwn=pwn; rt_tests=tests}
          (fun _ -> failwith "process_entry 5") in
        lumap,synmap,lr,sr,rtmap
  | Xml.Element("relationtypes",["id",id;"type",typ;"name",name;"description",description;
                                 "posstr",posstr;"display",display;"shortcut",shortcut;"autoreverse",autoreverse;
                                 "pwn",pwn],tests) ->
        let tests = Xlist.map tests process_tests in
        let rtmap = IntMap.add_inc rtmap (int_of_string id) {rt_type=typ; rt_reverse=(-1); rt_name=name; rt_description=description;
          rt_posstr=posstr; rt_display=display; rt_shortcut=shortcut; rt_autoreverse=autoreverse; rt_pwn=pwn; rt_tests=tests}
          (fun _ -> failwith "process_entry 5") in
        lumap,synmap,lr,sr,rtmap
  | Xml.Element("relationtypes",["id",id;"type",typ;"parent",parent;"reverse",reverse;"name",name;"description",description;
                                 "posstr",posstr;"display",display;"shortcut",shortcut;"autoreverse",autoreverse;
                                 "pwn",pwn],tests) ->
        let tests = Xlist.map tests process_tests in
        let rtmap = IntMap.add_inc rtmap (int_of_string id) {rt_type=typ; rt_reverse=int_of_string reverse; rt_name=name; rt_description=description;
          rt_posstr=posstr; rt_display=display; rt_shortcut=shortcut; rt_autoreverse=autoreverse; rt_pwn=pwn; rt_tests=tests}
          (fun _ -> failwith "process_entry 5") in
        lumap,synmap,lr,sr,rtmap
  | Xml.Element("relationtypes",["id",id;"type",typ;"parent",parent;"name",name;"description",description;
                                 "posstr",posstr;"display",display;"shortcut",shortcut;"autoreverse",autoreverse;
                                 "pwn",pwn],tests) ->
        let tests = Xlist.map tests process_tests in
        let rtmap = IntMap.add_inc rtmap (int_of_string id) {rt_type=typ; rt_reverse=(-1); rt_name=name; rt_description=description;
          rt_posstr=posstr; rt_display=display; rt_shortcut=shortcut; rt_autoreverse=autoreverse; rt_pwn=pwn; rt_tests=tests}
          (fun _ -> failwith "process_entry 5") in
        lumap,synmap,lr,sr,rtmap
  | node -> print_endline (Xml.to_string node); failwith "process_entry 1"

let load_data filename =
  match try Xml.parse_file filename with Xml.Error e -> failwith ("load_data Xml.Error " ^ Xml.error e) with
    Xml.Element("array-list",_,entries) ->
      Xlist.fold entries (IntMap.empty,IntMap.empty,[],[],IntMap.empty) process_entry
  | node -> failwith ("load_data " ^ (Xml.to_string node))

let merge_lu_syn lumap synmap =
  IntMap.map synmap (fun syn ->
    let units = Xlist.map syn.syn_units (fun (id,_) -> id, IntMap.find lumap id) in
    let pos = match StringSet.to_list (Xlist.fold units StringSet.empty (fun set (_,lu) ->
                 StringSet.add set lu.lu_pos)) with
        [] -> failwith "merge_lu_syn: empty synset"
      | [pos] -> pos
      | _ -> failwith "merge_lu_syn: inconsistent pos" in
    {syn with syn_units=units; syn_pos=pos})

let merge_lu_syn2 lumap synmap =
  IntMap.fold synmap lumap (fun lumap syn_id syn ->
    Xlist.fold syn.syn_units lumap (fun lumap (id,_) ->
      let lu = try IntMap.find lumap id with Not_found -> failwith "merge_lu_syn2" in
      if lu.lu_syn <> -1 then failwith "merge_lu_syn2" else
      IntMap.add lumap id {lu with lu_syn=syn_id}))

let create_relation_map rel_id rels =
  Xlist.fold rels Graph.empty (fun graph r ->
    if r.r_relation = rel_id then
      Graph.add graph r.r_parent r.r_child
    else graph)

let assign_no_hipo synmap hipo =
  IntMap.mapi synmap (fun id syn ->
    {syn with syn_no_hipo=IntSet.size (Graph.get_children_ids hipo IntSet.empty id)})

let select_pos synmap pos =
  IntMap.fold synmap IntSet.empty (fun selected id syn ->
    if syn.syn_pos = pos then IntSet.add selected id else selected)

let select_big_synsets synmap threshold =
  IntMap.fold synmap IntSet.empty (fun selected id syn ->
    if syn.syn_no_hipo >= threshold then IntSet.add selected id else selected)

let lu_name lu =
  lu.lu_name ^ "-" ^ lu.lu_variant

let syn_name syn =
  String.concat ", " (Xlist.map syn.syn_units (fun (_,lu) -> lu_name lu))

let syn_name_single syn =
  if syn.syn_units = [] then "empty" else
  lu_name (snd (List.hd syn.syn_units))

let print_synset_names filename synmap selected =
  File.file_out filename (fun file ->
    IntSet.iter selected (fun id ->
      let syn = IntMap.find synmap id in
      let names = syn_name syn in
      Printf.fprintf file "%d\t%d\t%s\t%s\n" syn.syn_no_hipo id syn.syn_pos names))


(**************************************************)


(*
let string_of_tests tests =
  String.concat " " (Xlist.map tests (fun (t,p) -> "(" ^ t ^ "," ^ p ^ ")"))

let string_of_units units =
  String.concat " " (Xlist.map units fst)

let string_of_lu lu =
  Printf.sprintf "\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\"" lu.lu_name lu.lu_pos lu.lu_tagcount lu.lu_domain
    lu.lu_desc lu.lu_workstate lu.lu_source lu.lu_variant

let string_of_syn syn =
  Printf.sprintf "\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\"" syn.syn_workstate syn.syn_split
    syn.syn_owner syn.syn_definition syn.syn_desc syn.syn_abstract (string_of_units syn.syn_units)

let string_of_rt rt =
  Printf.sprintf "\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\";\"%s\"" rt.rt_type rt.rt_reverse rt.rt_name rt.rt_description rt.rt_posstr
    rt.rt_display rt.rt_shortcut rt.rt_autoreverse rt.rt_pwn (string_of_tests rt.rt_tests)

let lu_names = ["name"; "pos"; "tagcount"; "domain"; "desc"; "workstate"; "source"; "variant"]
let syn_names = ["workstate"; "split"; "owner"; "definition"; "desc"; "abstract"; "units"]
let rt_names = ["type"; "reverse"; "name"; "description"; "posstr"; "display"; "shortcut"; "autoreverse"; "pwn"; "tests"]
let rel_names = ["parent"; "child"; "valid"; "owner"]

let print_lu_map filename lumap =
  File.file_out filename (fun file ->
    Printf.fprintf file "id;%s\n" (String.concat ";" lu_names);
    StringMap.iter lumap (fun id lu ->
      Printf.fprintf file "%s;%s\n" id (string_of_lu lu)))

let print_syn_map filename synmap =
  File.file_out filename (fun file ->
    Printf.fprintf file "id;%s\n" (String.concat ";" syn_names);
    StringMap.iter synmap (fun id syn ->
      Printf.fprintf file "%s;%s\n" id (string_of_syn syn)))

let print_rt_map filename rel_count rtmap =
  File.file_out filename (fun file ->
    Printf.fprintf file "id;quantity;%s\n" (String.concat ";" rt_names);
    StringMap.iter rtmap (fun id rt ->
      Printf.fprintf file "%s;%d;%s\n" id (try StringQMap.find rel_count id with Not_found -> 0) (string_of_rt rt)))

let print_rels filename rel_id rels =
  File.file_out filename (fun file ->
    Printf.fprintf file "%s\n" (String.concat ";" rel_names);
    Xlist.iter rels (fun r ->
      if r.r_relation = rel_id then
        Printf.fprintf file "%s;%s;%s;%s\n" r.r_parent r.r_child r.r_valid r.r_owner))


let print_stringqmap file name qmap =
  Printf.fprintf file "%s\n" name;
  StringQMap.iter qmap (fun k v ->
    Printf.fprintf file "%6d %s\n" v k)

let check_lexical_units_fields lumap =
  let pos = StringMap.fold lumap StringQMap.empty (fun pos _ lu ->
    StringQMap.add pos lu.lu_pos) in
  print_stringqmap stdout "wartości lu_pos" pos;
  (* let tagcount = StringMap.fold lumap StringQMap.empty (fun tagcount _ lu ->
    StringQMap.add tagcount lu.lu_tagcount) in
  print_stringqmap stdout "wartości lu_tagcount" tagcount; *)
  let domain = StringMap.fold lumap StringQMap.empty (fun domain _ lu ->
    StringQMap.add domain lu.lu_domain) in
  print_stringqmap stdout "wartości lu_domain" domain;
  let desc = StringMap.fold lumap StringQMap.empty (fun desc _ lu ->
    StringQMap.add desc lu.lu_desc) in
  print_stringqmap stdout "wartości lu_desc" desc;
  let workstate = StringMap.fold lumap StringQMap.empty (fun workstate _ lu ->
    StringQMap.add workstate lu.lu_workstate) in
  print_stringqmap stdout "wartości lu_workstate" workstate;
  let source = StringMap.fold lumap StringQMap.empty (fun source _ lu ->
    StringQMap.add source lu.lu_source) in
  print_stringqmap stdout "wartości lu_source" source;
  ()

let check_lu_syn_consistency lumap synmap =
  let set = StringMap.fold lumap StringSet.empty (fun set id _ ->
    if StringSet.mem set id then failwith "check_lu_syn_consistency 1" else
    StringSet.add set id) in
  let set = StringMap.fold synmap set (fun set _ syn ->
    Xlist.fold syn.syn_units set (fun set (id,_) ->
      if not (StringSet.mem set id) then failwith "check_lu_syn_consistency 2" else
      StringSet.remove set id)) in
  if not (StringSet.is_empty set) then failwith "check_lu_syn_consistency 3" else
  ()

let count_relations qmap rtmap rels =
  Xlist.fold rels qmap (fun qmap rel ->
    if not (StringMap.mem rtmap rel.r_relation) then print_endline ("unknown relation: " ^ rel.r_relation);
    StringQMap.add qmap rel.r_relation)

let print_lu_relation filename lumap attr =
  File.file_out filename (fun file ->
    StringMap.iter attr (fun id1 l ->
      Xlist.iter l (fun id2 ->
        let lu1 = StringMap.find lumap id1 in
        let lu2 = StringMap.find lumap id2 in
        Printf.fprintf file "%s\t%s\n" (lu_name lu1) (lu_name lu2))))

let print_syn_relation filename synmap attr =
  File.file_out filename (fun file ->
    StringMap.iter attr (fun id1 l ->
      Xlist.iter l (fun id2 ->
        let syn1 = StringMap.find synmap id1 in
        let syn2 = StringMap.find synmap id2 in
        Printf.fprintf file "%s\t%s\n" (syn_name syn1) (syn_name syn2))))

let print_lu_relations path lumap lr =
  let relations = Xlist.fold lr StringMap.empty (fun relations r ->
    let map = try StringMap.find relations r.r_relation with Not_found -> StringMap.empty in
    let map = StringMap.add_inc map r.r_parent [r.r_child] (fun l -> r.r_child :: l) in
    StringMap.add relations r.r_relation map) in
  StringMap.iter relations (fun id relation ->
    print_lu_relation (path ^ id ^ ".tab") lumap relation)

let print_syn_relations path synmap sr =
  let relations = Xlist.fold sr StringMap.empty (fun relations r ->
    let map = try StringMap.find relations r.r_relation with Not_found -> StringMap.empty in
    let map = StringMap.add_inc map r.r_parent [r.r_child] (fun l -> r.r_child :: l) in
    StringMap.add relations r.r_relation map) in
  StringMap.iter relations (fun id relation ->
    print_syn_relation (path ^ id ^ ".tab") synmap relation)

let create_relation_map_lex lu_syn rel_id rels =
  Xlist.fold rels Graph.empty (fun graph r ->
    if r.r_relation = rel_id then
      let parent = StringMap.find lu_syn r.r_parent in
      let child = StringMap.find lu_syn r.r_child in
      Graph.add graph (int_of_string r.r_parent) (int_of_string r.r_child)
    else graph)

let test_reverse_rel ra rb =
  StringMap.iter ra (fun ida l ->
    Xlist.iter l (fun idb ->
      try
        let l = StringMap.find rb idb in
        if Xlist.mem l ida then ()
        else print_endline ("test_reverse_rel a: " ^ ida)
      with Not_found -> print_endline ("test_reverse_rel b: " ^ idb)))

let pwn_pos = ["czasownik pwn"; "przymiotnik pwn"; "przysłówek pwn"; "rzeczownik pwn"]

let remove_pwn synmap =
  StringMap.fold synmap StringMap.empty (fun synmap id syn ->
    if Xlist.mem pwn_pos syn.syn_pos then synmap else StringMap.add synmap id syn)

(*let get_maximal_not_isolated_synsets synmap hipero hipo =
  let set = get_maximal_synsets synmap hipero in
  let set = StringSet.fold set StringSet.empty (fun set id ->
    if StringMap.mem hipo id then StringSet.add set id else set) in
  set*)

let rev_relation map =
  StringMap.fold map StringMap.empty (fun map id l ->
    Xlist.fold l map (fun map id2 ->
      StringMap.add_inc map id2 [id] (fun l -> id :: l)))

let sum_relation map map2 =
  StringMap.fold map2 map (fun map id l ->
    StringMap.add_inc map id l (fun l2 -> l @ l2))


    let synset_name synmap lumap id =
      let syn = try StringMap.find synmap id with Not_found -> failwith "synset_name" in
      let names = Xlist.map syn.syn_units (fun (luid,_) ->
        let lu = try StringMap.find lumap luid with Not_found -> failwith "synset_name" in
        lu.lu_name ^ " " ^ lu.lu_variant) in
      String.concat ", " names

(*    let synset_name_with_args walenty synmap lumap id =
      let syn = try StringMap.find synmap id with Not_found -> failwith "synset_name_with_args" in
      let names = Xlist.map syn.syn_units (fun luid ->
        let lu = try StringMap.find lumap luid with Not_found -> failwith "synset_name_with_args" in
        lu.lu_name, lu.lu_variant) in
      String.concat ", " (Xlist.map names (fun (name,variant) ->
        let args = try StringSet.to_list (StringMap.find walenty name) with Not_found -> [] in
        if args = [] then name ^ " " ^ variant else
        name ^ " " ^ variant ^ " <B>" ^ String.concat " " args ^ "</B>"))
*)
    let is_synset_abstract synmap id =
      let syn = try StringMap.find synmap id with Not_found -> failwith "is_synset_abstract" in
      match syn.syn_abstract with
        "true" -> true
      | "false" -> false
      | _ -> failwith "is_synset_abstract"

(*
let count_relation_map rmap =
  let qmap = StringMap.fold rmap IntQMap.empty (fun qmap _ l ->
    IntQMap.add qmap (Xlist.size l)) in
  IntQMap.iter qmap (fun len q ->
    Printf.printf "%6d %2d\n" q len)
*)
let rec create_spaces n =
  if n = 0 then "" else " " ^ (create_spaces (n - 1))

let rec is_synset_abstract2_rec tree synmap id visited =
  if is_synset_abstract synmap id then true,visited else
  if StringSet.mem visited id then false,visited else
  let visited = StringSet.add visited id in
  Xlist.fold (try StringMap.find tree id with Not_found -> []) (false,visited) (fun (b,visited) id2 ->
    let v,visited = is_synset_abstract2_rec tree synmap id2 visited in
    v || b, visited)

let is_synset_abstract2 tree synmap id =
  fst (is_synset_abstract2_rec tree synmap id StringSet.empty)

let rec print_subtree_rec file treshold tree synmap lumap level abs_parent id visited =
  if StringSet.mem visited id then (
    if is_synset_abstract2 tree synmap id || StringSet.size (get_subtree tree id StringSet.empty) >= treshold || abs_parent then
      Printf.fprintf file "%s%d VISITED %s%s\n" (create_spaces level)
        (StringSet.size (get_subtree tree id StringSet.empty))
        (if is_synset_abstract synmap id then "*" else "")
        (synset_name synmap lumap id);
    visited
  ) else (
    if is_synset_abstract2 tree synmap id || StringSet.size (get_subtree tree id StringSet.empty) >= treshold || abs_parent then
      Printf.fprintf file "%s%d %s%s\n" (create_spaces level)
        (StringSet.size (get_subtree tree id StringSet.empty))
        (if is_synset_abstract synmap id then "*" else "")
        (synset_name synmap lumap id);
    Xlist.fold (try StringMap.find tree id with Not_found -> []) (StringSet.add visited id) (fun visited id2 ->
      print_subtree_rec file treshold tree synmap lumap (level+2) (is_synset_abstract synmap id) id2 visited))

let print_subtree filename treshold tree synmap lumap id =
  File.file_out filename (fun file ->
    ignore(print_subtree_rec file treshold tree synmap lumap 0 true id StringSet.empty))

let rec create_subtree_xml tree synmap lumap id visited =
  let name = synset_name synmap lumap id in
  let size = StringSet.size (get_subtree tree id StringSet.empty) in
  let abstract = if is_synset_abstract synmap id then ["abstract","true"] else [] in
  if StringSet.mem visited id then
    Xml.Element("node",("name",name) :: ("visited","true") :: ("size",string_of_int size) :: abstract,[]), visited
  else
    let l,visited = Xlist.fold (try StringMap.find tree id with Not_found -> []) ([],StringSet.add visited id) (fun (l,visited) id2 ->
      let xml,visited = create_subtree_xml tree synmap lumap id2 visited in
      xml :: l, visited) in
    Xml.Element("node",("name",name) :: ("size",string_of_int size) :: abstract,l),visited

let print_subtree_xml filename tree synmap lumap id =
  File.file_out filename (fun file ->
    let xml,_ = create_subtree_xml tree synmap lumap id StringSet.empty in
    Printf.fprintf file "%s" (Xml.to_string_fmt xml))

let rec find_connected_components_rec rel threshold synmap set conn id =
  (* if id = "28358" then print_endline "find_connected_components_rec 1"; *)
  if StringSet.mem set id || (StringMap.find synmap id).syn_no_hipo < threshold then conn,set else (
  (* if id = "28358" then print_endline "find_connected_components_rec 2"; *)
  let conn = StringSet.add conn id in
  let set = StringSet.add set id in
  let l = try StringMap.find rel id with Not_found -> [] in
  (* if id = "28358" then Printf.printf "find_connected_components_rec 3: |conn|=%d\n%!" (StringSet.size conn); *)
  Xlist.fold l (conn,set) (fun (conn,set) id ->
    find_connected_components_rec rel threshold synmap set conn id))

let find_connected_components rel threshold synmap =
  let l,_ = StringMap.fold rel ([],StringSet.empty) (fun (l,set) id _ ->
    if StringSet.mem set id then l,set else
    let conn,set = find_connected_components_rec rel threshold synmap set StringSet.empty id in
    (* if StringSet.mem conn "28358" then print_endline "find_connected_components 1"; *)
    conn :: l, set) in
  l

let has_syn_above_threshold synmap threshold conn =
  StringSet.fold conn false (fun b id ->
    if (StringMap.find synmap id).syn_no_hipo >= threshold then true else b)

let remove_conn l id =
  Xlist.fold l [] (fun l conn ->
    if StringSet.mem conn id then l else conn :: l)

let select_conn l id =
  Xlist.fold l [] (fun l conn ->
    if StringSet.mem conn id then conn :: l else l)

let print_hipo_graph path name threshold synmap hipo conn =
  ignore (Xlist.fold conn 1 (fun n conn ->
    let name = name ^ "_" ^ string_of_int n in
    if has_syn_above_threshold synmap threshold conn then (
    File.file_out (path ^ name ^ ".gv") (fun file ->
      Printf.fprintf file "digraph G {\n  node [shape=box]\n";(*  "rankdir = LR\n";*)
      StringMap.iter synmap (fun id syn ->
        if StringSet.mem conn id && syn.syn_no_hipo >= threshold then
          Printf.fprintf file "  %s [label=\"%s\\n%d\"]\n" id (syn_name_single syn) syn.syn_no_hipo);
      StringMap.iter hipo (fun id1 l ->
        if StringSet.mem conn id1 && (StringMap.find synmap id1).syn_no_hipo >= threshold then
          Xlist.iter l (fun id2 ->
            if (StringMap.find synmap id2).syn_no_hipo >= threshold then
          Printf.fprintf file "  %s -> %s\n" id1 id2));
      Printf.fprintf file "}\n");
    Sys.chdir path;
    ignore (Sys.command ("dot -Tpng " ^ name ^ ".gv -o " ^ name ^ ".png"));
    Sys.chdir "..";
    n+1) else n))



(*
let rec create_spaces_html n =
  if n = 0 then "" else "&nbsp;&nbsp;&nbsp;" ^ (create_spaces_html (n - 1))

let create_args_map walenty synmap lumap =
  StringMap.fold synmap StringMap.empty (fun map id syn ->
    let names = Xlist.map syn.syn_units (fun luid ->
      let lu = try StringMap.find lumap luid with Not_found -> failwith "find_args" in
      lu.lu_name) in
    let args = Xlist.fold names StringSet.empty (fun set name ->
      try
        StringSet.union set (StringMap.find walenty name)
      with Not_found -> set) in
    if StringSet.is_empty args then map else
    StringMap.add map id args)

let rec create_args_tree_map_rec tree argmap id map =
  if StringMap.mem map id then map,StringMap.find map id else
  let args = try StringMap.find argmap id with Not_found -> StringSet.empty in
  let map,args = Xlist.fold (try StringMap.find tree id with Not_found -> []) (map,args) (fun (map,args) id2 ->
    let map2,args2 = create_args_tree_map_rec tree argmap id2 map in
    map2, StringSet.union args args2) in
  StringMap.add map id args, args

let create_args_tree_map tree argmap id =
  fst (create_args_tree_map_rec tree argmap id StringMap.empty)

let string_of_args argmap argtreemap id =
  let args = StringSet.to_list (try StringMap.find argtreemap id with Not_found -> StringSet.empty) in
  let local_args = try StringMap.find argmap id with Not_found -> StringSet.empty in
  let args = Xlist.map args (fun arg ->
    if StringSet.mem local_args arg then "<B>" ^ arg ^ "</B>"
    else arg) in
  String.concat " " args

let string_of_args2 argmap id =
  let args = StringSet.to_list (try StringMap.find argmap id with Not_found -> StringSet.empty) in
  "<B>" ^ String.concat " " args ^ "</B>"

(*let find_args walenty tree synmap lumap id =
  let syn = try StringMap.find synmap id with Not_found -> failwith "find_args" in
  let names = Xlist.map syn.syn_units (fun luid ->
    let lu = try StringMap.find lumap luid with Not_found -> failwith "find_args" in
    lu.lu_name) in
  let args = Xlist.fold names StringSet.empty (fun set name ->
    try
      StringSet.union set (StringMap.find walenty name)
    with Not_found -> set) in
  String.concat ", " (StringSet.to_list args)  *)

let rec print_arg_subtree_rec file argmap argtreemap tree synmap lumap level abs_parent id visited =
  if StringSet.mem visited id then (
    if is_synset_abstract2 tree synmap id || StringSet.size (get_subtree tree id StringSet.empty) >= 100 || abs_parent then
      Printf.fprintf file "%s<FONT SIZE=2>%s</FONT> %d VISITED %s%s %s<BR>\n" (create_spaces_html level) id
        (StringSet.size (get_subtree tree id StringSet.empty))
        (if is_synset_abstract synmap id then "*" else "")
        (synset_name synmap lumap id)
        (string_of_args argmap argtreemap id);
    visited
  ) else (
    if is_synset_abstract2 tree synmap id || StringSet.size (get_subtree tree id StringSet.empty) >= 100 || abs_parent then
      Printf.fprintf file "%s<FONT SIZE=2>%s</FONT> %d %s%s %s<BR>\n" (create_spaces_html level) id
        (StringSet.size (get_subtree tree id StringSet.empty))
        (if is_synset_abstract synmap id then "*" else "")
        (synset_name synmap lumap id)
        (string_of_args argmap argtreemap id);
    Xlist.fold (try StringMap.find tree id with Not_found -> []) (StringSet.add visited id) (fun visited id2 ->
      print_arg_subtree_rec file argmap argtreemap tree synmap lumap (level+2) (is_synset_abstract synmap id) id2 visited))

let print_html_header file =
  Printf.fprintf file "<HTML><HEAD><META HTTP-EQUIV=\"CONTENT-TYPE\" CONTENT=\"text/html; charset=utf8\"></HEAD><BODY>\n"

let print_html_trailer file =
  Printf.fprintf file "</BODY></HTML>\n"

let print_arg_subtree filename argmap argtreemap tree synmap lumap id =
  File.file_out filename (fun file ->
    print_html_header file;
    ignore(print_arg_subtree_rec file argmap argtreemap tree synmap lumap 0 true id StringSet.empty);
    print_html_trailer file)

let has_interesting_arg argtreemap id =
  try
    StringSet.size (StringMap.find argtreemap id) > 0
  with Not_found -> false

let rec print_arg_subtree_rec2 file argmap argtreemap walenty tree synmap lumap level abs_parent id visited =
  if StringSet.mem visited id then (
    if has_interesting_arg argtreemap id then
      Printf.fprintf file "%s<FONT SIZE=2>%s</FONT> %d VISITED %s%s<BR>\n" (create_spaces_html level) id
        (StringSet.size (get_subtree tree id StringSet.empty))
        (if is_synset_abstract synmap id then "*" else "")
        (synset_name_with_args walenty synmap lumap id)(*
        (string_of_args2 argmap id)*);
    visited
  ) else (
    if has_interesting_arg argtreemap id then
      Printf.fprintf file "%s<FONT SIZE=2>%s</FONT> %d %s%s<BR>\n" (create_spaces_html level) id
        (StringSet.size (get_subtree tree id StringSet.empty))
        (if is_synset_abstract synmap id then "*" else "")
        (synset_name_with_args walenty synmap lumap id)(*
        (string_of_args2 argmap id)*);
    Xlist.fold (try StringMap.find tree id with Not_found -> []) (StringSet.add visited id) (fun visited id2 ->
      print_arg_subtree_rec2 file argmap argtreemap walenty tree synmap lumap (level+2) (is_synset_abstract synmap id) id2 visited))

let print_arg_subtree2 filename argmap argtreemap walenty tree synmap lumap id =
  File.file_out filename (fun file ->
    print_html_header file;
    ignore(print_arg_subtree_rec2 file argmap argtreemap walenty tree synmap lumap 0 true id StringSet.empty);
    print_html_trailer file)

let compare_lu_id (x,_,_) (y,_,_) = compare (int_of_string x)  (int_of_string y)

let print_lexical_units path lumap =
  let map = StringMap.fold lumap StringMap.empty (fun map id lu ->
    StringMap.add_inc map lu.lu_pos [id,lu.lu_name,lu.lu_variant] (fun l -> (id,lu.lu_name,lu.lu_variant) :: l)) in
  StringMap.iter map (fun pos l ->
    File.file_out (path ^ pos ^ ".tab") (fun file ->
      Xlist.iter (Xlist.sort l compare_lu_id) (fun (id,lemma,variant) ->
        Printf.fprintf file "%s\t%s\t%s\n" id lemma variant)))
*)
let print_lexical_units_full path lumap =
  let map = StringMap.fold lumap StringMap.empty (fun map id lu ->
    StringMap.add_inc map lu.lu_pos [id,lu] (fun l -> (id,lu) :: l)) in
  StringMap.iter map (fun pos l ->
    File.file_out (path ^ pos ^ ".tab") (fun file ->
      Xlist.iter l (fun (id,lu) ->
        Printf.fprintf file "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" id lu.lu_name lu.lu_tagcount lu.lu_domain lu.lu_desc lu.lu_workstate lu.lu_source lu.lu_variant)))
(*
let print_synsets filename synmap =
  File.file_out filename (fun file ->
    StringMap.iter synmap (fun id syn ->
      Printf.fprintf file "%s\t%s\n" id (String.concat " " syn.syn_units)))

let print_hipero filename hipero =
  File.file_out filename (fun file ->
    StringMap.iter hipero (fun id l ->
      Printf.fprintf file "%s\t%s\n" id (String.concat " " l)))
*)

(*let xml_fold filename s f =
  File.file_in filename (fun file  ->
    let scanbuf = Scanf.Scanning.from_function (fun () -> input_char file) in
    let (*size*)_ =
      try
	Scanf.bscanf scanbuf "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<array-list owner=\"\" date=\"2013-07-24 11.45.32\" version=\"WordnetLoom 1.64.0\">\n"
      with
	_ -> failwith ("xml_fold " ^ filename ^ ": invalid header") in
    let lex = Lexing.from_function (fun buf len -> input file buf 0 len) in
    let par = XmlParser.make () in
    XmlParser.prove par false;
    XmlParser.check_eof par false;
    let r = ref s in
    let i = ref 0 in
    try
      while true do
	try
	  let graph = (*of_xml*) (XmlParser.parse par (XmlParser.SLexbuf lex)) in
	  r := f (!r) graph;
	  incr i;
	with Parsing.Parse_error -> failwith ("xml_fold " ^ filename ^ ": Parse_error")
      done;
      !r
    with
      Xml.Error e -> (
        print_endline ("xml_fold Xml.Error " ^ Xml.error e);
        Printf.printf "%d\n" (!i);
	(*if !i = size then*) !r
	(*else failwith ("xml_fold " ^ filename ^ ": invalid file contents")*)))*)
*)