Commit 25c2221c352fa384a1578c9dc6d155e5391ba4b2
1 parent
cf09ae51
obsługa concrafta
Showing
5 changed files
with
76 additions
and
5 deletions
pre/.gitignore
pre/concraft_test.ml
0 → 100644
1 | + | |
2 | +(* let concraft_in, concraft_out, concraft_err = Unix.open_process_full "../../../.local/bin/concraft-pl tag ../concraft/nkjp-model-0.2.gz" [| |] *) | |
3 | +let concraft_in, concraft_out, concraft_err = | |
4 | + Unix.open_process_full "concraft-pl tag ../concraft/nkjp-model-0.2.gz" | |
5 | + [|"PATH=" ^ Sys.getenv "PATH"|] | |
6 | + | |
7 | +let _ = | |
8 | + print_endline "out"; | |
9 | + Printf.fprintf concraft_out "Ala ma kota.\n\n%!"; | |
10 | + print_endline "in"; | |
11 | + print_endline ("concraft error message: " ^ input_line concraft_err); | |
12 | + () | |
... | ... |
pre/makefile
1 | 1 | OCAMLC=ocamlc |
2 | 2 | OCAMLOPT=ocamlopt |
3 | 3 | OCAMLDEP=ocamldep |
4 | -INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I ../morphology -I ../parser | |
4 | +INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I ../morphology -I ../parser -I ../corpora | |
5 | 5 | OCAMLFLAGS=$(INCLUDES) -g |
6 | 6 | OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa inflexion.cmxa |
7 | 7 | INSTALLDIR=`ocamlc -where` |
8 | 8 | |
9 | 9 | WAL= paths.ml walTypes.ml walStringOf.ml preTypes.ml preWordnet.ml walParser.ml walTEI.ml walFrames.ml |
10 | -PRE= preTokenizer.ml preAcronyms.ml prePatterns.ml prePaths.ml preMWE.ml preSemantics.ml preSentences.ml preProcessing.ml | |
10 | +PRE= preTokenizer.ml preAcronyms.ml prePatterns.ml prePaths.ml preMWE.ml preSemantics.ml preSentences.ml ../corpora/CONLL.ml preProcessing.ml | |
11 | 11 | |
12 | 12 | all: |
13 | 13 | $(OCAMLOPT) -o pre $(OCAMLOPTFLAGS) $(WAL) $(PRE) |
14 | 14 | |
15 | +concraft_test: concraft_test.ml | |
16 | + $(OCAMLOPT) -o concraft_test $(OCAMLOPTFLAGS) concraft_test.ml | |
17 | + | |
15 | 18 | .SUFFIXES: .mll .mly .ml .mli .cmo .cmi .cmx |
16 | 19 | |
17 | 20 | .mll.ml: |
... | ... | @@ -33,4 +36,4 @@ all: |
33 | 36 | $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< |
34 | 37 | |
35 | 38 | clean: |
36 | - rm -f *~ *.cm[oix] *.o pre | |
39 | + rm -f *~ *.cm[oix] *.o pre concraft_test | |
... | ... |
pre/preProcessing.ml
... | ... | @@ -604,13 +604,68 @@ let parse_conll tokens dep_paths = |
604 | 604 | conll_id + 1) in |
605 | 605 | () |
606 | 606 | |
607 | +(* let concraft_in, concraft_out, concraft_err = Unix.open_process_full "../../../.local/bin/concraft-pl tag ../concraft/nkjp-model-0.2.gz" [| |] *) | |
608 | +(*let concraft_in, concraft_out, concraft_err = | |
609 | + Unix.open_process_full "concraft-pl tag ../concraft/nkjp-model-0.2.gz" | |
610 | + [|"PATH=" ^ Sys.getenv "PATH"|] | |
611 | + | |
612 | +let rec load_concraft_sentence white orth rev ic = | |
613 | + (* print_endline "load_concraft_sentence 1"; *) | |
614 | + (* print_endline ("concraft error message: " ^ input_line concraft_err); *) | |
615 | + let s = input_line ic in | |
616 | + (* print_endline ("load_concraft_sentence: " ^ s); *) | |
617 | + if s = "" then List.rev rev else | |
618 | + match Xstring.split_delim "\t" s with | |
619 | + [""; lemma; interp; "disamb"] -> load_concraft_sentence "" "" ((white,orth,lemma,interp) :: rev) ic | |
620 | + | [""; lemma; interp] -> load_concraft_sentence white orth rev ic | |
621 | + | [orth; white] -> load_concraft_sentence white orth rev ic | |
622 | + | _ -> failwith ("load_concraft_sentence: " ^ s) | |
623 | + | |
624 | +let make_token (white,orth,lemma,interp) = | |
625 | + let cat,interp = match Xstring.split ":" interp with | |
626 | + cat :: l -> cat, [Xlist.map l (fun tag -> [tag])] | |
627 | + | _ -> failwith "make_token" in | |
628 | + {empty_token with orth = orth; token = Lemma(lemma,cat,interp)} | |
629 | + | |
630 | +let parse_mate tokens pbeg s = | |
631 | + (* print_endline ("parse_mate: " ^ s); *) | |
632 | + Printf.fprintf concraft_out "%s\n\n%!" s; | |
633 | + let l = load_concraft_sentence "" "" [] concraft_in in | |
634 | + let l = Xlist.map l make_token in | |
635 | + let l = {empty_token with token = Interp "<conll_root>"} :: l in | |
636 | + let l = Xlist.map l (fun t -> ExtArray.add tokens t,-1,"") in | |
637 | + let _ = CONLL.establish_for_token pbeg s tokens (List.tl l) in | |
638 | + let dep_paths = Array.of_list l in | |
639 | + parse_conll tokens dep_paths; | |
640 | + dep_paths | |
641 | + | |
642 | +let rec parse_mate_sentence tokens mode pbeg = function | |
643 | + RawSentence s -> if mode <> Mate then RawSentence s else DepSentence (parse_mate tokens pbeg s) | |
644 | + | StructSentence(paths,last) -> StructSentence(paths,last) | |
645 | + | DepSentence(paths) -> DepSentence(paths) | |
646 | + | QuotedSentences sentences -> | |
647 | + QuotedSentences(Xlist.map sentences (fun p -> | |
648 | + {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext; pfile_prefix=p.PreTypes.pfile_prefix; | |
649 | + psentence=parse_mate_sentence tokens mode pbeg p.PreTypes.psentence})) | |
650 | + | AltSentence l -> AltSentence(Xlist.map l (fun (mode,sentence) -> | |
651 | + mode, parse_mate_sentence tokens mode pbeg sentence)) | |
652 | + | |
653 | +let parse_mate_sentences tokens sentences = | |
654 | + Xlist.map sentences (fun p -> | |
655 | + {p with psentence=parse_mate_sentence tokens Struct p.pbeg p.psentence})*) | |
656 | + | |
607 | 657 | let parse_text = function |
608 | 658 | RawText query -> |
659 | + (* print_endline ("parse_text: " ^ query); *) | |
609 | 660 | let tokens = ExtArray.make 100 empty_token in |
610 | 661 | let _ = ExtArray.add tokens empty_token in (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *) |
611 | 662 | let paragraphs = Xlist.map (Xstring.split "\n" query) (fun paragraph -> |
612 | 663 | let paths = parse paragraph in |
664 | + (* print_endline "parse_text 1"; *) | |
613 | 665 | let sentences = PreSentences.split_into_sentences paragraph tokens paths in |
666 | + (* print_endline "parse_text 2"; *) | |
667 | + (* let sentences = parse_mate_sentences tokens sentences in *) | |
668 | + (* print_endline "parse_text 3"; *) | |
614 | 669 | AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) in |
615 | 670 | AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs, tokens)] |
616 | 671 | | AltText[Raw,RawText query;CONLL,StructText([ |
... | ... | @@ -633,9 +688,9 @@ let rec main_loop in_chan out_chan = |
633 | 688 | (try |
634 | 689 | (* let time0 = Sys.time () in *) |
635 | 690 | let utime0 = Unix.gettimeofday () in |
636 | - (* print_endline "main_loop 3a"; *) | |
691 | + (* print_endline "main_loop 3a"; *) | |
637 | 692 | let text = parse_text query in |
638 | - (* print_endline "main_loop 4a"; *) | |
693 | + (* print_endline "main_loop 4a"; *) | |
639 | 694 | (* let time2 = Sys.time () in *) |
640 | 695 | let utime2 = Unix.gettimeofday () in |
641 | 696 | (* Printf.printf "time=%f utime=%f\n%!" (time2 -. time0) (utime2 -. utime0); *) |
... | ... |
swigra/parser/morfeusz2-swi.so
0 → 100755
No preview for this file type