diff --git a/pre/.gitignore b/pre/.gitignore index ecca61d..a2d009d 100644 --- a/pre/.gitignore +++ b/pre/.gitignore @@ -1 +1,2 @@ pre +concraft_test diff --git a/pre/concraft_test.ml b/pre/concraft_test.ml new file mode 100644 index 0000000..3acaf27 --- /dev/null +++ b/pre/concraft_test.ml @@ -0,0 +1,12 @@ + +(* let concraft_in, concraft_out, concraft_err = Unix.open_process_full "../../../.local/bin/concraft-pl tag ../concraft/nkjp-model-0.2.gz" [| |] *) +let concraft_in, concraft_out, concraft_err = + Unix.open_process_full "concraft-pl tag ../concraft/nkjp-model-0.2.gz" + [|"PATH=" ^ Sys.getenv "PATH"|] + +let _ = + print_endline "out"; + Printf.fprintf concraft_out "Ala ma kota.\n\n%!"; + print_endline "in"; + print_endline ("concraft error message: " ^ input_line concraft_err); + () diff --git a/pre/makefile b/pre/makefile index afb101a..3965aae 100755 --- a/pre/makefile +++ b/pre/makefile @@ -1,17 +1,20 @@ OCAMLC=ocamlc OCAMLOPT=ocamlopt OCAMLDEP=ocamldep -INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I ../morphology -I ../parser +INCLUDES=-I +xml-light -I +xlib -I +zip -I +bz2 -I ../morphology -I ../parser -I ../corpora OCAMLFLAGS=$(INCLUDES) -g OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa inflexion.cmxa INSTALLDIR=`ocamlc -where` WAL= paths.ml walTypes.ml walStringOf.ml preTypes.ml preWordnet.ml walParser.ml walTEI.ml walFrames.ml -PRE= preTokenizer.ml preAcronyms.ml prePatterns.ml prePaths.ml preMWE.ml preSemantics.ml preSentences.ml preProcessing.ml +PRE= preTokenizer.ml preAcronyms.ml prePatterns.ml prePaths.ml preMWE.ml preSemantics.ml preSentences.ml ../corpora/CONLL.ml preProcessing.ml all: $(OCAMLOPT) -o pre $(OCAMLOPTFLAGS) $(WAL) $(PRE) +concraft_test: concraft_test.ml + $(OCAMLOPT) -o concraft_test $(OCAMLOPTFLAGS) concraft_test.ml + .SUFFIXES: .mll .mly .ml .mli .cmo .cmi .cmx .mll.ml: @@ -33,4 +36,4 @@ all: $(OCAMLOPT) $(OCAMLOPTFLAGS) -c $< clean: - rm -f *~ *.cm[oix] *.o pre + rm -f *~ *.cm[oix] *.o pre concraft_test diff --git a/pre/preProcessing.ml b/pre/preProcessing.ml index db2f025..7fa70cd 100644 --- a/pre/preProcessing.ml +++ b/pre/preProcessing.ml @@ -604,13 +604,68 @@ let parse_conll tokens dep_paths = conll_id + 1) in () +(* let concraft_in, concraft_out, concraft_err = Unix.open_process_full "../../../.local/bin/concraft-pl tag ../concraft/nkjp-model-0.2.gz" [| |] *) +(*let concraft_in, concraft_out, concraft_err = + Unix.open_process_full "concraft-pl tag ../concraft/nkjp-model-0.2.gz" + [|"PATH=" ^ Sys.getenv "PATH"|] + +let rec load_concraft_sentence white orth rev ic = + (* print_endline "load_concraft_sentence 1"; *) + (* print_endline ("concraft error message: " ^ input_line concraft_err); *) + let s = input_line ic in + (* print_endline ("load_concraft_sentence: " ^ s); *) + if s = "" then List.rev rev else + match Xstring.split_delim "\t" s with + [""; lemma; interp; "disamb"] -> load_concraft_sentence "" "" ((white,orth,lemma,interp) :: rev) ic + | [""; lemma; interp] -> load_concraft_sentence white orth rev ic + | [orth; white] -> load_concraft_sentence white orth rev ic + | _ -> failwith ("load_concraft_sentence: " ^ s) + +let make_token (white,orth,lemma,interp) = + let cat,interp = match Xstring.split ":" interp with + cat :: l -> cat, [Xlist.map l (fun tag -> [tag])] + | _ -> failwith "make_token" in + {empty_token with orth = orth; token = Lemma(lemma,cat,interp)} + +let parse_mate tokens pbeg s = + (* print_endline ("parse_mate: " ^ s); *) + Printf.fprintf concraft_out "%s\n\n%!" s; + let l = load_concraft_sentence "" "" [] concraft_in in + let l = Xlist.map l make_token in + let l = {empty_token with token = Interp "<conll_root>"} :: l in + let l = Xlist.map l (fun t -> ExtArray.add tokens t,-1,"") in + let _ = CONLL.establish_for_token pbeg s tokens (List.tl l) in + let dep_paths = Array.of_list l in + parse_conll tokens dep_paths; + dep_paths + +let rec parse_mate_sentence tokens mode pbeg = function + RawSentence s -> if mode <> Mate then RawSentence s else DepSentence (parse_mate tokens pbeg s) + | StructSentence(paths,last) -> StructSentence(paths,last) + | DepSentence(paths) -> DepSentence(paths) + | QuotedSentences sentences -> + QuotedSentences(Xlist.map sentences (fun p -> + {pid=p.PreTypes.pid; pbeg=p.PreTypes.pbeg; plen=p.PreTypes.plen; pnext=p.PreTypes.pnext; pfile_prefix=p.PreTypes.pfile_prefix; + psentence=parse_mate_sentence tokens mode pbeg p.PreTypes.psentence})) + | AltSentence l -> AltSentence(Xlist.map l (fun (mode,sentence) -> + mode, parse_mate_sentence tokens mode pbeg sentence)) + +let parse_mate_sentences tokens sentences = + Xlist.map sentences (fun p -> + {p with psentence=parse_mate_sentence tokens Struct p.pbeg p.psentence})*) + let parse_text = function RawText query -> + (* print_endline ("parse_text: " ^ query); *) let tokens = ExtArray.make 100 empty_token in let _ = ExtArray.add tokens empty_token in (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *) let paragraphs = Xlist.map (Xstring.split "\n" query) (fun paragraph -> let paths = parse paragraph in + (* print_endline "parse_text 1"; *) let sentences = PreSentences.split_into_sentences paragraph tokens paths in + (* print_endline "parse_text 2"; *) + (* let sentences = parse_mate_sentences tokens sentences in *) + (* print_endline "parse_text 3"; *) AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) in AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs, tokens)] | AltText[Raw,RawText query;CONLL,StructText([ @@ -633,9 +688,9 @@ let rec main_loop in_chan out_chan = (try (* let time0 = Sys.time () in *) let utime0 = Unix.gettimeofday () in - (* print_endline "main_loop 3a"; *) + (* print_endline "main_loop 3a"; *) let text = parse_text query in - (* print_endline "main_loop 4a"; *) + (* print_endline "main_loop 4a"; *) (* let time2 = Sys.time () in *) let utime2 = Unix.gettimeofday () in (* Printf.printf "time=%f utime=%f\n%!" (time2 -. time0) (utime2 -. utime0); *) diff --git a/swigra/parser/morfeusz2-swi.so b/swigra/parser/morfeusz2-swi.so new file mode 100755 index 0000000..e263e1c --- /dev/null +++ b/swigra/parser/morfeusz2-swi.so