Commit bc99576298bc25ab6e1962b0e95aab6c70f494e8
1 parent
4c6f5e84
rozpoczęcia dostosowywania preprocesingu w formacie conll
Showing
1 changed file
with
20 additions
and
20 deletions
pre/preProcessing.ml
... | ... | @@ -582,9 +582,7 @@ let parse query (*next_id*) = |
582 | 582 | let paths = PreLemmatization.combine_interps paths in |
583 | 583 | (* print_endline (PrePaths.to_string paths); *)*) |
584 | 584 | |
585 | -(* let parse_conll paths next_id = | |
586 | - (* print_endline "a11"; *) | |
587 | - let paths = PreMWE.process paths in | |
585 | +let parse_conll tokens paths next_id = | |
588 | 586 | (* print_endline "a12"; *) |
589 | 587 | let paths = find_proper_names paths in |
590 | 588 | (* print_endline "a13"; *) |
... | ... | @@ -608,7 +606,7 @@ let parse query (*next_id*) = |
608 | 606 | let paths = prepare_indexes paths in |
609 | 607 | let paths = PrePaths.sort paths in |
610 | 608 | (* print_endline "a18"; *) |
611 | - paths, next_id *) | |
609 | + paths, next_id | |
612 | 610 | |
613 | 611 | let make_ids tokens paths = |
614 | 612 | Xlist.rev_map paths (fun t -> |
... | ... | @@ -621,32 +619,34 @@ let make_chart paths last = |
621 | 619 | chart.(beg) <- (id,next) :: chart.(beg)); |
622 | 620 | chart |
623 | 621 | |
622 | +let split_into_sentences paths = | |
623 | + let paths = make_ids tokens paths in | |
624 | + let paths,last = PreSentences.prepare_indexes paths in | |
625 | + let chart = make_chart paths last in | |
626 | + let par = Array.of_list ([""] @ Xunicode.utf8_chars_of_utf8_string paragraph @ [""]) in | |
627 | + PreSentences.find_sentence par tokens chart last; | |
628 | + PreSentences.find_quoted_sentences par tokens chart last; | |
629 | + PreSentences.find_query par tokens chart last; | |
630 | + PreSentences.extract_sentences tokens chart last | |
631 | + | |
624 | 632 | let parse_text = function |
625 | 633 | RawText query -> |
626 | 634 | let tokens = ExtArray.make 100 empty_token in |
627 | 635 | let _ = ExtArray.add tokens empty_token in (* id=0 jest zarezerwowane dla pro; FIXME: czy to jest jeszcze aktualne? *) |
628 | 636 | let paragraphs = Xlist.map (Xstring.split "\n" query) (fun paragraph -> |
629 | 637 | let paths,_ = parse paragraph in |
630 | - let paths = make_ids tokens paths in | |
631 | - let paths,last = PreSentences.prepare_indexes paths in | |
632 | - let chart = make_chart paths last in | |
633 | - let par = Array.of_list ([""] @ Xunicode.utf8_chars_of_utf8_string paragraph @ [""]) in | |
634 | - PreSentences.find_sentence par tokens chart last; | |
635 | - PreSentences.find_quoted_sentences par tokens chart last; | |
636 | - PreSentences.find_query par tokens chart last; | |
637 | - let sentences = PreSentences.extract_sentences tokens chart last in | |
638 | + let sentences = split_into_sentences tokens paths in | |
638 | 639 | AltParagraph[Raw,RawParagraph paragraph; Struct,StructParagraph sentences]) in |
639 | 640 | AltText[Raw,RawText query; Struct,StructText(List.rev paragraphs, tokens)] |
640 | -(* | AltText[Raw,RawText query;CONLL,StructText([ | |
641 | - StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, StructSentence(_,paths,last)]} as p]],_)] -> | |
642 | - let (cpaths,clast), next_id = parse_conll (paths,last) first_id in | |
643 | - let (paths,last), next_id = parse query next_id in | |
644 | - let sentences, next_id = split_into_sentences query paths last next_id in | |
641 | + | AltText[Raw,RawText query;CONLL,StructText([ | |
642 | + StructParagraph[{psentence = AltSentence[Raw, RawSentence text; CONLL, DepSentence(_,dep_paths)]} as p]],tokens)] -> | |
643 | + parse_conll tokens paths; | |
644 | + let paths,_ = parse query in | |
645 | + let sentences = split_into_sentences tokens paths in | |
645 | 646 | let conll = StructParagraph[{p with psentence = AltSentence[Raw, RawSentence text; |
646 | - Mate, StructSentence("M",cpaths,clast); CONLL, StructSentence("C",cpaths,clast)]}] in | |
647 | - (* print_endline "parse_text 6"; *) | |
647 | + Mate, StructSentence("M",dep_paths); CONLL, StructSentence("C",dep_paths)]}] in | |
648 | 648 | AltText[Raw,RawText query; Struct, StructText([ |
649 | - AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]],next_id)]*) | |
649 | + AltParagraph[Raw,RawParagraph query; ENIAM, StructParagraph sentences; CONLL, conll]],tokens)] | |
650 | 650 | | _ -> failwith "parse_text: not implemented" |
651 | 651 | |
652 | 652 | let rec main_loop in_chan out_chan = |
... | ... |