Commit a5c6bd60568948f147fa18d4b06ced8431f08dce
1 parent
60c228bb
refactoring
Showing
71 changed files
with
31203 additions
and
169 deletions
Too many changes to show.
To preserve performance only 21 of 71 files are displayed.
disambiguation/ENIAM_EdgeScore.ml deleted
1 | -open Xstd | |
2 | -open ENIAM_LCGtypes | |
3 | -open Yojson | |
4 | - | |
5 | -module MST_Model : sig | |
6 | - type mst_model = { | |
7 | - typeAlphabet: int StringMap.t; | |
8 | - dataAlphabet: int StringMap.t; | |
9 | - parameters: float array} | |
10 | - val read_model: string -> mst_model | |
11 | - val empty: mst_model | |
12 | - exception MalformedModelJson | |
13 | -end | |
14 | -= struct | |
15 | - type mst_model = { | |
16 | - typeAlphabet: int StringMap.t; | |
17 | - dataAlphabet: int StringMap.t; | |
18 | - parameters: float array} | |
19 | - | |
20 | - let empty = {typeAlphabet = StringMap.empty; | |
21 | - dataAlphabet = StringMap.empty; | |
22 | - parameters = Array.make 0 0.0} | |
23 | - exception MalformedModelJson | |
24 | - | |
25 | - let construct_data_alphabet keys = | |
26 | - let counter = ref 0 in | |
27 | - let map = ref StringMap.empty in | |
28 | - let length = Array.length keys in | |
29 | - for i = 0 to length -1 do | |
30 | - map := StringMap.add !map keys.(i) !counter; | |
31 | - counter := !counter + 1; | |
32 | - done; | |
33 | - !map | |
34 | - | |
35 | - let construct_type_alphabet = construct_data_alphabet | |
36 | - | |
37 | - let read_model fname = | |
38 | - let data = Basic.from_file fname in | |
39 | - let open Yojson.Basic.Util in | |
40 | - let unwrapList = function | |
41 | - `List l -> l | |
42 | - | _ -> raise MalformedModelJson in | |
43 | - let dataA = data |> member "dataAlphabet" |> unwrapList |> filter_string in | |
44 | - let typeA = data |> member "typeAlphabet" |> unwrapList |> filter_string in | |
45 | - let params = data |> member "parameters" |> unwrapList |> filter_float in | |
46 | - {typeAlphabet = Array.of_list typeA |> construct_type_alphabet; | |
47 | - dataAlphabet = Array.of_list dataA |> construct_data_alphabet; | |
48 | - parameters = Array.of_list params} | |
49 | -end | |
50 | -open MST_Model | |
51 | - | |
52 | -let model = ref MST_Model.empty | |
53 | - | |
54 | -let initialize () = | |
55 | - model := MST_Model.read_model "dep.model.json"; | |
56 | - () | |
57 | - | |
58 | -exception UnsupportedLinearTerm of linear_term | |
59 | -exception EmptyVariant | |
60 | - | |
61 | -let add_feature str (fv: IntSet.t) = | |
62 | - if StringMap.mem !model.dataAlphabet str then | |
63 | - IntSet.add fv (StringMap.find !model.dataAlphabet str) | |
64 | - else | |
65 | - fv | |
66 | - | |
67 | -let score_fv (fv:IntSet.t) = | |
68 | - IntSet.fold fv 0.0 (fun score i -> score +. !model.parameters.(i)) | |
69 | - | |
70 | -let apply_features features fv = | |
71 | - List.fold_left (|>) fv features | |
72 | - | |
73 | -let add_linear_features f_type (obs: string array) first second distStr fv = | |
74 | - fv | |
75 | - | |
76 | -let add_two_obs_features prefix item1F1 item1F2 item2F1 item2F2 distStr fv = | |
77 | - let add_diststr str = [str; str^"*"^distStr] in | |
78 | - let flist = List.map ((^) prefix)[ | |
79 | - "2FF1="^item1F1; | |
80 | - "2FF1="^item1F1^" "^item1F2; | |
81 | - "2FF1="^item1F1^" "^item1F2^" "^item2F2; | |
82 | - "2FF1="^item1F1^" "^item1F2^" "^item2F2^" "^item2F1; | |
83 | - "2FF2="^item1F1^" "^item2F1; | |
84 | - "2FF3="^item1F1^" "^item2F2; | |
85 | - "2FF4="^item1F2^" "^item2F1^" "^item2F2; | |
86 | - "2FF5="^item1F2^" "^item2F2; | |
87 | - "2FF6="^item2F1^" "^item2F2; | |
88 | - "2FF7="^item1F2; | |
89 | - "2FF8="^item2F1; | |
90 | - "2FF9="^item2F2; | |
91 | - ] in | |
92 | - let funs = List.map (add_feature) (List.flatten (List.map add_diststr flist)) in | |
93 | - apply_features funs fv | |
94 | - | |
95 | -type disamb_info = { | |
96 | - tree: linear_term array | |
97 | -} | |
98 | - | |
99 | -let score_edge (data: disamb_info) (parent: node) (child: node) = | |
100 | - let fv = IntSet.empty in | |
101 | - let fv = add_two_obs_features "HC" | |
102 | - parent.orth parent.pos child.orth child.pos "" fv in | |
103 | - score_fv fv | |
104 | - | |
105 | -let rec fill_dep_edges_array | |
106 | - (data: disamb_info) parent (scores: float IntMap.t) = | |
107 | - function | |
108 | - Dot -> scores | |
109 | - | Ref i -> (match data.tree.(i) with | |
110 | - Node child -> IntMap.add scores i (score_edge data parent child) | |
111 | - | _ as x -> raise (UnsupportedLinearTerm x)) | |
112 | - | Tuple l -> List.fold_left (fill_dep_edges_array data parent) scores l | |
113 | - | Variant (_, l) -> List.fold_left | |
114 | - (fill_dep_edges_array data parent) | |
115 | - scores (List.map snd l) | |
116 | - | _ as x -> raise (UnsupportedLinearTerm x) | |
117 | - | |
118 | -let rec disambiguate_args edge_scores = | |
119 | - function | |
120 | - Dot -> Dot, 0.0 | |
121 | - | Ref i -> Ref i, IntMap.find edge_scores i | |
122 | - | Tuple l -> | |
123 | - let (terms, scores) = | |
124 | - List.map (disambiguate_args edge_scores) l |> List.split in | |
125 | - let num = List.length scores |> float_of_int in | |
126 | - Tuple terms, (List.fold_left (+.) 0.0 scores) /. num | |
127 | - | Variant (lab, l) -> | |
128 | - let (lbs, terms) = List.split l in | |
129 | - let new_terms_scores = List.map (disambiguate_args edge_scores) terms in | |
130 | - let select_best (term, score) (new_term, new_score) = | |
131 | - if new_score > score then | |
132 | - new_term, new_score | |
133 | - else | |
134 | - term, score in | |
135 | - List.fold_left select_best (List.hd new_terms_scores) (List.tl new_terms_scores) | |
136 | - | _ as x -> raise (UnsupportedLinearTerm x) | |
137 | - | |
138 | -(* dezambiguacja argumentรณw pojedynczego wierzchoลka algorytmem zachลannym *) | |
139 | -let disambiguate_node (data: disamb_info) parentI = | |
140 | - let parent = match data.tree.(parentI) with | |
141 | - Node node -> node | |
142 | - | _ as x -> raise (UnsupportedLinearTerm x) in | |
143 | - let edge_scores = fill_dep_edges_array | |
144 | - data parent IntMap.empty (parent.args) in | |
145 | - let (new_term, _) = disambiguate_args edge_scores (parent.args) in | |
146 | - Node {parent with args = new_term} | |
147 | - | |
148 | -let disambiguate_tree tree = | |
149 | - let tree2 = Array.copy tree in | |
150 | - let data : disamb_info = {tree = tree} in | |
151 | - let update parentI _ = | |
152 | - (let new_term = disambiguate_node data parentI in | |
153 | - tree2.(parentI) <- new_term;) in | |
154 | - Array.iteri update tree; tree2 |
disambiguation/ENIAMmstDisambiguation.ml
0 โ 100644
1 | +open Xstd | |
2 | +open ENIAM_LCGtypes | |
3 | +open ENIAMmstModel | |
4 | +open ENIAMmstFeatures | |
5 | + | |
6 | +let initialize () = | |
7 | + MST_Model.initialize "dep.model.json"; | |
8 | + () | |
9 | + | |
10 | +exception UnsupportedLinearTerm of linear_term | |
11 | +exception EmptyVariant | |
12 | + | |
13 | +let rec fill_dep_edges_array | |
14 | + (data: disamb_info) parent (scores: float IntMap.t) = | |
15 | + function | |
16 | + Dot -> scores | |
17 | + | Ref i -> IntMap.add scores i (score_edge data parent data.tree.(i)) | |
18 | + | Tuple l -> List.fold_left (fill_dep_edges_array data parent) scores l | |
19 | + | Variant (_, l) -> List.fold_left | |
20 | + (fill_dep_edges_array data parent) | |
21 | + scores (List.map snd l) | |
22 | + | _ as x -> raise (UnsupportedLinearTerm x) | |
23 | + | |
24 | +let rec disambiguate_args edge_scores = | |
25 | + function | |
26 | + Dot -> Dot, 0.0 | |
27 | + | Ref i -> Ref i, IntMap.find edge_scores i | |
28 | + | Tuple l -> | |
29 | + let (terms, scores) = | |
30 | + List.map (disambiguate_args edge_scores) l |> List.split in | |
31 | + let num = List.length scores |> float_of_int in | |
32 | + Tuple terms, (List.fold_left (+.) 0.0 scores) /. num | |
33 | + | Variant (lab, l) -> | |
34 | + let (lbs, terms) = List.split l in | |
35 | + let new_terms_scores = List.map (disambiguate_args edge_scores) terms in | |
36 | + let select_best (term, score) (new_term, new_score) = | |
37 | + if new_score >= score then | |
38 | + new_term, new_score | |
39 | + else | |
40 | + term, score in | |
41 | + List.fold_left select_best (List.hd new_terms_scores) (List.tl new_terms_scores) | |
42 | + | _ as x -> raise (UnsupportedLinearTerm x) | |
43 | + | |
44 | +(* dezambiguacja argumentรณw pojedynczego wierzchoลka algorytmem zachลannym *) | |
45 | +let disambiguate_node (data: disamb_info) parent = | |
46 | + let edge_scores = fill_dep_edges_array | |
47 | + data parent IntMap.empty (parent.args) in | |
48 | + let (new_term, _) = disambiguate_args edge_scores (parent.args) in | |
49 | + {parent with args = new_term} | |
50 | + | |
51 | +let disambiguate_tree (tree: linear_term array) = | |
52 | + let extract_node = (function | |
53 | + Node node -> node | |
54 | + | _ as x -> UnsupportedLinearTerm x |> raise) in | |
55 | + let data : disamb_info = {tree = Array.map extract_node tree} in | |
56 | + let disambiguate term = Node (extract_node term |> disambiguate_node data) in | |
57 | + Array.map disambiguate tree | |
... | ... |
disambiguation/ENIAMmstFeatures.ml
0 โ 100644
1 | +open ENIAMmstModel | |
2 | +open ENIAM_LCGtypes | |
3 | + | |
4 | +type disamb_info = { | |
5 | + tree: node array | |
6 | +} | |
7 | + | |
8 | +let apply_features features fv = | |
9 | + List.fold_left (|>) fv features | |
10 | + | |
11 | +let add_linear_features f_type (obs: string array) first second distStr fv = | |
12 | + fv | |
13 | + | |
14 | +let add_two_obs_features prefix item1F1 item1F2 item2F1 item2F2 distStr fv = | |
15 | + let add_diststr str = [str; str^"*"^distStr] in | |
16 | + let flist = List.map ((^) prefix)[ | |
17 | + "2FF1="^item1F1; | |
18 | + "2FF1="^item1F1^" "^item1F2; | |
19 | + "2FF1="^item1F1^" "^item1F2^" "^item2F2; | |
20 | + "2FF1="^item1F1^" "^item1F2^" "^item2F2^" "^item2F1; | |
21 | + "2FF2="^item1F1^" "^item2F1; | |
22 | + "2FF3="^item1F1^" "^item2F2; | |
23 | + "2FF4="^item1F2^" "^item2F1^" "^item2F2; | |
24 | + "2FF5="^item1F2^" "^item2F2; | |
25 | + "2FF6="^item2F1^" "^item2F2; | |
26 | + "2FF7="^item1F2; | |
27 | + "2FF8="^item2F1; | |
28 | + "2FF9="^item2F2; | |
29 | + ] in | |
30 | + let funs = List.map (MST_Model.add_feature) (List.flatten (List.map add_diststr flist)) in | |
31 | + apply_features funs fv | |
32 | + | |
33 | +let score_edge (data: disamb_info) (parent: node) (child: node) = | |
34 | + let fv = MST_Model.empty_fv in | |
35 | + let fv = add_two_obs_features "HC" | |
36 | + parent.orth parent.pos child.orth child.pos "" fv in | |
37 | + MST_Model.score_fv fv | |
... | ... |
disambiguation/ENIAMmstModel.ml
0 โ 100644
1 | +open Yojson | |
2 | +open Xstd | |
3 | + | |
4 | + | |
5 | +module MST_Model : sig | |
6 | + type mst_model | |
7 | + type feature_vector_t | |
8 | + exception MalformedModelJson | |
9 | + | |
10 | + val read_model: string -> mst_model | |
11 | + val initialize: string -> unit | |
12 | + val add_feature: string -> feature_vector_t -> feature_vector_t | |
13 | + val score_fv: feature_vector_t -> float | |
14 | + val empty_fv: feature_vector_t | |
15 | +end | |
16 | += struct | |
17 | + type feature_vector_t = IntSet.t | |
18 | + | |
19 | + type mst_model = { | |
20 | + typeAlphabet: int StringMap.t; | |
21 | + dataAlphabet: int StringMap.t; | |
22 | + parameters: float array} | |
23 | + | |
24 | + | |
25 | + exception MalformedModelJson | |
26 | + | |
27 | + let model = ref {typeAlphabet = StringMap.empty; | |
28 | + dataAlphabet = StringMap.empty; | |
29 | + parameters = Array.make 0 0.0} | |
30 | + | |
31 | + let empty_fv = IntSet.empty | |
32 | + | |
33 | + let add_feature str (fv: feature_vector_t) = | |
34 | + if StringMap.mem !model.dataAlphabet str then | |
35 | + IntSet.add fv (StringMap.find !model.dataAlphabet str) | |
36 | + else | |
37 | + fv | |
38 | + | |
39 | + let score_fv (fv: feature_vector_t) = | |
40 | + IntSet.fold fv 0.0 (fun score i -> score +. !model.parameters.(i)) | |
41 | + | |
42 | + let construct_data_alphabet keys = | |
43 | + let counter = ref 0 in | |
44 | + let map = ref StringMap.empty in | |
45 | + let length = Array.length keys in | |
46 | + for i = 0 to length -1 do | |
47 | + map := StringMap.add !map keys.(i) !counter; | |
48 | + counter := !counter + 1; | |
49 | + done; | |
50 | + !map | |
51 | + | |
52 | + let construct_type_alphabet = construct_data_alphabet | |
53 | + | |
54 | + let read_model fname = | |
55 | + let data = Basic.from_file fname in | |
56 | + try | |
57 | + let open Yojson.Basic.Util in | |
58 | + let unwrapList = function | |
59 | + `List l -> l | |
60 | + | _ -> raise MalformedModelJson in | |
61 | + let dataA = data |> member "dataAlphabet" |> unwrapList |> filter_string in | |
62 | + let typeA = data |> member "typeAlphabet" |> unwrapList |> filter_string in | |
63 | + let params = data |> member "parameters" |> unwrapList |> filter_float in | |
64 | + {typeAlphabet = Array.of_list typeA |> construct_type_alphabet; | |
65 | + dataAlphabet = Array.of_list dataA |> construct_data_alphabet; | |
66 | + parameters = Array.of_list params} | |
67 | + with | |
68 | + _ -> raise MalformedModelJson | |
69 | + | |
70 | + let initialize fname = | |
71 | + model := read_model fname; | |
72 | + () | |
73 | +end | |
... | ... |
disambiguation/makefile
... | ... | @@ -6,27 +6,27 @@ OCAMLFLAGS=$(INCLUDES) -g |
6 | 6 | OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-lcg-parser.cmxa yojson.cmx |
7 | 7 | INSTALLDIR=`ocamlc -where`/eniam |
8 | 8 | |
9 | -SOURCES= ENIAM_EdgeScore.ml | |
9 | +SOURCES=ENIAMmstModel.ml ENIAMmstFeatures.ml ENIAMmstDisambiguation.ml | |
10 | 10 | |
11 | -all: eniam-edge-score.cma eniam-edge-score.cmxa | |
11 | +all: eniam-mst-disambiguation.cma eniam-mst-disambiguation.cmxa | |
12 | 12 | |
13 | 13 | install: all |
14 | 14 | mkdir -p $(INSTALLDIR) |
15 | - cp eniam-edge-score.cmxa eniam-edge-score.a eniam-edge-score.cma $(INSTALLDIR) | |
16 | - cp ENIAM_EdgeScore.cmi $(INSTALLDIR) | |
17 | - cp ENIAM_EdgeScore.cmx $(INSTALLDIR) | |
15 | + cp eniam-mst-disambiguation.cmxa eniam-mst-disambiguation.a eniam-mst-disambiguation.cma $(INSTALLDIR) | |
16 | + cp ENIAMmstDisambiguation.cmi ENIAMmstModel.cmi ENIAMmstFeatures.cmi $(INSTALLDIR) | |
17 | + cp ENIAMmstDisambiguation.cmx ENIAMmstModel.cmx ENIAMmstFeatures.cmx $(INSTALLDIR) | |
18 | 18 | |
19 | 19 | install-local: all |
20 | 20 | mkdir -p $(INSTALLDIR) |
21 | - cp eniam-edge-score.cmxa eniam-edge-score.a eniam-edge-score.cma $(INSTALLDIR) | |
22 | - cp ENIAM_EdgeScore.cmi $(INSTALLDIR) | |
23 | - cp ENIAM_EdgeScore.cmx $(INSTALLDIR) | |
21 | + cp eniam-mst-disambiguation.cmxa eniam-mst-disambiguation.a eniam-mst-disambiguation.cma $(INSTALLDIR) | |
22 | + cp ENIAMmstDisambiguation.cmi ENIAMmstModel.cmi ENIAMmstFeatures.cmi $(INSTALLDIR) | |
23 | + cp ENIAMmstDisambiguation.cmx ENIAMmstModel.cmx ENIAMmstFeatures.cmx $(INSTALLDIR) | |
24 | 24 | |
25 | -eniam-edge-score.cma: $(SOURCES) | |
26 | - ocamlc -linkall -a -o eniam-edge-score.cma $(OCAMLFLAGS) $^ | |
25 | +eniam-mst-disambiguation.cma: $(SOURCES) | |
26 | + ocamlc -linkall -a -o eniam-mst-disambiguation.cma $(OCAMLFLAGS) $^ | |
27 | 27 | |
28 | -eniam-edge-score.cmxa: $(SOURCES) | |
29 | - ocamlopt -linkall -a -o eniam-edge-score.cmxa $(INCLUDES) $^ | |
28 | +eniam-mst-disambiguation.cmxa: $(SOURCES) | |
29 | + ocamlopt -linkall -a -o eniam-mst-disambiguation.cmxa $(INCLUDES) $^ | |
30 | 30 | |
31 | 31 | test: test.ml |
32 | 32 | mkdir -p results |
... | ... |
disambiguation/mstparser/.gitignore
0 โ 100644
disambiguation/mstparser/ALT_README
0 โ 100644
1 | +Introduction | |
2 | +============ | |
3 | + | |
4 | +This file contains the configuration and build instructions for using | |
5 | +Apache Ant (http://ant.apache.org) to build MSTParser, and for setting | |
6 | +up your environment to use the scripts in the mstparser/bin | |
7 | +directory. All the instructions in the original README file should | |
8 | +continue to work as before -- this document describes an optional way | |
9 | +of compiling and using MSTParser with a few more bells and whistles. | |
10 | + | |
11 | + | |
12 | +Configuring your environment variables | |
13 | +====================================== | |
14 | + | |
15 | +The easiest thing to do is to set the environment variables JAVA_HOME | |
16 | +and MSTPARSER_DIR to the relevant locations on your system. Set JAVA_HOME | |
17 | +to match the top level directory containing the Java installation you | |
18 | +want to use. Note that version 1.5 of the Java 2 SDK is required. | |
19 | + | |
20 | +For example, on Windows: | |
21 | + | |
22 | +C:\> set JAVA_HOME=C:\jdk1.5.0_04 | |
23 | + | |
24 | +or on Unix: | |
25 | + | |
26 | +% setenv JAVA_HOME /usr/local/java | |
27 | + (csh) | |
28 | +> JAVA_HOME=/usr/java; export JAVA_HOME | |
29 | + (ksh, bash) | |
30 | + | |
31 | +On Windows, to get these settings to persist, it's actually easiest to | |
32 | +set your environment variables through the System Properties from the | |
33 | +Control Panel. For example, under WinXP, go to Control Panel, click on | |
34 | +System Properties, choose the Advanced tab, click on Environment | |
35 | +Variables, and add your settings in the User variables area. | |
36 | + | |
37 | +Next, likewise set MSTPARSER_DIR to be the top level directory where you | |
38 | +unzipped the download. In Unix, type 'pwd' in the directory where | |
39 | +this file is and use the path given to you by the shell as | |
40 | +MSTPARSER_DIR. You can set this in the same manner as for JAVA_HOME | |
41 | +above. | |
42 | + | |
43 | +Next, add the directory MSTPARSER_DIR/bin to your path. For example, you | |
44 | +can set the path in your .bashrc file as follows: | |
45 | + | |
46 | +export PATH=$PATH:$MSTPARSER_DIR/bin | |
47 | + | |
48 | +Once you have taken care of these three things, you should be able to | |
49 | +build and use MSTParser. | |
50 | + | |
51 | + | |
52 | +Building the system | |
53 | +=================== | |
54 | + | |
55 | +The MSTParser build system is based on Apache Ant. | |
56 | +Ant is a little but very handy tool that uses a build file written in | |
57 | +XML (build.xml) as building instructions. | |
58 | + | |
59 | +To build the code, first make sure your current working | |
60 | +directory is where the build.xml file is located. Then type: | |
61 | + | |
62 | + sh build.sh (Unix) | |
63 | + | |
64 | +If everything is right and all the required packages are visible, this | |
65 | +action will generate a file called mstparser.jar in the ./output | |
66 | +directory, and Java class files in ./output/classes. | |
67 | + | |
68 | + | |
69 | +Build targets | |
70 | +============= | |
71 | + | |
72 | +These are the meaningful targets for the main build file: | |
73 | + | |
74 | + package --> generates the openccg.jar file (default) | |
75 | + compile --> compiles the source code | |
76 | + javadoc --> generates the API documentation | |
77 | + clean --> cleans up the compilation directory | |
78 | + | |
79 | +There are also build files in each sample grammar directory. | |
80 | + | |
81 | +To learn the details of what each target does, read the build.xml file. | |
82 | +It is quite understandable. | |
83 | + | |
84 | + | |
85 | +Trying it out | |
86 | +============= | |
87 | + | |
88 | +If you've managed to configure and build the system, you should be | |
89 | +able to run mstparser as described in the README, but without some of | |
90 | +the extra classpath and memory options, and you should be able to do | |
91 | +so from anywhere on your directory system. | |
92 | + | |
93 | +If you trouble starting up any of the scripts, make sure you have set | |
94 | +the environment variables properly, and that the scripts (located in | |
95 | +mstparser/bin) call the right shell environment (top-line of the | |
96 | +script; to solve the problem, either comment out this line or correct | |
97 | +the path). | |
98 | + | |
99 | +Here's a brief description of some of the scripts: | |
100 | + | |
101 | +1. The shell script mst_parse.sh is just a simple wrapper that allows | |
102 | +you to do this: | |
103 | + | |
104 | +> mst_parse.sh \ | |
105 | + train train-file:data/train.ulab model-name:dep.model \ | |
106 | + test test-file:data/test.ulab output-file:out.txt \ | |
107 | + eval gold-file:data/test.ulab | |
108 | + | |
109 | +instead of this (as described in the readme): | |
110 | + | |
111 | +> java -classpath ".:lib/trove.jar" -Xmx1800m mstparser.DependencyParser \ | |
112 | + train train-file:data/train.ulab model-name:dep.model \ | |
113 | + test test-file:data/test.ulab output-file:out.txt \ | |
114 | + eval gold-file:data/test.ulab | |
115 | + | |
116 | + | |
117 | +[NOTE: actually, if you want to run MSTParser the latter way and you | |
118 | +build MSTParser using Ant (via build.sh), then your Java class files | |
119 | +will be contained in ./output/classes rather than ./mstparser, so the | |
120 | +classpath would need to be "./output/classes:lib/trove.jar". The | |
121 | +mst_parse.sh script takes care of this and sets the classpath | |
122 | +appropriately.] | |
123 | + | |
124 | +2. The shell script mst_score.sh is just an easy way to call upon the | |
125 | +main method of the mstparser.DependencyEvaluator class. Call it as | |
126 | +such: | |
127 | + | |
128 | +> mst_score.sh <gold_standard_dependency_file> <parser_output_dependency_file> <format> | |
129 | + | |
130 | +where <format> is either MST or CONLL (default CONLL). Here's a concrete example: | |
131 | + | |
132 | +> mst_score.sh data/portuguese/floresta_test.conll testbed/my_floresta_parses.conll CONLL | |
133 | + | |
134 | + | |
135 | +See the following site for more information on the CONLL format (as | |
136 | +well as other dependency parsers and info, etc): | |
137 | + | |
138 | +http://nextens.uvt.nl/~conll/ | |
139 | + | |
140 | +3. The Python program mst_experiment.py is more involved wrapper that | |
141 | +allows one to easily do a randomized ten-fold while improving | |
142 | +performance on development data. It manages the various files that | |
143 | +MSTParser produces and keeps them tightly contained into a single | |
144 | +output directory. It also has hooks for using a part-of-speech tagger | |
145 | +(see pos_tag.py, which calls on the OpenNLP POS Tagger and assumes it | |
146 | +is installed). | |
147 | + | |
148 | +You can see the options by running: | |
149 | + | |
150 | +> mst_experiment.py --help | |
151 | + | |
152 | +Here's an example to do an 8-fold cross-validation using the | |
153 | +non-projective algorithm: | |
154 | + | |
155 | +> mst_experiment.py -f 8 -o experiment1 -d non-proj data/train.lab | |
156 | + | |
157 | +If you find it useful, that's great -- but before diving into it, you | |
158 | +should be aware that you may have to hack the Python a bit for your | |
159 | +own needs. It is actually put together from a program previous used to | |
160 | +interface with the Bikel parser, so there may be some extraneous | |
161 | +options and code hanging around. | |
162 | + | |
163 | +Note: Do NOT ask Ryan for any help with this Python script. Direct any | |
164 | +questions to Jason Baldridge instead (see email below), and even then | |
165 | +don't count on a rapid response. | |
166 | + | |
167 | +4. The Python script pos_tag.py calls on an unreleased version of the | |
168 | +OpenNLP tagger, and is left here only as an example that might help | |
169 | +you develop a similar script for other taggers. If you want the | |
170 | +OpenNLP tagger, let Jason know and he will consider packaging it up | |
171 | +with the parser more cleanly. | |
172 | + | |
173 | +5. The Python script mst2conll.py can be used to convert your existing | |
174 | +MST format files to CONLL format. The script conll2mst.py converts | |
175 | +CONLL formated files into MST format. | |
176 | + | |
177 | +6. The Python script create_baseline.py creates a right or left | |
178 | +linking baseline. Default is to create left linking -- use the option | |
179 | +-r for right linking. Currently, it uses MST format for input and | |
180 | +output, so you'll need to do some conversion if you have dependency | |
181 | +files in CONLL format. (This script was adapted from one written by | |
182 | +Ben Wing.) | |
183 | + | |
184 | + | |
185 | + | |
186 | +Bug Reports | |
187 | +=========== | |
188 | + | |
189 | +See the original README for bug reporting for the system | |
190 | +itself. Report problems with the Ant build setup, the Python scripts, | |
191 | +or these instructions to Jason Baldridge (jasonbaldridge@gmail.com). | |
192 | + | |
193 | +Also note: if you use Windows and are having problems, you are on your | |
194 | +own. | |
195 | + | |
196 | + | |
197 | +Special Note | |
198 | +============ | |
199 | + | |
200 | +Parts of these instructions and some of the directory structure are | |
201 | +based on the OpenCCG (openccg.sf.net) project and the JDOM project | |
202 | +(www.jdom.org). | |
203 | + | |
204 | + | |
... | ... |
disambiguation/mstparser/ALT_TESTBED
0 โ 100644
1 | +=NOTE: This file is for developers -- don't let it confuse you if are | |
2 | +just giving MSTParser a spin. Check out README and ALT_README instead. | |
3 | + | |
4 | +To test that changes to the code have not messed up previous results, | |
5 | +do the following. | |
6 | + | |
7 | + | |
8 | +--------------------------------------------------------------------- | |
9 | +1. Parse English in MST format: | |
10 | + | |
11 | +Run the parser as such: | |
12 | + | |
13 | +> mst_parse.sh format:MST train train-file:data/train.lab model-name:testbed/model test test-file:data/test.lab output-file:testbed/my_english_parses.mst eval gold-file:data/test.lab | |
14 | + | |
15 | +Score the results: | |
16 | + | |
17 | +> mst_score.sh data/test.lab testbed/my_english_parses.mst MST > testbed/my_english_score.txt | |
18 | + | |
19 | +Then compare "english_parses.mst" to "my_english_parses.mst" and "english_score.txt" to | |
20 | +"my_english_score.txt" -- they should be the same. (diff them) | |
21 | + | |
22 | + | |
23 | +--------------------------------------------------------------------- | |
24 | +2. Parse Portuguese in CONLL format: | |
25 | + | |
26 | +> mst_parse.sh format:CONLL train train-file:data/portuguese/floresta_train.conll model-name:testbed/model test test-file:data/portuguese/floresta_test.conll output-file:testbed/my_floresta_parses.conll eval gold-file:data/portuguese/floresta_test.conll | |
27 | + | |
28 | +Score the results: | |
29 | + | |
30 | +> mst_score.sh data/portuguese/floresta_test.conll testbed/my_floresta_parses.conll CONLL > testbed/my_floresta_score.txt | |
31 | + | |
32 | +Compare as with English on the obvious file names. | |
33 | + | |
34 | + | |
35 | +--------------------------------------------------------------------- | |
36 | +3. Parse English with second order model. | |
37 | + | |
38 | +Run the parser as such: | |
39 | + | |
40 | +> mst_parse.sh format:MST train train-file:data/train.lab model-name:testbed/model test test-file:data/test.lab output-file:testbed/my_english_parses_order2.mst eval gold-file:data/test.lab order:2 | |
41 | + | |
42 | +Score the results: | |
43 | + | |
44 | +> mst_score.sh data/test.lab testbed/my_english_parses_order2.mst MST > testbed/my_english_score_order2.txt | |
45 | + | |
46 | +Compare with english_score_order2.txt. | |
47 | + | |
48 | +--------------------------------------------------------------------- | |
49 | +4. Parse Portuguese in CONLL format with second order model: | |
50 | + | |
51 | +> mst_parse.sh train train-file:data/portuguese/floresta_train.conll test test-file:data/portuguese/floresta_test.conll output-file:out.txt eval gold-file:data/portuguese/floresta_test.conll order:2 decode-type:non-proj | |
52 | + | |
... | ... |
disambiguation/mstparser/CHANGES
0 โ 100644
1 | +----------------------------------------------------------------------- | |
2 | +v0.5.1 | |
3 | + | |
4 | +- Issue 10 - loadModel() method from DepdendencyParser should also be | |
5 | + able to receive an InputStream | |
6 | +- Issue 9 - Add a method to DependencyParser which return the Parse | |
7 | + Trees | |
8 | +- Issue 7 - Update source folder at ant script | |
9 | +- Issue 6 - Change visibility of some methods and attributes to | |
10 | + facilitate wrapping | |
11 | +- Issue 2 - Convert project to maven | |
12 | + | |
13 | +----------------------------------------------------------------------- | |
14 | +v0.5.0 | |
15 | + | |
16 | + UNKNOWN | |
17 | + | |
18 | +----------------------------------------------------------------------- | |
19 | +v0.4.3b | |
20 | + | |
21 | +- Fixed bug: DependencyInstance serialization was not handling the | |
22 | + feats. This caused errors when using the non-projective decoder with | |
23 | + second order. (JMB 4-APR-07) | |
24 | + | |
25 | +----------------------------------------------------------------------- | |
26 | +v0.4.3 | |
27 | + | |
28 | +- Forest files are created in the tmp directory. Without this, two | |
29 | + instances of MSTParser being run on the same data set would | |
30 | + overwrite each other's feature forest files. Also, the forest files | |
31 | + created in tmp are deleted when the Java VM exits. (JMB, 21-JAN-07). | |
32 | + | |
33 | +- Separated out the standard sentential parsing features from extra | |
34 | + features used for discourse parsing. (JMB, 23-MAR-07) | |
35 | + | |
36 | +- Created ParserOptions so that it is easier to pass various options | |
37 | + between the parser and the pipes. (JMB, 23-MAR-07) | |
38 | + | |
39 | +- Fixed bug in serialization of DependencyInstances -- lemmas were not | |
40 | + being written out, and this caused the 2nd order stuff to | |
41 | + crash. (JMB 23-MAR-07) | |
42 | + | |
43 | + | |
44 | +----------------------------------------------------------------------- | |
45 | +v0.4.2 | |
46 | + | |
47 | +- Results have improved slightly over previous testbed results. This | |
48 | + may be due to the fact that FeatureVector.dotProduct would have got | |
49 | + -1 return values on keys not held in the TIntDoubleHashMap for the | |
50 | + second vector in the previous version of Trove. Now that Trove | |
51 | + returns 0, this is actually the right behavior in this case. Another | |
52 | + possible explanation is that there is some minor change in the | |
53 | + features which are generated. Since the output has changed so | |
54 | + little, and for the better, I'll leave it at that for now. The | |
55 | + testbed results and output have been updated to reflect the current | |
56 | + version. (JMB, 17-JAN-07) | |
57 | + | |
58 | +- Uncommented a line in DependencyPipe that removed some features from | |
59 | + the parsing models in the previous release. (Need to come up with a | |
60 | + better way of defining different pipes!) (JMB, 17-JAN-07) | |
61 | + | |
62 | +- Changed the FeatureVector implementation to be a TLinkedList of | |
63 | + Feature objects, with two optional sub-FeatureVectors contained | |
64 | + within. This supports fast concatenation of two FeatureVectors since | |
65 | + it is no longer necessary to copy entire lists. Also, rather than | |
66 | + explicitly negating features for the getDistVector() method, a | |
67 | + boolean value is set that can optionally indicate the second | |
68 | + sub-FeatureVector as negated. The logic of the other methods then | |
69 | + preserves the negation (and negation with negation). Again, this | |
70 | + means we don't have to make copies for this operation. These changes | |
71 | + led sped up training by a factor of 2 to 4 (depedending on the | |
72 | + number of features used in the parsing model) and parsing by up to | |
73 | + 1.5 times. (JMB, 17-JAN-07) | |
74 | + | |
75 | +- Updated to Trove v1.1b5. Changed default return value of | |
76 | + TObjectIntHashMap to be -1 rather than 0, so it is important to use | |
77 | + the included trove.jar rather than downloading and using one from | |
78 | + the Trove project. (Note: I tried to update to v2.0a2, but the test | |
79 | + suites broke with that version. Attempts to sort out the problem | |
80 | + were unsuccessful, so V1.1b5 will just have to do for now.) (JMB, | |
81 | + 16-JAN-07) | |
82 | + | |
83 | +- Removed addIfNotPresent boolean from lookupIndex in Alphabet since | |
84 | + it isn't used in MSTParser and it incurs an extra method call and | |
85 | + boolean check on a very common method. (JMB, 16-JAN-07) | |
86 | + | |
87 | +- Added support for relational features, which hold between two | |
88 | + utterances. These features are defined as an NxN matrix (N=number of | |
89 | + parsing units) below the main CoNLL format declarations. This is | |
90 | + mainly introduced for discourse parsing to allow for features like | |
91 | + whether two parsing units are in the same sentence or paragraph, or | |
92 | + if they both contain references to the same entity. It can be | |
93 | + ignored for sentence parsing -- everything continues to work as | |
94 | + before. (The distance between two units is an example of such a | |
95 | + feature in sentence parsing, but this can be computed on the fly, so | |
96 | + it isn't necessary to use such a matrix.) (JMB, 14-JAN-07) | |
97 | + | |
98 | + | |
99 | +----------------------------------------------------------------------- | |
100 | +v0.4.0 | |
101 | + | |
102 | +- Cleaned up Pipes considerably; eg, Pipe2O doesn't replicate so much | |
103 | + code from Pipe. Many of the createFeatureVector methods were renamed | |
104 | + to things like addCoreFeatures. (JMB) | |
105 | + | |
106 | +- If one uses MST format, the creation of posA and the | |
107 | + 5-character-substring features now are put into dependency instances | |
108 | + in MSTReader as the course pos tags and lemmas, respectively. Then | |
109 | + in the feature extraction code, rather than creating posA etc on the | |
110 | + fly, it just references those fields in the dependency | |
111 | + instance. That way, if you use conll format, you get to use lemma | |
112 | + and course tag values supplied by the annotations. (JMB) | |
113 | + | |
114 | +- Can utilize the FEAT1|FEAT2|...|FEATN field of the CONLL format to | |
115 | + allow abitrary features. See addCoreFeatures() in the DependencyPipe | |
116 | + class. (JMB) | |
117 | + | |
118 | +----------------------------------------------------------------------- | |
119 | +v0.2.2 | |
120 | + | |
121 | +- MSTParser now works with both MST and CONLL formats. Pipes are now | |
122 | + passed a parameter for which format they use, and they call upon | |
123 | + Readers and Writers that know how to handle each format. CONLL is | |
124 | + the default format. (JMB) | |
125 | + | |
126 | +- Added a subset of the Portuguese data from CONLL to test the CONLL | |
127 | + format and to have another data set for the testbed. See TESTBED | |
128 | + (JMB) | |
129 | + | |
130 | +- Included an Ant build system that does some nice things, but which | |
131 | + can be ignored if make is preferred. Highlights of the additional | |
132 | + capabilities are: (1) class files are put in a location | |
133 | + (./output/classes) separate from the .java files; (2) you can get | |
134 | + javadocs (./doc/api) by running "sh build.sh javadoc"; (3) you can | |
135 | + make a release with "sh build.sh release" You don't need to install | |
136 | + anything extra ( ant.jar in in ./lib); the only additional steps | |
137 | + needed to use the Ant build setup is to set the JAVA_HOME and | |
138 | + MSTPARSER_DIR environment variables appropriately. (JMB) | |
139 | + | |
140 | + | |
141 | + | |
142 | + | |
... | ... |
disambiguation/mstparser/LICENSE
0 โ 100644
1 | +This software is Copyright (C) 2005 University of Pennsylvania and | |
2 | +this software is Copyright (C) 2002, 2003 University of Massachusetts | |
3 | +Amherst, Department of Computer Science, and is licensed under the | |
4 | +terms of the Common Public License, Version 1.0 or (at your option) | |
5 | +any subsequent version. | |
6 | + | |
7 | +The license is approved by the Open Source Initiative, and is available | |
8 | +from their website at http://www.opensource.org. | |
9 | + | |
10 | +===================== | |
11 | + | |
12 | +Common Public License Version 1.0 | |
13 | + | |
14 | +THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS COMMON | |
15 | +PUBLIC LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF | |
16 | +THE PROGRAM CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. | |
17 | + | |
18 | +1. DEFINITIONS | |
19 | + | |
20 | +"Contribution" means: | |
21 | + | |
22 | +a) in the case of the initial Contributor, the initial code and | |
23 | +documentation distributed under this Agreement, and | |
24 | + | |
25 | +b) in the case of each subsequent Contributor: | |
26 | + | |
27 | +i) changes to the Program, and | |
28 | + | |
29 | +ii) additions to the Program; | |
30 | + | |
31 | +where such changes and/or additions to the Program originate from and | |
32 | +are distributed by that particular Contributor. A Contribution | |
33 | +'originates' from a Contributor if it was added to the Program by such | |
34 | +Contributor itself or anyone acting on such Contributor's | |
35 | +behalf. Contributions do not include additions to the Program which: | |
36 | +(i) are separate modules of software distributed in conjunction with | |
37 | +the Program under their own license agreement, and (ii) are not | |
38 | +derivative works of the Program. | |
39 | + | |
40 | +"Contributor" means any person or entity that distributes the Program. | |
41 | + | |
42 | +"Licensed Patents " mean patent claims licensable by a Contributor | |
43 | +which are necessarily infringed by the use or sale of its Contribution | |
44 | +alone or when combined with the Program. | |
45 | + | |
46 | +"Program" means the Contributions distributed in accordance with this | |
47 | +Agreement. | |
48 | + | |
49 | +"Recipient" means anyone who receives the Program under this | |
50 | +Agreement, including all Contributors. | |
51 | + | |
52 | +2. GRANT OF RIGHTS | |
53 | + | |
54 | +a) Subject to the terms of this Agreement, each Contributor hereby | |
55 | +grants Recipient a non-exclusive, worldwide, royalty-free copyright | |
56 | +license to reproduce, prepare derivative works of, publicly display, | |
57 | +publicly perform, distribute and sublicense the Contribution of such | |
58 | +Contributor, if any, and such derivative works, in source code and | |
59 | +object code form. | |
60 | + | |
61 | +b) Subject to the terms of this Agreement, each Contributor hereby | |
62 | +grants Recipient a non-exclusive, worldwide, royalty-free patent | |
63 | +license under Licensed Patents to make, use, sell, offer to sell, | |
64 | +import and otherwise transfer the Contribution of such Contributor, if | |
65 | +any, in source code and object code form. This patent license shall | |
66 | +apply to the combination of the Contribution and the Program if, at | |
67 | +the time the Contribution is added by the Contributor, such addition | |
68 | +of the Contribution causes such combination to be covered by the | |
69 | +Licensed Patents. The patent license shall not apply to any other | |
70 | +combinations which include the Contribution. No hardware per se is | |
71 | +licensed hereunder. | |
72 | + | |
73 | +c) Recipient understands that although each Contributor grants the | |
74 | +licenses to its Contributions set forth herein, no assurances are | |
75 | +provided by any Contributor that the Program does not infringe the | |
76 | +patent or other intellectual property rights of any other entity. Each | |
77 | +Contributor disclaims any liability to Recipient for claims brought by | |
78 | +any other entity based on infringement of intellectual property rights | |
79 | +or otherwise. As a condition to exercising the rights and licenses | |
80 | +granted hereunder, each Recipient hereby assumes sole responsibility | |
81 | +to secure any other intellectual property rights needed, if any. For | |
82 | +example, if a third party patent license is required to allow | |
83 | +Recipient to distribute the Program, it is Recipient's responsibility | |
84 | +to acquire that license before distributing the Program. | |
85 | + | |
86 | +d) Each Contributor represents that to its knowledge it has sufficient | |
87 | +copyright rights in its Contribution, if any, to grant the copyright | |
88 | +license set forth in this Agreement. | |
89 | + | |
90 | +3. REQUIREMENTS | |
91 | + | |
92 | +A Contributor may choose to distribute the Program in object code form | |
93 | +under its own license agreement, provided that: | |
94 | + | |
95 | +a) it complies with the terms and conditions of this Agreement; and | |
96 | + | |
97 | +b) its license agreement: | |
98 | + | |
99 | +i) effectively disclaims on behalf of all Contributors all warranties | |
100 | +and conditions, express and implied, including warranties or | |
101 | +conditions of title and non-infringement, and implied warranties or | |
102 | +conditions of merchantability and fitness for a particular purpose; | |
103 | + | |
104 | +ii) effectively excludes on behalf of all Contributors all liability | |
105 | +for damages, including direct, indirect, special, incidental and | |
106 | +consequential damages, such as lost profits; | |
107 | + | |
108 | +iii) states that any provisions which differ from this Agreement are | |
109 | +offered by that Contributor alone and not by any other party; and | |
110 | + | |
111 | +iv) states that source code for the Program is available from such | |
112 | +Contributor, and informs licensees how to obtain it in a reasonable | |
113 | +manner on or through a medium customarily used for software exchange. | |
114 | + | |
115 | +When the Program is made available in source code form: | |
116 | + | |
117 | +a) it must be made available under this Agreement; and | |
118 | + | |
119 | +b) a copy of this Agreement must be included with each copy of the | |
120 | +Program. | |
121 | + | |
122 | +Contributors may not remove or alter any copyright notices contained | |
123 | +within the Program. | |
124 | + | |
125 | +Each Contributor must identify itself as the originator of its | |
126 | +Contribution, if any, in a manner that reasonably allows subsequent | |
127 | +Recipients to identify the originator of the Contribution. | |
128 | + | |
129 | +4. COMMERCIAL DISTRIBUTION | |
130 | + | |
131 | +Commercial distributors of software may accept certain | |
132 | +responsibilities with respect to end users, business partners and the | |
133 | +like. While this license is intended to facilitate the commercial use | |
134 | +of the Program, the Contributor who includes the Program in a | |
135 | +commercial product offering should do so in a manner which does not | |
136 | +create potential liability for other Contributors. Therefore, if a | |
137 | +Contributor includes the Program in a commercial product offering, | |
138 | +such Contributor ("Commercial Contributor") hereby agrees to defend | |
139 | +and indemnify every other Contributor ("Indemnified Contributor") | |
140 | +against any losses, damages and costs (collectively "Losses") arising | |
141 | +from claims, lawsuits and other legal actions brought by a third party | |
142 | +against the Indemnified Contributor to the extent caused by the acts | |
143 | +or omissions of such Commercial Contributor in connection with its | |
144 | +distribution of the Program in a commercial product offering. The | |
145 | +obligations in this section do not apply to any claims or Losses | |
146 | +relating to any actual or alleged intellectual property | |
147 | +infringement. In order to qualify, an Indemnified Contributor must: a) | |
148 | +promptly notify the Commercial Contributor in writing of such claim, | |
149 | +and b) allow the Commercial Contributor to control, and cooperate with | |
150 | +the Commercial Contributor in, the defense and any related settlement | |
151 | +negotiations. The Indemnified Contributor may participate in any such | |
152 | +claim at its own expense. | |
153 | + | |
154 | +For example, a Contributor might include the Program in a commercial | |
155 | +product offering, Product X. That Contributor is then a Commercial | |
156 | +Contributor. If that Commercial Contributor then makes performance | |
157 | +claims, or offers warranties related to Product X, those performance | |
158 | +claims and warranties are such Commercial Contributor's responsibility | |
159 | +alone. Under this section, the Commercial Contributor would have to | |
160 | +defend claims against the other Contributors related to those | |
161 | +performance claims and warranties, and if a court requires any other | |
162 | +Contributor to pay any damages as a result, the Commercial Contributor | |
163 | +must pay those damages. | |
164 | + | |
165 | +5. NO WARRANTY | |
166 | + | |
167 | +EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS | |
168 | +PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
169 | +KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY | |
170 | +WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY | |
171 | +OR FITNESS FOR A PARTICULAR PURPOSE. Each Recipient is solely | |
172 | +responsible for determining the appropriateness of using and | |
173 | +distributing the Program and assumes all risks associated with its | |
174 | +exercise of rights under this Agreement, including but not limited to | |
175 | +the risks and costs of program errors, compliance with applicable | |
176 | +laws, damage to or loss of data, programs or equipment, and | |
177 | +unavailability or interruption of operations. | |
178 | + | |
179 | +6. DISCLAIMER OF LIABILITY | |
180 | + | |
181 | +EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR | |
182 | +ANY CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, | |
183 | +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING | |
184 | +WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF | |
185 | +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
186 | +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OR | |
187 | +DISTRIBUTION OF THE PROGRAM OR THE EXERCISE OF ANY RIGHTS GRANTED | |
188 | +HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. | |
189 | + | |
190 | +7. GENERAL | |
191 | + | |
192 | +If any provision of this Agreement is invalid or unenforceable under | |
193 | +applicable law, it shall not affect the validity or enforceability of | |
194 | +the remainder of the terms of this Agreement, and without further | |
195 | +action by the parties hereto, such provision shall be reformed to the | |
196 | +minimum extent necessary to make such provision valid and enforceable. | |
197 | + | |
198 | +If Recipient institutes patent litigation against a Contributor with | |
199 | +respect to a patent applicable to software (including a cross-claim or | |
200 | +counterclaim in a lawsuit), then any patent licenses granted by that | |
201 | +Contributor to such Recipient under this Agreement shall terminate as | |
202 | +of the date such litigation is filed. In addition, if Recipient | |
203 | +institutes patent litigation against any entity (including a | |
204 | +cross-claim or counterclaim in a lawsuit) alleging that the Program | |
205 | +itself (excluding combinations of the Program with other software or | |
206 | +hardware) infringes such Recipient's patent(s), then such Recipient's | |
207 | +rights granted under Section 2(b) shall terminate as of the date such | |
208 | +litigation is filed. | |
209 | + | |
210 | +All Recipient's rights under this Agreement shall terminate if it | |
211 | +fails to comply with any of the material terms or conditions of this | |
212 | +Agreement and does not cure such failure in a reasonable period of | |
213 | +time after becoming aware of such noncompliance. If all Recipient's | |
214 | +rights under this Agreement terminate, Recipient agrees to cease use | |
215 | +and distribution of the Program as soon as reasonably | |
216 | +practicable. However, Recipient's obligations under this Agreement and | |
217 | +any licenses granted by Recipient relating to the Program shall | |
218 | +continue and survive. | |
219 | + | |
220 | +Everyone is permitted to copy and distribute copies of this Agreement, | |
221 | +but in order to avoid inconsistency the Agreement is copyrighted and | |
222 | +may only be modified in the following manner. The Agreement Steward | |
223 | +reserves the right to publish new versions (including revisions) of | |
224 | +this Agreement from time to time. No one other than the Agreement | |
225 | +Steward has the right to modify this Agreement. IBM is the initial | |
226 | +Agreement Steward. IBM may assign the responsibility to serve as the | |
227 | +Agreement Steward to a suitable separate entity. Each new version of | |
228 | +the Agreement will be given a distinguishing version number. The | |
229 | +Program (including Contributions) may always be distributed subject to | |
230 | +the version of the Agreement under which it was received. In addition, | |
231 | +after a new version of the Agreement is published, Contributor may | |
232 | +elect to distribute the Program (including its Contributions) under | |
233 | +the new version. Except as expressly stated in Sections 2(a) and 2(b) | |
234 | +above, Recipient receives no rights or licenses to the intellectual | |
235 | +property of any Contributor under this Agreement, whether expressly, | |
236 | +by implication, estoppel or otherwise. All rights in the Program not | |
237 | +expressly granted under this Agreement are reserved. | |
238 | + | |
239 | +This Agreement is governed by the laws of the State of New York and | |
240 | +the intellectual property laws of the United States of America. No | |
241 | +party to this Agreement will bring a legal action under this Agreement | |
242 | +more than one year after the cause of action arose. Each party waives | |
243 | +its rights to a jury trial in any resulting litigation. | |
... | ... |
disambiguation/mstparser/README
0 โ 100644
1 | +------------------------- | |
2 | +MSTParser version 0.5.0 | |
3 | +------------------------- | |
4 | + | |
5 | +This is the main README. See ALT_README for some extra utilities and | |
6 | +an alternative build process to the one described in this README. The | |
7 | +package has been modified by Jason Baldridge -- this version should | |
8 | +produce the same results as Ryan McDonald's previous releases, but it | |
9 | +has been made more flexible and configurable in the input formats it | |
10 | +accepts (both MST and CoNLL) and in the way features are declared (see | |
11 | +the DependencyPipe class). | |
12 | + | |
13 | +------------------------- | |
14 | + | |
15 | + | |
16 | +The following package contains a java implementation of the dependency | |
17 | +parsers described in: | |
18 | + | |
19 | +Non-Projective Dependency Parsing using Spanning Tree Algorithms | |
20 | +R. McDonald, F. Pereira, K. Ribarov and J. Hajic | |
21 | +HLT-EMNLP, 2005 | |
22 | + | |
23 | +Online Large-Margin Training of Dependency Parsers | |
24 | +R. McDonald, K. Crammer and F. Pereira | |
25 | +ACL, 2005 | |
26 | + | |
27 | +Online Learning of Approximate Dependency Parsing Algorithms | |
28 | +R. McDonald and F. Pereira | |
29 | +EACL, 2006 | |
30 | + | |
31 | +In addition, the parsers in this package can also learn and produce typed | |
32 | +dependency trees (i.e. trees with edge labels). | |
33 | + | |
34 | +The parser should work with Java 1.4 and 1.5 | |
35 | + | |
36 | +If there are any problems running the parser then email: ryantm@cis.upenn.edu | |
37 | +I will only respond to questions not answered in this README. | |
38 | + | |
39 | + | |
40 | +---------------- | |
41 | +Contents | |
42 | +---------------- | |
43 | + | |
44 | +1. Compiling | |
45 | + | |
46 | +2. Example of usage | |
47 | + | |
48 | +3. Running the parser | |
49 | + a. Input data format | |
50 | + b. Training a parser | |
51 | + c. Running a trained model on new data | |
52 | + d. Evaluating output | |
53 | + | |
54 | +4. Memory/Disk space and performance issues | |
55 | + | |
56 | +5. Reproducing results in HLT-EMNLP and ACL papers | |
57 | + | |
58 | + | |
59 | +---------------- | |
60 | +1. Compiling | |
61 | +---------------- | |
62 | + | |
63 | +To compile the code, first unzip/tar the downloaded file: | |
64 | + | |
65 | +> gunzip mstparser.tar.gz | |
66 | +> tar -xvf mstparser.tar | |
67 | +> cd MSTParser | |
68 | + | |
69 | +Next, run the following command | |
70 | + | |
71 | +> javac -classpath ".:lib/trove.jar" mstparser/DependencyParser.java | |
72 | + | |
73 | +This will compile the package. | |
74 | + | |
75 | + | |
76 | +--------------------- | |
77 | +2. Example Usage | |
78 | +--------------------- | |
79 | + | |
80 | +In the directory data/ there are examples of training and testing data. Data | |
81 | +format is described in the next section. | |
82 | + | |
83 | +train.ulab/test.ulab | |
84 | +- training and testing data with unlabeled trees | |
85 | + | |
86 | +train.lab/test.lab | |
87 | +- training and testing data with labeled trees | |
88 | + | |
89 | +To run an unlabeled parser type: | |
90 | + | |
91 | +> java -classpath ".:lib/trove.jar" -Xmx1800m mstparser.DependencyParser \ | |
92 | + train train-file:data/train.ulab model-name:dep.model \ | |
93 | + test test-file:data/test.ulab output-file:out.txt \ | |
94 | + eval gold-file:data/test.ulab format:MST | |
95 | + | |
96 | +This will train a parser on the training data, run it on the testing data and | |
97 | +evaluate the output against the gold standard. The results from running the | |
98 | +parser are in the file out.txt and the trained model in dep.model. | |
99 | + | |
100 | +To train an labeled parser run the same command but use the labeled training | |
101 | +and testing files. | |
102 | + | |
103 | + | |
104 | +------------------------- | |
105 | +3. Running the Parser | |
106 | +------------------------- | |
107 | + | |
108 | +------------------------- | |
109 | +3a. Input data format | |
110 | +------------------------- | |
111 | + | |
112 | +**************************** NOTE ********************************** | |
113 | +The parser now uses CONLL format as a default. Note the inclusion of | |
114 | +the format:MST option in the instructions below, which differ from the | |
115 | +instructions in previous versions (v0.2 and before). If you wish to | |
116 | +run the parser on CONLL formatted files, use format:CONLL or just | |
117 | +don't include the format option. | |
118 | +******************************************************************** | |
119 | + | |
120 | +Example data sets are given in the data/ directory. | |
121 | + | |
122 | +Each sentence in the data is represented by 3 or 4 lines and sentences are | |
123 | +space separated. The general format is: | |
124 | + | |
125 | +w1 w2 ... wn | |
126 | +p1 p2 ... pn | |
127 | +l1 l2 ... ln | |
128 | +d1 d2 ... d2 | |
129 | + | |
130 | +.... | |
131 | + | |
132 | + | |
133 | +Where, | |
134 | +- w1 ... wn are the n words of the sentence (tab deliminated) | |
135 | +- p1 ... pn are the POS tags for each word | |
136 | +- l1 ... ln are the labels of the incoming edge to each word | |
137 | +- d1 ... dn are integers representing the postition of each words parent | |
138 | + | |
139 | +For example, the sentence "John hit the ball" would be: | |
140 | + | |
141 | +John hit the ball | |
142 | +N V D N | |
143 | +SBJ ROOT MOD OBJ | |
144 | +2 0 4 2 | |
145 | + | |
146 | +Note that hit's parent is indexed by 0 since it is the root. | |
147 | + | |
148 | +If you wish to only train or test an unlabeled parser, then simply leave out | |
149 | +the third line for each sentence, e.g., | |
150 | + | |
151 | +John hit the ball | |
152 | +N V D N | |
153 | +2 0 4 2 | |
154 | + | |
155 | +The parser will automatically detect that it should produce unlabeled trees. | |
156 | + | |
157 | +Note that this format is the same for training AND for running the parser on | |
158 | +new data. Of course, you may not always know the gold standard. In this case, | |
159 | +just substitute lines 3 (the edge labels) and lines 4 (the parent indexes) with | |
160 | +dummy values. The parser just ignores these values and produces its own. | |
161 | + | |
162 | + | |
163 | +---------------------------- | |
164 | +3b. Training the parser | |
165 | +---------------------------- | |
166 | + | |
167 | +If you have a set of labeled data, first place it in the format described | |
168 | +above. | |
169 | + | |
170 | +If your training data is in a file train.txt, you can then run the command: | |
171 | + | |
172 | +> java -classpath ".:lib/trove.jar" -Xmx1800m mstparser.DependencyParser \ | |
173 | + train train-file:train.txt format:MST | |
174 | + | |
175 | +This will train a parser with all the default properties. Additonal | |
176 | +properties can be described with the following flags: | |
177 | + | |
178 | +train | |
179 | +- if present then parser will train a new model | |
180 | + | |
181 | +train-file:file.txt | |
182 | +- use data in file.txt to train the parser | |
183 | + | |
184 | +model-name:model.name | |
185 | +- store trained model in file called model.name | |
186 | + | |
187 | +iters:numIters | |
188 | +- Run training algorithm for numIters epochs, default is 10 | |
189 | + | |
190 | +decode-type:type | |
191 | +- type is either "proj" or "non-proj", e.g. decode-type:proj | |
192 | +- Default is "proj" | |
193 | +- "proj" use the projective parsing algorithm during training | |
194 | + - i.e. The Eisner algorithm | |
195 | +- "non-proj" use the non-projective parsing algorithm during training | |
196 | + - i.e. The Chu-Liu-Edmonds algorithm | |
197 | + | |
198 | +training-k:K | |
199 | +- Specifies the k-best parse set size to create constraints during training | |
200 | +- Default is 1 | |
201 | +- For non-projective parsing algorithm, k-best decoding is approximate | |
202 | + | |
203 | +loss-type:type | |
204 | +- type is either "punc" or "nopunc", e.g. loss-type:punc | |
205 | +- Default is "punc" | |
206 | +- "punc" include punctuation in hamming loss calculation | |
207 | +- "nopunc" do not include punctuation in hamming loss calculation | |
208 | + | |
209 | +create-forest:cf | |
210 | +- cf is either "true" or "false" | |
211 | +- Default is "true" | |
212 | +- If create-forest is false, it will not create the training parse forest (see | |
213 | + section 4). It assumes it has been created. | |
214 | +- This flag is useful if you are training many models on the same data and | |
215 | + features but using different parameters (e.g. training iters, decoding type). | |
216 | + | |
217 | +order:ord | |
218 | +- ord is either 1 or 2 | |
219 | +- Default is 1 | |
220 | +- Specifies the order/scope of features. 1 only has features over single edges | |
221 | + and 2 has features over pairs of adjacent edges in the tree. | |
222 | + | |
223 | +format:FORMAT | |
224 | +- FORMAT is either MST or CONLL | |
225 | +- Default is CONLL | |
226 | +- Specifies whether the input/output format. MST is the format used by | |
227 | + MSTParser until version 0.2.1. CONLL is the format used in the | |
228 | + CONLL-X shared task (see http://nextens.uvt.nl/~conll/). | |
229 | + | |
230 | +------------------------------------------------ | |
231 | +3c. Running a trained model on new data | |
232 | +------------------------------------------------ | |
233 | + | |
234 | +This section assumes you have trained a model and it is stored in dep.model. | |
235 | + | |
236 | +First, format your data properly (section 3a). | |
237 | + | |
238 | +It should be noted that the parser assumes both words and POS tags. To | |
239 | +generate POS tags for your data I suggest using the Ratniparkhi POS tagger | |
240 | +or another tagger of your choice. | |
241 | + | |
242 | +The parser also assumes that the edge label and parent index lines are | |
243 | +in the input. However, these can just be artificially inserted (e.g. with lines | |
244 | +of "LAB ... LAB" and "0 ... 0") since the parser will produce these lines | |
245 | +as output. | |
246 | + | |
247 | +If the data is in a file called test.txt, run the command: | |
248 | + | |
249 | +> java -classpath ".:lib/trove.jar" -Xmx1800m mstparser.DependencyParser \ | |
250 | + test model-name:dep.model test-file:test.txt output-file:out.txt format:MST | |
251 | + | |
252 | +This will create an output file "out.txt" with the predictions of the parser. | |
253 | +Other properties can be defined with the following flags: | |
254 | + | |
255 | +test | |
256 | +- If included a trained parser will be run on the testing data | |
257 | + | |
258 | +test-file:file.txt | |
259 | +- The file containing the data to run the parser on | |
260 | + | |
261 | +model-name:model.name | |
262 | +- The name of the stored model to be used | |
263 | + | |
264 | +output-file:out.txt | |
265 | +- The result of running the parser on the new data | |
266 | + | |
267 | +decode-type:type | |
268 | +- See section 3b. | |
269 | + | |
270 | +order:ord | |
271 | +- See section 3b. THIS NEEDS TO HAVE THE SAME VALUE OF THE TRAINED MODEL!! | |
272 | + | |
273 | +format:FORMAT | |
274 | +- See section 3b. | |
275 | + | |
276 | +Note that if you train a labeled model, you should only run it expecting | |
277 | +labeled output (e.g. the test data should have 4 lines per sentence). | |
278 | +And if you train an unlabeled model, you should only run it expecting | |
279 | +unlabeled output (e.g. the test data should have 3 lines per sentence). | |
280 | + | |
281 | + | |
282 | +------------------------ | |
283 | +3d. Evaluating Output | |
284 | +------------------------ | |
285 | + | |
286 | +This section describes a simple class for evaluating the output of | |
287 | +the parser against a gold standard. | |
288 | + | |
289 | +Assume you have a gold standard, say test.txt and the output of the parser | |
290 | +say out.txt, then run the following command: | |
291 | + | |
292 | +> java -classpath ".:lib/trove.jar" -Xmx1800m mstparser.DependencyParser \ | |
293 | + eval gold-file:test.txt output-file:out.txt MST | |
294 | + | |
295 | +This will return both labeled and unlabeled accuracy (if the data sets contain | |
296 | +labeled trees) as well as complete sentence accuracy, again labeled and | |
297 | +unlabeled. | |
298 | + | |
299 | +If your data is in CONLL format instead of MST format (pre-v0.2.1), | |
300 | +then replace MST by CONLL in the above command, or just leave it off | |
301 | +-- it defaults to CONLL. | |
302 | + | |
303 | +We should note that currently this evaluation script includes all punctuation. | |
304 | +In future releases we will modify this class to allow for the evaluation to | |
305 | +ingnore punctuation, which is standard for English (Yamada and Matsumoto 03). | |
306 | + | |
307 | + | |
308 | +--------------------------------------------- | |
309 | +4. Memory/Disk space and performance issues | |
310 | +--------------------------------------------- | |
311 | + | |
312 | +This parser is memory and disk space intensive. | |
313 | + | |
314 | +MEMORY ISSUES | |
315 | + | |
316 | +Remember to always run java with the flag -Xmx1800m to use all available | |
317 | +memory for the heap. For 64-bit machines use an even larger value, say | |
318 | +-Xmx8000m. | |
319 | + | |
320 | +Training a model on the WSJ can be done easily on a 32-bit machine. | |
321 | +It should also be possible to train a model on the entire Prague Dependency | |
322 | +Treebank on a 32-bit machine (I have done it), but I make no guarantees. | |
323 | + | |
324 | +DISK ISSUES | |
325 | + | |
326 | +To make training quicker we store the entire parse forest on disk, ala | |
327 | +Clark and Curran 04. This can be very large, up to and over 20GB!! Be aware | |
328 | +of this fact. | |
329 | + | |
330 | +If you train using a file called train.txt, the forest will be stored in | |
331 | +a file called train.txt.forest. If disk space is an issue you can remove this | |
332 | +file immediately after training (it is not need to run the parser on new data). | |
333 | + | |
334 | +However, sometimes it is good to keep this file around. Particularly, if you | |
335 | +are retraining a model on the same data and feature space but want to try | |
336 | +different training settings. By using the create-forest:false flag, you | |
337 | +can avoid having to recreate this file (which can take some time). | |
338 | + | |
339 | +PERFORMANCE ISSUES | |
340 | + | |
341 | +Once a model has been trained, running the model on new data is pretty quick. | |
342 | +However, as with all discriminative trained parsers, it does take some time | |
343 | +to train a parser. On a two year old 32-bit machine is will take 10-15 hours | |
344 | +to train a model on the entire Penn Treebank and around 24-30 hours to train | |
345 | +a model on the Prague Dependency Treebank. Newer machines or 64-bit machines | |
346 | +are of course much quicker. | |
347 | + | |
348 | + | |
349 | +------------------------------------------------------- | |
350 | +5. Reproducing results in HLT-EMNLP and ACL papers | |
351 | +------------------------------------------------------- | |
352 | + | |
353 | +To reproduce the English results in McDonald et al. ACL 2005, | |
354 | + | |
355 | +> java -classpath ".:lib/trove.jar" -Xmx1800m mstparser.DependencyParser \ | |
356 | + train train-file:train.wsj model-name:eng.wsj.model \ | |
357 | + training-k:5 loss-type:nopunc decode-type:proj \ | |
358 | + test test-file:test.wsj output-file:out.txt \ | |
359 | + eval gold-file:test.wsj format:MST | |
360 | + | |
361 | +This assumes that train.wsj is section 02-21 of the Penntreebank formatted | |
362 | +above and dependencies extracted using the head-rules of Yamada and Matsumoto. | |
363 | +See Joakim Nivre's tool set at: | |
364 | + http://w3.msi.vxu.se/~nivre/research/Penn2Malt.html | |
365 | +for a tool set to convert the WSJ to dependencies using the Yamada and | |
366 | +Matsumoto head rules. | |
367 | + | |
368 | +test.wsj is section 23 of the WSJ converted as above. Furthermore, POS tags are | |
369 | +supplied using Adwait Ratniparkhi's MXPOST tool-kit trained on sections 02-21. | |
370 | +This can be found at: | |
371 | +http://www.cogsci.ed.ac.uk/~jamesc/taggers/MXPOST.html | |
372 | + | |
373 | +Note that the evaluation will be slightly off from the results reported. This | |
374 | +is because the evaluation scripts include punctuation. If you modify the | |
375 | +evaluation script to discount punctuation, results will align. | |
376 | + | |
377 | + | |
378 | +To reproduce the Czech results in McDonald et al. HLT-EMNLP 2005, | |
379 | + | |
380 | +> java -classpath ".:lib/trove.jar" -Xmx1800m mstparser.DependencyParser \ | |
381 | + train train-file:train.pdt model-name:czech.pdt.model \ | |
382 | + training-k:1 loss-type:punc decode-type:non-proj \ | |
383 | + test test-file:test.pdt output-file:out.txt \ | |
384 | + eval gold-file:test.pdt format:MST | |
385 | + | |
386 | +This assumes train.pdt and test.pdt are the training and testing sections | |
387 | +of the Prague Dependency Treebank v1.0 formatted above. We use the | |
388 | +automatically assigned POS tags that have been reduced (see paper). | |
... | ... |
disambiguation/mstparser/bin/conll2mst.py
0 โ 100644
1 | +#! /usr/bin/python | |
2 | + | |
3 | +import sys; | |
4 | + | |
5 | +# Open File | |
6 | +f = open(sys.argv[1],'rt'); | |
7 | + | |
8 | +wrds = ""; pos = ""; labs = ""; par = ""; | |
9 | + | |
10 | +for line in f: | |
11 | + | |
12 | + sent = line.split(); | |
13 | + | |
14 | + if len(sent) > 0: | |
15 | + wrds += sent[1] + "\t"; | |
16 | + pos += sent[4] + "\t"; | |
17 | + labs += sent[7] + "\t"; | |
18 | + par += sent[6] + "\t"; | |
19 | + else: | |
20 | + print wrds; wrds = ""; | |
21 | + print pos; pos = ""; | |
22 | + print labs; labs = ""; | |
23 | + print par; par = ""; | |
24 | + print ""; | |
25 | + | |
26 | +f.close(); | |
27 | + | |
... | ... |
disambiguation/mstparser/bin/create_baseline.py
0 โ 100644
1 | +#!/usr/bin/python | |
2 | +import re | |
3 | +import optparse | |
4 | +import fileinput | |
5 | +import sys | |
6 | + | |
7 | +########################################################################### | |
8 | +# | |
9 | +# Command-line options and usage | |
10 | +# | |
11 | +########################################################################### | |
12 | + | |
13 | +usage = """%prog [OPTIONS] FILE ... | |
14 | + | |
15 | +Convert from one dependency style to another. | |
16 | +Use -f FROM and -t TO to specify the input and output formats. | |
17 | +""" | |
18 | + | |
19 | +parser = optparse.OptionParser(usage=usage) | |
20 | + | |
21 | +parser.add_option("-r", "--rightward", action="store_true", | |
22 | + default=False, | |
23 | + help="""Create right-linking baseline.""") | |
24 | + | |
25 | +parser.add_option("-d", "--default-relation", action="store", | |
26 | + default="Elaboration", | |
27 | + help="Pick default relation.", | |
28 | + metavar="RELATION") | |
29 | + | |
30 | +def transform_meta_chars(string): | |
31 | + return string.replace(",","+comma+") | |
32 | +def untransform_meta_chars(string): | |
33 | + return string.replace("+comma+",",") | |
34 | + | |
35 | +## Output dependencies for one sentence | |
36 | +def output_one_sentence(deps): | |
37 | + accum = [[], [], [], [], []] | |
38 | + for dep in deps: | |
39 | + for num in xrange(len(dep)): | |
40 | + accum[num].append(dep[num]) | |
41 | + accum = ["\t".join([str(x) for x in y]) for y in accum] | |
42 | + print "\n".join(accum[1:]) | |
43 | ||
44 | + | |
45 | + | |
46 | +## Get options | |
47 | + | |
48 | +(options, args) = parser.parse_args() | |
49 | + | |
50 | +## Process file(s) | |
51 | + | |
52 | +lines = fileinput.input(args) | |
53 | + | |
54 | +deps = [] | |
55 | + | |
56 | +## Read input | |
57 | + | |
58 | +sentence_info = [] | |
59 | +for line in lines: | |
60 | + line = line.strip() | |
61 | + if not line: | |
62 | + num_words = len(sentence_info[0]) | |
63 | + baseline_deps = range(num_words) | |
64 | + if options.rightward: | |
65 | + baseline_deps.pop(0) | |
66 | + baseline_deps.pop(0) | |
67 | + baseline_deps += [num_words, 0] | |
68 | + | |
69 | + sentence_info[2] = [options.default_relation]*num_words | |
70 | + sentence_info[3] = baseline_deps | |
71 | + | |
72 | + try: | |
73 | + for i in xrange(len(sentence_info[0])): | |
74 | + deps.append([i+1]+[row[i] for row in sentence_info]) | |
75 | + except: | |
76 | + #print sentence_info | |
77 | + print "\n".join([len(x) for x in sentence_info]) | |
78 | + sys.exit(0) | |
79 | + | |
80 | + #print deps | |
81 | + output_one_sentence(deps) | |
82 | + deps = [] | |
83 | + sentence_info = [] | |
84 | + else: | |
85 | + sentence_info.append(line.split()) | |
86 | + | |
... | ... |
disambiguation/mstparser/bin/mst-env
0 โ 100644
disambiguation/mstparser/bin/mst2conll.py
0 โ 100644
1 | +#! /usr/bin/python | |
2 | + | |
3 | +import sys; | |
4 | + | |
5 | +# Open File | |
6 | +f = open(sys.argv[1],'rt'); | |
7 | + | |
8 | +wrds = ""; | |
9 | +pos = ""; | |
10 | +labs = ""; | |
11 | +par = ""; | |
12 | + | |
13 | +for line in f: | |
14 | + | |
15 | + if len(line.strip()) == 0: | |
16 | + w = wrds.split(); p = pos.split(); l = labs.split(); pa = par.split(); | |
17 | + cnt = 1; | |
18 | + for t in w: | |
19 | + print str(cnt) + "\t" + t + "\t" + t + "\t" + p[cnt-1] + "\t" + p[cnt-1] + "\t_\t" + pa[cnt-1] + "\t" + l[cnt-1]; | |
20 | + cnt += 1; | |
21 | + print ""; | |
22 | + wrds = ""; pos = ""; labs = ""; par = ""; | |
23 | + elif len(wrds) == 0: | |
24 | + wrds = line; | |
25 | + elif len(pos) == 0: | |
26 | + pos = line; | |
27 | + elif len(labs) == 0: | |
28 | + labs = line; | |
29 | + else: | |
30 | + par = line; | |
31 | + | |
32 | +f.close(); | |
33 | + | |
... | ... |
disambiguation/mstparser/bin/mst_experiment.py
0 โ 100644
1 | +#!/usr/bin/python | |
2 | + | |
3 | +import os | |
4 | +import sys | |
5 | +import optparse | |
6 | + | |
7 | +## Check that MSTPARSER_DIR environment variable is set and get it | |
8 | +global mstparser_dir | |
9 | +mstparser_dir = '' | |
10 | +if os.environ.has_key('MSTPARSER_DIR'): | |
11 | + mstparser_dir = os.environ['MSTPARSER_DIR'] | |
12 | +else: | |
13 | + print "Please set the MSTPARSER_DIR environment variable to where you have the MSTParser installed." | |
14 | + exit(1) | |
15 | + | |
16 | + | |
17 | +########################################################################### | |
18 | +# | |
19 | +# Run a single fold. This could actually be not a "fold" per se, but | |
20 | +# actually explicitly provided training and test files. | |
21 | +# | |
22 | +########################################################################### | |
23 | + | |
24 | +def create_tag_train_file (source_file, formatted_file): | |
25 | + | |
26 | + output = file(formatted_file, "w") | |
27 | + | |
28 | + input = file(source_file) | |
29 | + line = input.readline() | |
30 | + while not(line == ""): | |
31 | + words = line.strip().split("\t") | |
32 | + line = input.readline() | |
33 | + tags = line.strip().split("\t") | |
34 | + | |
35 | + # the splitting takes care of word+stem representations like biliyor+bil | |
36 | + merged = [words[i].split("+")[0]+"_"+tags[i].replace("_", "+us+") \ | |
37 | + for i in range(len(words))] | |
38 | + | |
39 | + output.write(" ".join(merged)+"\n") | |
40 | + | |
41 | + input.readline() # eat up labels | |
42 | + input.readline() # eat up dependencies | |
43 | + input.readline() # eat blank line | |
44 | + line = input.readline() # read words of next sentence | |
45 | + | |
46 | + output.close() | |
47 | + | |
48 | + | |
49 | +def run_single_train_and_test(options, train_filename, | |
50 | + test_filename, output_filename, args): | |
51 | + | |
52 | + | |
53 | + realtest_filename = test_filename | |
54 | + # Tag the test sentences if requested | |
55 | + if options.tag_source == "OTK_Tagger": | |
56 | + print " Tagging test sentences..." | |
57 | + | |
58 | + tag_train_filename = train_filename+".tagged" | |
59 | + | |
60 | + create_tag_train_file(train_filename, tag_train_filename) | |
61 | + | |
62 | + tagged_filename = test_filename+".tagged.tmp" | |
63 | + tag_command = "python %s/bin/pos_tag.py -o %s %s %s %s" \ | |
64 | + % (mstparser_dir, | |
65 | + options.output_dir, | |
66 | + tag_train_filename, | |
67 | + test_filename, | |
68 | + tagged_filename) | |
69 | + | |
70 | + #print >> argfile, tag_command | |
71 | + if options.verbose: | |
72 | + print tag_command | |
73 | + os.system(tag_command) | |
74 | + #os.system(tag_command+' |tee --append '+options.output_dir+'/tag.out 2>&1') | |
75 | + else: | |
76 | + os.system(tag_command+' &>/dev/null') | |
77 | + #os.system(tag_command+' >> '+options.output_dir+'/tag.out 2>&1') | |
78 | + | |
79 | + | |
80 | + tag_lines = [] | |
81 | + counter = 0 | |
82 | + for line in file(tagged_filename): | |
83 | + if counter % 2 == 1: | |
84 | + tag_lines.append(line) | |
85 | + counter += 1 | |
86 | + | |
87 | + realtest_filename = test_filename+".tagged" | |
88 | + output = file(realtest_filename, "w") | |
89 | + counter = 0 | |
90 | + for line in file(test_filename): | |
91 | + if counter % 5 == 1: | |
92 | + output.write(tag_lines[(counter-1)/5]) | |
93 | + else: | |
94 | + output.write(line) | |
95 | + counter += 1 | |
96 | + | |
97 | + output.close() | |
98 | + | |
99 | + | |
100 | + # Train the parser | |
101 | + print " Training and evaluating..." | |
102 | + | |
103 | + train_command = 'mst_parse.sh train train-file:%s model-name:%s/dep.model decode-type:%s test test-file:%s output-file:%s %s' % (train_filename, options.output_dir, options.decoder_type, realtest_filename, output_filename, " ".join(args[1:])) | |
104 | + | |
105 | + if options.verbose: | |
106 | + print train_command | |
107 | + os.system(train_command) | |
108 | + else: | |
109 | + os.system(train_command+' &>/dev/null') | |
110 | + | |
111 | + | |
112 | +###################### END FUNCTION DEFINITIONS ######################## | |
113 | + | |
114 | + | |
115 | +## Get options | |
116 | + | |
117 | +opt_parser = optparse.OptionParser() | |
118 | +opt_parser.add_option("-l", "--language", action="store", default='Unspecified', | |
119 | + help="use configurations specific to LANGUAGE", | |
120 | + metavar="LANGUAGE") | |
121 | +opt_parser.add_option("-e", "--eval_file", action="store", default='Generated', | |
122 | + help="Read evaluation sentences from FILE. Using this option means that cross-validation will not be used.", | |
123 | + metavar="FILE") | |
124 | +opt_parser.add_option("-d", "--decoder_type", action="store", | |
125 | + choices=['proj', 'non-proj'], | |
126 | + default="proj", | |
127 | + help="Use a projective or non-projective algorithm.E", | |
128 | + metavar="FILE") | |
129 | +opt_parser.add_option("-o", "--output_dir", action="store", default='output', | |
130 | + help="save parser output to DIR", | |
131 | + metavar="DIR") | |
132 | +opt_parser.add_option("-f", "--num_folds", action="store", default=10, | |
133 | + help="The number of folds to use in cross-validation (Default=10).", | |
134 | + metavar="NUM") | |
135 | +opt_parser.add_option("-v", "--verbose", action="store_true", default=False, | |
136 | + help="be verbose") | |
137 | + | |
138 | +opt_parser.add_option("-t", "--tag_source", choices=['Gold','OTK_Tagger'], | |
139 | + default='Gold', | |
140 | + help="use tags from Gold standard or from a tagger (Gold (default), OTK_Tagger)", | |
141 | + metavar="SOURCE") | |
142 | + | |
143 | +(options, args) = opt_parser.parse_args() | |
144 | + | |
145 | +#Convert from FP to Int | |
146 | +options.num_folds = int(options.num_folds) | |
147 | + | |
148 | +# Check that the requested output directory doesn't exist and isn't a | |
149 | +# file. If it's okay, create the directory. | |
150 | +output_dir = options.output_dir | |
151 | +if os.path.isdir(output_dir): | |
152 | + os.system("rm -rf %s" % output_dir) | |
153 | +elif os.path.isfile(output_dir): | |
154 | + raise OSError("A file with the same name as the desired dir, " \ | |
155 | + "'%s', already exists." % output_dir) | |
156 | +os.makedirs(output_dir) | |
157 | + | |
158 | + | |
159 | +# This file accumulates the results across all folds. | |
160 | +model_output_filename = output_dir+"/model_out" | |
161 | +os.system('touch %s' % model_output_filename) | |
162 | + | |
163 | +## Process files | |
164 | + | |
165 | +train_filename = args[0] | |
166 | + | |
167 | +# This file accumulates the gold dependencies across all folds. | |
168 | +gold_deps_filename = output_dir+"/gold.deps" | |
169 | + | |
170 | +if options.eval_file == "Generated": | |
171 | + | |
172 | + num_folds = int(options.num_folds) | |
173 | + | |
174 | + print "Running a %d-fold evaluation on file %s" \ | |
175 | + % (num_folds, train_filename) | |
176 | ||
177 | + | |
178 | + # Align parses with their corresponding sentences and assign a | |
179 | + # partition id to them. | |
180 | + | |
181 | + train_file = file(train_filename) | |
182 | + | |
183 | + examples = [] | |
184 | + | |
185 | + next_example = train_file.readline() | |
186 | + | |
187 | + counter = 0 | |
188 | + while next_example: | |
189 | + partition = counter % num_folds | |
190 | + | |
191 | + elements = [] | |
192 | + while next_example and next_example != "\n": | |
193 | + elements += next_example | |
194 | + next_example = train_file.readline() | |
195 | + | |
196 | + examples.append((partition, elements)) | |
197 | + | |
198 | + next_example = train_file.readline() | |
199 | + | |
200 | + counter += 1 | |
201 | + | |
202 | + | |
203 | + # Close the sentences file and delete it. (It was either copied or | |
204 | + # generated, so it's okay.) | |
205 | + train_file.close() | |
206 | + | |
207 | + # Train/test on each partion | |
208 | + | |
209 | + gold_deps = open(gold_deps_filename,"w") | |
210 | + | |
211 | + # Run each fold. The output from each fold is appended to gold.deps | |
212 | + # and model.deps | |
213 | + #for test_partition in range(1): | |
214 | + for test_partition in range(num_folds): | |
215 | + | |
216 | + print "Fold",test_partition | |
217 | + | |
218 | + train_filename = output_dir+"/train" | |
219 | + train_set = open(train_filename, "w") | |
220 | + | |
221 | + test_filename = output_dir+"/test" | |
222 | + test_set = open(test_filename, "w") | |
223 | + | |
224 | + counter = 0 | |
225 | + for ex in examples: | |
226 | + if ex[0] == test_partition: | |
227 | + test_set.write("".join(ex[1])+"\n") | |
228 | + gold_deps.write("".join(ex[1])+"\n") | |
229 | + else: | |
230 | + train_set.write("".join(ex[1])+"\n") | |
231 | + | |
232 | + counter += 1 | |
233 | + | |
234 | + train_set.close() | |
235 | + test_set.close() | |
236 | + | |
237 | + # Run the fold. | |
238 | + output_filename = output_dir+"/output" | |
239 | + run_single_train_and_test(options, train_filename, test_filename, output_filename, args) | |
240 | + | |
241 | + # Pile this fold's output onto the accumulating result file. | |
242 | + os.system('cat %s >> %s' % (output_filename, model_output_filename)) | |
243 | + | |
244 | + gold_deps.flush() | |
245 | + | |
246 | + gold_deps.close() | |
247 | + | |
248 | +else: | |
249 | + os.system('cp %s %s' %(options.eval_file, gold_deps_filename)) | |
250 | + | |
251 | + run_single_train_and_test(options, train_filename, gold_deps_filename, model_output_filename, args) | |
252 | + | |
253 | + | |
254 | +################## EVALUATION ################### | |
255 | + | |
256 | +print "Evaluating. If anything here dies, you can still look at the output files in the directory '%s'." % (output_dir) | |
257 | + | |
258 | +# Get dependency results. | |
259 | + | |
260 | +os.system("mst_score.sh %s %s" % (gold_deps_filename, model_output_filename)) | |
261 | + | |
... | ... |
disambiguation/mstparser/bin/mst_parse.sh
0 โ 100644
disambiguation/mstparser/bin/mst_score.sh
0 โ 100644
disambiguation/mstparser/bin/pos_tag.py
0 โ 100644
1 | +#!/usr/bin/python | |
2 | + | |
3 | +import os | |
4 | +import sys | |
5 | +import optparse | |
6 | + | |
7 | +import tagging_util | |
8 | + | |
9 | +## Check that DBPARSER_DIR environment variable is set and get it | |
10 | +dbparser_dir = '' | |
11 | +if os.environ.has_key('DBPARSER_DIR'): | |
12 | + dbparser_dir = os.environ['DBPARSER_DIR'] | |
13 | +else: | |
14 | + print "Please set the DBPARSER_DIR environment variable to where you have Dan Bikel's parser installed." | |
15 | + exit(1) | |
16 | + | |
17 | +## Check that OPENNLP_DIR environment variable is set and get it | |
18 | +otk_dir = '' | |
19 | +if os.environ.has_key('OPENNLP_DIR'): | |
20 | + otk_dir = os.environ['OPENNLP_DIR'] | |
21 | +else: | |
22 | + print "Please set the OPENNLP_DIR environment variable to where you have the OpenNLP Toolkit installed." | |
23 | + exit(1) | |
24 | + | |
25 | + | |
26 | +## Get options | |
27 | + | |
28 | +opt_parser = optparse.OptionParser() | |
29 | +opt_parser.add_option("-o", "--output-dir", action="store", default='output', | |
30 | + help="save tagger output to DIR", | |
31 | + metavar="DIR") | |
32 | +opt_parser.add_option("-v", "--verbose", action="store_true", default=False, | |
33 | + help="be verbose") | |
34 | + | |
35 | +(options, args) = opt_parser.parse_args() | |
36 | + | |
37 | +verbose = options.verbose | |
38 | + | |
39 | +output_dir = options.output_dir | |
40 | +if os.path.isfile(output_dir): | |
41 | + raise OSError("A file with the same name as the desired dir, " \ | |
42 | + "'%s', already exists." % output_dir) | |
43 | +elif not(os.path.isdir(output_dir)): | |
44 | + os.makedirs(output_dir) | |
45 | + | |
46 | + | |
47 | +## Process files | |
48 | + | |
49 | +adwait_tagged_filename = args[0] | |
50 | +test_sentences = file(args[1]) | |
51 | +output_file = open(args[2], "w") | |
52 | + | |
53 | +# Use the gold trees to produce tagged sentences in Adwait's format | |
54 | +# with underscore separator. | |
55 | +# | |
56 | +# Note: any underscores in the tags themselves will be converted to | |
57 | +# +us+ metacharacters. These get unconverted at the end. | |
58 | +#os.system("python %s/python/parse_to_sentence.py -t -f Adwait -s -d %s > %s" | |
59 | +# % (dbparser_dir, tree_filename, adwait_tagged_filename)) | |
60 | + | |
61 | +model_filename = output_dir+"/model.bin.gz" | |
62 | + | |
63 | + | |
64 | +# Make a tag dictionary | |
65 | +tag_dictionary_filename = output_dir+"/tag_dict" | |
66 | +os.system("python %s/python/create_tag_dictionary.py -s _ %s > %s" | |
67 | + % (dbparser_dir, adwait_tagged_filename, tag_dictionary_filename)) | |
68 | + | |
69 | +# Train the tagger | |
70 | +os.system("%s/bin/otk_train_tagger.sh -dict %s %s %s &> /dev/null" | |
71 | + % (otk_dir, tag_dictionary_filename, adwait_tagged_filename, model_filename)) | |
72 | + | |
73 | +sentences_to_tag_filename = output_dir+"/to_tag.txt" | |
74 | + | |
75 | +# Strip off the parens that are used in the input to parser | |
76 | +to_tag_file = open(sentences_to_tag_filename, "w") | |
77 | +counter = 0 | |
78 | +for sentence in test_sentences: | |
79 | + if counter % 5 == 0: | |
80 | + clean = "\t".join([x.split("+")[0] for x in sentence.strip().split("\t")]) | |
81 | + to_tag_file.write(clean+"\n") | |
82 | + counter += 1 | |
83 | +to_tag_file.close() | |
84 | + | |
85 | +tagged_filename = output_dir+"/tagged.txt" | |
86 | + | |
87 | +# Run the tagger | |
88 | +os.system("%s/bin/otk_run_tagger.sh -dict %s -tag_dict %s %s %s > %s" | |
89 | + % (otk_dir, tag_dictionary_filename, tag_dictionary_filename, | |
90 | + sentences_to_tag_filename, model_filename, tagged_filename)) | |
91 | + | |
92 | + | |
93 | +# Convert tagger output to MST format. Unconvert the +us+ | |
94 | +# metachars back to underscores too (using tagging_util.de_metatize()). | |
95 | +for tagged_sent in file(tagged_filename): | |
96 | + words = [] | |
97 | + tags = [] | |
98 | + for word_tag in tagged_sent.split(): | |
99 | + (word,tag) = tagging_util.split_item(word_tag, "_") | |
100 | + words.append(word) | |
101 | + tags.append(tagging_util.de_metatize(tag,"_","+us+")) | |
102 | + output_file.write("\t".join(words)+"\n") | |
103 | + output_file.write("\t".join(tags)+"\n") | |
104 | + | |
105 | +output_file.close() | |
... | ... |
disambiguation/mstparser/build.sh
0 โ 100644
1 | +#!/bin/sh | |
2 | + | |
3 | +echo | |
4 | +echo "MST Parser Build System" | |
5 | +echo "-------------------" | |
6 | +echo | |
7 | + | |
8 | +if [ "$JAVA_HOME" = "" ] ; then | |
9 | + echo "ERROR: JAVA_HOME not found in your environment." | |
10 | + echo | |
11 | + echo "Please, set the JAVA_HOME variable in your environment to match the" | |
12 | + echo "location of the Java Virtual Machine you want to use." | |
13 | + exit 1 | |
14 | +fi | |
15 | + | |
16 | +if [ `echo $OSTYPE | grep -n cygwin` ]; then | |
17 | + PS=";" | |
18 | +else | |
19 | + PS=":" | |
20 | +fi | |
21 | + | |
22 | +LOCALCLASSPATH=$JAVA_HOME/lib/tools.jar | |
23 | +# add in the dependency .jar files | |
24 | +DIRLIBS=lib/*.jar | |
25 | +for i in ${DIRLIBS} | |
26 | +do | |
27 | + if [ "$i" != "${DIRLIBS}" ] ; then | |
28 | + LOCALCLASSPATH=$LOCALCLASSPATH${PS}"$i" | |
29 | + fi | |
30 | +done | |
31 | +ANT_HOME=./lib | |
32 | + | |
33 | +echo Building with classpath $LOCALCLASSPATH | |
34 | +echo | |
35 | + | |
36 | +echo Starting Ant... | |
37 | +echo | |
38 | + | |
39 | +$JAVA_HOME/bin/java -Dant.home=$ANT_HOME -classpath $LOCALCLASSPATH org.apache.tools.ant.Main $* | |
... | ... |
disambiguation/mstparser/build.xml
0 โ 100644
1 | +<!-- $Id: build.xml 138 2013-09-10 10:02:43Z wyldfire $ --> | |
2 | +<!-- Copyright (C) 2007 Ryan McDonald --> | |
3 | +<project default="compile" basedir="."> | |
4 | + | |
5 | + <!-- =================================================================== --> | |
6 | + <!-- Initialization target --> | |
7 | + <!-- =================================================================== --> | |
8 | + <target name="init"> | |
9 | + <tstamp/> | |
10 | + <property name="Name" value="MSTParser"/> | |
11 | + <property name="name" value="mstparser"/> | |
12 | + <property name="year" value="2013"/> | |
13 | + <property name="version" value="0.5.1"/> | |
14 | + | |
15 | + <echo message="----------- ${Name} ${version} [${year}] ------------"/> | |
16 | + | |
17 | + <property name="debug" value="on"/> | |
18 | + <property name="optimize" value="off"/> | |
19 | + <property name="deprecation" value="on"/> | |
20 | + | |
21 | + <property name="src.dir" value="./src/main/java"/> | |
22 | + <property name="lib.dir" value="./lib"/> | |
23 | + <property name="packages" value="mstparser.*"/> | |
24 | + | |
25 | + <property name="build.dir" value="./output"/> | |
26 | + <property name="build.dest" value="./output/classes"/> | |
27 | + <property name="build.javadocs" value="./docs/api"/> | |
28 | + | |
29 | + <filter token="year" value="${year}"/> | |
30 | + <filter token="version" value="${version}"/> | |
31 | + <filter token="date" value="${TODAY}"/> | |
32 | + <filter token="log" value="true"/> | |
33 | + <filter token="verbose" value="true"/> | |
34 | + | |
35 | + <path id="build.classpath"> | |
36 | + <fileset dir="${lib.dir}/"> | |
37 | + <include name="*.jar"/> | |
38 | + </fileset> | |
39 | + </path> | |
40 | + </target> | |
41 | + | |
42 | + | |
43 | + <!-- =================================================================== --> | |
44 | + <!-- Help on usage --> | |
45 | + <!-- =================================================================== --> | |
46 | + <target name="usage"> | |
47 | + <echo message=""/> | |
48 | + <echo message=""/> | |
49 | + <echo message="MST Parser build file"/> | |
50 | + <echo message="-------------------------------------------------------------"/> | |
51 | + <echo message=""/> | |
52 | + <echo message=" Available targets are:"/> | |
53 | + <echo message=""/> | |
54 | + <echo message=" package --> generates the mstparser.jar file"/> | |
55 | + <echo message=" compile --> compiles the source code (default)"/> | |
56 | + <echo message=" javadoc --> generates the API documentation"/> | |
57 | + <echo message=" clean --> cleans up the compilation directory"/> | |
58 | + <echo message=""/> | |
59 | + <echo message=" See the comments inside the build.xml file for more details."/> | |
60 | + <echo message="-------------------------------------------------------------"/> | |
61 | + <echo message=""/> | |
62 | + <echo message=""/> | |
63 | + </target> | |
64 | + | |
65 | + | |
66 | + <!-- =================================================================== --> | |
67 | + <!-- Prepares the build directories --> | |
68 | + <!-- =================================================================== --> | |
69 | + <target name="prepare" depends="init"> | |
70 | + <!-- create directories --> | |
71 | + <mkdir dir="${build.dir}"/> | |
72 | + <mkdir dir="${build.dest}"/> | |
73 | + </target> | |
74 | + | |
75 | + | |
76 | + <!-- =================================================================== --> | |
77 | + <!-- Compiles the source directory --> | |
78 | + <!-- =================================================================== --> | |
79 | + <target name="compile" | |
80 | + depends="prepare" | |
81 | + description="compiles the source code (default)"> | |
82 | + <javac srcdir="${src.dir}" | |
83 | + destdir="${build.dest}" | |
84 | + debug="${debug}" | |
85 | + deprecation="${deprecation}" | |
86 | + classpathref="build.classpath" | |
87 | + optimize="${optimize}"> | |
88 | + <!-- <compilerarg line="-Xlint:unchecked"/> --> | |
89 | + </javac> | |
90 | + </target> | |
91 | + | |
92 | + | |
93 | + <!-- =================================================================== --> | |
94 | + <!-- Creates the class package --> | |
95 | + <!-- =================================================================== --> | |
96 | + <target name="package" | |
97 | + depends="compile" | |
98 | + description="generates the mstparser.jar file"> | |
99 | + <jar jarfile="${build.dir}/${name}.jar"> | |
100 | + <fileset dir="${build.dest}" includes="**"/> | |
101 | + </jar> | |
102 | + </target> | |
103 | + | |
104 | + | |
105 | + <!-- =================================================================== --> | |
106 | + <!-- Creates the release file --> | |
107 | + <!-- =================================================================== --> | |
108 | + <target name="release" depends="clean,cleandocs"> | |
109 | + <tar tarfile="${name}-${version}-src.tar" | |
110 | + basedir="../" | |
111 | + includes="${name}/**" | |
112 | + excludes="**/CVS **/*forest testbed/my*" /> | |
113 | + <gzip src="${name}-${version}-src.tar" | |
114 | + zipfile="../${name}-${version}-src.tgz" /> | |
115 | + <delete file="${name}-${version}-src.tar" /> | |
116 | + </target> | |
117 | + | |
118 | + <!-- =================================================================== --> | |
119 | + <!-- Creates the homepage --> | |
120 | + <!-- =================================================================== --> | |
121 | + <target name="homepage" | |
122 | + depends="init,javadoc" | |
123 | + description="generates the API documentation"> | |
124 | + <tar tarfile="${name}-homepage.tar" | |
125 | + basedir="./docs/" | |
126 | + includes="**" | |
127 | + excludes="**/CVS" /> | |
128 | + <gzip src="${name}-homepage.tar" | |
129 | + zipfile="${build.dir}/${name}-homepage.tgz" /> | |
130 | + <delete file="${name}-homepage.tar" /> | |
131 | + </target> | |
132 | + | |
133 | + | |
134 | + <!-- =================================================================== --> | |
135 | + <!-- Creates the API documentation --> | |
136 | + <!-- =================================================================== --> | |
137 | + <target name="javadoc" depends="prepare"> | |
138 | + <mkdir dir="${build.javadocs}"/> | |
139 | + <javadoc packagenames="${packages}" | |
140 | + sourcepath="${src.dir}" | |
141 | + destdir="${build.javadocs}" | |
142 | + author="true" | |
143 | + version="true" | |
144 | + use="true" | |
145 | + splitindex="true" | |
146 | + noindex="false" | |
147 | + windowtitle="${name}" | |
148 | + doctitle="The ${Name} API v${version}" | |
149 | + bottom="Copyright © ${year} Ryan McDonald and Jason Baldridge. All Rights Reserved." | |
150 | + /> | |
151 | + </target> | |
152 | + | |
153 | + | |
154 | + <!-- =================================================================== --> | |
155 | + <!-- Cleans targets --> | |
156 | + <!-- =================================================================== --> | |
157 | + <target name="clean" | |
158 | + depends="init" | |
159 | + description="cleans up the directory"> | |
160 | + <delete dir="${build.dir}"/> | |
161 | + <delete file="${lib.dir}/${name}.jar" /> | |
162 | + </target> | |
163 | + | |
164 | + <target name="cleandocs" depends="init" description="cleans up the API docs directory"> | |
165 | + <delete dir="${build.javadocs}"/> | |
166 | + </target> | |
167 | + | |
168 | +</project> | |
169 | + | |
170 | +<!-- End of file --> | |
... | ... |