Commit a5c6bd60568948f147fa18d4b06ced8431f08dce

Authored by adamm
1 parent 60c228bb

refactoring

Showing 71 changed files with 31203 additions and 169 deletions

Too many changes to show.

To preserve performance only 21 of 71 files are displayed.

disambiguation/ENIAM_EdgeScore.ml deleted
1   -open Xstd
2   -open ENIAM_LCGtypes
3   -open Yojson
4   -
5   -module MST_Model : sig
6   - type mst_model = {
7   - typeAlphabet: int StringMap.t;
8   - dataAlphabet: int StringMap.t;
9   - parameters: float array}
10   - val read_model: string -> mst_model
11   - val empty: mst_model
12   - exception MalformedModelJson
13   -end
14   -= struct
15   - type mst_model = {
16   - typeAlphabet: int StringMap.t;
17   - dataAlphabet: int StringMap.t;
18   - parameters: float array}
19   -
20   - let empty = {typeAlphabet = StringMap.empty;
21   - dataAlphabet = StringMap.empty;
22   - parameters = Array.make 0 0.0}
23   - exception MalformedModelJson
24   -
25   - let construct_data_alphabet keys =
26   - let counter = ref 0 in
27   - let map = ref StringMap.empty in
28   - let length = Array.length keys in
29   - for i = 0 to length -1 do
30   - map := StringMap.add !map keys.(i) !counter;
31   - counter := !counter + 1;
32   - done;
33   - !map
34   -
35   - let construct_type_alphabet = construct_data_alphabet
36   -
37   - let read_model fname =
38   - let data = Basic.from_file fname in
39   - let open Yojson.Basic.Util in
40   - let unwrapList = function
41   - `List l -> l
42   - | _ -> raise MalformedModelJson in
43   - let dataA = data |> member "dataAlphabet" |> unwrapList |> filter_string in
44   - let typeA = data |> member "typeAlphabet" |> unwrapList |> filter_string in
45   - let params = data |> member "parameters" |> unwrapList |> filter_float in
46   - {typeAlphabet = Array.of_list typeA |> construct_type_alphabet;
47   - dataAlphabet = Array.of_list dataA |> construct_data_alphabet;
48   - parameters = Array.of_list params}
49   -end
50   -open MST_Model
51   -
52   -let model = ref MST_Model.empty
53   -
54   -let initialize () =
55   - model := MST_Model.read_model "dep.model.json";
56   - ()
57   -
58   -exception UnsupportedLinearTerm of linear_term
59   -exception EmptyVariant
60   -
61   -let add_feature str (fv: IntSet.t) =
62   - if StringMap.mem !model.dataAlphabet str then
63   - IntSet.add fv (StringMap.find !model.dataAlphabet str)
64   - else
65   - fv
66   -
67   -let score_fv (fv:IntSet.t) =
68   - IntSet.fold fv 0.0 (fun score i -> score +. !model.parameters.(i))
69   -
70   -let apply_features features fv =
71   - List.fold_left (|>) fv features
72   -
73   -let add_linear_features f_type (obs: string array) first second distStr fv =
74   - fv
75   -
76   -let add_two_obs_features prefix item1F1 item1F2 item2F1 item2F2 distStr fv =
77   - let add_diststr str = [str; str^"*"^distStr] in
78   - let flist = List.map ((^) prefix)[
79   - "2FF1="^item1F1;
80   - "2FF1="^item1F1^" "^item1F2;
81   - "2FF1="^item1F1^" "^item1F2^" "^item2F2;
82   - "2FF1="^item1F1^" "^item1F2^" "^item2F2^" "^item2F1;
83   - "2FF2="^item1F1^" "^item2F1;
84   - "2FF3="^item1F1^" "^item2F2;
85   - "2FF4="^item1F2^" "^item2F1^" "^item2F2;
86   - "2FF5="^item1F2^" "^item2F2;
87   - "2FF6="^item2F1^" "^item2F2;
88   - "2FF7="^item1F2;
89   - "2FF8="^item2F1;
90   - "2FF9="^item2F2;
91   - ] in
92   - let funs = List.map (add_feature) (List.flatten (List.map add_diststr flist)) in
93   - apply_features funs fv
94   -
95   -type disamb_info = {
96   - tree: linear_term array
97   -}
98   -
99   -let score_edge (data: disamb_info) (parent: node) (child: node) =
100   - let fv = IntSet.empty in
101   - let fv = add_two_obs_features "HC"
102   - parent.orth parent.pos child.orth child.pos "" fv in
103   - score_fv fv
104   -
105   -let rec fill_dep_edges_array
106   - (data: disamb_info) parent (scores: float IntMap.t) =
107   - function
108   - Dot -> scores
109   - | Ref i -> (match data.tree.(i) with
110   - Node child -> IntMap.add scores i (score_edge data parent child)
111   - | _ as x -> raise (UnsupportedLinearTerm x))
112   - | Tuple l -> List.fold_left (fill_dep_edges_array data parent) scores l
113   - | Variant (_, l) -> List.fold_left
114   - (fill_dep_edges_array data parent)
115   - scores (List.map snd l)
116   - | _ as x -> raise (UnsupportedLinearTerm x)
117   -
118   -let rec disambiguate_args edge_scores =
119   - function
120   - Dot -> Dot, 0.0
121   - | Ref i -> Ref i, IntMap.find edge_scores i
122   - | Tuple l ->
123   - let (terms, scores) =
124   - List.map (disambiguate_args edge_scores) l |> List.split in
125   - let num = List.length scores |> float_of_int in
126   - Tuple terms, (List.fold_left (+.) 0.0 scores) /. num
127   - | Variant (lab, l) ->
128   - let (lbs, terms) = List.split l in
129   - let new_terms_scores = List.map (disambiguate_args edge_scores) terms in
130   - let select_best (term, score) (new_term, new_score) =
131   - if new_score > score then
132   - new_term, new_score
133   - else
134   - term, score in
135   - List.fold_left select_best (List.hd new_terms_scores) (List.tl new_terms_scores)
136   - | _ as x -> raise (UnsupportedLinearTerm x)
137   -
138   -(* dezambiguacja argumentรณw pojedynczego wierzchoล‚ka algorytmem zachล‚annym *)
139   -let disambiguate_node (data: disamb_info) parentI =
140   - let parent = match data.tree.(parentI) with
141   - Node node -> node
142   - | _ as x -> raise (UnsupportedLinearTerm x) in
143   - let edge_scores = fill_dep_edges_array
144   - data parent IntMap.empty (parent.args) in
145   - let (new_term, _) = disambiguate_args edge_scores (parent.args) in
146   - Node {parent with args = new_term}
147   -
148   -let disambiguate_tree tree =
149   - let tree2 = Array.copy tree in
150   - let data : disamb_info = {tree = tree} in
151   - let update parentI _ =
152   - (let new_term = disambiguate_node data parentI in
153   - tree2.(parentI) <- new_term;) in
154   - Array.iteri update tree; tree2
disambiguation/ENIAMmstDisambiguation.ml 0 โ†’ 100644
  1 +open Xstd
  2 +open ENIAM_LCGtypes
  3 +open ENIAMmstModel
  4 +open ENIAMmstFeatures
  5 +
  6 +let initialize () =
  7 + MST_Model.initialize "dep.model.json";
  8 + ()
  9 +
  10 +exception UnsupportedLinearTerm of linear_term
  11 +exception EmptyVariant
  12 +
  13 +let rec fill_dep_edges_array
  14 + (data: disamb_info) parent (scores: float IntMap.t) =
  15 + function
  16 + Dot -> scores
  17 + | Ref i -> IntMap.add scores i (score_edge data parent data.tree.(i))
  18 + | Tuple l -> List.fold_left (fill_dep_edges_array data parent) scores l
  19 + | Variant (_, l) -> List.fold_left
  20 + (fill_dep_edges_array data parent)
  21 + scores (List.map snd l)
  22 + | _ as x -> raise (UnsupportedLinearTerm x)
  23 +
  24 +let rec disambiguate_args edge_scores =
  25 + function
  26 + Dot -> Dot, 0.0
  27 + | Ref i -> Ref i, IntMap.find edge_scores i
  28 + | Tuple l ->
  29 + let (terms, scores) =
  30 + List.map (disambiguate_args edge_scores) l |> List.split in
  31 + let num = List.length scores |> float_of_int in
  32 + Tuple terms, (List.fold_left (+.) 0.0 scores) /. num
  33 + | Variant (lab, l) ->
  34 + let (lbs, terms) = List.split l in
  35 + let new_terms_scores = List.map (disambiguate_args edge_scores) terms in
  36 + let select_best (term, score) (new_term, new_score) =
  37 + if new_score >= score then
  38 + new_term, new_score
  39 + else
  40 + term, score in
  41 + List.fold_left select_best (List.hd new_terms_scores) (List.tl new_terms_scores)
  42 + | _ as x -> raise (UnsupportedLinearTerm x)
  43 +
  44 +(* dezambiguacja argumentรณw pojedynczego wierzchoล‚ka algorytmem zachล‚annym *)
  45 +let disambiguate_node (data: disamb_info) parent =
  46 + let edge_scores = fill_dep_edges_array
  47 + data parent IntMap.empty (parent.args) in
  48 + let (new_term, _) = disambiguate_args edge_scores (parent.args) in
  49 + {parent with args = new_term}
  50 +
  51 +let disambiguate_tree (tree: linear_term array) =
  52 + let extract_node = (function
  53 + Node node -> node
  54 + | _ as x -> UnsupportedLinearTerm x |> raise) in
  55 + let data : disamb_info = {tree = Array.map extract_node tree} in
  56 + let disambiguate term = Node (extract_node term |> disambiguate_node data) in
  57 + Array.map disambiguate tree
... ...
disambiguation/ENIAMmstFeatures.ml 0 โ†’ 100644
  1 +open ENIAMmstModel
  2 +open ENIAM_LCGtypes
  3 +
  4 +type disamb_info = {
  5 + tree: node array
  6 +}
  7 +
  8 +let apply_features features fv =
  9 + List.fold_left (|>) fv features
  10 +
  11 +let add_linear_features f_type (obs: string array) first second distStr fv =
  12 + fv
  13 +
  14 +let add_two_obs_features prefix item1F1 item1F2 item2F1 item2F2 distStr fv =
  15 + let add_diststr str = [str; str^"*"^distStr] in
  16 + let flist = List.map ((^) prefix)[
  17 + "2FF1="^item1F1;
  18 + "2FF1="^item1F1^" "^item1F2;
  19 + "2FF1="^item1F1^" "^item1F2^" "^item2F2;
  20 + "2FF1="^item1F1^" "^item1F2^" "^item2F2^" "^item2F1;
  21 + "2FF2="^item1F1^" "^item2F1;
  22 + "2FF3="^item1F1^" "^item2F2;
  23 + "2FF4="^item1F2^" "^item2F1^" "^item2F2;
  24 + "2FF5="^item1F2^" "^item2F2;
  25 + "2FF6="^item2F1^" "^item2F2;
  26 + "2FF7="^item1F2;
  27 + "2FF8="^item2F1;
  28 + "2FF9="^item2F2;
  29 + ] in
  30 + let funs = List.map (MST_Model.add_feature) (List.flatten (List.map add_diststr flist)) in
  31 + apply_features funs fv
  32 +
  33 +let score_edge (data: disamb_info) (parent: node) (child: node) =
  34 + let fv = MST_Model.empty_fv in
  35 + let fv = add_two_obs_features "HC"
  36 + parent.orth parent.pos child.orth child.pos "" fv in
  37 + MST_Model.score_fv fv
... ...
disambiguation/ENIAMmstModel.ml 0 โ†’ 100644
  1 +open Yojson
  2 +open Xstd
  3 +
  4 +
  5 +module MST_Model : sig
  6 + type mst_model
  7 + type feature_vector_t
  8 + exception MalformedModelJson
  9 +
  10 + val read_model: string -> mst_model
  11 + val initialize: string -> unit
  12 + val add_feature: string -> feature_vector_t -> feature_vector_t
  13 + val score_fv: feature_vector_t -> float
  14 + val empty_fv: feature_vector_t
  15 +end
  16 += struct
  17 + type feature_vector_t = IntSet.t
  18 +
  19 + type mst_model = {
  20 + typeAlphabet: int StringMap.t;
  21 + dataAlphabet: int StringMap.t;
  22 + parameters: float array}
  23 +
  24 +
  25 + exception MalformedModelJson
  26 +
  27 + let model = ref {typeAlphabet = StringMap.empty;
  28 + dataAlphabet = StringMap.empty;
  29 + parameters = Array.make 0 0.0}
  30 +
  31 + let empty_fv = IntSet.empty
  32 +
  33 + let add_feature str (fv: feature_vector_t) =
  34 + if StringMap.mem !model.dataAlphabet str then
  35 + IntSet.add fv (StringMap.find !model.dataAlphabet str)
  36 + else
  37 + fv
  38 +
  39 + let score_fv (fv: feature_vector_t) =
  40 + IntSet.fold fv 0.0 (fun score i -> score +. !model.parameters.(i))
  41 +
  42 + let construct_data_alphabet keys =
  43 + let counter = ref 0 in
  44 + let map = ref StringMap.empty in
  45 + let length = Array.length keys in
  46 + for i = 0 to length -1 do
  47 + map := StringMap.add !map keys.(i) !counter;
  48 + counter := !counter + 1;
  49 + done;
  50 + !map
  51 +
  52 + let construct_type_alphabet = construct_data_alphabet
  53 +
  54 + let read_model fname =
  55 + let data = Basic.from_file fname in
  56 + try
  57 + let open Yojson.Basic.Util in
  58 + let unwrapList = function
  59 + `List l -> l
  60 + | _ -> raise MalformedModelJson in
  61 + let dataA = data |> member "dataAlphabet" |> unwrapList |> filter_string in
  62 + let typeA = data |> member "typeAlphabet" |> unwrapList |> filter_string in
  63 + let params = data |> member "parameters" |> unwrapList |> filter_float in
  64 + {typeAlphabet = Array.of_list typeA |> construct_type_alphabet;
  65 + dataAlphabet = Array.of_list dataA |> construct_data_alphabet;
  66 + parameters = Array.of_list params}
  67 + with
  68 + _ -> raise MalformedModelJson
  69 +
  70 + let initialize fname =
  71 + model := read_model fname;
  72 + ()
  73 +end
... ...
disambiguation/makefile
... ... @@ -6,27 +6,27 @@ OCAMLFLAGS=$(INCLUDES) -g
6 6 OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-lcg-parser.cmxa yojson.cmx
7 7 INSTALLDIR=`ocamlc -where`/eniam
8 8  
9   -SOURCES= ENIAM_EdgeScore.ml
  9 +SOURCES=ENIAMmstModel.ml ENIAMmstFeatures.ml ENIAMmstDisambiguation.ml
10 10  
11   -all: eniam-edge-score.cma eniam-edge-score.cmxa
  11 +all: eniam-mst-disambiguation.cma eniam-mst-disambiguation.cmxa
12 12  
13 13 install: all
14 14 mkdir -p $(INSTALLDIR)
15   - cp eniam-edge-score.cmxa eniam-edge-score.a eniam-edge-score.cma $(INSTALLDIR)
16   - cp ENIAM_EdgeScore.cmi $(INSTALLDIR)
17   - cp ENIAM_EdgeScore.cmx $(INSTALLDIR)
  15 + cp eniam-mst-disambiguation.cmxa eniam-mst-disambiguation.a eniam-mst-disambiguation.cma $(INSTALLDIR)
  16 + cp ENIAMmstDisambiguation.cmi ENIAMmstModel.cmi ENIAMmstFeatures.cmi $(INSTALLDIR)
  17 + cp ENIAMmstDisambiguation.cmx ENIAMmstModel.cmx ENIAMmstFeatures.cmx $(INSTALLDIR)
18 18  
19 19 install-local: all
20 20 mkdir -p $(INSTALLDIR)
21   - cp eniam-edge-score.cmxa eniam-edge-score.a eniam-edge-score.cma $(INSTALLDIR)
22   - cp ENIAM_EdgeScore.cmi $(INSTALLDIR)
23   - cp ENIAM_EdgeScore.cmx $(INSTALLDIR)
  21 + cp eniam-mst-disambiguation.cmxa eniam-mst-disambiguation.a eniam-mst-disambiguation.cma $(INSTALLDIR)
  22 + cp ENIAMmstDisambiguation.cmi ENIAMmstModel.cmi ENIAMmstFeatures.cmi $(INSTALLDIR)
  23 + cp ENIAMmstDisambiguation.cmx ENIAMmstModel.cmx ENIAMmstFeatures.cmx $(INSTALLDIR)
24 24  
25   -eniam-edge-score.cma: $(SOURCES)
26   - ocamlc -linkall -a -o eniam-edge-score.cma $(OCAMLFLAGS) $^
  25 +eniam-mst-disambiguation.cma: $(SOURCES)
  26 + ocamlc -linkall -a -o eniam-mst-disambiguation.cma $(OCAMLFLAGS) $^
27 27  
28   -eniam-edge-score.cmxa: $(SOURCES)
29   - ocamlopt -linkall -a -o eniam-edge-score.cmxa $(INCLUDES) $^
  28 +eniam-mst-disambiguation.cmxa: $(SOURCES)
  29 + ocamlopt -linkall -a -o eniam-mst-disambiguation.cmxa $(INCLUDES) $^
30 30  
31 31 test: test.ml
32 32 mkdir -p results
... ...
disambiguation/mstparser/.gitignore 0 โ†’ 100644
  1 +.idea/
  2 +out/
  3 +*.model
  4 +*.iml
  5 +*.json
... ...
disambiguation/mstparser/ALT_README 0 โ†’ 100644
  1 +Introduction
  2 +============
  3 +
  4 +This file contains the configuration and build instructions for using
  5 +Apache Ant (http://ant.apache.org) to build MSTParser, and for setting
  6 +up your environment to use the scripts in the mstparser/bin
  7 +directory. All the instructions in the original README file should
  8 +continue to work as before -- this document describes an optional way
  9 +of compiling and using MSTParser with a few more bells and whistles.
  10 +
  11 +
  12 +Configuring your environment variables
  13 +======================================
  14 +
  15 +The easiest thing to do is to set the environment variables JAVA_HOME
  16 +and MSTPARSER_DIR to the relevant locations on your system. Set JAVA_HOME
  17 +to match the top level directory containing the Java installation you
  18 +want to use. Note that version 1.5 of the Java 2 SDK is required.
  19 +
  20 +For example, on Windows:
  21 +
  22 +C:\> set JAVA_HOME=C:\jdk1.5.0_04
  23 +
  24 +or on Unix:
  25 +
  26 +% setenv JAVA_HOME /usr/local/java
  27 + (csh)
  28 +> JAVA_HOME=/usr/java; export JAVA_HOME
  29 + (ksh, bash)
  30 +
  31 +On Windows, to get these settings to persist, it's actually easiest to
  32 +set your environment variables through the System Properties from the
  33 +Control Panel. For example, under WinXP, go to Control Panel, click on
  34 +System Properties, choose the Advanced tab, click on Environment
  35 +Variables, and add your settings in the User variables area.
  36 +
  37 +Next, likewise set MSTPARSER_DIR to be the top level directory where you
  38 +unzipped the download. In Unix, type 'pwd' in the directory where
  39 +this file is and use the path given to you by the shell as
  40 +MSTPARSER_DIR. You can set this in the same manner as for JAVA_HOME
  41 +above.
  42 +
  43 +Next, add the directory MSTPARSER_DIR/bin to your path. For example, you
  44 +can set the path in your .bashrc file as follows:
  45 +
  46 +export PATH=$PATH:$MSTPARSER_DIR/bin
  47 +
  48 +Once you have taken care of these three things, you should be able to
  49 +build and use MSTParser.
  50 +
  51 +
  52 +Building the system
  53 +===================
  54 +
  55 +The MSTParser build system is based on Apache Ant.
  56 +Ant is a little but very handy tool that uses a build file written in
  57 +XML (build.xml) as building instructions.
  58 +
  59 +To build the code, first make sure your current working
  60 +directory is where the build.xml file is located. Then type:
  61 +
  62 + sh build.sh (Unix)
  63 +
  64 +If everything is right and all the required packages are visible, this
  65 +action will generate a file called mstparser.jar in the ./output
  66 +directory, and Java class files in ./output/classes.
  67 +
  68 +
  69 +Build targets
  70 +=============
  71 +
  72 +These are the meaningful targets for the main build file:
  73 +
  74 + package --> generates the openccg.jar file (default)
  75 + compile --> compiles the source code
  76 + javadoc --> generates the API documentation
  77 + clean --> cleans up the compilation directory
  78 +
  79 +There are also build files in each sample grammar directory.
  80 +
  81 +To learn the details of what each target does, read the build.xml file.
  82 +It is quite understandable.
  83 +
  84 +
  85 +Trying it out
  86 +=============
  87 +
  88 +If you've managed to configure and build the system, you should be
  89 +able to run mstparser as described in the README, but without some of
  90 +the extra classpath and memory options, and you should be able to do
  91 +so from anywhere on your directory system.
  92 +
  93 +If you trouble starting up any of the scripts, make sure you have set
  94 +the environment variables properly, and that the scripts (located in
  95 +mstparser/bin) call the right shell environment (top-line of the
  96 +script; to solve the problem, either comment out this line or correct
  97 +the path).
  98 +
  99 +Here's a brief description of some of the scripts:
  100 +
  101 +1. The shell script mst_parse.sh is just a simple wrapper that allows
  102 +you to do this:
  103 +
  104 +> mst_parse.sh \
  105 + train train-file:data/train.ulab model-name:dep.model \
  106 + test test-file:data/test.ulab output-file:out.txt \
  107 + eval gold-file:data/test.ulab
  108 +
  109 +instead of this (as described in the readme):
  110 +
  111 +> java -classpath ".:lib/trove.jar" -Xmx1800m mstparser.DependencyParser \
  112 + train train-file:data/train.ulab model-name:dep.model \
  113 + test test-file:data/test.ulab output-file:out.txt \
  114 + eval gold-file:data/test.ulab
  115 +
  116 +
  117 +[NOTE: actually, if you want to run MSTParser the latter way and you
  118 +build MSTParser using Ant (via build.sh), then your Java class files
  119 +will be contained in ./output/classes rather than ./mstparser, so the
  120 +classpath would need to be "./output/classes:lib/trove.jar". The
  121 +mst_parse.sh script takes care of this and sets the classpath
  122 +appropriately.]
  123 +
  124 +2. The shell script mst_score.sh is just an easy way to call upon the
  125 +main method of the mstparser.DependencyEvaluator class. Call it as
  126 +such:
  127 +
  128 +> mst_score.sh <gold_standard_dependency_file> <parser_output_dependency_file> <format>
  129 +
  130 +where <format> is either MST or CONLL (default CONLL). Here's a concrete example:
  131 +
  132 +> mst_score.sh data/portuguese/floresta_test.conll testbed/my_floresta_parses.conll CONLL
  133 +
  134 +
  135 +See the following site for more information on the CONLL format (as
  136 +well as other dependency parsers and info, etc):
  137 +
  138 +http://nextens.uvt.nl/~conll/
  139 +
  140 +3. The Python program mst_experiment.py is more involved wrapper that
  141 +allows one to easily do a randomized ten-fold while improving
  142 +performance on development data. It manages the various files that
  143 +MSTParser produces and keeps them tightly contained into a single
  144 +output directory. It also has hooks for using a part-of-speech tagger
  145 +(see pos_tag.py, which calls on the OpenNLP POS Tagger and assumes it
  146 +is installed).
  147 +
  148 +You can see the options by running:
  149 +
  150 +> mst_experiment.py --help
  151 +
  152 +Here's an example to do an 8-fold cross-validation using the
  153 +non-projective algorithm:
  154 +
  155 +> mst_experiment.py -f 8 -o experiment1 -d non-proj data/train.lab
  156 +
  157 +If you find it useful, that's great -- but before diving into it, you
  158 +should be aware that you may have to hack the Python a bit for your
  159 +own needs. It is actually put together from a program previous used to
  160 +interface with the Bikel parser, so there may be some extraneous
  161 +options and code hanging around.
  162 +
  163 +Note: Do NOT ask Ryan for any help with this Python script. Direct any
  164 +questions to Jason Baldridge instead (see email below), and even then
  165 +don't count on a rapid response.
  166 +
  167 +4. The Python script pos_tag.py calls on an unreleased version of the
  168 +OpenNLP tagger, and is left here only as an example that might help
  169 +you develop a similar script for other taggers. If you want the
  170 +OpenNLP tagger, let Jason know and he will consider packaging it up
  171 +with the parser more cleanly.
  172 +
  173 +5. The Python script mst2conll.py can be used to convert your existing
  174 +MST format files to CONLL format. The script conll2mst.py converts
  175 +CONLL formated files into MST format.
  176 +
  177 +6. The Python script create_baseline.py creates a right or left
  178 +linking baseline. Default is to create left linking -- use the option
  179 +-r for right linking. Currently, it uses MST format for input and
  180 +output, so you'll need to do some conversion if you have dependency
  181 +files in CONLL format. (This script was adapted from one written by
  182 +Ben Wing.)
  183 +
  184 +
  185 +
  186 +Bug Reports
  187 +===========
  188 +
  189 +See the original README for bug reporting for the system
  190 +itself. Report problems with the Ant build setup, the Python scripts,
  191 +or these instructions to Jason Baldridge (jasonbaldridge@gmail.com).
  192 +
  193 +Also note: if you use Windows and are having problems, you are on your
  194 +own.
  195 +
  196 +
  197 +Special Note
  198 +============
  199 +
  200 +Parts of these instructions and some of the directory structure are
  201 +based on the OpenCCG (openccg.sf.net) project and the JDOM project
  202 +(www.jdom.org).
  203 +
  204 +
... ...
disambiguation/mstparser/ALT_TESTBED 0 โ†’ 100644
  1 +=NOTE: This file is for developers -- don't let it confuse you if are
  2 +just giving MSTParser a spin. Check out README and ALT_README instead.
  3 +
  4 +To test that changes to the code have not messed up previous results,
  5 +do the following.
  6 +
  7 +
  8 +---------------------------------------------------------------------
  9 +1. Parse English in MST format:
  10 +
  11 +Run the parser as such:
  12 +
  13 +> mst_parse.sh format:MST train train-file:data/train.lab model-name:testbed/model test test-file:data/test.lab output-file:testbed/my_english_parses.mst eval gold-file:data/test.lab
  14 +
  15 +Score the results:
  16 +
  17 +> mst_score.sh data/test.lab testbed/my_english_parses.mst MST > testbed/my_english_score.txt
  18 +
  19 +Then compare "english_parses.mst" to "my_english_parses.mst" and "english_score.txt" to
  20 +"my_english_score.txt" -- they should be the same. (diff them)
  21 +
  22 +
  23 +---------------------------------------------------------------------
  24 +2. Parse Portuguese in CONLL format:
  25 +
  26 +> mst_parse.sh format:CONLL train train-file:data/portuguese/floresta_train.conll model-name:testbed/model test test-file:data/portuguese/floresta_test.conll output-file:testbed/my_floresta_parses.conll eval gold-file:data/portuguese/floresta_test.conll
  27 +
  28 +Score the results:
  29 +
  30 +> mst_score.sh data/portuguese/floresta_test.conll testbed/my_floresta_parses.conll CONLL > testbed/my_floresta_score.txt
  31 +
  32 +Compare as with English on the obvious file names.
  33 +
  34 +
  35 +---------------------------------------------------------------------
  36 +3. Parse English with second order model.
  37 +
  38 +Run the parser as such:
  39 +
  40 +> mst_parse.sh format:MST train train-file:data/train.lab model-name:testbed/model test test-file:data/test.lab output-file:testbed/my_english_parses_order2.mst eval gold-file:data/test.lab order:2
  41 +
  42 +Score the results:
  43 +
  44 +> mst_score.sh data/test.lab testbed/my_english_parses_order2.mst MST > testbed/my_english_score_order2.txt
  45 +
  46 +Compare with english_score_order2.txt.
  47 +
  48 +---------------------------------------------------------------------
  49 +4. Parse Portuguese in CONLL format with second order model:
  50 +
  51 +> mst_parse.sh train train-file:data/portuguese/floresta_train.conll test test-file:data/portuguese/floresta_test.conll output-file:out.txt eval gold-file:data/portuguese/floresta_test.conll order:2 decode-type:non-proj
  52 +
... ...
disambiguation/mstparser/CHANGES 0 โ†’ 100644
  1 +-----------------------------------------------------------------------
  2 +v0.5.1
  3 +
  4 +- Issue 10 - loadModel() method from DepdendencyParser should also be
  5 + able to receive an InputStream
  6 +- Issue 9 - Add a method to DependencyParser which return the Parse
  7 + Trees
  8 +- Issue 7 - Update source folder at ant script
  9 +- Issue 6 - Change visibility of some methods and attributes to
  10 + facilitate wrapping
  11 +- Issue 2 - Convert project to maven
  12 +
  13 +-----------------------------------------------------------------------
  14 +v0.5.0
  15 +
  16 + UNKNOWN
  17 +
  18 +-----------------------------------------------------------------------
  19 +v0.4.3b
  20 +
  21 +- Fixed bug: DependencyInstance serialization was not handling the
  22 + feats. This caused errors when using the non-projective decoder with
  23 + second order. (JMB 4-APR-07)
  24 +
  25 +-----------------------------------------------------------------------
  26 +v0.4.3
  27 +
  28 +- Forest files are created in the tmp directory. Without this, two
  29 + instances of MSTParser being run on the same data set would
  30 + overwrite each other's feature forest files. Also, the forest files
  31 + created in tmp are deleted when the Java VM exits. (JMB, 21-JAN-07).
  32 +
  33 +- Separated out the standard sentential parsing features from extra
  34 + features used for discourse parsing. (JMB, 23-MAR-07)
  35 +
  36 +- Created ParserOptions so that it is easier to pass various options
  37 + between the parser and the pipes. (JMB, 23-MAR-07)
  38 +
  39 +- Fixed bug in serialization of DependencyInstances -- lemmas were not
  40 + being written out, and this caused the 2nd order stuff to
  41 + crash. (JMB 23-MAR-07)
  42 +
  43 +
  44 +-----------------------------------------------------------------------
  45 +v0.4.2
  46 +
  47 +- Results have improved slightly over previous testbed results. This
  48 + may be due to the fact that FeatureVector.dotProduct would have got
  49 + -1 return values on keys not held in the TIntDoubleHashMap for the
  50 + second vector in the previous version of Trove. Now that Trove
  51 + returns 0, this is actually the right behavior in this case. Another
  52 + possible explanation is that there is some minor change in the
  53 + features which are generated. Since the output has changed so
  54 + little, and for the better, I'll leave it at that for now. The
  55 + testbed results and output have been updated to reflect the current
  56 + version. (JMB, 17-JAN-07)
  57 +
  58 +- Uncommented a line in DependencyPipe that removed some features from
  59 + the parsing models in the previous release. (Need to come up with a
  60 + better way of defining different pipes!) (JMB, 17-JAN-07)
  61 +
  62 +- Changed the FeatureVector implementation to be a TLinkedList of
  63 + Feature objects, with two optional sub-FeatureVectors contained
  64 + within. This supports fast concatenation of two FeatureVectors since
  65 + it is no longer necessary to copy entire lists. Also, rather than
  66 + explicitly negating features for the getDistVector() method, a
  67 + boolean value is set that can optionally indicate the second
  68 + sub-FeatureVector as negated. The logic of the other methods then
  69 + preserves the negation (and negation with negation). Again, this
  70 + means we don't have to make copies for this operation. These changes
  71 + led sped up training by a factor of 2 to 4 (depedending on the
  72 + number of features used in the parsing model) and parsing by up to
  73 + 1.5 times. (JMB, 17-JAN-07)
  74 +
  75 +- Updated to Trove v1.1b5. Changed default return value of
  76 + TObjectIntHashMap to be -1 rather than 0, so it is important to use
  77 + the included trove.jar rather than downloading and using one from
  78 + the Trove project. (Note: I tried to update to v2.0a2, but the test
  79 + suites broke with that version. Attempts to sort out the problem
  80 + were unsuccessful, so V1.1b5 will just have to do for now.) (JMB,
  81 + 16-JAN-07)
  82 +
  83 +- Removed addIfNotPresent boolean from lookupIndex in Alphabet since
  84 + it isn't used in MSTParser and it incurs an extra method call and
  85 + boolean check on a very common method. (JMB, 16-JAN-07)
  86 +
  87 +- Added support for relational features, which hold between two
  88 + utterances. These features are defined as an NxN matrix (N=number of
  89 + parsing units) below the main CoNLL format declarations. This is
  90 + mainly introduced for discourse parsing to allow for features like
  91 + whether two parsing units are in the same sentence or paragraph, or
  92 + if they both contain references to the same entity. It can be
  93 + ignored for sentence parsing -- everything continues to work as
  94 + before. (The distance between two units is an example of such a
  95 + feature in sentence parsing, but this can be computed on the fly, so
  96 + it isn't necessary to use such a matrix.) (JMB, 14-JAN-07)
  97 +
  98 +
  99 +-----------------------------------------------------------------------
  100 +v0.4.0
  101 +
  102 +- Cleaned up Pipes considerably; eg, Pipe2O doesn't replicate so much
  103 + code from Pipe. Many of the createFeatureVector methods were renamed
  104 + to things like addCoreFeatures. (JMB)
  105 +
  106 +- If one uses MST format, the creation of posA and the
  107 + 5-character-substring features now are put into dependency instances
  108 + in MSTReader as the course pos tags and lemmas, respectively. Then
  109 + in the feature extraction code, rather than creating posA etc on the
  110 + fly, it just references those fields in the dependency
  111 + instance. That way, if you use conll format, you get to use lemma
  112 + and course tag values supplied by the annotations. (JMB)
  113 +
  114 +- Can utilize the FEAT1|FEAT2|...|FEATN field of the CONLL format to
  115 + allow abitrary features. See addCoreFeatures() in the DependencyPipe
  116 + class. (JMB)
  117 +
  118 +-----------------------------------------------------------------------
  119 +v0.2.2
  120 +
  121 +- MSTParser now works with both MST and CONLL formats. Pipes are now
  122 + passed a parameter for which format they use, and they call upon
  123 + Readers and Writers that know how to handle each format. CONLL is
  124 + the default format. (JMB)
  125 +
  126 +- Added a subset of the Portuguese data from CONLL to test the CONLL
  127 + format and to have another data set for the testbed. See TESTBED
  128 + (JMB)
  129 +
  130 +- Included an Ant build system that does some nice things, but which
  131 + can be ignored if make is preferred. Highlights of the additional
  132 + capabilities are: (1) class files are put in a location
  133 + (./output/classes) separate from the .java files; (2) you can get
  134 + javadocs (./doc/api) by running "sh build.sh javadoc"; (3) you can
  135 + make a release with "sh build.sh release" You don't need to install
  136 + anything extra ( ant.jar in in ./lib); the only additional steps
  137 + needed to use the Ant build setup is to set the JAVA_HOME and
  138 + MSTPARSER_DIR environment variables appropriately. (JMB)
  139 +
  140 +
  141 +
  142 +
... ...
disambiguation/mstparser/LICENSE 0 โ†’ 100644
  1 +This software is Copyright (C) 2005 University of Pennsylvania and
  2 +this software is Copyright (C) 2002, 2003 University of Massachusetts
  3 +Amherst, Department of Computer Science, and is licensed under the
  4 +terms of the Common Public License, Version 1.0 or (at your option)
  5 +any subsequent version.
  6 +
  7 +The license is approved by the Open Source Initiative, and is available
  8 +from their website at http://www.opensource.org.
  9 +
  10 +=====================
  11 +
  12 +Common Public License Version 1.0
  13 +
  14 +THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS COMMON
  15 +PUBLIC LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF
  16 +THE PROGRAM CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
  17 +
  18 +1. DEFINITIONS
  19 +
  20 +"Contribution" means:
  21 +
  22 +a) in the case of the initial Contributor, the initial code and
  23 +documentation distributed under this Agreement, and
  24 +
  25 +b) in the case of each subsequent Contributor:
  26 +
  27 +i) changes to the Program, and
  28 +
  29 +ii) additions to the Program;
  30 +
  31 +where such changes and/or additions to the Program originate from and
  32 +are distributed by that particular Contributor. A Contribution
  33 +'originates' from a Contributor if it was added to the Program by such
  34 +Contributor itself or anyone acting on such Contributor's
  35 +behalf. Contributions do not include additions to the Program which:
  36 +(i) are separate modules of software distributed in conjunction with
  37 +the Program under their own license agreement, and (ii) are not
  38 +derivative works of the Program.
  39 +
  40 +"Contributor" means any person or entity that distributes the Program.
  41 +
  42 +"Licensed Patents " mean patent claims licensable by a Contributor
  43 +which are necessarily infringed by the use or sale of its Contribution
  44 +alone or when combined with the Program.
  45 +
  46 +"Program" means the Contributions distributed in accordance with this
  47 +Agreement.
  48 +
  49 +"Recipient" means anyone who receives the Program under this
  50 +Agreement, including all Contributors.
  51 +
  52 +2. GRANT OF RIGHTS
  53 +
  54 +a) Subject to the terms of this Agreement, each Contributor hereby
  55 +grants Recipient a non-exclusive, worldwide, royalty-free copyright
  56 +license to reproduce, prepare derivative works of, publicly display,
  57 +publicly perform, distribute and sublicense the Contribution of such
  58 +Contributor, if any, and such derivative works, in source code and
  59 +object code form.
  60 +
  61 +b) Subject to the terms of this Agreement, each Contributor hereby
  62 +grants Recipient a non-exclusive, worldwide, royalty-free patent
  63 +license under Licensed Patents to make, use, sell, offer to sell,
  64 +import and otherwise transfer the Contribution of such Contributor, if
  65 +any, in source code and object code form. This patent license shall
  66 +apply to the combination of the Contribution and the Program if, at
  67 +the time the Contribution is added by the Contributor, such addition
  68 +of the Contribution causes such combination to be covered by the
  69 +Licensed Patents. The patent license shall not apply to any other
  70 +combinations which include the Contribution. No hardware per se is
  71 +licensed hereunder.
  72 +
  73 +c) Recipient understands that although each Contributor grants the
  74 +licenses to its Contributions set forth herein, no assurances are
  75 +provided by any Contributor that the Program does not infringe the
  76 +patent or other intellectual property rights of any other entity. Each
  77 +Contributor disclaims any liability to Recipient for claims brought by
  78 +any other entity based on infringement of intellectual property rights
  79 +or otherwise. As a condition to exercising the rights and licenses
  80 +granted hereunder, each Recipient hereby assumes sole responsibility
  81 +to secure any other intellectual property rights needed, if any. For
  82 +example, if a third party patent license is required to allow
  83 +Recipient to distribute the Program, it is Recipient's responsibility
  84 +to acquire that license before distributing the Program.
  85 +
  86 +d) Each Contributor represents that to its knowledge it has sufficient
  87 +copyright rights in its Contribution, if any, to grant the copyright
  88 +license set forth in this Agreement.
  89 +
  90 +3. REQUIREMENTS
  91 +
  92 +A Contributor may choose to distribute the Program in object code form
  93 +under its own license agreement, provided that:
  94 +
  95 +a) it complies with the terms and conditions of this Agreement; and
  96 +
  97 +b) its license agreement:
  98 +
  99 +i) effectively disclaims on behalf of all Contributors all warranties
  100 +and conditions, express and implied, including warranties or
  101 +conditions of title and non-infringement, and implied warranties or
  102 +conditions of merchantability and fitness for a particular purpose;
  103 +
  104 +ii) effectively excludes on behalf of all Contributors all liability
  105 +for damages, including direct, indirect, special, incidental and
  106 +consequential damages, such as lost profits;
  107 +
  108 +iii) states that any provisions which differ from this Agreement are
  109 +offered by that Contributor alone and not by any other party; and
  110 +
  111 +iv) states that source code for the Program is available from such
  112 +Contributor, and informs licensees how to obtain it in a reasonable
  113 +manner on or through a medium customarily used for software exchange.
  114 +
  115 +When the Program is made available in source code form:
  116 +
  117 +a) it must be made available under this Agreement; and
  118 +
  119 +b) a copy of this Agreement must be included with each copy of the
  120 +Program.
  121 +
  122 +Contributors may not remove or alter any copyright notices contained
  123 +within the Program.
  124 +
  125 +Each Contributor must identify itself as the originator of its
  126 +Contribution, if any, in a manner that reasonably allows subsequent
  127 +Recipients to identify the originator of the Contribution.
  128 +
  129 +4. COMMERCIAL DISTRIBUTION
  130 +
  131 +Commercial distributors of software may accept certain
  132 +responsibilities with respect to end users, business partners and the
  133 +like. While this license is intended to facilitate the commercial use
  134 +of the Program, the Contributor who includes the Program in a
  135 +commercial product offering should do so in a manner which does not
  136 +create potential liability for other Contributors. Therefore, if a
  137 +Contributor includes the Program in a commercial product offering,
  138 +such Contributor ("Commercial Contributor") hereby agrees to defend
  139 +and indemnify every other Contributor ("Indemnified Contributor")
  140 +against any losses, damages and costs (collectively "Losses") arising
  141 +from claims, lawsuits and other legal actions brought by a third party
  142 +against the Indemnified Contributor to the extent caused by the acts
  143 +or omissions of such Commercial Contributor in connection with its
  144 +distribution of the Program in a commercial product offering. The
  145 +obligations in this section do not apply to any claims or Losses
  146 +relating to any actual or alleged intellectual property
  147 +infringement. In order to qualify, an Indemnified Contributor must: a)
  148 +promptly notify the Commercial Contributor in writing of such claim,
  149 +and b) allow the Commercial Contributor to control, and cooperate with
  150 +the Commercial Contributor in, the defense and any related settlement
  151 +negotiations. The Indemnified Contributor may participate in any such
  152 +claim at its own expense.
  153 +
  154 +For example, a Contributor might include the Program in a commercial
  155 +product offering, Product X. That Contributor is then a Commercial
  156 +Contributor. If that Commercial Contributor then makes performance
  157 +claims, or offers warranties related to Product X, those performance
  158 +claims and warranties are such Commercial Contributor's responsibility
  159 +alone. Under this section, the Commercial Contributor would have to
  160 +defend claims against the other Contributors related to those
  161 +performance claims and warranties, and if a court requires any other
  162 +Contributor to pay any damages as a result, the Commercial Contributor
  163 +must pay those damages.
  164 +
  165 +5. NO WARRANTY
  166 +
  167 +EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS
  168 +PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  169 +KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY
  170 +WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY
  171 +OR FITNESS FOR A PARTICULAR PURPOSE. Each Recipient is solely
  172 +responsible for determining the appropriateness of using and
  173 +distributing the Program and assumes all risks associated with its
  174 +exercise of rights under this Agreement, including but not limited to
  175 +the risks and costs of program errors, compliance with applicable
  176 +laws, damage to or loss of data, programs or equipment, and
  177 +unavailability or interruption of operations.
  178 +
  179 +6. DISCLAIMER OF LIABILITY
  180 +
  181 +EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR
  182 +ANY CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT,
  183 +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING
  184 +WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF
  185 +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  186 +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OR
  187 +DISTRIBUTION OF THE PROGRAM OR THE EXERCISE OF ANY RIGHTS GRANTED
  188 +HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
  189 +
  190 +7. GENERAL
  191 +
  192 +If any provision of this Agreement is invalid or unenforceable under
  193 +applicable law, it shall not affect the validity or enforceability of
  194 +the remainder of the terms of this Agreement, and without further
  195 +action by the parties hereto, such provision shall be reformed to the
  196 +minimum extent necessary to make such provision valid and enforceable.
  197 +
  198 +If Recipient institutes patent litigation against a Contributor with
  199 +respect to a patent applicable to software (including a cross-claim or
  200 +counterclaim in a lawsuit), then any patent licenses granted by that
  201 +Contributor to such Recipient under this Agreement shall terminate as
  202 +of the date such litigation is filed. In addition, if Recipient
  203 +institutes patent litigation against any entity (including a
  204 +cross-claim or counterclaim in a lawsuit) alleging that the Program
  205 +itself (excluding combinations of the Program with other software or
  206 +hardware) infringes such Recipient's patent(s), then such Recipient's
  207 +rights granted under Section 2(b) shall terminate as of the date such
  208 +litigation is filed.
  209 +
  210 +All Recipient's rights under this Agreement shall terminate if it
  211 +fails to comply with any of the material terms or conditions of this
  212 +Agreement and does not cure such failure in a reasonable period of
  213 +time after becoming aware of such noncompliance. If all Recipient's
  214 +rights under this Agreement terminate, Recipient agrees to cease use
  215 +and distribution of the Program as soon as reasonably
  216 +practicable. However, Recipient's obligations under this Agreement and
  217 +any licenses granted by Recipient relating to the Program shall
  218 +continue and survive.
  219 +
  220 +Everyone is permitted to copy and distribute copies of this Agreement,
  221 +but in order to avoid inconsistency the Agreement is copyrighted and
  222 +may only be modified in the following manner. The Agreement Steward
  223 +reserves the right to publish new versions (including revisions) of
  224 +this Agreement from time to time. No one other than the Agreement
  225 +Steward has the right to modify this Agreement. IBM is the initial
  226 +Agreement Steward. IBM may assign the responsibility to serve as the
  227 +Agreement Steward to a suitable separate entity. Each new version of
  228 +the Agreement will be given a distinguishing version number. The
  229 +Program (including Contributions) may always be distributed subject to
  230 +the version of the Agreement under which it was received. In addition,
  231 +after a new version of the Agreement is published, Contributor may
  232 +elect to distribute the Program (including its Contributions) under
  233 +the new version. Except as expressly stated in Sections 2(a) and 2(b)
  234 +above, Recipient receives no rights or licenses to the intellectual
  235 +property of any Contributor under this Agreement, whether expressly,
  236 +by implication, estoppel or otherwise. All rights in the Program not
  237 +expressly granted under this Agreement are reserved.
  238 +
  239 +This Agreement is governed by the laws of the State of New York and
  240 +the intellectual property laws of the United States of America. No
  241 +party to this Agreement will bring a legal action under this Agreement
  242 +more than one year after the cause of action arose. Each party waives
  243 +its rights to a jury trial in any resulting litigation.
... ...
disambiguation/mstparser/README 0 โ†’ 100644
  1 +-------------------------
  2 +MSTParser version 0.5.0
  3 +-------------------------
  4 +
  5 +This is the main README. See ALT_README for some extra utilities and
  6 +an alternative build process to the one described in this README. The
  7 +package has been modified by Jason Baldridge -- this version should
  8 +produce the same results as Ryan McDonald's previous releases, but it
  9 +has been made more flexible and configurable in the input formats it
  10 +accepts (both MST and CoNLL) and in the way features are declared (see
  11 +the DependencyPipe class).
  12 +
  13 +-------------------------
  14 +
  15 +
  16 +The following package contains a java implementation of the dependency
  17 +parsers described in:
  18 +
  19 +Non-Projective Dependency Parsing using Spanning Tree Algorithms
  20 +R. McDonald, F. Pereira, K. Ribarov and J. Hajic
  21 +HLT-EMNLP, 2005
  22 +
  23 +Online Large-Margin Training of Dependency Parsers
  24 +R. McDonald, K. Crammer and F. Pereira
  25 +ACL, 2005
  26 +
  27 +Online Learning of Approximate Dependency Parsing Algorithms
  28 +R. McDonald and F. Pereira
  29 +EACL, 2006
  30 +
  31 +In addition, the parsers in this package can also learn and produce typed
  32 +dependency trees (i.e. trees with edge labels).
  33 +
  34 +The parser should work with Java 1.4 and 1.5
  35 +
  36 +If there are any problems running the parser then email: ryantm@cis.upenn.edu
  37 +I will only respond to questions not answered in this README.
  38 +
  39 +
  40 +----------------
  41 +Contents
  42 +----------------
  43 +
  44 +1. Compiling
  45 +
  46 +2. Example of usage
  47 +
  48 +3. Running the parser
  49 + a. Input data format
  50 + b. Training a parser
  51 + c. Running a trained model on new data
  52 + d. Evaluating output
  53 +
  54 +4. Memory/Disk space and performance issues
  55 +
  56 +5. Reproducing results in HLT-EMNLP and ACL papers
  57 +
  58 +
  59 +----------------
  60 +1. Compiling
  61 +----------------
  62 +
  63 +To compile the code, first unzip/tar the downloaded file:
  64 +
  65 +> gunzip mstparser.tar.gz
  66 +> tar -xvf mstparser.tar
  67 +> cd MSTParser
  68 +
  69 +Next, run the following command
  70 +
  71 +> javac -classpath ".:lib/trove.jar" mstparser/DependencyParser.java
  72 +
  73 +This will compile the package.
  74 +
  75 +
  76 +---------------------
  77 +2. Example Usage
  78 +---------------------
  79 +
  80 +In the directory data/ there are examples of training and testing data. Data
  81 +format is described in the next section.
  82 +
  83 +train.ulab/test.ulab
  84 +- training and testing data with unlabeled trees
  85 +
  86 +train.lab/test.lab
  87 +- training and testing data with labeled trees
  88 +
  89 +To run an unlabeled parser type:
  90 +
  91 +> java -classpath ".:lib/trove.jar" -Xmx1800m mstparser.DependencyParser \
  92 + train train-file:data/train.ulab model-name:dep.model \
  93 + test test-file:data/test.ulab output-file:out.txt \
  94 + eval gold-file:data/test.ulab format:MST
  95 +
  96 +This will train a parser on the training data, run it on the testing data and
  97 +evaluate the output against the gold standard. The results from running the
  98 +parser are in the file out.txt and the trained model in dep.model.
  99 +
  100 +To train an labeled parser run the same command but use the labeled training
  101 +and testing files.
  102 +
  103 +
  104 +-------------------------
  105 +3. Running the Parser
  106 +-------------------------
  107 +
  108 +-------------------------
  109 +3a. Input data format
  110 +-------------------------
  111 +
  112 +**************************** NOTE **********************************
  113 +The parser now uses CONLL format as a default. Note the inclusion of
  114 +the format:MST option in the instructions below, which differ from the
  115 +instructions in previous versions (v0.2 and before). If you wish to
  116 +run the parser on CONLL formatted files, use format:CONLL or just
  117 +don't include the format option.
  118 +********************************************************************
  119 +
  120 +Example data sets are given in the data/ directory.
  121 +
  122 +Each sentence in the data is represented by 3 or 4 lines and sentences are
  123 +space separated. The general format is:
  124 +
  125 +w1 w2 ... wn
  126 +p1 p2 ... pn
  127 +l1 l2 ... ln
  128 +d1 d2 ... d2
  129 +
  130 +....
  131 +
  132 +
  133 +Where,
  134 +- w1 ... wn are the n words of the sentence (tab deliminated)
  135 +- p1 ... pn are the POS tags for each word
  136 +- l1 ... ln are the labels of the incoming edge to each word
  137 +- d1 ... dn are integers representing the postition of each words parent
  138 +
  139 +For example, the sentence "John hit the ball" would be:
  140 +
  141 +John hit the ball
  142 +N V D N
  143 +SBJ ROOT MOD OBJ
  144 +2 0 4 2
  145 +
  146 +Note that hit's parent is indexed by 0 since it is the root.
  147 +
  148 +If you wish to only train or test an unlabeled parser, then simply leave out
  149 +the third line for each sentence, e.g.,
  150 +
  151 +John hit the ball
  152 +N V D N
  153 +2 0 4 2
  154 +
  155 +The parser will automatically detect that it should produce unlabeled trees.
  156 +
  157 +Note that this format is the same for training AND for running the parser on
  158 +new data. Of course, you may not always know the gold standard. In this case,
  159 +just substitute lines 3 (the edge labels) and lines 4 (the parent indexes) with
  160 +dummy values. The parser just ignores these values and produces its own.
  161 +
  162 +
  163 +----------------------------
  164 +3b. Training the parser
  165 +----------------------------
  166 +
  167 +If you have a set of labeled data, first place it in the format described
  168 +above.
  169 +
  170 +If your training data is in a file train.txt, you can then run the command:
  171 +
  172 +> java -classpath ".:lib/trove.jar" -Xmx1800m mstparser.DependencyParser \
  173 + train train-file:train.txt format:MST
  174 +
  175 +This will train a parser with all the default properties. Additonal
  176 +properties can be described with the following flags:
  177 +
  178 +train
  179 +- if present then parser will train a new model
  180 +
  181 +train-file:file.txt
  182 +- use data in file.txt to train the parser
  183 +
  184 +model-name:model.name
  185 +- store trained model in file called model.name
  186 +
  187 +iters:numIters
  188 +- Run training algorithm for numIters epochs, default is 10
  189 +
  190 +decode-type:type
  191 +- type is either "proj" or "non-proj", e.g. decode-type:proj
  192 +- Default is "proj"
  193 +- "proj" use the projective parsing algorithm during training
  194 + - i.e. The Eisner algorithm
  195 +- "non-proj" use the non-projective parsing algorithm during training
  196 + - i.e. The Chu-Liu-Edmonds algorithm
  197 +
  198 +training-k:K
  199 +- Specifies the k-best parse set size to create constraints during training
  200 +- Default is 1
  201 +- For non-projective parsing algorithm, k-best decoding is approximate
  202 +
  203 +loss-type:type
  204 +- type is either "punc" or "nopunc", e.g. loss-type:punc
  205 +- Default is "punc"
  206 +- "punc" include punctuation in hamming loss calculation
  207 +- "nopunc" do not include punctuation in hamming loss calculation
  208 +
  209 +create-forest:cf
  210 +- cf is either "true" or "false"
  211 +- Default is "true"
  212 +- If create-forest is false, it will not create the training parse forest (see
  213 + section 4). It assumes it has been created.
  214 +- This flag is useful if you are training many models on the same data and
  215 + features but using different parameters (e.g. training iters, decoding type).
  216 +
  217 +order:ord
  218 +- ord is either 1 or 2
  219 +- Default is 1
  220 +- Specifies the order/scope of features. 1 only has features over single edges
  221 + and 2 has features over pairs of adjacent edges in the tree.
  222 +
  223 +format:FORMAT
  224 +- FORMAT is either MST or CONLL
  225 +- Default is CONLL
  226 +- Specifies whether the input/output format. MST is the format used by
  227 + MSTParser until version 0.2.1. CONLL is the format used in the
  228 + CONLL-X shared task (see http://nextens.uvt.nl/~conll/).
  229 +
  230 +------------------------------------------------
  231 +3c. Running a trained model on new data
  232 +------------------------------------------------
  233 +
  234 +This section assumes you have trained a model and it is stored in dep.model.
  235 +
  236 +First, format your data properly (section 3a).
  237 +
  238 +It should be noted that the parser assumes both words and POS tags. To
  239 +generate POS tags for your data I suggest using the Ratniparkhi POS tagger
  240 +or another tagger of your choice.
  241 +
  242 +The parser also assumes that the edge label and parent index lines are
  243 +in the input. However, these can just be artificially inserted (e.g. with lines
  244 +of "LAB ... LAB" and "0 ... 0") since the parser will produce these lines
  245 +as output.
  246 +
  247 +If the data is in a file called test.txt, run the command:
  248 +
  249 +> java -classpath ".:lib/trove.jar" -Xmx1800m mstparser.DependencyParser \
  250 + test model-name:dep.model test-file:test.txt output-file:out.txt format:MST
  251 +
  252 +This will create an output file "out.txt" with the predictions of the parser.
  253 +Other properties can be defined with the following flags:
  254 +
  255 +test
  256 +- If included a trained parser will be run on the testing data
  257 +
  258 +test-file:file.txt
  259 +- The file containing the data to run the parser on
  260 +
  261 +model-name:model.name
  262 +- The name of the stored model to be used
  263 +
  264 +output-file:out.txt
  265 +- The result of running the parser on the new data
  266 +
  267 +decode-type:type
  268 +- See section 3b.
  269 +
  270 +order:ord
  271 +- See section 3b. THIS NEEDS TO HAVE THE SAME VALUE OF THE TRAINED MODEL!!
  272 +
  273 +format:FORMAT
  274 +- See section 3b.
  275 +
  276 +Note that if you train a labeled model, you should only run it expecting
  277 +labeled output (e.g. the test data should have 4 lines per sentence).
  278 +And if you train an unlabeled model, you should only run it expecting
  279 +unlabeled output (e.g. the test data should have 3 lines per sentence).
  280 +
  281 +
  282 +------------------------
  283 +3d. Evaluating Output
  284 +------------------------
  285 +
  286 +This section describes a simple class for evaluating the output of
  287 +the parser against a gold standard.
  288 +
  289 +Assume you have a gold standard, say test.txt and the output of the parser
  290 +say out.txt, then run the following command:
  291 +
  292 +> java -classpath ".:lib/trove.jar" -Xmx1800m mstparser.DependencyParser \
  293 + eval gold-file:test.txt output-file:out.txt MST
  294 +
  295 +This will return both labeled and unlabeled accuracy (if the data sets contain
  296 +labeled trees) as well as complete sentence accuracy, again labeled and
  297 +unlabeled.
  298 +
  299 +If your data is in CONLL format instead of MST format (pre-v0.2.1),
  300 +then replace MST by CONLL in the above command, or just leave it off
  301 +-- it defaults to CONLL.
  302 +
  303 +We should note that currently this evaluation script includes all punctuation.
  304 +In future releases we will modify this class to allow for the evaluation to
  305 +ingnore punctuation, which is standard for English (Yamada and Matsumoto 03).
  306 +
  307 +
  308 +---------------------------------------------
  309 +4. Memory/Disk space and performance issues
  310 +---------------------------------------------
  311 +
  312 +This parser is memory and disk space intensive.
  313 +
  314 +MEMORY ISSUES
  315 +
  316 +Remember to always run java with the flag -Xmx1800m to use all available
  317 +memory for the heap. For 64-bit machines use an even larger value, say
  318 +-Xmx8000m.
  319 +
  320 +Training a model on the WSJ can be done easily on a 32-bit machine.
  321 +It should also be possible to train a model on the entire Prague Dependency
  322 +Treebank on a 32-bit machine (I have done it), but I make no guarantees.
  323 +
  324 +DISK ISSUES
  325 +
  326 +To make training quicker we store the entire parse forest on disk, ala
  327 +Clark and Curran 04. This can be very large, up to and over 20GB!! Be aware
  328 +of this fact.
  329 +
  330 +If you train using a file called train.txt, the forest will be stored in
  331 +a file called train.txt.forest. If disk space is an issue you can remove this
  332 +file immediately after training (it is not need to run the parser on new data).
  333 +
  334 +However, sometimes it is good to keep this file around. Particularly, if you
  335 +are retraining a model on the same data and feature space but want to try
  336 +different training settings. By using the create-forest:false flag, you
  337 +can avoid having to recreate this file (which can take some time).
  338 +
  339 +PERFORMANCE ISSUES
  340 +
  341 +Once a model has been trained, running the model on new data is pretty quick.
  342 +However, as with all discriminative trained parsers, it does take some time
  343 +to train a parser. On a two year old 32-bit machine is will take 10-15 hours
  344 +to train a model on the entire Penn Treebank and around 24-30 hours to train
  345 +a model on the Prague Dependency Treebank. Newer machines or 64-bit machines
  346 +are of course much quicker.
  347 +
  348 +
  349 +-------------------------------------------------------
  350 +5. Reproducing results in HLT-EMNLP and ACL papers
  351 +-------------------------------------------------------
  352 +
  353 +To reproduce the English results in McDonald et al. ACL 2005,
  354 +
  355 +> java -classpath ".:lib/trove.jar" -Xmx1800m mstparser.DependencyParser \
  356 + train train-file:train.wsj model-name:eng.wsj.model \
  357 + training-k:5 loss-type:nopunc decode-type:proj \
  358 + test test-file:test.wsj output-file:out.txt \
  359 + eval gold-file:test.wsj format:MST
  360 +
  361 +This assumes that train.wsj is section 02-21 of the Penntreebank formatted
  362 +above and dependencies extracted using the head-rules of Yamada and Matsumoto.
  363 +See Joakim Nivre's tool set at:
  364 + http://w3.msi.vxu.se/~nivre/research/Penn2Malt.html
  365 +for a tool set to convert the WSJ to dependencies using the Yamada and
  366 +Matsumoto head rules.
  367 +
  368 +test.wsj is section 23 of the WSJ converted as above. Furthermore, POS tags are
  369 +supplied using Adwait Ratniparkhi's MXPOST tool-kit trained on sections 02-21.
  370 +This can be found at:
  371 +http://www.cogsci.ed.ac.uk/~jamesc/taggers/MXPOST.html
  372 +
  373 +Note that the evaluation will be slightly off from the results reported. This
  374 +is because the evaluation scripts include punctuation. If you modify the
  375 +evaluation script to discount punctuation, results will align.
  376 +
  377 +
  378 +To reproduce the Czech results in McDonald et al. HLT-EMNLP 2005,
  379 +
  380 +> java -classpath ".:lib/trove.jar" -Xmx1800m mstparser.DependencyParser \
  381 + train train-file:train.pdt model-name:czech.pdt.model \
  382 + training-k:1 loss-type:punc decode-type:non-proj \
  383 + test test-file:test.pdt output-file:out.txt \
  384 + eval gold-file:test.pdt format:MST
  385 +
  386 +This assumes train.pdt and test.pdt are the training and testing sections
  387 +of the Prague Dependency Treebank v1.0 formatted above. We use the
  388 +automatically assigned POS tags that have been reduced (see paper).
... ...
disambiguation/mstparser/bin/conll2mst.py 0 โ†’ 100644
  1 +#! /usr/bin/python
  2 +
  3 +import sys;
  4 +
  5 +# Open File
  6 +f = open(sys.argv[1],'rt');
  7 +
  8 +wrds = ""; pos = ""; labs = ""; par = "";
  9 +
  10 +for line in f:
  11 +
  12 + sent = line.split();
  13 +
  14 + if len(sent) > 0:
  15 + wrds += sent[1] + "\t";
  16 + pos += sent[4] + "\t";
  17 + labs += sent[7] + "\t";
  18 + par += sent[6] + "\t";
  19 + else:
  20 + print wrds; wrds = "";
  21 + print pos; pos = "";
  22 + print labs; labs = "";
  23 + print par; par = "";
  24 + print "";
  25 +
  26 +f.close();
  27 +
... ...
disambiguation/mstparser/bin/create_baseline.py 0 โ†’ 100644
  1 +#!/usr/bin/python
  2 +import re
  3 +import optparse
  4 +import fileinput
  5 +import sys
  6 +
  7 +###########################################################################
  8 +#
  9 +# Command-line options and usage
  10 +#
  11 +###########################################################################
  12 +
  13 +usage = """%prog [OPTIONS] FILE ...
  14 +
  15 +Convert from one dependency style to another.
  16 +Use -f FROM and -t TO to specify the input and output formats.
  17 +"""
  18 +
  19 +parser = optparse.OptionParser(usage=usage)
  20 +
  21 +parser.add_option("-r", "--rightward", action="store_true",
  22 + default=False,
  23 + help="""Create right-linking baseline.""")
  24 +
  25 +parser.add_option("-d", "--default-relation", action="store",
  26 + default="Elaboration",
  27 + help="Pick default relation.",
  28 + metavar="RELATION")
  29 +
  30 +def transform_meta_chars(string):
  31 + return string.replace(",","+comma+")
  32 +def untransform_meta_chars(string):
  33 + return string.replace("+comma+",",")
  34 +
  35 +## Output dependencies for one sentence
  36 +def output_one_sentence(deps):
  37 + accum = [[], [], [], [], []]
  38 + for dep in deps:
  39 + for num in xrange(len(dep)):
  40 + accum[num].append(dep[num])
  41 + accum = ["\t".join([str(x) for x in y]) for y in accum]
  42 + print "\n".join(accum[1:])
  43 + print
  44 +
  45 +
  46 +## Get options
  47 +
  48 +(options, args) = parser.parse_args()
  49 +
  50 +## Process file(s)
  51 +
  52 +lines = fileinput.input(args)
  53 +
  54 +deps = []
  55 +
  56 +## Read input
  57 +
  58 +sentence_info = []
  59 +for line in lines:
  60 + line = line.strip()
  61 + if not line:
  62 + num_words = len(sentence_info[0])
  63 + baseline_deps = range(num_words)
  64 + if options.rightward:
  65 + baseline_deps.pop(0)
  66 + baseline_deps.pop(0)
  67 + baseline_deps += [num_words, 0]
  68 +
  69 + sentence_info[2] = [options.default_relation]*num_words
  70 + sentence_info[3] = baseline_deps
  71 +
  72 + try:
  73 + for i in xrange(len(sentence_info[0])):
  74 + deps.append([i+1]+[row[i] for row in sentence_info])
  75 + except:
  76 + #print sentence_info
  77 + print "\n".join([len(x) for x in sentence_info])
  78 + sys.exit(0)
  79 +
  80 + #print deps
  81 + output_one_sentence(deps)
  82 + deps = []
  83 + sentence_info = []
  84 + else:
  85 + sentence_info.append(line.split())
  86 +
... ...
disambiguation/mstparser/bin/mst-env 0 โ†’ 100644
  1 +#!/bin/sh
  2 +# sets MST environment variables
  3 +MST_LIB=$MSTPARSER_DIR/lib
  4 +DIRLIBS=$MST_LIB/trove.jar
  5 +CP=${MSTPARSER_DIR}/output/classes:${DIRLIBS}
  6 +JAVA=$JAVA_HOME/bin/java
  7 +JAVA_CMD="$JAVA -Xmx1800m -classpath $CP "
... ...
disambiguation/mstparser/bin/mst2conll.py 0 โ†’ 100644
  1 +#! /usr/bin/python
  2 +
  3 +import sys;
  4 +
  5 +# Open File
  6 +f = open(sys.argv[1],'rt');
  7 +
  8 +wrds = "";
  9 +pos = "";
  10 +labs = "";
  11 +par = "";
  12 +
  13 +for line in f:
  14 +
  15 + if len(line.strip()) == 0:
  16 + w = wrds.split(); p = pos.split(); l = labs.split(); pa = par.split();
  17 + cnt = 1;
  18 + for t in w:
  19 + print str(cnt) + "\t" + t + "\t" + t + "\t" + p[cnt-1] + "\t" + p[cnt-1] + "\t_\t" + pa[cnt-1] + "\t" + l[cnt-1];
  20 + cnt += 1;
  21 + print "";
  22 + wrds = ""; pos = ""; labs = ""; par = "";
  23 + elif len(wrds) == 0:
  24 + wrds = line;
  25 + elif len(pos) == 0:
  26 + pos = line;
  27 + elif len(labs) == 0:
  28 + labs = line;
  29 + else:
  30 + par = line;
  31 +
  32 +f.close();
  33 +
... ...
disambiguation/mstparser/bin/mst_experiment.py 0 โ†’ 100644
  1 +#!/usr/bin/python
  2 +
  3 +import os
  4 +import sys
  5 +import optparse
  6 +
  7 +## Check that MSTPARSER_DIR environment variable is set and get it
  8 +global mstparser_dir
  9 +mstparser_dir = ''
  10 +if os.environ.has_key('MSTPARSER_DIR'):
  11 + mstparser_dir = os.environ['MSTPARSER_DIR']
  12 +else:
  13 + print "Please set the MSTPARSER_DIR environment variable to where you have the MSTParser installed."
  14 + exit(1)
  15 +
  16 +
  17 +###########################################################################
  18 +#
  19 +# Run a single fold. This could actually be not a "fold" per se, but
  20 +# actually explicitly provided training and test files.
  21 +#
  22 +###########################################################################
  23 +
  24 +def create_tag_train_file (source_file, formatted_file):
  25 +
  26 + output = file(formatted_file, "w")
  27 +
  28 + input = file(source_file)
  29 + line = input.readline()
  30 + while not(line == ""):
  31 + words = line.strip().split("\t")
  32 + line = input.readline()
  33 + tags = line.strip().split("\t")
  34 +
  35 + # the splitting takes care of word+stem representations like biliyor+bil
  36 + merged = [words[i].split("+")[0]+"_"+tags[i].replace("_", "+us+") \
  37 + for i in range(len(words))]
  38 +
  39 + output.write(" ".join(merged)+"\n")
  40 +
  41 + input.readline() # eat up labels
  42 + input.readline() # eat up dependencies
  43 + input.readline() # eat blank line
  44 + line = input.readline() # read words of next sentence
  45 +
  46 + output.close()
  47 +
  48 +
  49 +def run_single_train_and_test(options, train_filename,
  50 + test_filename, output_filename, args):
  51 +
  52 +
  53 + realtest_filename = test_filename
  54 + # Tag the test sentences if requested
  55 + if options.tag_source == "OTK_Tagger":
  56 + print " Tagging test sentences..."
  57 +
  58 + tag_train_filename = train_filename+".tagged"
  59 +
  60 + create_tag_train_file(train_filename, tag_train_filename)
  61 +
  62 + tagged_filename = test_filename+".tagged.tmp"
  63 + tag_command = "python %s/bin/pos_tag.py -o %s %s %s %s" \
  64 + % (mstparser_dir,
  65 + options.output_dir,
  66 + tag_train_filename,
  67 + test_filename,
  68 + tagged_filename)
  69 +
  70 + #print >> argfile, tag_command
  71 + if options.verbose:
  72 + print tag_command
  73 + os.system(tag_command)
  74 + #os.system(tag_command+' |tee --append '+options.output_dir+'/tag.out 2>&1')
  75 + else:
  76 + os.system(tag_command+' &>/dev/null')
  77 + #os.system(tag_command+' >> '+options.output_dir+'/tag.out 2>&1')
  78 +
  79 +
  80 + tag_lines = []
  81 + counter = 0
  82 + for line in file(tagged_filename):
  83 + if counter % 2 == 1:
  84 + tag_lines.append(line)
  85 + counter += 1
  86 +
  87 + realtest_filename = test_filename+".tagged"
  88 + output = file(realtest_filename, "w")
  89 + counter = 0
  90 + for line in file(test_filename):
  91 + if counter % 5 == 1:
  92 + output.write(tag_lines[(counter-1)/5])
  93 + else:
  94 + output.write(line)
  95 + counter += 1
  96 +
  97 + output.close()
  98 +
  99 +
  100 + # Train the parser
  101 + print " Training and evaluating..."
  102 +
  103 + train_command = 'mst_parse.sh train train-file:%s model-name:%s/dep.model decode-type:%s test test-file:%s output-file:%s %s' % (train_filename, options.output_dir, options.decoder_type, realtest_filename, output_filename, " ".join(args[1:]))
  104 +
  105 + if options.verbose:
  106 + print train_command
  107 + os.system(train_command)
  108 + else:
  109 + os.system(train_command+' &>/dev/null')
  110 +
  111 +
  112 +###################### END FUNCTION DEFINITIONS ########################
  113 +
  114 +
  115 +## Get options
  116 +
  117 +opt_parser = optparse.OptionParser()
  118 +opt_parser.add_option("-l", "--language", action="store", default='Unspecified',
  119 + help="use configurations specific to LANGUAGE",
  120 + metavar="LANGUAGE")
  121 +opt_parser.add_option("-e", "--eval_file", action="store", default='Generated',
  122 + help="Read evaluation sentences from FILE. Using this option means that cross-validation will not be used.",
  123 + metavar="FILE")
  124 +opt_parser.add_option("-d", "--decoder_type", action="store",
  125 + choices=['proj', 'non-proj'],
  126 + default="proj",
  127 + help="Use a projective or non-projective algorithm.E",
  128 + metavar="FILE")
  129 +opt_parser.add_option("-o", "--output_dir", action="store", default='output',
  130 + help="save parser output to DIR",
  131 + metavar="DIR")
  132 +opt_parser.add_option("-f", "--num_folds", action="store", default=10,
  133 + help="The number of folds to use in cross-validation (Default=10).",
  134 + metavar="NUM")
  135 +opt_parser.add_option("-v", "--verbose", action="store_true", default=False,
  136 + help="be verbose")
  137 +
  138 +opt_parser.add_option("-t", "--tag_source", choices=['Gold','OTK_Tagger'],
  139 + default='Gold',
  140 + help="use tags from Gold standard or from a tagger (Gold (default), OTK_Tagger)",
  141 + metavar="SOURCE")
  142 +
  143 +(options, args) = opt_parser.parse_args()
  144 +
  145 +#Convert from FP to Int
  146 +options.num_folds = int(options.num_folds)
  147 +
  148 +# Check that the requested output directory doesn't exist and isn't a
  149 +# file. If it's okay, create the directory.
  150 +output_dir = options.output_dir
  151 +if os.path.isdir(output_dir):
  152 + os.system("rm -rf %s" % output_dir)
  153 +elif os.path.isfile(output_dir):
  154 + raise OSError("A file with the same name as the desired dir, " \
  155 + "'%s', already exists." % output_dir)
  156 +os.makedirs(output_dir)
  157 +
  158 +
  159 +# This file accumulates the results across all folds.
  160 +model_output_filename = output_dir+"/model_out"
  161 +os.system('touch %s' % model_output_filename)
  162 +
  163 +## Process files
  164 +
  165 +train_filename = args[0]
  166 +
  167 +# This file accumulates the gold dependencies across all folds.
  168 +gold_deps_filename = output_dir+"/gold.deps"
  169 +
  170 +if options.eval_file == "Generated":
  171 +
  172 + num_folds = int(options.num_folds)
  173 +
  174 + print "Running a %d-fold evaluation on file %s" \
  175 + % (num_folds, train_filename)
  176 + print
  177 +
  178 + # Align parses with their corresponding sentences and assign a
  179 + # partition id to them.
  180 +
  181 + train_file = file(train_filename)
  182 +
  183 + examples = []
  184 +
  185 + next_example = train_file.readline()
  186 +
  187 + counter = 0
  188 + while next_example:
  189 + partition = counter % num_folds
  190 +
  191 + elements = []
  192 + while next_example and next_example != "\n":
  193 + elements += next_example
  194 + next_example = train_file.readline()
  195 +
  196 + examples.append((partition, elements))
  197 +
  198 + next_example = train_file.readline()
  199 +
  200 + counter += 1
  201 +
  202 +
  203 + # Close the sentences file and delete it. (It was either copied or
  204 + # generated, so it's okay.)
  205 + train_file.close()
  206 +
  207 + # Train/test on each partion
  208 +
  209 + gold_deps = open(gold_deps_filename,"w")
  210 +
  211 + # Run each fold. The output from each fold is appended to gold.deps
  212 + # and model.deps
  213 + #for test_partition in range(1):
  214 + for test_partition in range(num_folds):
  215 +
  216 + print "Fold",test_partition
  217 +
  218 + train_filename = output_dir+"/train"
  219 + train_set = open(train_filename, "w")
  220 +
  221 + test_filename = output_dir+"/test"
  222 + test_set = open(test_filename, "w")
  223 +
  224 + counter = 0
  225 + for ex in examples:
  226 + if ex[0] == test_partition:
  227 + test_set.write("".join(ex[1])+"\n")
  228 + gold_deps.write("".join(ex[1])+"\n")
  229 + else:
  230 + train_set.write("".join(ex[1])+"\n")
  231 +
  232 + counter += 1
  233 +
  234 + train_set.close()
  235 + test_set.close()
  236 +
  237 + # Run the fold.
  238 + output_filename = output_dir+"/output"
  239 + run_single_train_and_test(options, train_filename, test_filename, output_filename, args)
  240 +
  241 + # Pile this fold's output onto the accumulating result file.
  242 + os.system('cat %s >> %s' % (output_filename, model_output_filename))
  243 +
  244 + gold_deps.flush()
  245 +
  246 + gold_deps.close()
  247 +
  248 +else:
  249 + os.system('cp %s %s' %(options.eval_file, gold_deps_filename))
  250 +
  251 + run_single_train_and_test(options, train_filename, gold_deps_filename, model_output_filename, args)
  252 +
  253 +
  254 +################## EVALUATION ###################
  255 +
  256 +print "Evaluating. If anything here dies, you can still look at the output files in the directory '%s'." % (output_dir)
  257 +
  258 +# Get dependency results.
  259 +
  260 +os.system("mst_score.sh %s %s" % (gold_deps_filename, model_output_filename))
  261 +
... ...
disambiguation/mstparser/bin/mst_parse.sh 0 โ†’ 100644
  1 +#!/bin/sh
  2 +. mst-env
  3 +$JAVA_CMD mstparser.DependencyParser $@
... ...
disambiguation/mstparser/bin/mst_score.sh 0 โ†’ 100644
  1 +#!/bin/sh
  2 +. mst-env
  3 +$JAVA_CMD mstparser.DependencyEvaluator $@
... ...
disambiguation/mstparser/bin/pos_tag.py 0 โ†’ 100644
  1 +#!/usr/bin/python
  2 +
  3 +import os
  4 +import sys
  5 +import optparse
  6 +
  7 +import tagging_util
  8 +
  9 +## Check that DBPARSER_DIR environment variable is set and get it
  10 +dbparser_dir = ''
  11 +if os.environ.has_key('DBPARSER_DIR'):
  12 + dbparser_dir = os.environ['DBPARSER_DIR']
  13 +else:
  14 + print "Please set the DBPARSER_DIR environment variable to where you have Dan Bikel's parser installed."
  15 + exit(1)
  16 +
  17 +## Check that OPENNLP_DIR environment variable is set and get it
  18 +otk_dir = ''
  19 +if os.environ.has_key('OPENNLP_DIR'):
  20 + otk_dir = os.environ['OPENNLP_DIR']
  21 +else:
  22 + print "Please set the OPENNLP_DIR environment variable to where you have the OpenNLP Toolkit installed."
  23 + exit(1)
  24 +
  25 +
  26 +## Get options
  27 +
  28 +opt_parser = optparse.OptionParser()
  29 +opt_parser.add_option("-o", "--output-dir", action="store", default='output',
  30 + help="save tagger output to DIR",
  31 + metavar="DIR")
  32 +opt_parser.add_option("-v", "--verbose", action="store_true", default=False,
  33 + help="be verbose")
  34 +
  35 +(options, args) = opt_parser.parse_args()
  36 +
  37 +verbose = options.verbose
  38 +
  39 +output_dir = options.output_dir
  40 +if os.path.isfile(output_dir):
  41 + raise OSError("A file with the same name as the desired dir, " \
  42 + "'%s', already exists." % output_dir)
  43 +elif not(os.path.isdir(output_dir)):
  44 + os.makedirs(output_dir)
  45 +
  46 +
  47 +## Process files
  48 +
  49 +adwait_tagged_filename = args[0]
  50 +test_sentences = file(args[1])
  51 +output_file = open(args[2], "w")
  52 +
  53 +# Use the gold trees to produce tagged sentences in Adwait's format
  54 +# with underscore separator.
  55 +#
  56 +# Note: any underscores in the tags themselves will be converted to
  57 +# +us+ metacharacters. These get unconverted at the end.
  58 +#os.system("python %s/python/parse_to_sentence.py -t -f Adwait -s -d %s > %s"
  59 +# % (dbparser_dir, tree_filename, adwait_tagged_filename))
  60 +
  61 +model_filename = output_dir+"/model.bin.gz"
  62 +
  63 +
  64 +# Make a tag dictionary
  65 +tag_dictionary_filename = output_dir+"/tag_dict"
  66 +os.system("python %s/python/create_tag_dictionary.py -s _ %s > %s"
  67 + % (dbparser_dir, adwait_tagged_filename, tag_dictionary_filename))
  68 +
  69 +# Train the tagger
  70 +os.system("%s/bin/otk_train_tagger.sh -dict %s %s %s &> /dev/null"
  71 + % (otk_dir, tag_dictionary_filename, adwait_tagged_filename, model_filename))
  72 +
  73 +sentences_to_tag_filename = output_dir+"/to_tag.txt"
  74 +
  75 +# Strip off the parens that are used in the input to parser
  76 +to_tag_file = open(sentences_to_tag_filename, "w")
  77 +counter = 0
  78 +for sentence in test_sentences:
  79 + if counter % 5 == 0:
  80 + clean = "\t".join([x.split("+")[0] for x in sentence.strip().split("\t")])
  81 + to_tag_file.write(clean+"\n")
  82 + counter += 1
  83 +to_tag_file.close()
  84 +
  85 +tagged_filename = output_dir+"/tagged.txt"
  86 +
  87 +# Run the tagger
  88 +os.system("%s/bin/otk_run_tagger.sh -dict %s -tag_dict %s %s %s > %s"
  89 + % (otk_dir, tag_dictionary_filename, tag_dictionary_filename,
  90 + sentences_to_tag_filename, model_filename, tagged_filename))
  91 +
  92 +
  93 +# Convert tagger output to MST format. Unconvert the +us+
  94 +# metachars back to underscores too (using tagging_util.de_metatize()).
  95 +for tagged_sent in file(tagged_filename):
  96 + words = []
  97 + tags = []
  98 + for word_tag in tagged_sent.split():
  99 + (word,tag) = tagging_util.split_item(word_tag, "_")
  100 + words.append(word)
  101 + tags.append(tagging_util.de_metatize(tag,"_","+us+"))
  102 + output_file.write("\t".join(words)+"\n")
  103 + output_file.write("\t".join(tags)+"\n")
  104 +
  105 +output_file.close()
... ...
disambiguation/mstparser/build.sh 0 โ†’ 100644
  1 +#!/bin/sh
  2 +
  3 +echo
  4 +echo "MST Parser Build System"
  5 +echo "-------------------"
  6 +echo
  7 +
  8 +if [ "$JAVA_HOME" = "" ] ; then
  9 + echo "ERROR: JAVA_HOME not found in your environment."
  10 + echo
  11 + echo "Please, set the JAVA_HOME variable in your environment to match the"
  12 + echo "location of the Java Virtual Machine you want to use."
  13 + exit 1
  14 +fi
  15 +
  16 +if [ `echo $OSTYPE | grep -n cygwin` ]; then
  17 + PS=";"
  18 +else
  19 + PS=":"
  20 +fi
  21 +
  22 +LOCALCLASSPATH=$JAVA_HOME/lib/tools.jar
  23 +# add in the dependency .jar files
  24 +DIRLIBS=lib/*.jar
  25 +for i in ${DIRLIBS}
  26 +do
  27 + if [ "$i" != "${DIRLIBS}" ] ; then
  28 + LOCALCLASSPATH=$LOCALCLASSPATH${PS}"$i"
  29 + fi
  30 +done
  31 +ANT_HOME=./lib
  32 +
  33 +echo Building with classpath $LOCALCLASSPATH
  34 +echo
  35 +
  36 +echo Starting Ant...
  37 +echo
  38 +
  39 +$JAVA_HOME/bin/java -Dant.home=$ANT_HOME -classpath $LOCALCLASSPATH org.apache.tools.ant.Main $*
... ...
disambiguation/mstparser/build.xml 0 โ†’ 100644
  1 +<!-- $Id: build.xml 138 2013-09-10 10:02:43Z wyldfire $ -->
  2 +<!-- Copyright (C) 2007 Ryan McDonald -->
  3 +<project default="compile" basedir=".">
  4 +
  5 + <!-- =================================================================== -->
  6 + <!-- Initialization target -->
  7 + <!-- =================================================================== -->
  8 + <target name="init">
  9 + <tstamp/>
  10 + <property name="Name" value="MSTParser"/>
  11 + <property name="name" value="mstparser"/>
  12 + <property name="year" value="2013"/>
  13 + <property name="version" value="0.5.1"/>
  14 +
  15 + <echo message="----------- ${Name} ${version} [${year}] ------------"/>
  16 +
  17 + <property name="debug" value="on"/>
  18 + <property name="optimize" value="off"/>
  19 + <property name="deprecation" value="on"/>
  20 +
  21 + <property name="src.dir" value="./src/main/java"/>
  22 + <property name="lib.dir" value="./lib"/>
  23 + <property name="packages" value="mstparser.*"/>
  24 +
  25 + <property name="build.dir" value="./output"/>
  26 + <property name="build.dest" value="./output/classes"/>
  27 + <property name="build.javadocs" value="./docs/api"/>
  28 +
  29 + <filter token="year" value="${year}"/>
  30 + <filter token="version" value="${version}"/>
  31 + <filter token="date" value="${TODAY}"/>
  32 + <filter token="log" value="true"/>
  33 + <filter token="verbose" value="true"/>
  34 +
  35 + <path id="build.classpath">
  36 + <fileset dir="${lib.dir}/">
  37 + <include name="*.jar"/>
  38 + </fileset>
  39 + </path>
  40 + </target>
  41 +
  42 +
  43 + <!-- =================================================================== -->
  44 + <!-- Help on usage -->
  45 + <!-- =================================================================== -->
  46 + <target name="usage">
  47 + <echo message=""/>
  48 + <echo message=""/>
  49 + <echo message="MST Parser build file"/>
  50 + <echo message="-------------------------------------------------------------"/>
  51 + <echo message=""/>
  52 + <echo message=" Available targets are:"/>
  53 + <echo message=""/>
  54 + <echo message=" package --> generates the mstparser.jar file"/>
  55 + <echo message=" compile --> compiles the source code (default)"/>
  56 + <echo message=" javadoc --> generates the API documentation"/>
  57 + <echo message=" clean --> cleans up the compilation directory"/>
  58 + <echo message=""/>
  59 + <echo message=" See the comments inside the build.xml file for more details."/>
  60 + <echo message="-------------------------------------------------------------"/>
  61 + <echo message=""/>
  62 + <echo message=""/>
  63 + </target>
  64 +
  65 +
  66 + <!-- =================================================================== -->
  67 + <!-- Prepares the build directories -->
  68 + <!-- =================================================================== -->
  69 + <target name="prepare" depends="init">
  70 + <!-- create directories -->
  71 + <mkdir dir="${build.dir}"/>
  72 + <mkdir dir="${build.dest}"/>
  73 + </target>
  74 +
  75 +
  76 + <!-- =================================================================== -->
  77 + <!-- Compiles the source directory -->
  78 + <!-- =================================================================== -->
  79 + <target name="compile"
  80 + depends="prepare"
  81 + description="compiles the source code (default)">
  82 + <javac srcdir="${src.dir}"
  83 + destdir="${build.dest}"
  84 + debug="${debug}"
  85 + deprecation="${deprecation}"
  86 + classpathref="build.classpath"
  87 + optimize="${optimize}">
  88 + <!-- <compilerarg line="-Xlint:unchecked"/> -->
  89 + </javac>
  90 + </target>
  91 +
  92 +
  93 + <!-- =================================================================== -->
  94 + <!-- Creates the class package -->
  95 + <!-- =================================================================== -->
  96 + <target name="package"
  97 + depends="compile"
  98 + description="generates the mstparser.jar file">
  99 + <jar jarfile="${build.dir}/${name}.jar">
  100 + <fileset dir="${build.dest}" includes="**"/>
  101 + </jar>
  102 + </target>
  103 +
  104 +
  105 + <!-- =================================================================== -->
  106 + <!-- Creates the release file -->
  107 + <!-- =================================================================== -->
  108 + <target name="release" depends="clean,cleandocs">
  109 + <tar tarfile="${name}-${version}-src.tar"
  110 + basedir="../"
  111 + includes="${name}/**"
  112 + excludes="**/CVS **/*forest testbed/my*" />
  113 + <gzip src="${name}-${version}-src.tar"
  114 + zipfile="../${name}-${version}-src.tgz" />
  115 + <delete file="${name}-${version}-src.tar" />
  116 + </target>
  117 +
  118 + <!-- =================================================================== -->
  119 + <!-- Creates the homepage -->
  120 + <!-- =================================================================== -->
  121 + <target name="homepage"
  122 + depends="init,javadoc"
  123 + description="generates the API documentation">
  124 + <tar tarfile="${name}-homepage.tar"
  125 + basedir="./docs/"
  126 + includes="**"
  127 + excludes="**/CVS" />
  128 + <gzip src="${name}-homepage.tar"
  129 + zipfile="${build.dir}/${name}-homepage.tgz" />
  130 + <delete file="${name}-homepage.tar" />
  131 + </target>
  132 +
  133 +
  134 + <!-- =================================================================== -->
  135 + <!-- Creates the API documentation -->
  136 + <!-- =================================================================== -->
  137 + <target name="javadoc" depends="prepare">
  138 + <mkdir dir="${build.javadocs}"/>
  139 + <javadoc packagenames="${packages}"
  140 + sourcepath="${src.dir}"
  141 + destdir="${build.javadocs}"
  142 + author="true"
  143 + version="true"
  144 + use="true"
  145 + splitindex="true"
  146 + noindex="false"
  147 + windowtitle="${name}"
  148 + doctitle="The ${Name} API v${version}"
  149 + bottom="Copyright &#169; ${year} Ryan McDonald and Jason Baldridge. All Rights Reserved."
  150 + />
  151 + </target>
  152 +
  153 +
  154 + <!-- =================================================================== -->
  155 + <!-- Cleans targets -->
  156 + <!-- =================================================================== -->
  157 + <target name="clean"
  158 + depends="init"
  159 + description="cleans up the directory">
  160 + <delete dir="${build.dir}"/>
  161 + <delete file="${lib.dir}/${name}.jar" />
  162 + </target>
  163 +
  164 + <target name="cleandocs" depends="init" description="cleans up the API docs directory">
  165 + <delete dir="${build.javadocs}"/>
  166 + </target>
  167 +
  168 +</project>
  169 +
  170 +<!-- End of file -->
... ...