refactoring

adamm
1 parent 60c228bb
Showing 71 changed files with 31203 additions and 169 deletions
disambiguation/ENIAM_EdgeScore.ml
disambiguation/ENIAMmstDisambiguation.ml
disambiguation/ENIAMmstFeatures.ml
disambiguation/ENIAMmstModel.ml
disambiguation/makefile
disambiguation/mstparser/.gitignore
disambiguation/mstparser/ALT_README
disambiguation/mstparser/ALT_TESTBED
disambiguation/mstparser/CHANGES
disambiguation/mstparser/LICENSE
disambiguation/mstparser/README
disambiguation/mstparser/bin/conll2mst.py
disambiguation/mstparser/bin/create_baseline.py
disambiguation/mstparser/bin/mst-env
disambiguation/mstparser/bin/mst2conll.py
disambiguation/mstparser/bin/mst_experiment.py
disambiguation/mstparser/bin/mst_parse.sh
disambiguation/mstparser/bin/mst_score.sh
disambiguation/mstparser/bin/pos_tag.py
disambiguation/mstparser/build.sh
-open Xstd
-open ENIAM_LCGtypes
-open Yojson
-
-module MST_Model : sig
-  type mst_model = {
-    typeAlphabet: int StringMap.t;
-    dataAlphabet: int StringMap.t;
-    parameters: float array}
-  val read_model: string -> mst_model
-  val empty: mst_model
-  exception MalformedModelJson
-end
-= struct
-  type mst_model = {
-    typeAlphabet: int StringMap.t;
-    dataAlphabet: int StringMap.t;
-    parameters: float array}
-
-  let empty = {typeAlphabet = StringMap.empty;
-               dataAlphabet = StringMap.empty;
-               parameters = Array.make 0 0.0}
-  exception MalformedModelJson
-
-  let construct_data_alphabet keys =
-    let counter = ref 0 in
-    let map = ref StringMap.empty in
-    let length = Array.length keys in
-    for i = 0 to length -1 do
-      map := StringMap.add !map keys.(i) !counter;
-      counter := !counter + 1;
-    done;
-    !map
-
-  let construct_type_alphabet = construct_data_alphabet
-
-  let read_model fname =
-    let data = Basic.from_file fname in
-    let open Yojson.Basic.Util in
-    let unwrapList = function
-        `List l -> l
-      | _ -> raise MalformedModelJson in
-    let dataA = data |> member "dataAlphabet"  |> unwrapList |> filter_string in
-    let typeA = data |> member "typeAlphabet"  |> unwrapList |> filter_string in
-    let params = data |> member "parameters" |> unwrapList |> filter_float in
-    {typeAlphabet = Array.of_list typeA |> construct_type_alphabet;
-     dataAlphabet = Array.of_list dataA |> construct_data_alphabet;
-     parameters = Array.of_list params}
-end
-open MST_Model
-
-let model = ref MST_Model.empty
-
-let initialize () =
-  model := MST_Model.read_model "dep.model.json";
-  ()
-
-exception UnsupportedLinearTerm of linear_term
-exception EmptyVariant
-
-let add_feature str (fv: IntSet.t) =
-  if StringMap.mem !model.dataAlphabet str then
-    IntSet.add fv (StringMap.find !model.dataAlphabet str)
-  else
-    fv
-
-let score_fv (fv:IntSet.t) =
-  IntSet.fold fv 0.0 (fun score i -> score +. !model.parameters.(i))
-
-let apply_features features fv =
-  List.fold_left (|>) fv features
-
-let add_linear_features f_type (obs: string array) first second distStr fv =
-  fv
-
-let add_two_obs_features prefix item1F1 item1F2 item2F1 item2F2 distStr fv =
-  let add_diststr str = [str; str^"*"^distStr] in
-  let flist = List.map ((^) prefix)[
-      "2FF1="^item1F1;
-      "2FF1="^item1F1^" "^item1F2;
-      "2FF1="^item1F1^" "^item1F2^" "^item2F2;
-      "2FF1="^item1F1^" "^item1F2^" "^item2F2^" "^item2F1;
-      "2FF2="^item1F1^" "^item2F1;
-      "2FF3="^item1F1^" "^item2F2;
-      "2FF4="^item1F2^" "^item2F1^" "^item2F2;
-      "2FF5="^item1F2^" "^item2F2;
-      "2FF6="^item2F1^" "^item2F2;
-      "2FF7="^item1F2;
-      "2FF8="^item2F1;
-      "2FF9="^item2F2;
-    ] in
-  let funs = List.map (add_feature) (List.flatten (List.map add_diststr flist)) in
-  apply_features funs fv
-
-type disamb_info = {
-  tree: linear_term array
-}
-
-let score_edge (data: disamb_info) (parent: node) (child: node) =
-  let fv = IntSet.empty in
-  let fv = add_two_obs_features "HC"
-      parent.orth parent.pos child.orth child.pos "" fv in
-  score_fv fv
-
-let rec fill_dep_edges_array
-    (data: disamb_info) parent (scores: float IntMap.t) =
-  function
-    Dot -> scores
-  | Ref i -> (match data.tree.(i) with
-        Node child -> IntMap.add scores i (score_edge data parent child)
-      | _ as x -> raise (UnsupportedLinearTerm x))
-  | Tuple l -> List.fold_left (fill_dep_edges_array data parent) scores l
-  | Variant (_, l) -> List.fold_left
-                        (fill_dep_edges_array data parent)
-                        scores  (List.map snd l)
-  | _ as x -> raise (UnsupportedLinearTerm x)
-
-let rec disambiguate_args edge_scores =
-  function
-    Dot -> Dot, 0.0
-  | Ref i -> Ref i, IntMap.find edge_scores i
-  | Tuple l ->
-    let (terms, scores) =
-      List.map (disambiguate_args edge_scores) l |> List.split in
-    let num = List.length scores |> float_of_int in
-    Tuple terms, (List.fold_left (+.) 0.0 scores) /. num
-  | Variant (lab, l) ->
-    let (lbs, terms) = List.split l in
-    let new_terms_scores = List.map (disambiguate_args edge_scores) terms in
-    let select_best (term, score) (new_term, new_score) =
-      if new_score > score then
-        new_term, new_score
-      else
-        term, score in
-    List.fold_left select_best (List.hd new_terms_scores) (List.tl new_terms_scores)
-  | _ as x -> raise (UnsupportedLinearTerm x)
-
-(* dezambiguacja argumentów pojedynczego wierzchołka algorytmem zachłannym *)
-let disambiguate_node (data: disamb_info) parentI =
-  let parent = match data.tree.(parentI) with
-      Node node -> node
-    | _ as x -> raise (UnsupportedLinearTerm x) in
-  let edge_scores = fill_dep_edges_array
-      data parent IntMap.empty (parent.args) in
-  let (new_term, _) = disambiguate_args edge_scores (parent.args) in
-  Node {parent with args = new_term}
-
-let disambiguate_tree tree =
-  let tree2 = Array.copy tree in
-  let data : disamb_info = {tree = tree} in
-  let update parentI _ =
-    (let new_term = disambiguate_node data parentI in
-     tree2.(parentI) <- new_term;) in
-  Array.iteri update tree; tree2
+open Xstd
+open ENIAM_LCGtypes
+open ENIAMmstModel
+open ENIAMmstFeatures
+
+let initialize () =
+  MST_Model.initialize "dep.model.json";
+  ()
+
+exception UnsupportedLinearTerm of linear_term
+exception EmptyVariant
+
+let rec fill_dep_edges_array
+    (data: disamb_info) parent (scores: float IntMap.t) =
+  function
+    Dot -> scores
+  | Ref i -> IntMap.add scores i (score_edge data parent data.tree.(i))
+  | Tuple l -> List.fold_left (fill_dep_edges_array data parent) scores l
+  | Variant (_, l) -> List.fold_left
+                        (fill_dep_edges_array data parent)
+                        scores  (List.map snd l)
+  | _ as x -> raise (UnsupportedLinearTerm x)
+
+let rec disambiguate_args edge_scores =
+  function
+    Dot -> Dot, 0.0
+  | Ref i -> Ref i, IntMap.find edge_scores i
+  | Tuple l ->
+    let (terms, scores) =
+      List.map (disambiguate_args edge_scores) l |> List.split in
+    let num = List.length scores |> float_of_int in
+    Tuple terms, (List.fold_left (+.) 0.0 scores) /. num
+  | Variant (lab, l) ->
+    let (lbs, terms) = List.split l in
+    let new_terms_scores = List.map (disambiguate_args edge_scores) terms in
+    let select_best (term, score) (new_term, new_score) =
+      if new_score >= score then
+        new_term, new_score
+      else
+        term, score in
+    List.fold_left select_best (List.hd new_terms_scores) (List.tl new_terms_scores)
+  | _ as x -> raise (UnsupportedLinearTerm x)
+
+(* dezambiguacja argumentów pojedynczego wierzchołka algorytmem zachłannym *)
+let disambiguate_node (data: disamb_info) parent =
+  let edge_scores = fill_dep_edges_array
+      data parent IntMap.empty (parent.args) in
+  let (new_term, _) = disambiguate_args edge_scores (parent.args) in
+  {parent with args = new_term}
+
+let disambiguate_tree (tree: linear_term array) =
+  let extract_node = (function
+        Node node -> node
+      | _ as x -> UnsupportedLinearTerm x |> raise) in
+  let data : disamb_info = {tree = Array.map extract_node tree} in
+  let disambiguate term = Node (extract_node term |> disambiguate_node data) in
+  Array.map disambiguate tree
+open ENIAMmstModel
+open ENIAM_LCGtypes
+
+type disamb_info = {
+  tree: node array
+}
+
+let apply_features features fv =
+  List.fold_left (|>) fv features
+
+let add_linear_features f_type (obs: string array) first second distStr fv =
+  fv
+
+let add_two_obs_features prefix item1F1 item1F2 item2F1 item2F2 distStr fv =
+  let add_diststr str = [str; str^"*"^distStr] in
+  let flist = List.map ((^) prefix)[
+      "2FF1="^item1F1;
+      "2FF1="^item1F1^" "^item1F2;
+      "2FF1="^item1F1^" "^item1F2^" "^item2F2;
+      "2FF1="^item1F1^" "^item1F2^" "^item2F2^" "^item2F1;
+      "2FF2="^item1F1^" "^item2F1;
+      "2FF3="^item1F1^" "^item2F2;
+      "2FF4="^item1F2^" "^item2F1^" "^item2F2;
+      "2FF5="^item1F2^" "^item2F2;
+      "2FF6="^item2F1^" "^item2F2;
+      "2FF7="^item1F2;
+      "2FF8="^item2F1;
+      "2FF9="^item2F2;
+    ] in
+  let funs = List.map (MST_Model.add_feature) (List.flatten (List.map add_diststr flist)) in
+  apply_features funs fv
+
+let score_edge (data: disamb_info) (parent: node) (child: node) =
+  let fv = MST_Model.empty_fv in
+  let fv = add_two_obs_features "HC"
+      parent.orth parent.pos child.orth child.pos "" fv in
+  MST_Model.score_fv fv
+open Yojson
+open Xstd
+
+
+module MST_Model : sig
+  type mst_model
+  type feature_vector_t
+  exception MalformedModelJson
+
+  val read_model: string -> mst_model
+  val initialize: string -> unit
+  val add_feature: string -> feature_vector_t -> feature_vector_t
+  val score_fv: feature_vector_t -> float
+  val empty_fv: feature_vector_t
+end
+= struct
+  type feature_vector_t = IntSet.t
+
+  type mst_model = {
+    typeAlphabet: int StringMap.t;
+    dataAlphabet: int StringMap.t;
+    parameters: float array}
+
+
+  exception MalformedModelJson
+
+  let model = ref {typeAlphabet = StringMap.empty;
+                   dataAlphabet = StringMap.empty;
+                   parameters = Array.make 0 0.0}
+
+  let empty_fv = IntSet.empty
+
+  let add_feature str (fv: feature_vector_t) =
+    if StringMap.mem !model.dataAlphabet str then
+      IntSet.add fv (StringMap.find !model.dataAlphabet str)
+    else
+      fv
+
+  let score_fv (fv: feature_vector_t) =
+    IntSet.fold fv 0.0 (fun score i -> score +. !model.parameters.(i))
+
+  let construct_data_alphabet keys =
+    let counter = ref 0 in
+    let map = ref StringMap.empty in
+    let length = Array.length keys in
+    for i = 0 to length -1 do
+      map := StringMap.add !map keys.(i) !counter;
+      counter := !counter + 1;
+    done;
+    !map
+
+  let construct_type_alphabet = construct_data_alphabet
+
+  let read_model fname =
+    let data = Basic.from_file fname in
+    try
+      let open Yojson.Basic.Util in
+      let unwrapList = function
+          `List l -> l
+        | _ -> raise MalformedModelJson in
+      let dataA = data |> member "dataAlphabet"  |> unwrapList |> filter_string in
+      let typeA = data |> member "typeAlphabet"  |> unwrapList |> filter_string in
+      let params = data |> member "parameters" |> unwrapList |> filter_float in
+      {typeAlphabet = Array.of_list typeA |> construct_type_alphabet;
+       dataAlphabet = Array.of_list dataA |> construct_data_alphabet;
+       parameters = Array.of_list params}
+    with
+      _ -> raise MalformedModelJson
+
+  let initialize fname =
+    model := read_model fname;
+    ()
+end
@@ -6,27 +6,27 @@ OCAMLFLAGS=$(INCLUDES) -g
 OCAMLOPTFLAGS=$(INCLUDES) unix.cmxa xml-light.cmxa str.cmxa nums.cmxa zip.cmxa bz2.cmxa xlib.cmxa eniam-lcg-parser.cmxa yojson.cmx
 INSTALLDIR=`ocamlc -where`/eniam
  
-SOURCES= ENIAM_EdgeScore.ml
+SOURCES=ENIAMmstModel.ml ENIAMmstFeatures.ml ENIAMmstDisambiguation.ml
  
-all: eniam-edge-score.cma eniam-edge-score.cmxa
+all: eniam-mst-disambiguation.cma eniam-mst-disambiguation.cmxa
  
 install: all
 	mkdir -p $(INSTALLDIR)
-	cp eniam-edge-score.cmxa eniam-edge-score.a eniam-edge-score.cma $(INSTALLDIR)
-	cp ENIAM_EdgeScore.cmi $(INSTALLDIR)
-	cp ENIAM_EdgeScore.cmx $(INSTALLDIR)
+	cp eniam-mst-disambiguation.cmxa eniam-mst-disambiguation.a eniam-mst-disambiguation.cma $(INSTALLDIR)
+	cp ENIAMmstDisambiguation.cmi ENIAMmstModel.cmi ENIAMmstFeatures.cmi $(INSTALLDIR)
+	cp ENIAMmstDisambiguation.cmx ENIAMmstModel.cmx ENIAMmstFeatures.cmx $(INSTALLDIR)
  
 install-local: all
 	mkdir -p $(INSTALLDIR)
-	cp eniam-edge-score.cmxa eniam-edge-score.a eniam-edge-score.cma $(INSTALLDIR)
-	cp ENIAM_EdgeScore.cmi $(INSTALLDIR)
-	cp ENIAM_EdgeScore.cmx $(INSTALLDIR)
+	cp eniam-mst-disambiguation.cmxa eniam-mst-disambiguation.a eniam-mst-disambiguation.cma $(INSTALLDIR)
+	cp ENIAMmstDisambiguation.cmi ENIAMmstModel.cmi ENIAMmstFeatures.cmi $(INSTALLDIR)
+	cp ENIAMmstDisambiguation.cmx ENIAMmstModel.cmx ENIAMmstFeatures.cmx $(INSTALLDIR)
  
-eniam-edge-score.cma: $(SOURCES)
-	ocamlc -linkall -a -o eniam-edge-score.cma $(OCAMLFLAGS) $^
+eniam-mst-disambiguation.cma: $(SOURCES)
+	ocamlc -linkall -a -o eniam-mst-disambiguation.cma $(OCAMLFLAGS) $^
  
-eniam-edge-score.cmxa: $(SOURCES)
-	ocamlopt -linkall -a -o eniam-edge-score.cmxa $(INCLUDES) $^
+eniam-mst-disambiguation.cmxa: $(SOURCES)
+	ocamlopt -linkall -a -o eniam-mst-disambiguation.cmxa $(INCLUDES) $^
  
 test: test.ml
 	mkdir -p results
+.idea/
+out/
+*.model
+*.iml
+*.json
+Introduction
+============
+
+This file contains the configuration and build instructions for using
+Apache Ant (http://ant.apache.org) to build MSTParser, and for setting
+up your environment to use the scripts in the mstparser/bin
+directory. All the instructions in the original README file should
+continue to work as before -- this document describes an optional way
+of compiling and using MSTParser with a few more bells and whistles.
+
+
+Configuring your environment variables
+======================================
+
+The easiest thing to do is to set the environment variables JAVA_HOME
+and MSTPARSER_DIR to the relevant locations on your system. Set JAVA_HOME
+to match the top level directory containing the Java installation you
+want to use. Note that version 1.5 of the Java 2 SDK is required.
+
+For example, on Windows:
+
+C:\> set JAVA_HOME=C:\jdk1.5.0_04
+
+or on Unix:
+
+% setenv JAVA_HOME /usr/local/java
+  (csh)
+> JAVA_HOME=/usr/java; export JAVA_HOME
+  (ksh, bash)
+
+On Windows, to get these settings to persist, it's actually easiest to
+set your environment variables through the System Properties from the
+Control Panel. For example, under WinXP, go to Control Panel, click on
+System Properties, choose the Advanced tab, click on Environment
+Variables, and add your settings in the User variables area.
+
+Next, likewise set MSTPARSER_DIR to be the top level directory where you
+unzipped the download. In Unix, type 'pwd' in the directory where
+this file is and use the path given to you by the shell as
+MSTPARSER_DIR.  You can set this in the same manner as for JAVA_HOME
+above.
+
+Next, add the directory MSTPARSER_DIR/bin to your path. For example, you
+can set the path in your .bashrc file as follows:
+
+export PATH=$PATH:$MSTPARSER_DIR/bin
+
+Once you have taken care of these three things, you should be able to
+build and use MSTParser.
+
+
+Building the system
+===================
+
+The MSTParser build system is based on Apache Ant.
+Ant is a little but very handy tool that uses a build file written in
+XML (build.xml) as building instructions.
+
+To build the code, first make sure your current working
+directory is where the build.xml file is located. Then type:
+
+  sh build.sh      (Unix)
+
+If everything is right and all the required packages are visible, this
+action will generate a file called mstparser.jar in the ./output
+directory, and Java class files in ./output/classes.
+
+
+Build targets
+=============
+
+These are the meaningful targets for the main build file:
+
+  package  --> generates the openccg.jar file (default)
+  compile  --> compiles the source code
+  javadoc  --> generates the API documentation
+  clean    --> cleans up the compilation directory
+
+There are also build files in each sample grammar directory.
+
+To learn the details of what each target does, read the build.xml file.
+It is quite understandable.
+
+
+Trying it out
+=============
+
+If you've managed to configure and build the system, you should be
+able to run mstparser as described in the README, but without some of
+the extra classpath and memory options, and you should be able to do
+so from anywhere on your directory system. 
+
+If you trouble starting up any of the scripts, make sure you have set
+the environment variables properly, and that the scripts (located in
+mstparser/bin) call the right shell environment (top-line of the
+script; to solve the problem, either comment out this line or correct
+the path).
+
+Here's a brief description of some of the scripts:
+
+1. The shell script mst_parse.sh is just a simple wrapper that allows
+you to do this:
+
+> mst_parse.sh \
+  train train-file:data/train.ulab model-name:dep.model \
+  test test-file:data/test.ulab output-file:out.txt \
+  eval gold-file:data/test.ulab
+
+instead of this (as described in the readme):
+
+> java -classpath ".:lib/trove.jar" -Xmx1800m mstparser.DependencyParser \
+  train train-file:data/train.ulab model-name:dep.model \
+  test test-file:data/test.ulab output-file:out.txt \
+  eval gold-file:data/test.ulab
+
+
+[NOTE: actually, if you want to run MSTParser the latter way and you
+build MSTParser using Ant (via build.sh), then your Java class files
+will be contained in ./output/classes rather than ./mstparser, so the
+classpath would need to be "./output/classes:lib/trove.jar". The
+mst_parse.sh script takes care of this and sets the classpath
+appropriately.]
+
+2. The shell script mst_score.sh is just an easy way to call upon the
+main method of the mstparser.DependencyEvaluator class. Call it as
+such:
+
+> mst_score.sh <gold_standard_dependency_file> <parser_output_dependency_file> <format>
+
+where <format> is either MST or CONLL (default CONLL). Here's a concrete example:
+
+> mst_score.sh data/portuguese/floresta_test.conll testbed/my_floresta_parses.conll CONLL
+ 
+
+See the following site for more information on the CONLL format (as
+well as other dependency parsers and info, etc):
+
+http://nextens.uvt.nl/~conll/
+
+3. The Python program mst_experiment.py is more involved wrapper that
+allows one to easily do a randomized ten-fold while improving
+performance on development data. It manages the various files that
+MSTParser produces and keeps them tightly contained into a single
+output directory. It also has hooks for using a part-of-speech tagger
+(see pos_tag.py, which calls on the OpenNLP POS Tagger and assumes it
+is installed).
+
+You can see the options by running:
+
+> mst_experiment.py --help
+
+Here's an example to do an 8-fold cross-validation using the
+non-projective algorithm:
+
+> mst_experiment.py -f 8 -o experiment1 -d non-proj data/train.lab
+
+If you find it useful, that's great -- but before diving into it, you
+should be aware that you may have to hack the Python a bit for your
+own needs. It is actually put together from a program previous used to
+interface with the Bikel parser, so there may be some extraneous
+options and code hanging around.
+
+Note: Do NOT ask Ryan for any help with this Python script. Direct any
+questions to Jason Baldridge instead (see email below), and even then
+don't count on a rapid response.
+
+4. The Python script pos_tag.py calls on an unreleased version of the
+OpenNLP tagger, and is left here only as an example that might help
+you develop a similar script for other taggers. If you want the
+OpenNLP tagger, let Jason know and he will consider packaging it up
+with the parser more cleanly.
+
+5. The Python script mst2conll.py can be used to convert your existing
+MST format files to CONLL format.  The script conll2mst.py converts
+CONLL formated files into MST format.
+
+6. The Python script create_baseline.py creates a right or left
+linking baseline. Default is to create left linking -- use the option
+-r for right linking. Currently, it uses MST format for input and
+output, so you'll need to do some conversion if you have dependency
+files in CONLL format. (This script was adapted from one written by
+Ben Wing.)
+
+
+
+Bug Reports
+===========
+
+See the original README for bug reporting for the system
+itself. Report problems with the Ant build setup, the Python scripts,
+or these instructions to Jason Baldridge (jasonbaldridge@gmail.com).
+
+Also note: if you use Windows and are having problems, you are on your
+own.
+
+
+Special Note
+============
+
+Parts of these instructions and some of the directory structure are
+based on the OpenCCG (openccg.sf.net) project and the JDOM project
+(www.jdom.org).
+
+
+=NOTE: This file is for developers -- don't let it confuse you if are
+just giving MSTParser a spin. Check out README and ALT_README instead.
+
+To test that changes to the code have not messed up previous results,
+do the following.
+
+
+---------------------------------------------------------------------
+1. Parse English in MST format:
+
+Run the parser as such:
+
+> mst_parse.sh format:MST train train-file:data/train.lab model-name:testbed/model test test-file:data/test.lab output-file:testbed/my_english_parses.mst eval gold-file:data/test.lab
+
+Score the results:
+
+> mst_score.sh data/test.lab testbed/my_english_parses.mst MST > testbed/my_english_score.txt
+
+Then compare "english_parses.mst" to "my_english_parses.mst" and "english_score.txt" to
+"my_english_score.txt" -- they should be the same. (diff them)
+
+
+---------------------------------------------------------------------
+2. Parse Portuguese in CONLL format:
+
+> mst_parse.sh format:CONLL train train-file:data/portuguese/floresta_train.conll model-name:testbed/model test test-file:data/portuguese/floresta_test.conll output-file:testbed/my_floresta_parses.conll eval gold-file:data/portuguese/floresta_test.conll
+
+Score the results:
+
+> mst_score.sh data/portuguese/floresta_test.conll testbed/my_floresta_parses.conll CONLL > testbed/my_floresta_score.txt
+
+Compare as with English on the obvious file names.
+
+
+---------------------------------------------------------------------
+3. Parse English with second order model.
+
+Run the parser as such:
+
+> mst_parse.sh format:MST train train-file:data/train.lab model-name:testbed/model test test-file:data/test.lab output-file:testbed/my_english_parses_order2.mst eval gold-file:data/test.lab order:2
+
+Score the results:
+
+> mst_score.sh data/test.lab testbed/my_english_parses_order2.mst MST > testbed/my_english_score_order2.txt
+
+Compare with english_score_order2.txt.
+
+---------------------------------------------------------------------
+4. Parse Portuguese in CONLL format with second order model:
+
+> mst_parse.sh train train-file:data/portuguese/floresta_train.conll test test-file:data/portuguese/floresta_test.conll output-file:out.txt eval gold-file:data/portuguese/floresta_test.conll order:2 decode-type:non-proj
+
+-----------------------------------------------------------------------
+v0.5.1
+
+- Issue 10 - loadModel() method from DepdendencyParser should also be 
+  able to receive an InputStream
+- Issue 9 - Add a method to DependencyParser which return the Parse 
+  Trees
+- Issue 7 - Update source folder at ant script
+- Issue 6 - Change visibility of some methods and attributes to 
+  facilitate wrapping
+- Issue 2 - Convert project to maven
+
+-----------------------------------------------------------------------
+v0.5.0
+
+  UNKNOWN
+
+-----------------------------------------------------------------------
+v0.4.3b
+
+- Fixed bug: DependencyInstance serialization was not handling the
+  feats. This caused errors when using the non-projective decoder with
+  second order. (JMB 4-APR-07)
+
+-----------------------------------------------------------------------
+v0.4.3
+
+- Forest files are created in the tmp directory. Without this, two
+  instances of MSTParser being run on the same data set would
+  overwrite each other's feature forest files. Also, the forest files
+  created in tmp are deleted when the Java VM exits. (JMB, 21-JAN-07).
+
+- Separated out the standard sentential parsing features from extra
+  features used for discourse parsing. (JMB, 23-MAR-07)
+
+- Created ParserOptions so that it is easier to pass various options
+  between the parser and the pipes. (JMB, 23-MAR-07)
+
+- Fixed bug in serialization of DependencyInstances -- lemmas were not
+  being written out, and this caused the 2nd order stuff to
+  crash. (JMB 23-MAR-07)
+
+
+-----------------------------------------------------------------------
+v0.4.2
+
+- Results have improved slightly over previous testbed results. This
+  may be due to the fact that FeatureVector.dotProduct would have got
+  -1 return values on keys not held in the TIntDoubleHashMap for the
+  second vector in the previous version of Trove. Now that Trove
+  returns 0, this is actually the right behavior in this case. Another
+  possible explanation is that there is some minor change in the
+  features which are generated. Since the output has changed so
+  little, and for the better, I'll leave it at that for now. The
+  testbed results and output have been updated to reflect the current
+  version. (JMB, 17-JAN-07)
+
+- Uncommented a line in DependencyPipe that removed some features from
+  the parsing models in the previous release. (Need to come up with a
+  better way of defining different pipes!) (JMB, 17-JAN-07)
+
+- Changed the FeatureVector implementation to be a TLinkedList of
+  Feature objects, with two optional sub-FeatureVectors contained
+  within. This supports fast concatenation of two FeatureVectors since
+  it is no longer necessary to copy entire lists. Also, rather than
+  explicitly negating features for the getDistVector() method, a
+  boolean value is set that can optionally indicate the second
+  sub-FeatureVector as negated. The logic of the other methods then
+  preserves the negation (and negation with negation). Again, this
+  means we don't have to make copies for this operation. These changes
+  led sped up training by a factor of 2 to 4 (depedending on the
+  number of features used in the parsing model) and parsing by up to
+  1.5 times. (JMB, 17-JAN-07)
+
+- Updated to Trove v1.1b5. Changed default return value of
+  TObjectIntHashMap to be -1 rather than 0, so it is important to use
+  the included trove.jar rather than downloading and using one from
+  the Trove project. (Note: I tried to update to v2.0a2, but the test
+  suites broke with that version. Attempts to sort out the problem
+  were unsuccessful, so V1.1b5 will just have to do for now.) (JMB,
+  16-JAN-07)
+
+- Removed addIfNotPresent boolean from lookupIndex in Alphabet since
+  it isn't used in MSTParser and it incurs an extra method call and
+  boolean check on a very common method. (JMB, 16-JAN-07)
+
+- Added support for relational features, which hold between two
+  utterances. These features are defined as an NxN matrix (N=number of
+  parsing units) below the main CoNLL format declarations. This is
+  mainly introduced for discourse parsing to allow for features like
+  whether two parsing units are in the same sentence or paragraph, or
+  if they both contain references to the same entity. It can be
+  ignored for sentence parsing -- everything continues to work as
+  before. (The distance between two units is an example of such a
+  feature in sentence parsing, but this can be computed on the fly, so
+  it isn't necessary to use such a matrix.) (JMB, 14-JAN-07)
+
+
+-----------------------------------------------------------------------
+v0.4.0
+
+- Cleaned up Pipes considerably; eg, Pipe2O doesn't replicate so much
+  code from Pipe. Many of the createFeatureVector methods were renamed
+  to things like addCoreFeatures. (JMB)
+
+- If one uses MST format, the creation of posA and the
+  5-character-substring features now are put into dependency instances
+  in MSTReader as the course pos tags and lemmas, respectively. Then
+  in the feature extraction code, rather than creating posA etc on the
+  fly, it just references those fields in the dependency
+  instance. That way, if you use conll format, you get to use lemma
+  and course tag values supplied by the annotations. (JMB)
+
+- Can utilize the FEAT1|FEAT2|...|FEATN field of the CONLL format to
+  allow abitrary features. See addCoreFeatures() in the DependencyPipe
+  class. (JMB)
+
+-----------------------------------------------------------------------
+v0.2.2
+
+- MSTParser now works with both MST and CONLL formats.  Pipes are now
+  passed a parameter for which format they use, and they call upon
+  Readers and Writers that know how to handle each format. CONLL is
+  the default format. (JMB)
+
+- Added a subset of the Portuguese data from CONLL to test the CONLL
+  format and to have another data set for the testbed. See TESTBED
+  (JMB)
+
+- Included an Ant build system that does some nice things, but which
+  can be ignored if make is preferred. Highlights of the additional
+  capabilities are: (1) class files are put in a location
+  (./output/classes) separate from the .java files; (2) you can get
+  javadocs (./doc/api) by running "sh build.sh javadoc"; (3) you can
+  make a release with "sh build.sh release" You don't need to install
+  anything extra ( ant.jar in in ./lib); the only additional steps
+  needed to use the Ant build setup is to set the JAVA_HOME and
+  MSTPARSER_DIR environment variables appropriately. (JMB)
+
+
+
+
+This software is Copyright (C) 2005 University of Pennsylvania and
+this software is Copyright (C) 2002, 2003 University of Massachusetts
+Amherst, Department of Computer Science, and is licensed under the
+terms of the Common Public License, Version 1.0 or (at your option)
+any subsequent version.
+
+The license is approved by the Open Source Initiative, and is available
+from their website at http://www.opensource.org.
+
+=====================
+
+Common Public License Version 1.0
+
+THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS COMMON
+PUBLIC LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF
+THE PROGRAM CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
+
+1. DEFINITIONS
+
+"Contribution" means:
+
+a) in the case of the initial Contributor, the initial code and
+documentation distributed under this Agreement, and
+
+b) in the case of each subsequent Contributor:
+
+i) changes to the Program, and
+
+ii) additions to the Program;
+
+where such changes and/or additions to the Program originate from and
+are distributed by that particular Contributor. A Contribution
+'originates' from a Contributor if it was added to the Program by such
+Contributor itself or anyone acting on such Contributor's
+behalf. Contributions do not include additions to the Program which:
+(i) are separate modules of software distributed in conjunction with
+the Program under their own license agreement, and (ii) are not
+derivative works of the Program.
+
+"Contributor" means any person or entity that distributes the Program.
+
+"Licensed Patents " mean patent claims licensable by a Contributor
+which are necessarily infringed by the use or sale of its Contribution
+alone or when combined with the Program.
+
+"Program" means the Contributions distributed in accordance with this
+Agreement.
+
+"Recipient" means anyone who receives the Program under this
+Agreement, including all Contributors.
+
+2. GRANT OF RIGHTS
+
+a) Subject to the terms of this Agreement, each Contributor hereby
+grants Recipient a non-exclusive, worldwide, royalty-free copyright
+license to reproduce, prepare derivative works of, publicly display,
+publicly perform, distribute and sublicense the Contribution of such
+Contributor, if any, and such derivative works, in source code and
+object code form.
+
+b) Subject to the terms of this Agreement, each Contributor hereby
+grants Recipient a non-exclusive, worldwide, royalty-free patent
+license under Licensed Patents to make, use, sell, offer to sell,
+import and otherwise transfer the Contribution of such Contributor, if
+any, in source code and object code form. This patent license shall
+apply to the combination of the Contribution and the Program if, at
+the time the Contribution is added by the Contributor, such addition
+of the Contribution causes such combination to be covered by the
+Licensed Patents. The patent license shall not apply to any other
+combinations which include the Contribution. No hardware per se is
+licensed hereunder.
+
+c) Recipient understands that although each Contributor grants the
+licenses to its Contributions set forth herein, no assurances are
+provided by any Contributor that the Program does not infringe the
+patent or other intellectual property rights of any other entity. Each
+Contributor disclaims any liability to Recipient for claims brought by
+any other entity based on infringement of intellectual property rights
+or otherwise. As a condition to exercising the rights and licenses
+granted hereunder, each Recipient hereby assumes sole responsibility
+to secure any other intellectual property rights needed, if any. For
+example, if a third party patent license is required to allow
+Recipient to distribute the Program, it is Recipient's responsibility
+to acquire that license before distributing the Program.
+
+d) Each Contributor represents that to its knowledge it has sufficient
+copyright rights in its Contribution, if any, to grant the copyright
+license set forth in this Agreement.
+
+3. REQUIREMENTS
+
+A Contributor may choose to distribute the Program in object code form
+under its own license agreement, provided that:
+
+a) it complies with the terms and conditions of this Agreement; and
+
+b) its license agreement:
+
+i) effectively disclaims on behalf of all Contributors all warranties
+and conditions, express and implied, including warranties or
+conditions of title and non-infringement, and implied warranties or
+conditions of merchantability and fitness for a particular purpose;
+
+ii) effectively excludes on behalf of all Contributors all liability
+for damages, including direct, indirect, special, incidental and
+consequential damages, such as lost profits;
+
+iii) states that any provisions which differ from this Agreement are
+offered by that Contributor alone and not by any other party; and
+
+iv) states that source code for the Program is available from such
+Contributor, and informs licensees how to obtain it in a reasonable
+manner on or through a medium customarily used for software exchange.
+
+When the Program is made available in source code form:
+
+a) it must be made available under this Agreement; and
+
+b) a copy of this Agreement must be included with each copy of the
+Program.
+
+Contributors may not remove or alter any copyright notices contained
+within the Program.
+
+Each Contributor must identify itself as the originator of its
+Contribution, if any, in a manner that reasonably allows subsequent
+Recipients to identify the originator of the Contribution.
+
+4. COMMERCIAL DISTRIBUTION
+
+Commercial distributors of software may accept certain
+responsibilities with respect to end users, business partners and the
+like. While this license is intended to facilitate the commercial use
+of the Program, the Contributor who includes the Program in a
+commercial product offering should do so in a manner which does not
+create potential liability for other Contributors. Therefore, if a
+Contributor includes the Program in a commercial product offering,
+such Contributor ("Commercial Contributor") hereby agrees to defend
+and indemnify every other Contributor ("Indemnified Contributor")
+against any losses, damages and costs (collectively "Losses") arising
+from claims, lawsuits and other legal actions brought by a third party
+against the Indemnified Contributor to the extent caused by the acts
+or omissions of such Commercial Contributor in connection with its
+distribution of the Program in a commercial product offering. The
+obligations in this section do not apply to any claims or Losses
+relating to any actual or alleged intellectual property
+infringement. In order to qualify, an Indemnified Contributor must: a)
+promptly notify the Commercial Contributor in writing of such claim,
+and b) allow the Commercial Contributor to control, and cooperate with
+the Commercial Contributor in, the defense and any related settlement
+negotiations. The Indemnified Contributor may participate in any such
+claim at its own expense.
+
+For example, a Contributor might include the Program in a commercial
+product offering, Product X. That Contributor is then a Commercial
+Contributor. If that Commercial Contributor then makes performance
+claims, or offers warranties related to Product X, those performance
+claims and warranties are such Commercial Contributor's responsibility
+alone. Under this section, the Commercial Contributor would have to
+defend claims against the other Contributors related to those
+performance claims and warranties, and if a court requires any other
+Contributor to pay any damages as a result, the Commercial Contributor
+must pay those damages.
+
+5. NO WARRANTY
+
+EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS
+PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY
+WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY
+OR FITNESS FOR A PARTICULAR PURPOSE. Each Recipient is solely
+responsible for determining the appropriateness of using and
+distributing the Program and assumes all risks associated with its
+exercise of rights under this Agreement, including but not limited to
+the risks and costs of program errors, compliance with applicable
+laws, damage to or loss of data, programs or equipment, and
+unavailability or interruption of operations.
+
+6. DISCLAIMER OF LIABILITY
+
+EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR
+ANY CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING
+WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OR
+DISTRIBUTION OF THE PROGRAM OR THE EXERCISE OF ANY RIGHTS GRANTED
+HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+7. GENERAL
+
+If any provision of this Agreement is invalid or unenforceable under
+applicable law, it shall not affect the validity or enforceability of
+the remainder of the terms of this Agreement, and without further
+action by the parties hereto, such provision shall be reformed to the
+minimum extent necessary to make such provision valid and enforceable.
+
+If Recipient institutes patent litigation against a Contributor with
+respect to a patent applicable to software (including a cross-claim or
+counterclaim in a lawsuit), then any patent licenses granted by that
+Contributor to such Recipient under this Agreement shall terminate as
+of the date such litigation is filed. In addition, if Recipient
+institutes patent litigation against any entity (including a
+cross-claim or counterclaim in a lawsuit) alleging that the Program
+itself (excluding combinations of the Program with other software or
+hardware) infringes such Recipient's patent(s), then such Recipient's
+rights granted under Section 2(b) shall terminate as of the date such
+litigation is filed.
+
+All Recipient's rights under this Agreement shall terminate if it
+fails to comply with any of the material terms or conditions of this
+Agreement and does not cure such failure in a reasonable period of
+time after becoming aware of such noncompliance. If all Recipient's
+rights under this Agreement terminate, Recipient agrees to cease use
+and distribution of the Program as soon as reasonably
+practicable. However, Recipient's obligations under this Agreement and
+any licenses granted by Recipient relating to the Program shall
+continue and survive.
+
+Everyone is permitted to copy and distribute copies of this Agreement,
+but in order to avoid inconsistency the Agreement is copyrighted and
+may only be modified in the following manner. The Agreement Steward
+reserves the right to publish new versions (including revisions) of
+this Agreement from time to time. No one other than the Agreement
+Steward has the right to modify this Agreement. IBM is the initial
+Agreement Steward. IBM may assign the responsibility to serve as the
+Agreement Steward to a suitable separate entity. Each new version of
+the Agreement will be given a distinguishing version number. The
+Program (including Contributions) may always be distributed subject to
+the version of the Agreement under which it was received. In addition,
+after a new version of the Agreement is published, Contributor may
+elect to distribute the Program (including its Contributions) under
+the new version. Except as expressly stated in Sections 2(a) and 2(b)
+above, Recipient receives no rights or licenses to the intellectual
+property of any Contributor under this Agreement, whether expressly,
+by implication, estoppel or otherwise. All rights in the Program not
+expressly granted under this Agreement are reserved.
+
+This Agreement is governed by the laws of the State of New York and
+the intellectual property laws of the United States of America. No
+party to this Agreement will bring a legal action under this Agreement
+more than one year after the cause of action arose. Each party waives
+its rights to a jury trial in any resulting litigation.
+-------------------------
+MSTParser version 0.5.0
+-------------------------
+
+This is the main README. See ALT_README for some extra utilities and
+an alternative build process to the one described in this README. The
+package has been modified by Jason Baldridge -- this version should
+produce the same results as Ryan McDonald's previous releases, but it
+has been made more flexible and configurable in the input formats it
+accepts (both MST and CoNLL) and in the way features are declared (see
+the DependencyPipe class).
+
+-------------------------
+
+
+The following package contains a java implementation of the dependency
+parsers described in:
+
+Non-Projective Dependency Parsing using Spanning Tree Algorithms
+R. McDonald, F. Pereira, K. Ribarov and J. Hajic
+HLT-EMNLP, 2005
+
+Online Large-Margin Training of Dependency Parsers
+R. McDonald, K. Crammer and F. Pereira
+ACL, 2005
+
+Online Learning of Approximate Dependency Parsing Algorithms
+R. McDonald and F. Pereira
+EACL, 2006
+
+In addition, the parsers in this package can also learn and produce typed
+dependency trees (i.e. trees with edge labels).
+
+The parser should work with Java 1.4 and 1.5
+
+If there are any problems running the parser then email: ryantm@cis.upenn.edu
+I will only respond to questions not answered in this README.
+
+
+----------------
+Contents
+----------------
+
+1. Compiling
+
+2. Example of usage
+
+3. Running the parser
+   a. Input data format
+   b. Training a parser
+   c. Running a trained model on new data
+   d. Evaluating output
+
+4. Memory/Disk space and performance issues
+
+5. Reproducing results in HLT-EMNLP and ACL papers
+
+
+----------------
+1. Compiling
+----------------
+
+To compile the code, first unzip/tar the downloaded file:
+
+> gunzip mstparser.tar.gz
+> tar -xvf mstparser.tar
+> cd MSTParser
+
+Next, run the following command
+
+> javac -classpath ".:lib/trove.jar" mstparser/DependencyParser.java
+
+This will compile the package.
+
+
+---------------------
+2. Example Usage
+---------------------
+
+In the directory data/ there are examples of training and testing data. Data
+format is described in the next section.
+
+train.ulab/test.ulab
+- training and testing data with unlabeled trees
+
+train.lab/test.lab
+- training and testing data with labeled trees
+
+To run an unlabeled parser type:
+
+> java -classpath ".:lib/trove.jar" -Xmx1800m mstparser.DependencyParser \
+  train train-file:data/train.ulab model-name:dep.model \
+  test test-file:data/test.ulab output-file:out.txt \
+  eval gold-file:data/test.ulab format:MST
+  
+This will train a parser on the training data, run it on the testing data and
+evaluate the output against the gold standard. The results from running the
+parser are in the file out.txt and the trained model in dep.model.
+
+To train an labeled parser run the same command but use the labeled training
+and testing files.
+
+
+-------------------------
+3. Running the Parser
+-------------------------
+
+-------------------------
+3a. Input data format
+-------------------------
+
+**************************** NOTE **********************************
+The parser now uses CONLL format as a default. Note the inclusion of
+the format:MST option in the instructions below, which differ from the
+instructions in previous versions (v0.2 and before). If you wish to
+run the parser on CONLL formatted files, use format:CONLL or just
+don't include the format option.
+********************************************************************
+
+Example data sets are given in the data/ directory.
+
+Each sentence in the data is represented by 3 or 4 lines and sentences are
+space separated. The general format is:
+
+w1    w2    ...    wn
+p1    p2    ...    pn
+l1    l2    ...    ln
+d1    d2    ...    d2
+
+....
+
+
+Where,
+- w1 ... wn are the n words of the sentence (tab deliminated)
+- p1 ... pn are the POS tags for each word
+- l1 ... ln are the labels of the incoming edge to each word
+- d1 ... dn are integers representing the postition of each words parent
+
+For example, the sentence "John hit the ball" would be:
+
+John	hit	the	ball
+N	V	D	N
+SBJ	ROOT	MOD	OBJ
+2	0	4	2
+
+Note that hit's parent is indexed by 0 since it is the root.
+
+If you wish to only train or test an unlabeled parser, then simply leave out
+the third line for each sentence, e.g.,
+
+John	hit	the	ball
+N	V	D	N
+2	0	4	2
+
+The parser will automatically detect that it should produce unlabeled trees.
+
+Note that this format is the same for training AND for running the parser on
+new data. Of course, you may not always know the gold standard. In this case,
+just substitute lines 3 (the edge labels) and lines 4 (the parent indexes) with
+dummy values. The parser just ignores these values and produces its own.
+
+
+----------------------------
+3b. Training the parser
+----------------------------
+
+If you have a set of labeled data, first place it in the format described
+above.
+
+If your training data is in a file train.txt, you can then run the command:
+
+> java -classpath ".:lib/trove.jar" -Xmx1800m mstparser.DependencyParser \
+  train train-file:train.txt format:MST
+
+This will train a parser with all the default properties. Additonal
+properties can be described with the following flags:
+
+train
+- if present then parser will train a new model
+
+train-file:file.txt
+- use data in file.txt to train the parser
+
+model-name:model.name
+- store trained model in file called model.name
+
+iters:numIters
+- Run training algorithm for numIters epochs, default is 10
+
+decode-type:type
+- type is either "proj" or "non-proj", e.g. decode-type:proj
+- Default is "proj"
+- "proj" use the projective parsing algorithm during training
+  - i.e. The Eisner algorithm
+- "non-proj" use the non-projective parsing algorithm during training
+  - i.e. The Chu-Liu-Edmonds algorithm
+
+training-k:K
+- Specifies the k-best parse set size to create constraints during training
+- Default is 1
+- For non-projective parsing algorithm, k-best decoding is approximate
+
+loss-type:type
+- type is either "punc" or "nopunc", e.g. loss-type:punc
+- Default is "punc"
+- "punc" include punctuation in hamming loss calculation
+- "nopunc" do not include punctuation in hamming loss calculation
+
+create-forest:cf
+- cf is either "true" or "false"
+- Default is "true"
+- If create-forest is false, it will not create the training parse forest (see
+  section 4). It assumes it has been created.
+- This flag is useful if you are training many models on the same data and
+  features but using different parameters (e.g. training iters, decoding type).
+
+order:ord
+- ord is either 1 or 2
+- Default is 1
+- Specifies the order/scope of features. 1 only has features over single edges
+  and 2 has features over pairs of adjacent edges in the tree.
+
+format:FORMAT
+- FORMAT is either MST or CONLL
+- Default is CONLL
+- Specifies whether the input/output format. MST is the format used by
+  MSTParser until version 0.2.1. CONLL is the format used in the
+  CONLL-X shared task (see http://nextens.uvt.nl/~conll/).
+
+------------------------------------------------
+3c. Running a trained model on new data
+------------------------------------------------
+
+This section assumes you have trained a model and it is stored in dep.model.
+
+First, format your data properly (section 3a).
+
+It should be noted that the parser assumes both words and POS tags. To
+generate POS tags for your data I suggest using the Ratniparkhi POS tagger
+or another tagger of your choice.
+
+The parser also assumes that the edge label and parent index lines are
+in the input. However, these can just be artificially inserted (e.g. with lines
+of "LAB ... LAB" and "0 ... 0") since the parser will produce these lines
+as output.
+
+If the data is in a file called test.txt, run the command:
+
+> java -classpath ".:lib/trove.jar" -Xmx1800m mstparser.DependencyParser \
+  test model-name:dep.model test-file:test.txt output-file:out.txt format:MST
+
+This will create an output file "out.txt" with the predictions of the parser.
+Other properties can be defined with the following flags:
+
+test
+- If included a trained parser will be run on the testing data
+
+test-file:file.txt
+- The file containing the data to run the parser on
+
+model-name:model.name
+- The name of the stored model to be used
+
+output-file:out.txt
+- The result of running the parser on the new data
+
+decode-type:type
+- See section 3b.
+
+order:ord
+- See section 3b. THIS NEEDS TO HAVE THE SAME VALUE OF THE TRAINED MODEL!!
+
+format:FORMAT
+- See section 3b.
+
+Note that if you train a labeled model, you should only run it expecting
+labeled output (e.g. the test data should have 4 lines per sentence).
+And if you train an unlabeled model, you should only run it expecting
+unlabeled output (e.g. the test data should have 3 lines per sentence).
+
+
+------------------------
+3d. Evaluating Output
+------------------------
+
+This section describes a simple class for evaluating the output of
+the parser against a gold standard.
+
+Assume you have a gold standard, say test.txt and the output of the parser
+say out.txt, then run the following command:
+
+> java -classpath ".:lib/trove.jar" -Xmx1800m mstparser.DependencyParser \
+  eval gold-file:test.txt output-file:out.txt MST
+
+This will return both labeled and unlabeled accuracy (if the data sets contain
+labeled trees) as well as complete sentence accuracy, again labeled and
+unlabeled.
+
+If your data is in CONLL format instead of MST format (pre-v0.2.1),
+then replace MST by CONLL in the above command, or just leave it off
+-- it defaults to CONLL.
+
+We should note that currently this evaluation script includes all punctuation.
+In future releases we will modify this class to allow for the evaluation to
+ingnore punctuation, which is standard for English (Yamada and Matsumoto 03).
+
+
+---------------------------------------------
+4. Memory/Disk space and performance issues
+---------------------------------------------
+
+This parser is memory and disk space intensive.
+
+MEMORY ISSUES
+
+Remember to always run java with the flag -Xmx1800m to use all available
+memory for the heap. For 64-bit machines use an even larger value, say
+-Xmx8000m.
+
+Training a model on the WSJ can be done easily on a 32-bit machine.
+It should also be possible to train a model on the entire Prague Dependency
+Treebank on a 32-bit machine (I have done it), but I make no guarantees.
+
+DISK ISSUES
+
+To make training quicker we store the entire parse forest on disk, ala
+Clark and Curran 04. This can be very large, up to and over 20GB!! Be aware
+of this fact.
+
+If you train using a file called train.txt, the forest will be stored in
+a file called train.txt.forest. If disk space is an issue you can remove this
+file immediately after training (it is not need to run the parser on new data).
+
+However, sometimes it is good to keep this file around. Particularly, if you
+are retraining a model on the same data and feature space but want to try
+different training settings. By using the create-forest:false flag, you
+can avoid having to recreate this file (which can take some time).
+
+PERFORMANCE ISSUES
+
+Once a model has been trained, running the model on new data is pretty quick.
+However, as with all discriminative trained parsers, it does take some time
+to train a parser. On a two year old 32-bit machine is will take 10-15 hours
+to train a model on the entire Penn Treebank and around 24-30 hours to train
+a model on the Prague Dependency Treebank. Newer machines or 64-bit machines
+are of course much quicker.
+
+
+-------------------------------------------------------
+5. Reproducing results in HLT-EMNLP and ACL papers
+-------------------------------------------------------
+
+To reproduce the English results in McDonald et al. ACL 2005,
+
+> java -classpath ".:lib/trove.jar" -Xmx1800m mstparser.DependencyParser \
+  train train-file:train.wsj model-name:eng.wsj.model \
+  training-k:5 loss-type:nopunc decode-type:proj \
+  test test-file:test.wsj output-file:out.txt \
+  eval gold-file:test.wsj format:MST
+
+This assumes that train.wsj is section 02-21 of the Penntreebank formatted
+above and dependencies extracted using the head-rules of Yamada and Matsumoto.
+See Joakim Nivre's tool set at:
+  http://w3.msi.vxu.se/~nivre/research/Penn2Malt.html
+for a tool set to convert the WSJ to dependencies using the Yamada and
+Matsumoto head rules.
+
+test.wsj is section 23 of the WSJ converted as above. Furthermore, POS tags are
+supplied using Adwait Ratniparkhi's MXPOST tool-kit trained on sections 02-21.
+This can be found at:
+http://www.cogsci.ed.ac.uk/~jamesc/taggers/MXPOST.html
+
+Note that the evaluation will be slightly off from the results reported. This
+is because the evaluation scripts include punctuation. If you modify the
+evaluation script to discount punctuation, results will align.
+
+
+To reproduce the Czech results in McDonald et al. HLT-EMNLP 2005,
+
+> java -classpath ".:lib/trove.jar" -Xmx1800m mstparser.DependencyParser \
+  train train-file:train.pdt model-name:czech.pdt.model \
+  training-k:1 loss-type:punc decode-type:non-proj \
+  test test-file:test.pdt output-file:out.txt \
+  eval gold-file:test.pdt format:MST
+
+This assumes train.pdt and test.pdt are the training and testing sections
+of the Prague Dependency Treebank v1.0 formatted above. We use the
+automatically assigned POS tags that have been reduced (see paper).
+#! /usr/bin/python
+
+import sys;
+
+# Open File
+f = open(sys.argv[1],'rt');
+
+wrds = ""; pos = ""; labs = ""; par = "";
+
+for line in f:
+    
+    sent = line.split();
+
+    if len(sent) > 0:
+        wrds += sent[1] + "\t";
+        pos += sent[4] + "\t";
+        labs += sent[7] + "\t";
+        par += sent[6] + "\t";
+    else:
+        print wrds; wrds = "";
+        print pos; pos = "";
+        print labs; labs = "";
+        print par; par = "";
+        print "";
+
+f.close();
+
+#!/usr/bin/python
+import re
+import optparse
+import fileinput
+import sys
+
+###########################################################################
+#
+# Command-line options and usage
+#
+###########################################################################
+
+usage = """%prog [OPTIONS] FILE ...
+
+Convert from one dependency style to another.
+Use -f FROM and -t TO to specify the input and output formats.
+"""
+
+parser = optparse.OptionParser(usage=usage)
+
+parser.add_option("-r", "--rightward", action="store_true",
+                   default=False,
+                   help="""Create right-linking baseline.""")
+
+parser.add_option("-d", "--default-relation", action="store",
+		  default="Elaboration",
+		  help="Pick default relation.",
+		  metavar="RELATION")
+
+def transform_meta_chars(string):
+    return string.replace(",","+comma+")
+def untransform_meta_chars(string):
+    return string.replace("+comma+",",")
+
+## Output dependencies for one sentence
+def output_one_sentence(deps):
+    accum = [[], [], [], [], []]
+    for dep in deps:
+        for num in xrange(len(dep)):
+            accum[num].append(dep[num])
+    accum = ["\t".join([str(x) for x in y]) for y in accum]
+    print "\n".join(accum[1:])
+    print
+
+
+## Get options
+
+(options, args) = parser.parse_args()
+
+## Process file(s)
+
+lines = fileinput.input(args)
+
+deps = []
+
+## Read input
+
+sentence_info = []
+for line in lines:
+    line = line.strip()
+    if not line:
+	num_words = len(sentence_info[0])
+	baseline_deps = range(num_words)
+	if options.rightward:
+	    baseline_deps.pop(0)
+	    baseline_deps.pop(0)
+	    baseline_deps += [num_words, 0]
+
+	sentence_info[2] = [options.default_relation]*num_words
+	sentence_info[3] = baseline_deps
+
+	try:
+	    for i in xrange(len(sentence_info[0])):
+		deps.append([i+1]+[row[i] for row in sentence_info])
+	except:
+	    #print sentence_info
+	    print "\n".join([len(x) for x in sentence_info])
+	    sys.exit(0)
+
+	#print deps
+        output_one_sentence(deps)
+        deps = []
+        sentence_info = []
+    else:
+        sentence_info.append(line.split())
+
+#!/bin/sh
+# sets MST environment variables
+MST_LIB=$MSTPARSER_DIR/lib
+DIRLIBS=$MST_LIB/trove.jar
+CP=${MSTPARSER_DIR}/output/classes:${DIRLIBS}
+JAVA=$JAVA_HOME/bin/java
+JAVA_CMD="$JAVA -Xmx1800m -classpath $CP "
+#! /usr/bin/python
+
+import sys;
+
+# Open File
+f = open(sys.argv[1],'rt');
+
+wrds = "";
+pos = "";
+labs = "";
+par = "";
+
+for line in f:
+
+    if len(line.strip()) == 0:
+        w = wrds.split(); p = pos.split(); l = labs.split(); pa = par.split();
+        cnt = 1;
+        for t in w:
+            print str(cnt) + "\t" + t + "\t" + t + "\t" + p[cnt-1] + "\t" + p[cnt-1] + "\t_\t" + pa[cnt-1] + "\t" + l[cnt-1];
+            cnt += 1;
+        print "";
+        wrds = ""; pos = ""; labs = ""; par = "";
+    elif len(wrds) == 0:
+        wrds = line;
+    elif len(pos) == 0:
+        pos = line;
+    elif len(labs) == 0:
+        labs = line;
+    else:
+        par = line;
+
+f.close();
+
+#!/usr/bin/python
+
+import os
+import sys
+import optparse
+
+## Check that MSTPARSER_DIR environment variable is set and get it
+global mstparser_dir
+mstparser_dir = ''
+if os.environ.has_key('MSTPARSER_DIR'):
+    mstparser_dir = os.environ['MSTPARSER_DIR']
+else:
+    print "Please set the MSTPARSER_DIR environment variable to where you have the MSTParser installed."
+    exit(1)
+
+
+###########################################################################
+#
+# Run a single fold. This could actually be not a "fold" per se, but
+# actually explicitly provided training and test files.
+#
+###########################################################################
+
+def create_tag_train_file (source_file, formatted_file):
+
+    output = file(formatted_file, "w")
+
+    input = file(source_file)
+    line = input.readline()
+    while not(line == ""):
+        words = line.strip().split("\t")
+        line = input.readline()
+        tags = line.strip().split("\t")
+
+        # the splitting takes care of word+stem representations like biliyor+bil
+        merged = [words[i].split("+")[0]+"_"+tags[i].replace("_", "+us+") \
+                  for i in range(len(words))]
+
+        output.write(" ".join(merged)+"\n")
+
+        input.readline() # eat up labels
+        input.readline() # eat up dependencies
+        input.readline() # eat blank line
+        line = input.readline() # read words of next sentence
+
+    output.close()
+
+
+def run_single_train_and_test(options, train_filename,
+                              test_filename, output_filename, args):
+
+
+    realtest_filename = test_filename
+    # Tag the test sentences if requested
+    if options.tag_source == "OTK_Tagger":
+        print "  Tagging test sentences..."
+
+        tag_train_filename = train_filename+".tagged"
+        
+        create_tag_train_file(train_filename, tag_train_filename)
+
+        tagged_filename = test_filename+".tagged.tmp"
+        tag_command = "python %s/bin/pos_tag.py -o %s %s %s %s" \
+                      % (mstparser_dir,
+                         options.output_dir,
+                         tag_train_filename,
+                         test_filename,
+                         tagged_filename)
+        
+        #print >> argfile, tag_command
+        if options.verbose:
+            print tag_command
+            os.system(tag_command)
+            #os.system(tag_command+' |tee --append '+options.output_dir+'/tag.out 2>&1')
+        else:
+            os.system(tag_command+' &>/dev/null')
+            #os.system(tag_command+' >> '+options.output_dir+'/tag.out 2>&1')
+
+
+        tag_lines = []
+        counter = 0
+        for line in file(tagged_filename):
+            if counter % 2 == 1:
+                tag_lines.append(line)
+            counter += 1
+
+        realtest_filename = test_filename+".tagged"
+        output = file(realtest_filename, "w")
+        counter = 0
+        for line in file(test_filename):
+            if counter % 5 == 1:
+                output.write(tag_lines[(counter-1)/5])
+            else:
+                output.write(line)
+            counter += 1
+
+        output.close()
+
+
+    # Train the parser
+    print "  Training and evaluating..."
+
+    train_command = 'mst_parse.sh train train-file:%s model-name:%s/dep.model decode-type:%s test test-file:%s output-file:%s %s' % (train_filename, options.output_dir, options.decoder_type, realtest_filename, output_filename, " ".join(args[1:]))
+
+    if options.verbose:
+	print train_command
+	os.system(train_command)
+    else:
+	os.system(train_command+' &>/dev/null')
+    
+
+###################### END FUNCTION DEFINITIONS ########################
+
+
+## Get options
+
+opt_parser = optparse.OptionParser()
+opt_parser.add_option("-l", "--language", action="store", default='Unspecified',
+		  help="use configurations specific to LANGUAGE",
+		  metavar="LANGUAGE")
+opt_parser.add_option("-e", "--eval_file", action="store", default='Generated',
+		  help="Read evaluation sentences from FILE. Using this option means that cross-validation will not be used.",
+		  metavar="FILE")
+opt_parser.add_option("-d", "--decoder_type", action="store",
+                      choices=['proj', 'non-proj'],
+		      default="proj",
+		      help="Use a projective or non-projective algorithm.E",
+		      metavar="FILE")
+opt_parser.add_option("-o", "--output_dir", action="store", default='output',
+		  help="save parser output to DIR",
+		  metavar="DIR")
+opt_parser.add_option("-f", "--num_folds", action="store", default=10,
+		  help="The number of folds to use in cross-validation (Default=10).",
+		  metavar="NUM")
+opt_parser.add_option("-v", "--verbose", action="store_true", default=False,
+                      help="be verbose")
+
+opt_parser.add_option("-t", "--tag_source", choices=['Gold','OTK_Tagger'],
+                   default='Gold',
+                   help="use tags from Gold standard or from a tagger (Gold (default), OTK_Tagger)",
+                   metavar="SOURCE")
+
+(options, args) = opt_parser.parse_args()
+
+#Convert from FP to Int
+options.num_folds = int(options.num_folds)
+
+# Check that the requested output directory doesn't exist and isn't a
+# file. If it's okay, create the directory.
+output_dir = options.output_dir
+if os.path.isdir(output_dir):
+    os.system("rm -rf %s" % output_dir)
+elif os.path.isfile(output_dir):
+    raise OSError("A file with the same name as the desired dir, " \
+		  "'%s', already exists." % output_dir)
+os.makedirs(output_dir)
+
+
+# This file accumulates the results across all folds.
+model_output_filename = output_dir+"/model_out"
+os.system('touch %s' % model_output_filename)
+
+## Process files
+
+train_filename = args[0]
+
+# This file accumulates the gold dependencies across all folds.
+gold_deps_filename = output_dir+"/gold.deps"
+
+if options.eval_file == "Generated":
+
+    num_folds = int(options.num_folds)
+    
+    print "Running a %d-fold evaluation on file %s" \
+          % (num_folds, train_filename)
+    print
+    
+    # Align parses with their corresponding sentences and assign a
+    # partition id to them.
+    
+    train_file = file(train_filename)
+    
+    examples = []
+    
+    next_example = train_file.readline()
+    
+    counter = 0
+    while next_example:
+        partition = counter % num_folds
+    
+        elements = []
+        while next_example and next_example != "\n":
+            elements += next_example
+            next_example = train_file.readline()
+    
+        examples.append((partition, elements))
+    
+        next_example = train_file.readline()
+    
+        counter += 1
+    
+    
+    # Close the sentences file and delete it. (It was either copied or
+    # generated, so it's okay.)
+    train_file.close()
+    
+    # Train/test on each partion
+    
+    gold_deps = open(gold_deps_filename,"w")
+    
+    # Run each fold. The output from each fold is appended to gold.deps
+    # and model.deps
+    #for test_partition in range(1):
+    for test_partition in range(num_folds):
+    
+        print "Fold",test_partition
+    
+        train_filename = output_dir+"/train"
+        train_set = open(train_filename, "w")
+    
+        test_filename = output_dir+"/test"
+        test_set = open(test_filename, "w")
+    
+        counter = 0
+        for ex in examples:
+            if ex[0] == test_partition:
+                test_set.write("".join(ex[1])+"\n")
+                gold_deps.write("".join(ex[1])+"\n")
+            else:
+                train_set.write("".join(ex[1])+"\n")
+    
+        counter += 1
+    
+        train_set.close()
+        test_set.close()
+    
+        # Run the fold.
+        output_filename = output_dir+"/output"
+        run_single_train_and_test(options, train_filename, test_filename, output_filename, args)
+    
+        # Pile this fold's output onto the accumulating result file.
+        os.system('cat %s >> %s' % (output_filename, model_output_filename))
+    
+        gold_deps.flush()
+    
+    gold_deps.close()
+
+else:
+    os.system('cp %s %s' %(options.eval_file, gold_deps_filename))
+    
+    run_single_train_and_test(options, train_filename, gold_deps_filename, model_output_filename, args)
+
+
+################## EVALUATION ###################
+
+print "Evaluating. If anything here dies, you can still look at the output files in the directory '%s'." % (output_dir)
+
+# Get dependency results.
+
+os.system("mst_score.sh %s %s" % (gold_deps_filename, model_output_filename))
+
+#!/bin/sh
+. mst-env
+$JAVA_CMD mstparser.DependencyParser $@
+#!/bin/sh
+. mst-env
+$JAVA_CMD mstparser.DependencyEvaluator $@
+#!/usr/bin/python
+
+import os
+import sys
+import optparse
+
+import tagging_util
+
+## Check that DBPARSER_DIR environment variable is set and get it
+dbparser_dir = ''
+if os.environ.has_key('DBPARSER_DIR'):
+    dbparser_dir = os.environ['DBPARSER_DIR']
+else:
+    print "Please set the DBPARSER_DIR environment variable to where you have Dan Bikel's parser installed."
+    exit(1)
+
+## Check that OPENNLP_DIR environment variable is set and get it
+otk_dir = ''
+if os.environ.has_key('OPENNLP_DIR'):
+    otk_dir = os.environ['OPENNLP_DIR']
+else:
+    print "Please set the OPENNLP_DIR environment variable to where you have the OpenNLP Toolkit installed."
+    exit(1)
+
+
+## Get options
+
+opt_parser = optparse.OptionParser()
+opt_parser.add_option("-o", "--output-dir", action="store", default='output',
+		  help="save tagger output to DIR",
+		  metavar="DIR")
+opt_parser.add_option("-v", "--verbose", action="store_true", default=False,
+                      help="be verbose")
+
+(options, args) = opt_parser.parse_args()
+
+verbose = options.verbose
+
+output_dir = options.output_dir
+if os.path.isfile(output_dir):
+    raise OSError("A file with the same name as the desired dir, " \
+		  "'%s', already exists." % output_dir)
+elif not(os.path.isdir(output_dir)):
+    os.makedirs(output_dir)
+
+
+## Process files
+
+adwait_tagged_filename = args[0]
+test_sentences = file(args[1])
+output_file = open(args[2], "w")
+
+# Use the gold trees to produce tagged sentences in Adwait's format
+# with underscore separator.
+#
+# Note: any underscores in the tags themselves will be converted to
+# +us+ metacharacters. These get unconverted at the end.
+#os.system("python %s/python/parse_to_sentence.py -t -f Adwait -s -d %s > %s"
+#          % (dbparser_dir, tree_filename, adwait_tagged_filename))
+
+model_filename = output_dir+"/model.bin.gz"
+
+
+# Make a tag dictionary
+tag_dictionary_filename = output_dir+"/tag_dict"
+os.system("python %s/python/create_tag_dictionary.py -s _ %s > %s"
+          % (dbparser_dir, adwait_tagged_filename, tag_dictionary_filename))
+
+# Train the tagger
+os.system("%s/bin/otk_train_tagger.sh -dict %s %s %s &> /dev/null"
+          % (otk_dir, tag_dictionary_filename, adwait_tagged_filename, model_filename))
+
+sentences_to_tag_filename = output_dir+"/to_tag.txt"
+
+# Strip off the parens that are used in the input to parser
+to_tag_file = open(sentences_to_tag_filename, "w")
+counter = 0
+for sentence in test_sentences:
+    if counter % 5 == 0:
+        clean = "\t".join([x.split("+")[0] for x in sentence.strip().split("\t")])
+        to_tag_file.write(clean+"\n")
+    counter += 1
+to_tag_file.close()
+
+tagged_filename = output_dir+"/tagged.txt"
+
+# Run the tagger
+os.system("%s/bin/otk_run_tagger.sh -dict %s -tag_dict %s %s %s > %s"
+          % (otk_dir, tag_dictionary_filename, tag_dictionary_filename,
+             sentences_to_tag_filename, model_filename, tagged_filename))
+
+
+# Convert tagger output to MST format. Unconvert the +us+
+# metachars back to underscores too (using tagging_util.de_metatize()).
+for tagged_sent in file(tagged_filename):
+    words = []
+    tags = []
+    for word_tag in tagged_sent.split():
+        (word,tag) = tagging_util.split_item(word_tag, "_")
+        words.append(word)
+        tags.append(tagging_util.de_metatize(tag,"_","+us+"))
+    output_file.write("\t".join(words)+"\n")
+    output_file.write("\t".join(tags)+"\n")
+
+output_file.close()
+#!/bin/sh
+
+echo
+echo "MST Parser Build System"
+echo "-------------------"
+echo
+
+if [ "$JAVA_HOME" = "" ] ; then
+  echo "ERROR: JAVA_HOME not found in your environment."
+  echo
+  echo "Please, set the JAVA_HOME variable in your environment to match the"
+  echo "location of the Java Virtual Machine you want to use."
+  exit 1
+fi
+
+if [ `echo $OSTYPE | grep -n cygwin` ]; then
+  PS=";"
+else
+  PS=":"
+fi
+
+LOCALCLASSPATH=$JAVA_HOME/lib/tools.jar
+# add in the dependency .jar files
+DIRLIBS=lib/*.jar
+for i in ${DIRLIBS}
+do
+    if [ "$i" != "${DIRLIBS}" ] ; then
+        LOCALCLASSPATH=$LOCALCLASSPATH${PS}"$i"
+    fi
+done
+ANT_HOME=./lib
+
+echo Building with classpath $LOCALCLASSPATH
+echo
+
+echo Starting Ant...
+echo
+
+$JAVA_HOME/bin/java -Dant.home=$ANT_HOME -classpath $LOCALCLASSPATH org.apache.tools.ant.Main $*
+<!-- $Id: build.xml 138 2013-09-10 10:02:43Z wyldfire $ -->
+<!-- Copyright (C) 2007 Ryan McDonald -->
+<project default="compile" basedir=".">
+
+	<!-- =================================================================== -->
+	<!-- Initialization target                                               -->
+	<!-- =================================================================== -->
+	<target name="init">
+		<tstamp/>
+		<property name="Name" value="MSTParser"/>
+		<property name="name" value="mstparser"/>
+		<property name="year" value="2013"/>
+		<property name="version" value="0.5.1"/>
+
+		<echo message="----------- ${Name} ${version} [${year}] ------------"/>
+
+		<property name="debug" value="on"/>
+		<property name="optimize" value="off"/>
+		<property name="deprecation" value="on"/>
+
+		<property name="src.dir" value="./src/main/java"/>
+		<property name="lib.dir" value="./lib"/>
+		<property name="packages" value="mstparser.*"/>
+
+		<property name="build.dir" value="./output"/>
+		<property name="build.dest" value="./output/classes"/>
+		<property name="build.javadocs" value="./docs/api"/>
+
+		<filter token="year" value="${year}"/>
+		<filter token="version" value="${version}"/>
+		<filter token="date" value="${TODAY}"/>
+		<filter token="log" value="true"/>
+		<filter token="verbose" value="true"/>
+
+		<path id="build.classpath">
+			<fileset dir="${lib.dir}/">
+				<include name="*.jar"/>
+			</fileset>
+		</path>
+	</target>
+
+
+	<!-- =================================================================== -->
+	<!-- Help on usage                                                       -->
+	<!-- =================================================================== -->
+	<target name="usage">
+		<echo message=""/>
+		<echo message=""/>
+		<echo message="MST Parser build file"/>
+		<echo message="-------------------------------------------------------------"/>
+		<echo message=""/>
+		<echo message=" Available targets are:"/>
+		<echo message=""/>
+		<echo message="   package  --> generates the mstparser.jar file"/>
+		<echo message="   compile  --> compiles the source code  (default)"/>
+		<echo message="   javadoc  --> generates the API documentation"/>
+		<echo message="   clean    --> cleans up the compilation directory"/>
+		<echo message=""/>
+		<echo message=" See the comments inside the build.xml file for more details."/>
+		<echo message="-------------------------------------------------------------"/>
+		<echo message=""/>
+		<echo message=""/>
+	</target>
+
+
+	<!-- =================================================================== -->
+	<!-- Prepares the build directories                                      -->
+	<!-- =================================================================== -->
+	<target name="prepare" depends="init">
+		<!-- create directories -->
+		<mkdir dir="${build.dir}"/>
+		<mkdir dir="${build.dest}"/>
+	</target>
+
+
+	<!-- =================================================================== -->
+	<!-- Compiles the source directory                                       -->
+	<!-- =================================================================== -->
+	<target name="compile" 
+          depends="prepare"
+          description="compiles the source code  (default)">
+		<javac srcdir="${src.dir}"
+           destdir="${build.dest}"
+           debug="${debug}"
+           deprecation="${deprecation}"
+           classpathref="build.classpath"
+           optimize="${optimize}">
+			<!--	   <compilerarg line="-Xlint:unchecked"/> -->
+		</javac>
+	</target>
+
+
+	<!-- =================================================================== -->
+	<!-- Creates the class package                                           -->
+	<!-- =================================================================== -->
+	<target name="package" 
+          depends="compile"
+          description="generates the mstparser.jar file">
+		<jar jarfile="${build.dir}/${name}.jar">
+			<fileset dir="${build.dest}" includes="**"/>
+		</jar>
+	</target>
+
+
+	<!-- =================================================================== -->
+	<!-- Creates the release file                                           -->
+	<!-- =================================================================== -->
+	<target name="release" depends="clean,cleandocs">
+		<tar tarfile="${name}-${version}-src.tar"
+          basedir="../"
+          includes="${name}/**"
+          excludes="**/CVS **/*forest testbed/my*" />
+		<gzip src="${name}-${version}-src.tar" 
+          zipfile="../${name}-${version}-src.tgz" />
+		<delete file="${name}-${version}-src.tar" />
+	</target>
+
+	<!-- =================================================================== -->
+	<!-- Creates the homepage                                                -->
+	<!-- =================================================================== -->
+	<target name="homepage" 
+          depends="init,javadoc"
+          description="generates the API documentation">
+		<tar tarfile="${name}-homepage.tar"
+         basedir="./docs/"
+         includes="**"
+         excludes="**/CVS" />
+		<gzip src="${name}-homepage.tar"
+          zipfile="${build.dir}/${name}-homepage.tgz" />
+		<delete file="${name}-homepage.tar" />
+	</target>
+
+
+	<!-- =================================================================== -->
+	<!-- Creates the API documentation                                       -->
+	<!-- =================================================================== -->
+	<target name="javadoc" depends="prepare">
+		<mkdir dir="${build.javadocs}"/>
+		<javadoc packagenames="${packages}"
+             sourcepath="${src.dir}"
+             destdir="${build.javadocs}"
+             author="true"
+             version="true"
+             use="true"
+             splitindex="true"
+             noindex="false"
+             windowtitle="${name}"
+             doctitle="The ${Name} API v${version}"
+             bottom="Copyright &#169; ${year} Ryan McDonald and Jason Baldridge. All Rights Reserved."
+    />
+	</target>
+
+
+	<!-- =================================================================== -->
+	<!-- Cleans targets                                                      -->
+	<!-- =================================================================== -->
+	<target name="clean" 
+          depends="init"
+          description="cleans up the directory">
+		<delete dir="${build.dir}"/>
+		<delete file="${lib.dir}/${name}.jar" />
+	</target>
+
+	<target name="cleandocs" depends="init" description="cleans up the API docs directory">
+		<delete dir="${build.javadocs}"/>
+	</target>
+
+</project>
+
+<!-- End of file -->