diff --git a/dependencyParser/basic/mate-tools/.classpath b/dependencyParser/basic/mate-tools/.classpath new file mode 100644 index 0000000..8092159 --- /dev/null +++ b/dependencyParser/basic/mate-tools/.classpath @@ -0,0 +1,8 @@ +<?xml version="1.0" encoding="UTF-8"?> +<classpath> + <classpathentry kind="src" path="src"/> + <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/> + <classpathentry kind="lib" path="/mtt/lib/trove.jar"/> + <classpathentry kind="lib" path="lib/commons-math-2.2.jar"/> + <classpathentry kind="output" path="classes"/> +</classpath> diff --git a/dependencyParser/mate-tools/.externalToolBuilders/New_Builder.launch b/dependencyParser/basic/mate-tools/.externalToolBuilders/New_Builder.launch index eca73f7..eca73f7 100644 --- a/dependencyParser/mate-tools/.externalToolBuilders/New_Builder.launch +++ b/dependencyParser/basic/mate-tools/.externalToolBuilders/New_Builder.launch diff --git a/dependencyParser/mate-tools/.externalToolBuilders/ana.launch b/dependencyParser/basic/mate-tools/.externalToolBuilders/ana.launch index 09df90d..09df90d 100644 --- a/dependencyParser/mate-tools/.externalToolBuilders/ana.launch +++ b/dependencyParser/basic/mate-tools/.externalToolBuilders/ana.launch diff --git a/dependencyParser/mate-tools/.project b/dependencyParser/basic/mate-tools/.project index f813b9e..f813b9e 100644 --- a/dependencyParser/mate-tools/.project +++ b/dependencyParser/basic/mate-tools/.project diff --git a/dependencyParser/mate-tools/build.xml b/dependencyParser/basic/mate-tools/build.xml index c558279..c558279 100644 --- a/dependencyParser/mate-tools/build.xml +++ b/dependencyParser/basic/mate-tools/build.xml diff --git a/dependencyParser/mate-tools/lib/commons-math-2.2.jar b/dependencyParser/basic/mate-tools/lib/commons-math-2.2.jar index b29a39c..b29a39c 100644 --- a/dependencyParser/mate-tools/lib/commons-math-2.2.jar +++ b/dependencyParser/basic/mate-tools/lib/commons-math-2.2.jar diff --git a/dependencyParser/mate-tools/lib/trove-2.0.4.jar b/dependencyParser/basic/mate-tools/lib/trove-2.0.4.jar index cb1c8f1..cb1c8f1 100644 --- a/dependencyParser/mate-tools/lib/trove-2.0.4.jar +++ b/dependencyParser/basic/mate-tools/lib/trove-2.0.4.jar diff --git a/dependencyParser/basic/mate-tools/src/decoder/ParallelDecoder.java b/dependencyParser/basic/mate-tools/src/decoder/ParallelDecoder.java new file mode 100755 index 0000000..0dd1c18 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/decoder/ParallelDecoder.java @@ -0,0 +1,155 @@ +package decoder; + +import is2.data.Closed; +import is2.data.DataF; +import is2.data.Edges; +import is2.data.Open; + +import java.util.ArrayList; +import java.util.concurrent.Callable; + +/** + * @author Bernd Bohnet, 30.08.2009 + * + * This class implements a parallel feature extractor. + */ +final public class ParallelDecoder implements Callable<Object> +{ + // some constants + private static final float INIT_BEST = (-1.0F / 0.0F); + private static final boolean[] DIR ={false,true}; + + // the data space of the weights for a dependency tree + final private DataF x; + + private short[] pos; + + private Open O[][][][]; + private Closed C[][][][] ; + + private int n; + + boolean done=false; + public boolean waiting =false; + + /** + * Initialize the parallel decoder. + * + * @param pos part-of-speech + * @param d data + * @param edges part-of-speech edge mapping + * @param o open spans + * @param c closed spans + * @param n number of words + */ + public ParallelDecoder(short[] pos, DataF d, Open o[][][][], Closed c[][][][], int n) { + + this.pos =pos; + this.x =d; + + this.O=o; + this.C=c; + this.n=n; + } + + + private static class DSet { short w1,w2;} + + @Override + public Object call() { + + while (true){ + + DSet set = get(); + if (done && set==null) break; + + if (set ==null) return null; + + short s=set.w1, t=set.w2; + + for(short dir =1;dir>=0;dir--) { + + short[] labs = (dir==1) ? Edges.get(pos[s],pos[t], false):Edges.get(pos[t],pos[s], true); + + O[s][t][dir] = new Open[labs.length]; + for (int l = O[s][t][dir].length - 1; l >= 0; l--) { + + double tRP = INIT_BEST; + + Closed tL = null, tR = null; + + for (int r = s; r < t; r++) { + + if (s == 0 && r != 0) continue; + + double tLPr = INIT_BEST,tRPr = INIT_BEST; + Closed tLCld = null, tRCld = null; + + if (r == s) tLPr = dir==1 ? x.sib[s][t][s][0][l] : x.gra[t][s][s][1 ][l]; + else + for (int i = s + 1; i <= r; i++) + if (((dir==1 ? x.sib[s][t][i][0][l] : x.gra[t][s][i][1][l]) + C[s][r][1][i].p) > tLPr) { + tLPr = ((dir==1 ? x.sib[s][t][i][0][l] : x.gra[t][s][i][1][l]) + C[s][r][1][i].p);tLCld = C[s][r][1][i];} + + if (r == t-1) tRPr = dir==1 ? x.gra[s][t][s][0][l] : x.sib[t][s][s][1][l]; + else + for (int i = r + 1; i < t; i++) + if (((dir == 1 ? x.gra[s][t][i][0][l] : x.sib[t][s][i][1][l]) + C[r+1][t][0][i].p) > tRPr) { + tRPr = ((dir==1?x.gra[s][t][i][0][l]:x.sib[t][s][i][1][l]) + C[r+1][t][0][i].p); tRCld=C[r + 1][t][0][i];} + + if (tLPr + tRPr > tRP) {tRP = tLPr + tRPr; tL = tLCld;tR = tRCld;} + } + O[s][t][dir][l] = new Open(s, t, dir, labs[l],tL, tR, + (float) ( tRP+((dir==1)?x.pl[s][t]: x.pl[t][s]) + ((dir==1)? x.lab[s][t][labs[l]][0]:x.lab[t][s][labs[l]][1]))); + } + } + C[s][t][1] = new Closed[n]; C[s][t][0] = new Closed[n]; + + for (int m = s ; m <= t; m++) { + for(boolean d : DIR) { + if ((d && m!=s)||!d && (m!=t && s!=0)) { + + // create closed structure + + double top = INIT_BEST; + + Open tU = null; Closed tL = null; + int numLabels =O[(d ? s : m)][(d ? m : t)][d?1:0].length; + + //for (int l = numLabels-1; l >=0; l--) { + for (int l = 0; l < numLabels; l++) { + + Open hi = O[(d ? s : m)][(d ? m : t)][d?1:0][l]; + for (int amb = m + (d?1:-1); amb != (d?t:s) + (d?1:-1); amb += (d?1:-1)) { + + if ((hi.p + C[d?m:s][d?t:m][d?1:0][amb].p +x.gra[d?s:t][m][amb][d?0:1][l]) > top) { + top = (hi.p + C[d?m:s][d?t:m][d?1:0][amb].p +x.gra[d?s:t][m][amb][(d?0:1)][l]); tU = hi; tL=C[d?m:s][d?t:m][d?1:0][amb];} + } + + if ((m == (d ? t : s)) && (hi.p + x.gra[d?s:t][m][d?s:t][(d ? 0 :1)][l]) > top) { + top = (hi.p + x.gra[(d ? s : t)][m][d?s:t][d?0:1][l]); tU = hi; tL = null;} + } + C[s][t][d?1:0][m] = new Closed(s, t, m, d?1:0,tU,tL,(float) top); + } + } + } + } + return null; + } + + public static ArrayList<DSet> sets = new ArrayList<DSet>(); + + static synchronized private DSet get() { + synchronized (sets) { + if (sets.size()==0) return null; + return sets.remove(sets.size()-1); + } + } + + public static void add(short w1, short w2){ + DSet ds =new DSet(); + ds.w1=w1; + ds.w2=w2; + sets.add(ds); + } +} diff --git a/dependencyParser/basic/mate-tools/src/decoder/ParallelRearrangeNBest.java b/dependencyParser/basic/mate-tools/src/decoder/ParallelRearrangeNBest.java new file mode 100755 index 0000000..493917b --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/decoder/ParallelRearrangeNBest.java @@ -0,0 +1,136 @@ +package decoder; + +import is2.data.DataF; +import is2.data.Edges; +import is2.data.Parse; +import is2.data.ParseNBest; + +import java.util.ArrayList; +import java.util.concurrent.Callable; + +import extractors.Extractor; + +/** + * @author Dr. Bernd Bohnet, 30.08.2009 + * + * This class implements a parallel edge rearrangement for non-projective parsing; + * The linear method was first suggest by Rayn McDonald et. al. 2005. + */ +final public class ParallelRearrangeNBest implements Callable<Object> { + + // new parent child combination to explore + final static class PA { + final float p; + final short ch, pa; + + float best; + + + + public PA(float p2, short ch2, short pa2) { p=p2; ch=ch2;pa=pa2;} + } + + // list of parent child combinations + private static ArrayList<PA> parents = new ArrayList<PA>(); + + // some data from the dependency tree + private short[] pos; + private DataF x; + private boolean[][] isChild ; + public short[] heads,types; + private float lastNBest; + private float best; // best so far + private float threshold; + private Extractor extractor; + + + /** + * Initialize the parallel rearrange thread + * + * @param isChild2 is a child + * @param edgesC the part-of-speech edge mapping + * @param pos the part-of-speech + * @param x the data + * @param lastNBest + * @param s the heads + * @param ts the types + */ + public ParallelRearrangeNBest(short[] pos , DataF x, Parse p, float lastNBest, Extractor extractor, float best, float threshold) { + + + heads=p.heads; + + types= p.labels; + + isChild = new boolean[heads.length][heads.length]; + + for(int i = 1, l1=1; i < heads.length; i++,l1=i) + while((l1= heads[l1]) != -1) isChild[l1][i] = true; + + + this.lastNBest =lastNBest; + this.pos =pos; + this.x=x; + + this.extractor = extractor; + this.best=best; + this.threshold = threshold; + } + + public ArrayList<ParseNBest> parses = new ArrayList<ParseNBest>(); + + @Override + public Object call() { + + // check the list of new possible parents and children for a better combination + for(int ch = 1; ch < heads.length; ch++) { + for(short pa = 0; pa < heads.length; pa++) { + if(ch == pa || pa == heads[ch] || isChild[ch][pa]) continue; + + short oldP = heads[ch], oldT = types[ch]; + heads[ch]=pa; + + short[] labels = Edges.get(pos[pa], pos[ch],ch<pa); + + for(int l=0;l<labels.length;l++) { + + types[ch]=labels[l]; + float p_new = extractor.encode3(pos, heads, types, x); + + if (p_new<lastNBest || ((best+this.threshold)>p_new)) continue; + + ParseNBest p = new ParseNBest(); + p.signature(heads, types); + p.f1=p_new; + parses.add(p); + } + + // change back + heads[ch]= oldP; types[ch]=oldT; + + // consider changes to labels only + labels = Edges.get(pos[oldP], pos[ch],ch<oldP); + + for(int l=0;l<labels.length;l++) { + + types[ch]=labels[l]; + float p_new = (float) extractor.encode3(pos, heads, types, x); + + // optimization: add only if larger than smallest of n-best + if (p_new<lastNBest || ((best+this.threshold)>p_new)) continue; + + ParseNBest p = new ParseNBest(); + p.signature(heads, types); + p.f1=p_new; + parses.add(p); + } + + heads[ch]= oldP; types[ch]=oldT; + } + } + return parses; + } + + + +} diff --git a/dependencyParser/basic/mate-tools/src/decoder/ParallelRearrangeNBest2.java b/dependencyParser/basic/mate-tools/src/decoder/ParallelRearrangeNBest2.java new file mode 100644 index 0000000..a25b392 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/decoder/ParallelRearrangeNBest2.java @@ -0,0 +1,151 @@ +package decoder; + +import is2.data.DataF; +import is2.data.Edges; +import is2.data.Parse; +import is2.data.ParseNBest; + +import java.util.ArrayList; +import java.util.concurrent.Callable; + +import decoder.ParallelRearrangeNBest.PA; + +import extractors.Extractor; + +/** + * @author Dr. Bernd Bohnet, 30.08.2009 + * + * This class implements a parallel edge rearrangement for non-projective parsing; + * The linear method was first suggest by Rayn McDonald et. al. 2005. + */ +final public class ParallelRearrangeNBest2 implements Callable<Object> { + + // new parent child combination to explore + final static class PA { + final float p; + final short ch, pa; + + + public short[] heads,types; + + public PA(Parse p, short ch2, short pa2) { + this.p =(float)p.f1; + heads =p.heads; + types=p.labels; + ch=ch2;pa=pa2; + + } + } + + // list of parent child combinations + private static ArrayList<PA> parents = new ArrayList<PA>(); + + // some data from the dependency tree + private short[] pos; + private DataF x; + private float lastNBest; + private float threshold; + private Extractor extractor; + + + /** + * Initialize the parallel rearrange thread + * @param pos the part-of-speech + * @param x the data + * @param lastNBest + * @param isChild2 is a child + * @param edgesC the part-of-speech edge mapping + * @param s the heads + * @param ts the types + */ + public ParallelRearrangeNBest2(short[] pos , DataF x, float lastNBest, Extractor extractor, float threshold) { + + + + this.lastNBest =lastNBest; + this.pos =pos; + this.x=x; + + this.extractor = extractor; + this.threshold = threshold; + } + + public ArrayList<ParseNBest> parses = new ArrayList<ParseNBest>(); + + @Override + public Object call() { + + try { + + while(true) { + PA p = getPA(); + + if (p==null) return parses; + + short oldP = p.heads[p.ch], oldT = p.types[p.ch]; + p.heads[p.ch]=p.pa; + + short[] labels = Edges.get(pos[p.pa], pos[p.ch],p.ch<p.pa); + + for(int l=0;l<labels.length;l++) { + + p.types[p.ch]=labels[l]; + float p_new = extractor.encode3(pos, p.heads, p.types, x); + + if (p_new<lastNBest || ((p.p+this.threshold)>p_new)) continue; + + ParseNBest x = new ParseNBest(); + x.signature(p.heads, p.types); + x.f1=p_new; + parses.add(x); + } + + // change back + p.heads[p.ch]= oldP; p.types[p.ch]=oldT; + + // consider changes to labels only + labels = Edges.get(pos[oldP], pos[p.ch],p.ch<oldP); + + for(int l=0;l<labels.length;l++) { + + p.types[p.ch]=labels[l]; + float p_new = (float) extractor.encode3(pos, p.heads, p.types, x); + + // optimization: add only if larger than smallest of n-best + if (p_new<lastNBest || ((p.p+this.threshold)>p_new)) continue; + + ParseNBest x = new ParseNBest(); + x.signature(p.heads, p.types); + x.f1=p_new; + parses.add(x); + } + + p.heads[p.ch]= oldP; p.types[p.ch]=oldT; + } + } catch(Exception e) { + e.printStackTrace(); + } + return parses; + } + + /** + * Add a child-parent combination which are latter explored for rearrangement + * + * @param p2 + * @param ch2 + * @param pa + */ + public static void add(Parse p, short ch2, short pa) { + parents.add(new PA(p,ch2,pa)); + } + + public static PA getPA() { + synchronized(parents) { + if (parents.size()==0) return null; + return parents.remove(parents.size()-1); + } + } + + + +} diff --git a/dependencyParser/basic/mate-tools/src/examples/DependencyParser.java b/dependencyParser/basic/mate-tools/src/examples/DependencyParser.java new file mode 100644 index 0000000..c41a101 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/examples/DependencyParser.java @@ -0,0 +1,92 @@ +package examples; + + +import is2.data.InstancesTagger; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; +import is2.lemmatizer.Lemmatizer; +import is2.lemmatizer.MFO; +import is2.parser.Parser; +import is2.tag.Tagger; +//import org.apache.log4j.Logger; + +import java.io.File; +import java.util.Arrays; + +/** + * Dependency parsing + * + * @author B. Piwowarski <benjamin@bpiwowar.net> + * @date 10/10/12 + */ +//@TaskDescription(name = "dependency-parser", project = "mate-tools") +public class DependencyParser { + // final static private Logger LOGGER = Logger.getLogger(DependencyParser.class); + //@Argument(name = "lemmatizer", required = true, checkers = IOChecker.Readable.class) + File lemmatizerFile; + + //@Argument(name = "tagger", required = true) + File taggerFile; + + //@Argument(name = "parser", required = true) + File parserFile; + + //@Override + public int execute() throws Throwable { + + // Load lemmatizer + //LOGGER.info("Loading lemmatizer"); + // true = do uppercase lemmatization + Lemmatizer lemmatizer = new Lemmatizer(lemmatizerFile.getAbsolutePath()); + + // Load tagger + //LOGGER.info("Loading tagger"); + Tagger tagger = new Tagger(taggerFile.getAbsolutePath()); + + // Load parser + //LOGGER.info("Loading parser"); + Parser parser = new Parser(parserFile.getAbsolutePath()); + + + // Sentences to parse + String sentences[] = new String[]{ + "Airfields have been constructed on a number of the islands .", + "Private investment has even made an increasingly modern ferry fleet possible .", + "Politically , the 1990s have been relatively quite times for the islands ." + }; + + CONLLReader09 reader = new CONLLReader09(CONLLReader09.NO_NORMALIZE); + + for (String sentence : sentences) { + // Prepare the sentence + InstancesTagger instanceTagger = new InstancesTagger(); + instanceTagger.init(1, new MFO()); + + String[] split = sentence.split("\\s+"); + String[] splitRoot = new String[split.length+1]; + System.arraycopy(split, 0, splitRoot, 1, split.length); + splitRoot[0] = CONLLReader09.ROOT; + + SentenceData09 instance = new SentenceData09(); + instance.init(splitRoot); + + reader.insert(instanceTagger, instance); + + SentenceData09 result = lemmatizer.apply(instance); + tagger.apply(result); + result = parser.parse(result, parser.params, false, parser.options); + + + // Output + System.out.println(Arrays.toString(result.forms)); + System.out.println(Arrays.toString(result.plemmas)); + System.out.println(Arrays.toString(result.ppos)); + System.out.println(Arrays.toString(result.pheads)); + System.out.println(Arrays.toString(result.plabels)); + System.out.println(); + + } + + return 0; + } +} diff --git a/dependencyParser/basic/mate-tools/src/examples/FullPipelineSpanish.java b/dependencyParser/basic/mate-tools/src/examples/FullPipelineSpanish.java new file mode 100644 index 0000000..a255595 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/examples/FullPipelineSpanish.java @@ -0,0 +1,98 @@ +package examples; + +import is2.data.SentenceData09; +import is2.io.CONLLWriter09; +import is2.lemmatizer.Lemmatizer; + +import is2.parser.Parser; +import is2.tag.Tagger; +import is2.tools.Tool; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.StringTokenizer; + +/** + * @author Bernd Bohnet, 13.09.2010 + * + * Illustrates the application the full pipeline: lemmatizer, morphologic, tagger, and parser + */ +public class FullPipelineSpanish { + + + // shows how to parse a sentences and call the tools + public static void main(String[] args) throws IOException { + + // Create a data container for a sentence + SentenceData09 i = new SentenceData09(); + + if (args.length==1) { // input might be a sentence: "This is another test ." + StringTokenizer st = new StringTokenizer(args[0]); + ArrayList<String> forms = new ArrayList<String>(); + + forms.add("<root>"); + while(st.hasMoreTokens()) forms.add(st.nextToken()); + + i.init(forms.toArray(new String[0])); + + } else { + // provide a default sentence: Haus has a mutated vowel + i.init(new String[] {"<root>","También","estuve","emocionado","pero","no","pude","imaginar","mi","vida","sin","la", + "gente","tan","intima","a","mí","."}); + + } + + // lemmatizing + + System.out.println("\nReading the model of the lemmatizer"); + Tool lemmatizer = new Lemmatizer("models/lemma-spa.model"); // create a lemmatizer + + System.out.println("Applying the lemmatizer"); + lemmatizer.apply(i); + + System.out.print(i.toString()); + System.out.print("Lemmata: "); for (String l : i.plemmas) System.out.print(l+" "); System.out.println(); + + // morphologic tagging + + System.out.println("\nReading the model of the morphologic tagger"); + is2.mtag.Tagger morphTagger = new is2.mtag.Tagger("models/mtag-spa.model"); + + System.out.println("\nApplying the morpholoigc tagger"); + morphTagger.apply(i); + + System.out.print(i.toString()); + System.out.print("Morph: "); for (String f : i.pfeats) System.out.print(f+" "); System.out.println(); + + // part-of-speech tagging + + System.out.println("\nReading the model of the part-of-speech tagger"); + Tool tagger = new Tagger("models/tag-spa.model"); + + System.out.println("\nApplying the part-of-speech tagger"); + tagger.apply(i); + + System.out.print(i.toString()); + System.out.print("Part-of-Speech tags: "); for (String p : i.ppos) System.out.print(p+" "); System.out.println(); + + // parsing + + System.out.println("\nReading the model of the dependency parser"); + Tool parser = new Parser("models/prs-spa.model"); + + System.out.println("\nApplying the parser"); + parser.apply(i); + + System.out.println(i.toString()); + + // write the result to a file + + CONLLWriter09 writer = new is2.io.CONLLWriter09("example-out.txt"); + + writer.write(i, CONLLWriter09.NO_ROOT); + writer.finishWriting(); + + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/examples/FullPipelineTest.java b/dependencyParser/basic/mate-tools/src/examples/FullPipelineTest.java new file mode 100644 index 0000000..c8f992a --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/examples/FullPipelineTest.java @@ -0,0 +1,110 @@ +package examples; + + +import is2.data.InstancesTagger; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; +import is2.io.CONLLWriter09; +import is2.lemmatizer.Lemmatizer; +import is2.lemmatizer.MFO; +import is2.parser.Parser; +import is2.tag.Tagger; +//import org.apache.log4j.Logger; + +import java.io.File; +import java.util.Arrays; + +/** + * Dependency parsing + * + * @author B. Piwowarski <benjamin@bpiwowar.net> + * @date 10/10/12 + */ +//@TaskDescription(name = "dependency-parser", project = "mate-tools") +public class FullPipelineTest { + // final static private Logger LOGGER = Logger.getLogger(DependencyParser.class); + //@Argument(name = "lemmatizer", required = true, checkers = IOChecker.Readable.class) + public File lemmatizerFile; + + //@Argument(name = "tagger", required = true) + public File taggerFile; + + public File mtaggerFile; + + //@Argument(name = "parser", required = true) + public File parserFile; + + //@Override + public int execute(String source, String target) throws Throwable { + + // Load lemmatizer + //LOGGER.info("Loading lemmatizer"); + // true = do uppercase lemmatization + Lemmatizer lemmatizer = new Lemmatizer(lemmatizerFile.getAbsolutePath()); + + // Load tagger + //LOGGER.info("Loading tagger"); + Tagger tagger = new Tagger(taggerFile.getAbsolutePath()); + + is2.mtag.Tagger mtagger = new is2.mtag.Tagger(mtaggerFile.getAbsolutePath()); + + // Load parser + //LOGGER.info("Loading parser"); + Parser parser = new Parser(parserFile.getAbsolutePath()); + + + CONLLReader09 reader = new CONLLReader09(source); + CONLLWriter09 writer = new CONLLWriter09(target); + + int count=0; + while (true) { + // Prepare the sentence + InstancesTagger is = new InstancesTagger(); + is.init(1, new MFO()); + + SentenceData09 instance= reader.getNext(is); + if (instance ==null) break; + SentenceData09 result = null; +try { + + System.out.print("\b\b\b\b"+count); + result= lemmatizer.apply(instance); + + result = tagger.apply(result); + result= mtagger.apply(result); + result = parser.apply(result); + + count++; +} catch(Exception e) { + + System.out.println("error"+result); + System.out.println("error"+instance); + e.printStackTrace(); + break; +} + + // Output + writer.write(result); + + } + writer.finishWriting(); + return 0; + } + + public static void main(String args[]) throws Throwable { + + if (args.length<3) { + System.out.println("lemmatizer-model tagger-model parser-model source target"); + System.exit(0); + } + FullPipelineTest p = new FullPipelineTest(); + p.lemmatizerFile = new File(args[0]); + p.taggerFile = new File(args[1]); + p.mtaggerFile = new File(args[2]); + p.parserFile = new File(args[3]); + + p.execute(args[4], args[5]); + + } + +} diff --git a/dependencyParser/basic/mate-tools/src/examples/MorphTagger.java b/dependencyParser/basic/mate-tools/src/examples/MorphTagger.java new file mode 100644 index 0000000..0088426 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/examples/MorphTagger.java @@ -0,0 +1,79 @@ +package examples; + +import is2.data.SentenceData09; +import is2.lemmatizer.Lemmatizer; +import is2.lemmatizer.Options; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.StringTokenizer; + +/** + * @author Bernd Bohnet, 13.09.2010 + * + * Illustrates the application of some components: lemmatizer, tagger, and parser + */ +public class MorphTagger { + + + /** + * How to lemmatize a sentences? + */ + public static void main(String[] args) throws IOException { + + + // Create a data container for a sentence + SentenceData09 i = new SentenceData09(); + + if (args.length==1) { // input might be a sentence: "This is another test ." + StringTokenizer st = new StringTokenizer(args[0]); + ArrayList<String> forms = new ArrayList<String>(); + + forms.add("<root>"); + while(st.hasMoreTokens()) forms.add(st.nextToken()); + + i.init(forms.toArray(new String[0])); + + } else { + // provide a default sentence + i.init(new String[] {"<root>","Häuser","hat","ein","Umlaut","."}); + } + + //print the forms + for (String l : i.forms) System.out.println("forms : "+l); + + // tell the lemmatizer the location of the model + is2.lemmatizer.Options optsLemmatizer = new Options(new String[] {"-model","models/lemma-ger.model"}); + + // create a lemmatizer + Lemmatizer lemmatizer = new Lemmatizer(optsLemmatizer.modelName); + + // lemmatize a sentence; the result is stored in the stenenceData09 i + lemmatizer.apply(i); + + + // output the lemmata + for (String l : i.plemmas) System.out.println("lemma : "+l); + + + is2.mtag.Options morphologicTaggerOptions = new is2.mtag.Options(new String[] {"-model","models/mtag-ger.model"}); + + is2.mtag.Tagger mt = new is2.mtag.Tagger(morphologicTaggerOptions); + + try { + + + // SentenceData09 snt = is2.mtag.Main.out(i.forms, lemmata); + + SentenceData09 snt = mt.apply(i); + for(String f : snt.pfeats) System.out.println("feats "+f); + + } catch(Exception e){ + e.printStackTrace(); + } + + + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/examples/ParseOnly.java b/dependencyParser/basic/mate-tools/src/examples/ParseOnly.java new file mode 100755 index 0000000..cec31dd --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/examples/ParseOnly.java @@ -0,0 +1,50 @@ +package examples; + +import is2.data.SentenceData09; +import is2.parser.Options; +import is2.parser.Parser; + + +public class ParseOnly { + + public static void main(String[] args) { + + if (args.length ==0) { + plain(); + } + + } + + /** + * This example shows how to parse a sentence. + */ + public static void plain() { + + // initialize the options + String[] opts ={"-model","models/prs-eng-x.model"}; + Options options = new Options(opts); + + // create a parser + Parser parser = new Parser(options); + + // Create a data container for a sentence + SentenceData09 i = new SentenceData09(); + + // Provide the sentence + i.init(new String[] {"<root>","This","is","a","test","."}); + i.setPPos(new String[]{"<root-POS>","DT","VBZ","DT","NN","."}); + + // parse the sentence + SentenceData09 out = parser.apply(i); + + // output the sentence and dependency tree + System.out.println(out.toString()); + + // Get the parsing results + out.getLabels(); + out.getParents(); + + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/examples/Pipeline.java b/dependencyParser/basic/mate-tools/src/examples/Pipeline.java new file mode 100644 index 0000000..e55869d --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/examples/Pipeline.java @@ -0,0 +1,95 @@ +package examples; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.StringTokenizer; + +import is2.data.SentenceData09; +import is2.lemmatizer.Lemmatizer; +import is2.parser.Options; +import is2.parser.Parser; +import is2.tag.Tagger; + +/** + * @author Bernd Bohnet, 13.09.2010 + * + * Illustrates the application of some components: lemmatizer, tagger, and parser + */ +public class Pipeline { + + + // how to parse a sentences and call the tools + public static void main(String[] args) throws IOException { + + + // Create a data container for a sentence + SentenceData09 i = new SentenceData09(); + + if (args.length==1) { // input might be a sentence: "This is another test ." + StringTokenizer st = new StringTokenizer(args[0]); + ArrayList<String> forms = new ArrayList<String>(); + + forms.add("<root>"); + while(st.hasMoreTokens()) forms.add(st.nextToken()); + + i.init(forms.toArray(new String[0])); + + } else { + // provide a default sentence + i.init(new String[] {"<root>","This","is","a","test","."}); + } + + //print the forms + for (String l : i.forms) System.out.println("form : "+l); + + // tell the lemmatizer the location of the model + is2.lemmatizer.Options optsLemmatizer = new is2.lemmatizer.Options(new String[] {"-model","models/lemma-eng.model"}); + + // create a lemmatizer + Lemmatizer lemmatizer = new Lemmatizer(optsLemmatizer.modelName); + + // lemmatize a sentence; the result is stored in the stenenceData09 i + i = lemmatizer.apply(i); + + + // output the lemmata + for (String l : i.plemmas) System.out.println("lemma : "+l); + + // tell the tagger the location of the model + is2.tag.Options optsTagger = new is2.tag.Options(new String[]{"-model","models/tag-eng.model"}); + Tagger tagger = new Tagger(optsTagger); + + + +// String pos[] =tagger.tag(i.forms, i.lemmas); +// i.setPPos(pos); + + + SentenceData09 tagged = tagger.tag(i); + for (String p : tagged.ppos) System.out.println("pos "+p); + + + + // initialize the options + Options optsParser = new Options(new String[]{"-model","models/prs-eng-x.model"}); + + // create a parser + Parser parser = new Parser(optsParser); + + // parse the sentence (you get a copy of the input i) + SentenceData09 parse = parser.apply(tagged); + + System.out.println(parse.toString()); + + // create some trash on the hard drive :-) + is2.io.CONLLWriter09 writer = new is2.io.CONLLWriter09("example-out.txt"); + + writer.write(i); + writer.finishWriting(); + } + + + + +} diff --git a/dependencyParser/basic/mate-tools/src/extractors/Extractor.java b/dependencyParser/basic/mate-tools/src/extractors/Extractor.java new file mode 100644 index 0000000..327895d --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/extractors/Extractor.java @@ -0,0 +1,59 @@ +/** + * + */ +package extractors; + +import is2.data.Cluster; +import is2.data.DataF; +import is2.data.FV; +import is2.data.IFV; +import is2.data.Instances; + +/** + * @author Dr. Bernd Bohnet, 29.04.2011 + * + * + */ +public interface Extractor { + + + /** + * Initializes the Extractor general parts + */ + public void initStat(); + + /** + * Initializes the Extractor specific parts + */ + public void init(); + + public int basic(short[] pos, int[] forms, int w1, int w2, Cluster cluster, IFV f); + + public void firstm(Instances is, int i, int w1, int w2, int j, Cluster cluster, long[] svs); + + public void siblingm(Instances is, int i, short[] pos, int[] forms, + int[] lemmas, short[][] feats, int w1, int w2, int g, int j, + Cluster cluster, long[] svs, int n); + + public void gcm(Instances is, int i, int w1, int w2, int g, int j, Cluster cluster, long[] svs); + + public int getType(); + + public FV encodeCat(Instances is, int n, short[] pos, int[] is2, + int[] is3, short[] heads, short[] labels, short[][] s, Cluster cl, + FV pred); + + public void setMaxForm(int integer); + + /** + * @return + */ + public int getMaxForm(); + + + public float encode3(short[] pos, short[] heads, short[] labs, DataF x); + + + + +} diff --git a/dependencyParser/basic/mate-tools/src/extractors/ExtractorClusterStacked.java b/dependencyParser/basic/mate-tools/src/extractors/ExtractorClusterStacked.java new file mode 100755 index 0000000..79a44ca --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/extractors/ExtractorClusterStacked.java @@ -0,0 +1,958 @@ +package extractors; + + +import is2.data.Cluster; +import is2.data.D4; +import is2.data.DataF; +import is2.data.Edges; +import is2.data.FV; +import is2.data.IFV; +import is2.data.Instances; +import is2.data.Long2IntInterface; +import is2.data.MFB; +import is2.util.DB; + + + +final public class ExtractorClusterStacked implements Extractor { + + public static int s_rel,s_word,s_type,s_dir,s_dist,s_feat,s_child,s_spath,s_lpath,s_pos; + + + final D4 d0 ,dl1,dl2, dwr,dr,dwwp,dw,dwp,dlf,d3lp, d2lp,d2pw,d2pp ; + + public final Long2IntInterface li; + + public ExtractorClusterStacked(Long2IntInterface li) { + + this.initFeatures(); + this.li=li; + d0 = new D4(li);dl1 = new D4(li);dl2 = new D4(li); + dwr = new D4(li); + dr = new D4(li); + dwwp = new D4(li); + + dw = new D4(li); + dwp = new D4(li); + + dlf = new D4(li); + d3lp = new D4(li); d2lp = new D4(li); d2pw = new D4(li); d2pp = new D4(li); + + } + + public void initStat() { + + + MFB mf = new MFB(); + s_rel = mf.getFeatureCounter().get(REL).intValue(); + s_pos = mf.getFeatureCounter().get(POS).intValue(); + s_word = mf.getFeatureCounter().get(WORD).intValue(); + s_type = mf.getFeatureCounter().get(TYPE).intValue();//mf.getFeatureBits(); + s_dir = mf.getFeatureCounter().get(DIR); + la = mf.getValue(DIR, LA); + ra = mf.getValue(DIR, RA); + s_dist = mf.getFeatureCounter().get(DIST);//mf.getFeatureBits(DIST); + s_feat = mf.getFeatureCounter().get(FEAT);//mf.getFeatureBits(Pipe.FEAT); + s_spath = mf.getFeatureCounter().get(Cluster.SPATH)==null?0:mf.getFeatureCounter().get(Cluster.SPATH);//mf.getFeatureBits(Cluster.SPATH); + s_lpath = mf.getFeatureCounter().get(Cluster.LPATH)==null?0:mf.getFeatureCounter().get(Cluster.LPATH);//mf.getFeatureBits(Cluster.LPATH); + } + + public void init(){ + // DB.println("init"); + d0.a0 = s_type;d0.a1 = s_pos;d0.a2 = s_pos;d0.a3 = s_pos;d0.a4 = s_pos;d0.a5 = s_pos;d0.a6 = s_pos;d0.a7 = s_pos; + dl1.a0 = s_type;dl1.a1 = s_rel; dl1.a2 = s_pos;dl1.a3 = s_pos; dl1.a4 = s_pos; dl1.a5 = s_pos; dl1.a6 = s_pos; dl1.a7 = s_pos; + dl2.a0 = s_type;dl2.a1 = s_rel;dl2.a2 = s_word;dl2.a3 = s_pos;dl2.a4 = s_pos;dl2.a5 = s_pos;dl2.a6 = s_pos;dl2.a7 = s_pos; + dwp.a0 = s_type; dwp.a1 = s_rel; dwp.a2 = s_word; dwp.a3 = s_pos; dwp.a4 = s_pos; dwp.a5 = s_word; + dwwp.a0 = s_type; dwwp.a1 = s_rel; dwwp.a2 = s_word; dwwp.a3 = s_word; dwwp.a4 = s_pos; dwwp.a5 = s_word; + dlf.a0 = s_type;dlf.a1 = s_rel; dlf.a2 = s_pos;dlf.a3 = s_pos; dlf.a4 = s_feat; dlf.a5 = s_feat; dlf.a6 = s_pos; dlf.a7 = s_pos; + d3lp.a0 = s_type; d3lp.a1 = s_rel; d3lp.a2 = s_lpath; d3lp.a3 = s_lpath; d3lp.a4 = s_lpath; d3lp.a5 = s_word; d3lp.a6 = s_spath; d3lp.a7 = s_spath; + d2lp.a0 = s_type; d2lp.a1 = s_rel; d2lp.a2 = s_lpath; d2lp.a3 = s_lpath; d2lp.a4 = s_word; d2lp.a5 = s_word; //d3lp.a6 = s_spath; d3lp.a7 = s_spath; + d2pw.a0 = s_type; d2pw.a1 = s_rel; d2pw.a2 = s_lpath; d2pw.a3 = s_lpath; d2pw.a4 = s_word; d2pw.a5 = s_word; //d3lp.a6 = s_spath; d3lp.a7 = s_spath; + d2pp.a0 = s_type; d2pp.a1 = s_rel; d2pp.a2 = s_lpath; d2pp.a3 = s_lpath; d2pp.a4 = s_pos; d2pp.a5 = s_pos; //d3lp.a6 = s_spath; d3lp.a7 = s_spath; + } + + + public int basic(short[] pposs, int[] form, int p, int d, Cluster cluster, IFV f) + { + + d0.clean(); dl1.clean(); dl2.clean(); dwp.clean(); dwwp.clean(); dlf.clean(); d3lp.clean(); + + d3lp.clean(); d2lp.clean();d2pw.clean(); d2pp.clean(); + + int n=1; + int dir= (p < d)? ra:la; + d0.v0= n++; d0.v1=pposs[p]; d0.v2=pposs[d]; //d0.stop=4; + int end= (p >= d ? p : d); + int start = (p >= d ? d : p) + 1; + + for(int i = start ; i <end ; i++) { + d0.v3=pposs[i]; + d0.cz4(); + d0.csa(s_dir,dir,f); + } + return n; + } + + + public void firstm(Instances is, int i, + int prnt, int dpnt, int label, Cluster cluster, long[] f) + { + + + //short[] pposs, int[] form, int[] lemmas, short[][] feats + for(int k=0;k<f.length;k++) f[k]=0; + + short[] pposs = is.pposs[i]; + int[] form =is.forms[i]; + short[][] feats = is.feats[i]; + + + int pF = form[prnt],dF = form[dpnt]; + int pL = is.plemmas[i][prnt],dL = is.plemmas[i][dpnt]; + int pP = pposs[prnt],dP = pposs[dpnt]; + + int prntLS = pF==-1?-1:cluster.getLP(pF), chldLS = dF==-1?-1:cluster.getLP(dF); + + final int dir= (prnt < dpnt)? ra:la; + + if (pF>maxForm) pF=-1; + if (pL>maxForm) pL=-1; + + if (dF>maxForm) dF=-1; + if (dL>maxForm) dL=-1; + + + int n=3,c=0; + + dl2.v1=label; + dl2.v0= n++; dl2.v2=pF; dl2.v3=dP; dl2.cz4(); f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++; dl2.cz3(); f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++; dl2.v2=dF; dl2.v3=pP; dl2.cz4(); f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++; dl2.cz3(); f[c++]=dl2.csa(s_dir,dir); + + + dwwp.v1=label; + dwwp.v0= n++; dwwp.v2=pF; dwwp.v3=dF; dwwp.cz4(); f[c++]=dwwp.csa(s_dir,dir); + + dl1.v1=label; + dl1.v0= n++; dl1.v2=dP; dl1.cz3(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=pP; dl1.cz3(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v3=dP; dl1.cz4(); f[c++]=dl1.csa(s_dir,dir); + + int pPm1 = prnt > 0 ? pposs[prnt - 1] : s_str, dPm1 = dpnt > 0 ? pposs[dpnt - 1] : s_str; + int pPp1 = prnt < pposs.length - 1 ? pposs[prnt + 1]:s_end, dPp1 = dpnt < pposs.length - 1 ? pposs[dpnt + 1]:s_end; + + int pPm2 = prnt > 1 ? pposs[prnt - 2] : s_str, dPm2 = dpnt > 1 ? pposs[dpnt - 2] : s_str; + int pPp2 = prnt < pposs.length - 2 ? pposs[prnt + 2]:s_end, dPp2 = dpnt < pposs.length - 2 ? pposs[dpnt + 2]:s_end; + + int pFm1 = prnt > 0 ? form[prnt - 1] : s_stwrd, dFm1 = dpnt > 0 ? form[dpnt - 1] : s_stwrd; + int pFp1 = prnt < form.length - 1 ? form[prnt + 1]:s_stwrd, dFp1 = dpnt < form.length - 1 ? form[dpnt + 1]:s_stwrd; + + + + dl1.v0= n++;dl1.v2=pP; dl1.v3=pPp1; dl1.v4=dP;dl1.v5=dPp1; dl1.cz6(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v5=dPm1; dl1.cz6(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v3=pPm1; dl1.cz6(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v5=dPp1; dl1.cz6(); f[n++]=dl1.csa(s_dir,dir); + + + dl1.v0= n++; dl1.v3=pPm1; dl1.cz5(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v3=dPm1; dl1.cz5(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v3=dPp1; dl1.cz5(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v3=pPp1; dl1.cz5(); f[n++]=dl1.csa(s_dir,dir); + + dl1.v0= n++;dl1.v2=pP; dl1.v3=pPp2; dl1.v4=dP;dl1.v5=dPp2; dl1.cz6(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v5=dPm2; dl1.cz6(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v3=pPm2; dl1.cz6(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v5=dPp2; dl1.cz6(); f[n++]=dl1.csa(s_dir,dir); + + dl1.v0= n++; dl1.v3=pPm2; dl1.cz5(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v3=dPm2; dl1.cz5(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v3=dPp2; dl1.cz5(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v3=pPp2; dl1.cz5(); f[n++]=dl1.csa(s_dir,dir); + + + + dl2.v0= n++; dl2.v3=dFm1; dl2.v3=pPp1;dl2.v4=pP; dl2.cz5(); f[n++]=dl2.getVal(); + dl2.v0= n++; dl2.v3=dFp1; dl2.v3=pPm1; dl2.cz5(); f[n++]=dl2.getVal(); + dl2.v0= n++; dl2.v3=pFm1; dl2.v3=dPp1;dl2.v4=dP; dl2.cz5(); f[n++]=dl2.getVal(); + dl2.v0= n++; dl2.v3=pFp1; dl2.v3=dPm1; dl2.cz5(); f[n++]=dl2.getVal(); + + + dl2.v0= n++; dl2.v3=dFm1; dl2.v3=dPm2;dl2.v4=pP; dl2.cz5(); f[n++]=dl2.getVal(); + dl2.v0= n++; dl2.v3=dFp1; dl2.v3=dPp2; dl2.cz5(); f[n++]=dl2.getVal(); + dl2.v0= n++; dl2.v3=pFm1; dl2.v3=pPm2;dl2.v4=dP; dl2.cz5(); f[n++]=dl2.getVal(); + dl2.v0= n++; dl2.v3=pFp1; dl2.v3=pPp2; dl2.cz5(); f[n++]=dl2.getVal(); + + + dwwp.v0= n++; dwwp.v2=pF; dwwp.v3=dF; dwwp.v4=dP; dwwp.cz5(); f[n++]=dwwp.csa(s_dir,dir); + dwwp.v0= n++; dwwp.v2=pF; dwwp.v3=dF; dwwp.v4=pP; dwwp.cz5(); f[n++]=dwwp.csa(s_dir,dir); + dwwp.v0= n++; dwwp.v2=dF; dwwp.v3=pF; dwwp.v4=pP; dwwp.v4=dP; dwwp.cz6(); f[n++]=dwwp.csa(s_dir,dir); + + + + // lemmas + + dl2.v1=label; + dl2.v0= n++; dl2.v2=pL; dl2.v3=dP; dl2.cz4(); f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++; dl2.cz3(); f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++; dl2.v2=dL; dl2.v3=pP; dl2.cz4(); f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++; dl2.cz3(); f[c++]=dl2.csa(s_dir,dir); + + + dwwp.v1=label; + dwwp.v0= n++; dwwp.v2=pL; dwwp.v3=dL; dwwp.cz4(); f[c++]=dwwp.csa(s_dir,dir); + + dwp.v1= label; + dwp.v0=n++;dwp.v2=dL; dwp.v3=pP;dwp.v4=dP;dwp.v5=pL; dwp.cz6(); f[c++]=dwp.csa(s_dir,dir); + dwp.v0=n++;dwp.cz5(); f[c++]=dwp.csa(s_dir,dir); + + dwp.v0=n++;dwp.v2=pL; dwp.cz5(); f[c++]=dwp.csa(s_dir,dir); + dwwp.v0= n++; dwwp.v2=pL; dwwp.v3=dL; dwwp.v4=dP; dwwp.cz5(); f[c++]=dwwp.csa(s_dir,dir); + dwwp.v0= n++; dwwp.v4=pP; dwwp.cz5(); f[c++]=dwwp.csa(s_dir,dir); + + + // cluster + + d2pw.v1=label; + d2pw.v0=n++; d2pw.v2=prntLS; d2pw.v3=chldLS; d2pw.cz4(); f[c++]=d2pw.csa(s_dir,dir); + d2pw.v0=n++; d2pw.v4=pF; d2pw.cz5(); f[c++]=d2pw.csa(s_dir,dir); + d2pw.v0=n++; d2pw.v4=dF; d2pw.cz5(); f[c++]=d2pw.csa(s_dir,dir); + d2pw.v0=n++; d2pw.v5=pF; d2pw.cz6(); f[c++]=d2pw.csa(s_dir,dir); + + + d2pp.v1=label; + d2pp.v0=n++; d2pp.v2=prntLS; d2pp.v3=chldLS; d2pp.cz4(); f[c++]=d2pp.csa(s_dir,dir); + d2pp.v0=n++; d2pp.v4=pP; d2pp.cz5(); f[c++]=d2pp.csa(s_dir,dir); + d2pp.v0=n++; d2pp.v4=dP; d2pp.cz5(); f[c++]=d2pp.csa(s_dir,dir); + d2pp.v0=n++; d2pp.v5=pP; d2pp.cz6(); f[c++]=d2pp.csa(s_dir,dir); + + + short[] prel = is.plabels[i]; + short[] phead = is.pheads[i]; + + + //take those in for stacking + // dl2.v1=label; + // dl2.v0= n++;dl2.v2=prel[dpnt];dl2.v3=pP;dl2.v4=dP; dl2.v5=prnt==phead[dpnt]?1:2; dl2.cz6(); f[c++]=dl2.csa(s_dir,dir); + // dl2.v0= n++;dl2.v2=pP;dl2.v3=dP; dl2.v4=prnt==phead[dpnt]?1:2; dl2.cz5(); f[c++]=dl2.csa(s_dir,dir); + + + + if (feats==null) return; + + short[] featsP =feats[prnt], featsD =feats[dpnt]; + dlf.v0= n++; dlf.v1=label; dlf.v2=pP; dlf.v3=dP; + extractFeat(f, c, dir, featsP, featsD); + + return; + } + + + + public void gcm(Instances is , int i, int p, int d, int gc, int label,Cluster cluster, long[] f) { + + for(int k=0;k<f.length;k++) f[k]=0; + + short[] pos= is.pposs[i]; + int[] forms=is.forms[i]; + int[] lemmas=is.plemmas[i]; + short[][] feats=is.feats[i]; + + int pP = pos[p], dP = pos[d]; + int prntF = forms[p], chldF = forms[d]; + int prntL = lemmas[p], chldL = lemmas[d]; + int prntLS = prntF==-1?-1:cluster.getLP(prntF), chldLS = chldF==-1?-1:cluster.getLP(chldF); + + int gP = gc != -1 ? pos[gc] : s_str; + int gcF = gc != -1 ? forms[gc] : s_stwrd; + int gcL = gc != -1 ? lemmas[gc] : s_stwrd; + int gcLS = (gc != -1) && (gcF!=-1) ? cluster.getLP(gcF) : s_stwrd; + + if (prntF>maxForm) prntF=-1; + if (prntL>maxForm) prntL=-1; + + if (chldF>maxForm) chldF=-1; + if (chldL>maxForm) chldL=-1; + + if (gcF>maxForm) gcF=-1; + if (gcL>maxForm) gcL=-1; + + + int dir= (p < d)? ra:la, dir_gra =(d < gc)? ra:la; + + int n=84,c=0; + + //dl1.v023(); + dl1.v1=label; + dl1.v0= n++; dl1.v2=pP; dl1.v3=dP;dl1.v4=gP; dl1.cz5(); dl1.cs(s_dir,dir);f[c++]=dl1.csa(s_dir,dir_gra); + dl1.v0= n++; dl1.v2=pP; dl1.v3=gP; dl1.cz4();dl1.cs(s_dir,dir);f[c++]=dl1.csa(s_dir,dir_gra); + dl1.v0= n++; dl1.v2=dP; dl1.cz4(); dl1.cs(s_dir,dir);f[c++]=dl1.csa(s_dir,dir_gra); + + dwwp.v1=label; + dwwp.v0= n++; dwwp.v2=prntF; dwwp.v3=gcF; + dwwp.cz4(); dwwp.cs(s_dir,dir);f[c++]=dwwp.csa(s_dir,dir_gra); + + dwwp.v0= n++; dwwp.v2=chldF; dwwp.v3=gcF; + dwwp.cz4(); dwwp.cs(s_dir,dir);f[c++]=dwwp.csa(s_dir,dir_gra); + + dwp.v1=label; + dwp.v0= n++; dwp.v2=gcF; dwp.v3=pP; + dwp.cz4(); dwp.cs(s_dir,dir);f[c++]=dwp.csa(s_dir,dir_gra); + + dwp.v0= n++; dwp.v2=gcF; dwp.v3=dP; + dwp.cz4(); dwp.cs(s_dir,dir);f[c++]=dwp.csa(s_dir,dir_gra); + + dwp.v0= n++; dwp.v2=prntF; dwp.v3=gP; + dwp.cz4(); dwp.cs(s_dir,dir);f[c++]=dwp.csa(s_dir,dir_gra); + + dwp.v0= n++; dwp.v2=chldF; dwp.v3=gP; + dwp.cz4(); dwp.cs(s_dir,dir); f[c++]=dwp.csa(s_dir,dir_gra); + + + // lemma + + dwwp.v0= n++; dwwp.v2=prntL; dwwp.v3=gcL; + dwwp.cz4();dwwp.cs(s_dir,dir);f[c++]=dwwp.csa(s_dir,dir_gra); + + dwwp.v0= n++; dwwp.v2=chldL; dwwp.v3=gcL; + dwwp.cz4(); dwwp.cs(s_dir,dir);f[c++]=dwwp.csa(s_dir,dir_gra); + + dwp.v0= n++; dwp.v2=gcL; dwp.v3=pP; + dwp.cz4(); dwp.cs(s_dir,dir);f[c++]=dwp.csa(s_dir,dir_gra); + + dwp.v0= n++; dwp.v2=gcL; dwp.v3=dP; + dwp.cz4(); dwp.cs(s_dir,dir);f[c++]=dwp.csa(s_dir,dir_gra); + + dwp.v0= n++; dwp.v2=prntL; dwp.v3=gP; + dwp.cz4(); dwp.cs(s_dir,dir);f[c++]=dwp.csa(s_dir,dir_gra); + + dwp.v0= n++; dwp.v2=chldL; dwp.v3=gP; + dwp.cz4(); dwp.cs(s_dir,dir); f[c++]=dwp.csa(s_dir,dir_gra); + + + // clusters + + d2lp.v1= label; + d2lp.v0= n++; d2lp.v2=prntLS; d2lp.v3=gcLS; d2lp.cz4(); d2lp.cs(s_dir,dir);f[c++]=d2lp.csa(s_dir,dir_gra);// f.add(li.l2i(l)); + d2lp.v0= n++; d2lp.v2=chldLS; d2lp.v3=gcLS; d2lp.cz4(); d2lp.cs(s_dir,dir);f[c++]=d2lp.csa(s_dir,dir_gra); + d3lp.v0= n++; d3lp.v1= label; d3lp.v2=prntLS; d3lp.v3=chldLS; d3lp.v4=gcLS; d3lp.cz5(); d3lp.cs(s_dir,dir);f[c++]=d3lp.csa(s_dir,dir_gra); + + //_f83; + d2lp.v0= n++; d2lp.v2=prntLS; d2lp.v3=chldLS; d2lp.v4=gcF; d2lp.cz5(); f[c++]=d2lp.csa(s_dir,dir); + d2lp.v0= n++; d2lp.v2=prntLS; d2lp.v3=gcLS; d2lp.v4=chldF; d2lp.cz5(); f[c++]=d2lp.csa(s_dir,dir); + d2lp.v0= n++; d2lp.v2=chldLS; d2lp.v3=gcLS; d2lp.v4=prntF; d2lp.cz5(); f[c++]=d2lp.csa(s_dir,dir); + + d2pp.v1= label; + d2pp.v0= n++; d2pp.v2=prntLS; d2pp.v3=chldLS; d2pp.v4=gP; d2pp.cz5(); f[c++]=d2pp.csa(s_dir,dir); + d2pp.v0= n++; d2pp.v2=prntLS; d2pp.v3=gcLS; d2pp.v4=dP; d2pp.cz5(); f[c++]=d2pp.csa(s_dir,dir); + d2pp.v0= n++; d2pp.v2=chldLS; d2pp.v3=gcLS; d2pp.v4=pP; d2pp.cz5(); f[c++]=d2pp.csa(s_dir,dir); + + + + // linear features + + int prntPm1 = p != 0 ? pos[p - 1] : s_str; // parent-pos-minus1 + int chldPm1 = d - 1 >=0 ? pos[d - 1] : s_str; // child-pos-minus1 + int prntPp1 = p != pos.length - 1 ? pos[p + 1] : s_end; + int chldPp1 = d != pos.length - 1 ? pos[d + 1] : s_end; + + int gcPm1 = gc > 0 ? pos[gc - 1] : s_str; + int gcPp1 = gc < pos.length - 1 ? pos[gc + 1] : s_end; + + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcPp1;dl1.v4=dP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcPm1;dl1.v4=dP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=dP;dl1.v4=chldPp1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=dP;dl1.v4=chldPm1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcPp1;dl1.v4=chldPm1;dl1.v5=dP;dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gcPm1; dl1.v3=gP;dl1.v4=chldPm1;dl1.v5=dP; dl1.cz6();f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcPp1;dl1.v4=dP;dl1.v5=chldPp1; dl1.cz6();f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gcPm1; dl1.v3=gP;dl1.v4=dP;dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcPp1;dl1.v4=pP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcPm1;dl1.v4=pP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=pP;dl1.v4=prntPp1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=pP;dl1.v4=prntPm1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcPp1;dl1.v4=prntPm1;dl1.v5=pP; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gcPm1; dl1.v3=gP;dl1.v4=prntPm1;dl1.v5=pP; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcPp1;dl1.v4=pP;dl1.v5=prntPp1; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gcPm1; dl1.v3=gP; dl1.v4=pP; dl1.v5=prntPp1;dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + + + int pLSp1 = p != pos.length - 1 ? forms[p + 1]==-1?-1:cluster.getLP(forms[p + 1]): _cend; + int cLSp1 = d != pos.length - 1 ? forms[d + 1] ==-1?-1:cluster.getLP(forms[d + 1]):_cend; + int gcLSp1 = gc < pos.length -1 ? forms[gc + 1] ==-1?-1:cluster.getLP(forms[gc + 1]) : s_end; + + int pLSm1 = p != 0 ? lemmas[p - 1]==-1?-1:cluster.getLP(lemmas[p - 1]): _cstr; + int cLSm1 = d - 1 >=0 ? lemmas[d - 1] ==-1?-1:cluster.getLP(lemmas[d - 1]):_cstr; + int gcLSm1 = gc > 0 ? lemmas[gc - 1] ==-1?-1:cluster.getLP(lemmas[gc - 1]) : _cstr; + + + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcLSp1;dl1.v4=dP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcLSm1;dl1.v4=dP; dl1.cz5();f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=dP;dl1.v4=cLSp1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=dP;dl1.v4=cLSm1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcLSp1;dl1.v4=cLSm1;dl1.v5=dP;dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gcLSm1; dl1.v3=gP;dl1.v4=cLSm1;dl1.v5=dP; dl1.cz6();f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcLSp1;dl1.v4=dP;dl1.v5=cLSp1; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=cLSm1; dl1.v3=gP;dl1.v4=dP;dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcLSp1;dl1.v4=pP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcLSm1;dl1.v4=pP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=pP;dl1.v4=pLSp1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=pP;dl1.v4=pLSm1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcLSp1;dl1.v4=pLSm1;dl1.v5=pP; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gcLSm1; dl1.v3=gP;dl1.v4=pLSm1;dl1.v5=pP; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcLSp1;dl1.v4=pP;dl1.v5=pLSp1; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gcLSm1; dl1.v3=gP; dl1.v4=pP; dl1.v5=pLSp1;dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + + + + short[] prel = is.plabels[i],phead=is.pheads[i]; + + int g = p==phead[d]?1:2 ; + if (gc>=0) g += d==phead[gc]?4:8; + + int gr = gc==-1?s_relend:prel[gc]; + + // take those in for stacking + /* + dl2.v1=label; + dl2.v0= n++;dl2.v2=prel[d];dl2.v3=g;dl2.v4=gP;dl2.v5=dP;dl2.cz6();f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++;dl2.v2=prel[d];dl2.v3=g;dl2.v4=gP;dl2.v5=pP;dl2.cz6();f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++;dl2.v2=prel[d];dl2.v3=g;dl2.v4=gP;dl2.v5=pP;dl2.v6=dP;dl2.cz7();f[c++]=dl2.csa(s_dir,dir); + + dl2.v0= n++;dl2.v2=gr;dl2.v3=g;dl2.v4=gP;dl2.v5=dP;dl2.cz6();f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++;dl2.v2=gr;dl2.v3=g;dl2.v4=gP;dl2.v5=pP;dl2.cz6();f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++;dl2.v2=gr;dl2.v3=g;dl2.v4=gP;dl2.v5=pP;dl2.v6=dP;dl2.cz7();f[c++]=dl2.csa(s_dir,dir); + +*/ + if (feats==null) return; + + short[] featsP =feats[d]; + short[] featsD =gc!=-1?feats[gc]:null; + + dlf.v0= n++; dlf.v1=label; dlf.v2=gP; dlf.v3=dP; + extractFeat(f, c, dir, featsP, featsD); + return; + } + + + public void siblingm(Instances is , int i,short pos[], int forms[], int[] lemmas, short[][] feats, int prnt, int d, int sblng, int label, Cluster cluster, long[] f, int v) + { + + for(int k=0;k<f.length;k++) f[k]=0; + + int pP = pos[prnt], dP = pos[d]; + int prntF = forms[prnt],chldF = forms[d]; + int prntL = lemmas[prnt], chldL = lemmas[d]; + int prntLS = prntF==-1?-1:cluster.getLP(prntF), chldLS = chldF==-1?-1:cluster.getLP(chldF); + + int sP = sblng!=-1 ? pos[sblng] : s_str, sblF = sblng!=-1 ? forms[sblng] : s_stwrd, sblL = sblng!=-1 ? lemmas[sblng] : s_stwrd; + + int sblLS = (sblng != -1)&&(sblF!=-1) ? cluster.getLP(sblF) : s_stwrd; + + + int dir= (prnt < d)? ra:la; + + int abs = Math.abs(prnt-d); + + final int dist; + if (abs > 10)dist=d10;else if (abs>5) dist=d5;else if( abs==5)dist=d4;else if (abs==4)dist=d3;else if (abs==3)dist=d2; + else if (abs==2)dist=d1; else dist=di0; + + int n=147; + + if (prntF>maxForm) prntF=-1; + if (prntL>maxForm) prntL=-1; + + if (chldF>maxForm) chldF=-1; + if (chldL>maxForm) chldL=-1; + + if (sblF>maxForm) sblF=-1; + if (sblL>maxForm) sblL=-1; + + + dl1.v0= n++; dl1.v1=label;dl1.v2=pP; dl1.v3=dP;dl1.v4=sP; dl1.cz5(); f[0]=dl1.csa(s_dir,dir);f[1]=dl1.csa(s_dist,dist); + dl1.v0= n++; dl1.v3=sP; dl1.cz4(); f[2]=dl1.csa(s_dir,dir); f[3]=dl1.csa(s_dist,dist); + dl1.v0= n++; dl1.v2=dP;dl1.cz4(); f[4]=dl1.csa(s_dir,dir); f[5]=dl1.csa(s_dist,dist); + + // sibling only could be tried + dwwp.v1=label; + dwwp.v0= n++; dwwp.v2=prntF; dwwp.v3=sblF; dwwp.cz4(); f[6]=dwwp.csa(s_dir,dir); f[7]=dwwp.csa(s_dist,dist); + dwwp.v0= n++; dwwp.v2=chldF; dwwp.cz4(); f[8]=dwwp.csa(s_dir,dir); f[9]=dwwp.csa(s_dist,dist); + dwp.v0= n++; dwp.v1=label; dwp.v2=sblF; dwp.v3=pP; dwp.cz4(); f[10]=dwp.csa(s_dir,dir); f[11]=dwp.csa(s_dist,dist); + dwp.v0= n++; /*dwp.v1=label; */dwp.v3=dP; dwp.cz4(); f[12]=dwp.csa(s_dir,dir); f[13]=dwp.csa(s_dist,dist); + dwp.v0= n++; /*dwp.v1=label;*/ dwp.v2=prntF; dwp.v3=sP; dwp.cz4(); f[14]=dwp.csa(s_dir,dir); f[15]=dwp.csa(s_dist,dist); + dwp.v0= n++; /*dwp.v1=label;*/ dwp.v2=chldF; dwp.cz4(); f[16]=dwp.csa(s_dir,dir); f[17]=dwp.csa(s_dist,dist); + + //lemmas + dwwp.v0= n++; dwwp.v2=prntL; dwwp.v3=sblL; dwwp.cz4(); f[18]=dwwp.csa(s_dir,dir); + dwwp.v0= n++; dwwp.v2=chldL; dwwp.cz4(); f[19]=dwwp.csa(s_dir,dir); f[20]=dwwp.csa(s_dist,dist); + dwp.v0= n++; /*dwp.v1=label;*/ dwp.v2=sblL; dwp.v3=pP; dwp.cz4(); f[21]=dwp.csa(s_dir,dir); f[22]=dwp.csa(s_dist,dist); + dwp.v0= n++; /*dwp.v1=label; */ dwp.v3=dP; dwp.cz4(); f[23]=dwp.csa(s_dir,dir);f[24]=dwp.csa(s_dist,dist); + dwp.v0= n++; /*dwp.v1=label;*/ dwp.v2=prntL; dwp.v3=sP; dwp.cz4(); f[25]=dwp.csa(s_dir,dir); f[26]=dwp.csa(s_dist,dist); + dwp.v0= n++; /*dwp.v1=label;*/ dwp.v2=chldL; dwp.cz4(); f[27]=dwp.csa(s_dir,dir);f[28]=dwp.csa(s_dist,dist); + + + // clusters + + d2lp.v1=label; + d2lp.v0= n++; d2lp.v2=prntLS; d2lp.v3=sblLS; d2lp.cz4(); f[29]=d2lp.csa(s_dir,dir); + d2lp.v0= n++; d2lp.v2=chldLS; d2lp.v3=sblLS; d2lp.cz4(); f[30]=d2lp.csa(s_dir,dir); f[31]=d2lp.csa(s_dist,dist); + + d3lp.v1= label; + d3lp.v0= n++; d3lp.v2=prntLS; d3lp.v3=chldLS; d3lp.v4=sblLS;d3lp.cz5(); f[32]=d3lp.csa(s_dir,dir); + + d2lp.v0= n++; d2lp.v2=prntLS; d2lp.v3=chldLS; d2lp.v4=sblF; d2lp.cz5(); f[33]=d2lp.csa(s_dir,dir); f[34]=d2lp.csa(s_dist,dist); + d2lp.v0= n++; d2lp.v2=prntLS; d2lp.v3=sblLS; d2lp.v4=chldF; d2lp.cz5(); f[35]=d2lp.csa(s_dir,dir); f[36]=d2lp.csa(s_dist,dist); + d2lp.v0= n++; d2lp.v2=chldLS; d2lp.v3=sblLS; d2lp.v4=prntF; d2lp.cz5(); f[37]=d2lp.csa(s_dir,dir); f[38]=d2lp.csa(s_dist,dist); + + d2pp.v1=label; + d2pp.v0= n++; d2pp.v2=prntLS; d2pp.v3=chldLS; d2pp.v4=sP; d2pp.cz5(); f[39]=d2pp.csa(s_dir,dir); f[40]=d2pp.csa(s_dist,dist); + d2pp.v0= n++; d2pp.v2=prntLS; d2pp.v3=sblLS; d2pp.v4=dP; d2pp.cz5(); f[41]=d2pp.csa(s_dir,dir); f[42]=d2pp.csa(s_dist,dist); + d2pp.v0= n++; d2pp.v2=chldLS; d2pp.v3=sblLS; d2pp.v4=pP; d2pp.cz5(); f[43]=d2pp.csa(s_dir,dir); f[44]=d2pp.csa(s_dist,dist); + + + int prntPm1 = prnt!=0 ? pos[prnt-1] : s_str; + int chldPm1 = d-1>=0 ? pos[d-1] : s_str; + int prntPp1 = prnt!=pos.length-1 ? pos[prnt+1] : s_end; + int chldPp1 = d!=pos.length-1 ? pos[d+1] : s_end; + + // sibling part of speech minus and plus 1 + int sblPm1 = sblng>0 ? pos[sblng-1]:s_str; + int sblPp1 = sblng<pos.length-1 ? pos[sblng + 1]:s_end; + + dl1.v0=n++; dl1.v2=sP; dl1.v3=sblPp1;dl1.v4=pP; dl1.cz5(); f[45]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sblPm1;dl1.v4=pP; dl1.cz5(); f[46]=dl1.csa(s_dir,dir);// f.add(li.l2i(l)); + dl1.v0=n++; dl1.v2=sP; dl1.v3=pP;dl1.v4=prntPp1;dl1.cz5(); f[47]=dl1.csa(s_dir,dir);// f.add(li.l2i(l)); + dl1.v0=n++; dl1.v2=sP; dl1.v3=pP;dl1.v4=prntPm1; dl1.cz5(); f[48]=dl1.csa(s_dir,dir);// f.add(li.l2i(l)); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sblPp1;dl1.v4=prntPm1;dl1.v5=pP; dl1.cz6(); f[49]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sblPm1; dl1.v3=sP;dl1.v4=prntPm1;dl1.v5=pP;dl1.cz6(); f[50]=dl1.csa(s_dir,dir);// f.add(li.l2i(l)); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sblPp1;dl1.v4=pP;dl1.v5=prntPp1; dl1.cz6(); f[51]=dl1.csa(s_dir,dir);// f.add(li.l2i(l)); + dl1.v0=n++; dl1.v2=sblPm1; dl1.v3=sP; dl1.v4=pP;dl1.v5=prntPp1; dl1.cz6(); f[52]=dl1.csa(s_dir,dir);// f.add(li.l2i(l)); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sblPp1;dl1.v4=dP; dl1.cz5(); f[53]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sblPm1;dl1.v4=dP; dl1.cz5(); f[54]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=dP;dl1.v4=chldPp1;dl1.cz5(); f[55]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=dP;dl1.v4=chldPm1; dl1.cz5(); f[56]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sblPp1;dl1.v4=chldPm1;dl1.v5=dP; dl1.cz6(); f[57]=dl1.csa(s_dir,dir);// f.add(li.l2i(l)); + dl1.v0=n++; dl1.v2=sblPm1; dl1.v3=sP;dl1.v4=chldPm1;dl1.v5=dP;dl1.cz6(); f[58]=dl1.csa(s_dir,dir); + dl1.v0= n++;dl1.v2=sP; dl1.v3=sblPp1;dl1.v4=dP;dl1.v5=chldPp1;dl1.cz6();f[59]=dl1.csa(s_dir,dir);// f.add(li.l2i(l)); + dl1.v0= n++; dl1.v2=sblPm1; dl1.v3=sP;dl1.v4=dP;dl1.v5=chldPp1;dl1.cz6(); f[60]=dl1.csa(s_dir,dir); + + int c=61; + + int pLSp1 = prnt != pos.length - 1 ? forms[prnt + 1]==-1?-1:cluster.getLP(forms[prnt + 1]): _cend; + int cLSp1 = d != pos.length - 1 ? forms[d + 1] ==-1?-1:cluster.getLP(forms[d + 1]):_cend; + int sLSp1 = sblng < pos.length -1 ? forms[sblng + 1] ==-1?-1:cluster.getLP(forms[sblng + 1]) : _cend; + + int pLSm1 = prnt!=0 ? forms[prnt - 1]==-1?-1:cluster.getLP(forms[prnt - 1]): _cstr; + int cLSm1 = d-1>=0 ? forms[d - 1] ==-1?-1:cluster.getLP(forms[d - 1]):_cstr; + int sLSm1 = sblng>0 ? forms[sblng - 1] ==-1?-1:cluster.getLP(forms[sblng - 1]):_cstr; + + //int c=61; + + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=pP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSm1;dl1.v4=pP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=pP;dl1.v4=pLSp1;dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=pP;dl1.v4=pLSm1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=pLSm1;dl1.v5=pP; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sLSm1; dl1.v3=sP;dl1.v4=pLSm1;dl1.v5=pP; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=pP;dl1.v5=pLSp1; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sLSm1; dl1.v3=sP; dl1.v4=pP;dl1.v5=pLSp1; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=dP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSm1;dl1.v4=dP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=dP;dl1.v4=cLSp1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++;dl1.v2=sP; dl1.v3=dP;dl1.v4=cLSm1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSm1;dl1.v4=cLSm1;dl1.v5=dP; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sLSm1; dl1.v3=sP;dl1.v4=cLSm1;dl1.v5=dP;dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++;dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=dP;dl1.v5=cLSp1;dl1.cz6();f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sLSm1; dl1.v3=sP;dl1.v4=dP;dl1.v5=cLSp1; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + + + + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=pP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSm1;dl1.v4=pP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=pP;dl1.v4=pLSp1;dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=pP;dl1.v4=pLSm1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=pLSm1;dl1.v5=pP; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sLSm1; dl1.v3=sP;dl1.v4=pLSm1;dl1.v5=pP; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=pP;dl1.v5=pLSp1; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sLSm1; dl1.v3=sP; dl1.v4=pP;dl1.v5=pLSp1; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=dP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSm1;dl1.v4=dP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=dP;dl1.v4=cLSp1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++;dl1.v2=sP; dl1.v3=dP;dl1.v4=cLSm1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSm1;dl1.v4=cLSm1;dl1.v5=dP; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sLSm1; dl1.v3=sP;dl1.v4=cLSm1;dl1.v5=dP;dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++;dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=dP;dl1.v5=cLSp1;dl1.cz6();f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=sLSm1; dl1.v3=sP;dl1.v4=dP;dl1.v5=cLSp1; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + + // take those in for stacking + + /* + short[] prel = is.plabels[i],phead=is.pheads[i]; + + int g = prnt==phead[d]?1:2 ; + if (sblng>=0) g += prnt==phead[sblng]?4:8; + + int gr = sblng==-1?s_relend:prel[sblng]; + + + dl2.v0= n++;dl2.v2=prel[d];dl2.v3=g;dl2.v4=sP;dl2.v5=dP;dl2.cz6();f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++;dl2.v2=prel[d];dl2.v3=g;dl2.v4=sP;dl2.v5=pP;dl2.cz6();f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++;dl2.v2=prel[d];dl2.v3=g;dl2.v4=sP;dl2.v5=pP;dl2.v6=dP;dl2.cz7();f[c++]=dl2.csa(s_dir,dir); + + dl2.v0= n++;dl2.v2=gr;dl2.v3=g;dl2.v4=sP;dl2.v5=dP;dl2.cz6();f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++;dl2.v2=gr;dl2.v3=g;dl2.v4=sP;dl2.v5=pP;dl2.cz6();f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++;dl2.v2=gr;dl2.v3=g;dl2.v4=sP;dl2.v5=pP;dl2.v6=dP;dl2.cz7();f[c++]=dl2.csa(s_dir,dir); +*/ + + if (feats==null) return; + + int cnt=c; + + short[] featsP =feats[d]; + short[] featsSbl =sblng!=-1?feats[sblng]:null; + + dlf.v0= n++; dlf.v1=label; dlf.v2=sP; dlf.v3=dP; + + + cnt = extractFeat(f, cnt ,dir, featsP, featsSbl); + + featsP =feats[prnt]; + featsSbl =sblng!=-1?feats[sblng]:null; + + dlf.v0= n++; dlf.v1=label; dlf.v2=pP; dlf.v3=sP; + if (featsP!=null && featsSbl!=null) { + for(short i1=0;i1<featsP.length;i1++) { + for(short i2=0;i2<featsSbl.length;i2++) { + dlf.v4=featsP[i1]; dlf.v5=featsSbl[i2]; + dlf.cz6(); f[cnt++]=dlf.csa(s_dir,prnt<sblng?1:2); + } + } + } else if (featsP==null && featsSbl!=null) { + + for(short i2=0;i2<featsSbl.length;i2++) { + dlf.v4=nofeat; dlf.v5=featsSbl[i2]; + dlf.cz6(); f[cnt++]=dlf.csa(s_dir,dir); + } + + } else if (featsP!=null && featsSbl==null) { + + for(short i1=0;i1<featsP.length;i1++) { + dlf.v4=featsP[i1]; dlf.v5=nofeat; + dlf.cz6(); f[cnt++]=dlf.csa(s_dir,dir); + } + } + + return; + } + + private int extractFeat(long[] f, int cnt, int dir, short[] featsP, short[] featsD) { + if (featsP!=null && featsD!=null) { + for(short i1=0;i1<featsP.length;i1++) { + for(short i2=0;i2<featsD.length;i2++) { + dlf.v4=featsP[i1]; dlf.v5=featsD[i2]; + dlf.cz6(); f[cnt++]=dlf.csa(s_dir,dir); + } + } + } else if (featsP==null && featsD!=null) { + + for(short i2=0;i2<featsD.length;i2++) { + dlf.v4=nofeat; dlf.v5=featsD[i2]; + dlf.cz6(); f[cnt++]=dlf.csa(s_dir,dir); + + } + } else if (featsP!=null && featsD==null) { + + for(short i1=0;i1<featsP.length;i1++) { + dlf.v4=featsP[i1]; dlf.v5=nofeat; + dlf.cz6(); f[cnt++]=dlf.csa(s_dir,dir); + + } + } + return cnt; + } + + public IFV encodeCat2(Instances is, int ic, short pposs[], int forms[], int[] lemmas, short[] heads, short[] types, short feats[][], + Cluster cluster, IFV f, Long2IntInterface li) { + + + long[] svs = new long[250]; + + for (int i = 1; i < heads.length; i++) { + + + int n =basic(pposs, forms, heads[i], i, cluster, f); + firstm(is, ic, heads[i], i, types[i], cluster,svs); + for(int k=0;k<svs.length;k++) f.add(li.l2i(svs[k])); + + int ch,cmi,cmo; + if (heads[i] < i) { + ch = rightmostRight(heads, heads[i], i); + cmi = leftmostLeft(heads, i, heads[i]); + cmo = rightmostRight(heads, i, heads.length); + + } else { + ch = leftmostLeft(heads, heads[i], i); + cmi = rightmostRight(heads, i, heads[i]); + cmo = leftmostLeft(heads, i, 0); + } + + siblingm(is,ic,pposs, forms,lemmas, feats, heads[i], i, ch,types[i], cluster, svs,n); + for(int k=0;k<svs.length;k++) f.add(li.l2i(svs[k])); + + + gcm(is, ic,heads[i],i,cmi, types[i], cluster, svs); + for(int k=0;k<svs.length;k++) f.add(li.l2i(svs[k])); + + gcm(is, ic, heads[i],i,cmo, types[i], cluster, svs); + for(int k=0;k<svs.length;k++)f.add(li.l2i(svs[k])); + } + + return f; + } + + public FV encodeCat(Instances is, int ic, short pposs[], int forms[], int[] lemmas, short[] heads, short[] types, short feats[][], Cluster cluster, FV f) { + + + long[] svs = new long[250]; + + for (int i = 1; i < heads.length; i++) { + + + int n =basic(pposs, forms, heads[i], i, cluster, f); + firstm(is, ic, heads[i], i, types[i], cluster,svs); + for(int k=0;k<svs.length;k++) dl1.map(f,svs[k]); + + int ch,cmi,cmo; + if (heads[i] < i) { + ch = rightmostRight(heads, heads[i], i); + cmi = leftmostLeft(heads, i, heads[i]); + cmo = rightmostRight(heads, i, heads.length); + + } else { + ch = leftmostLeft(heads, heads[i], i); + cmi = rightmostRight(heads, i, heads[i]); + cmo = leftmostLeft(heads, i, 0); + } + + siblingm(is,ic,pposs, forms,lemmas, feats, heads[i], i, ch,types[i], cluster, svs,n); + for(int k=0;k<svs.length;k++) dl1.map(f,svs[k]); + + + gcm(is, ic,heads[i],i,cmi, types[i], cluster, svs); + for(int k=0;k<svs.length;k++) dl1.map(f,svs[k]); + + gcm(is, ic, heads[i],i,cmo, types[i], cluster, svs); + for(int k=0;k<svs.length;k++) dl1.map(f,svs[k]); + } + + return f; + } + + + public float encode3(short[] pos, short heads[] , short[] types, DataF d2) { + + double v = 0; + for (int i = 1; i < heads.length; i++) { + + int dir= (heads[i] < i)? 0:1; + + v += d2.pl[heads[i]][i]; + v += d2.lab[heads[i]][i][types[i]][dir]; + + boolean left = i<heads[i]; + short[] labels = Edges.get(pos[heads[i]], pos[i], left); + int lid=-1; + for(int k=0;k<labels.length;k++) if (types[i]== labels[k]) {lid= k;break;} + + int ch,cmi,cmo; + if (heads[i] < i) { + ch = rightmostRight(heads, heads[i], i); + cmi = leftmostLeft(heads, i, heads[i]); + cmo = rightmostRight(heads, i, heads.length); + + if (ch==-1) ch=heads[i]; + if (cmi==-1) cmi=heads[i]; + if (cmo==-1) cmo=heads[i]; + + } else { + ch = leftmostLeft(heads, heads[i], i); + cmi = rightmostRight(heads, i, heads[i]); + cmo = leftmostLeft(heads, i, 0); + + if (ch==-1) ch=i; + if (cmi==-1) cmi=i; + if (cmo==-1) cmo=i; + } + v += d2.sib[heads[i]][i][ch][dir][lid]; + v += d2.gra[heads[i]][i][cmi][dir][lid]; + v += d2.gra[heads[i]][i][cmo][dir][lid]; + } + return (float)v; + } + + /** + * Provide the scores of the edges + * @param pos + * @param heads + * @param types + * @param edgesScores + * @param d2 + * @return + */ + public static float encode3(short[] pos, short heads[] , short[] types, float[] edgesScores, DataF d2) { + + double v = 0; + for (int i = 1; i < heads.length; i++) { + + int dir= (heads[i] < i)? 0:1; + + edgesScores[i] = d2.pl[heads[i]][i]; + edgesScores[i] += d2.lab[heads[i]][i][types[i]][dir]; + + boolean left = i<heads[i]; + short[] labels = Edges.get(pos[heads[i]], pos[i], left); + int lid=-1; + for(int k=0;k<labels.length;k++) if (types[i]== labels[k]) {lid= k;break;} + + int ch,cmi,cmo; + if (heads[i] < i) { + ch = rightmostRight(heads, heads[i], i); + cmi = leftmostLeft(heads, i, heads[i]); + cmo = rightmostRight(heads, i, heads.length); + + if (ch==-1) ch=heads[i]; + if (cmi==-1) cmi=heads[i]; + if (cmo==-1) cmo=heads[i]; + + } else { + ch = leftmostLeft(heads, heads[i], i); + cmi = rightmostRight(heads, i, heads[i]); + cmo = leftmostLeft(heads, i, 0); + + if (ch==-1) ch=i; + if (cmi==-1) cmi=i; + if (cmo==-1) cmo=i; + } + edgesScores[i] += d2.sib[heads[i]][i][ch][dir][lid]; + edgesScores[i] += d2.gra[heads[i]][i][cmi][dir][lid]; + edgesScores[i] += d2.gra[heads[i]][i][cmo][dir][lid]; + v+=edgesScores[i]; + } + return (float)v; + } + + + private static int rightmostRight(short[] heads, int head, int max) { + int rightmost = -1; + for (int i = head + 1; i < max; i++) if (heads[i] == head) rightmost = i; + + return rightmost; + } + + private static int leftmostLeft(short[] heads, int head, int min) { + int leftmost = -1; + for (int i = head - 1; i > min; i--) if (heads[i] == head) leftmost = i; + return leftmost; + } + + public static final String REL = "REL",END = "END",STR = "STR",LA = "LA",RA = "RA"; + + private static int ra,la; + private static int s_str; + private static int s_end, _cend,_cstr, s_stwrd,s_relend; + + protected static final String TYPE = "TYPE",DIR = "D"; + public static final String POS = "POS"; + protected static final String DIST = "DIST",MID = "MID", FEAT="F"; + + private static final String _0 = "0",_4 = "4",_3 = "3", _2 = "2",_1 = "1",_5 = "5",_10 = "10"; + + private static int di0, d4,d3,d2,d1,d5,d10; + + + private static final String WORD = "WORD",STWRD = "STWRD", STPOS = "STPOS"; + + + + private static int nofeat; + + + public static int maxForm; + + + /** + * Initialize the features. + * @param maxFeatures + */ + static public void initFeatures() { + + + MFB mf = new MFB(); + mf.register(POS, MID); + s_str = mf.register(POS, STR); + s_end = mf.register(POS, END); + + s_relend = mf.register(REL, END); + + _cstr= mf.register(Cluster.SPATH,STR); + _cend=mf.register(Cluster.SPATH,END); + + + mf.register(TYPE, POS); + + s_stwrd=mf.register(WORD,STWRD); + mf.register(POS,STPOS); + + la = mf.register(DIR, LA); + ra = mf.register(DIR, RA); + + // mf.register(TYPE, CHAR); + + mf.register(TYPE, FEAT); + nofeat=mf.register(FEAT, "NOFEAT"); + + for(int k=0;k<215;k++) mf.register(TYPE, "F"+k); + + + di0=mf.register(DIST, _0); + d1=mf.register(DIST, _1); + d2=mf.register(DIST, _2); + d3=mf.register(DIST, _3); + d4=mf.register(DIST, _4); + d5=mf.register(DIST, _5); + // d5l=mf.register(DIST, _5l); + d10=mf.register(DIST, _10); + + + } + + /* (non-Javadoc) + * @see extractors.Extractor#getType() + */ + @Override + public int getType() { + + return s_type; + } + + /* (non-Javadoc) + * @see extractors.Extractor#setMaxForm(int) + */ + @Override + public void setMaxForm(int max) { + maxForm = max; + } + + /* (non-Javadoc) + * @see extractors.Extractor#getMaxForm() + */ + @Override + public int getMaxForm() { + return maxForm; + } + + + +} diff --git a/dependencyParser/basic/mate-tools/src/extractors/ExtractorClusterStackedR2.java b/dependencyParser/basic/mate-tools/src/extractors/ExtractorClusterStackedR2.java new file mode 100644 index 0000000..de82f42 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/extractors/ExtractorClusterStackedR2.java @@ -0,0 +1,937 @@ +package extractors; + + +import java.util.Arrays; + +import is2.data.Cluster; +import is2.data.D4; +import is2.data.DataF; +import is2.data.Edges; +import is2.data.FV; +import is2.data.IFV; +import is2.data.Instances; +import is2.data.Long2IntInterface; +import is2.data.MFB; +import is2.util.DB; + + + +final public class ExtractorClusterStackedR2 implements Extractor { + + public static int s_rel,s_word,s_type,s_dir,s_dist,s_feat,s_child,s_spath,s_lpath,s_pos; + + MFB mf; + + final D4 d0 ,dl1,dl2, dwr,dr,dwwp,dw,dwp,dlf,d3lp, d2lp,d2pw,d2pp ; + + public final Long2IntInterface li; + + public ExtractorClusterStackedR2(Long2IntInterface li) { + + initFeatures(); + this.li=li; + d0 = new D4(li);dl1 = new D4(li);dl2 = new D4(li); + dwr = new D4(li); + dr = new D4(li); + dwwp = new D4(li); + + dw = new D4(li); + dwp = new D4(li); + + dlf = new D4(li); + d3lp = new D4(li); d2lp = new D4(li); d2pw = new D4(li); d2pp = new D4(li); + + } + + public void initStat() { + + + mf = new MFB(); + s_rel = mf.getFeatureCounter().get(REL).intValue(); + s_pos = mf.getFeatureCounter().get(POS).intValue(); + s_word = mf.getFeatureCounter().get(WORD).intValue(); + s_type = mf.getFeatureCounter().get(TYPE).intValue();//mf.getFeatureBits(); + s_dir = mf.getFeatureCounter().get(DIR); + la = mf.getValue(DIR, LA); + ra = mf.getValue(DIR, RA); + s_dist = mf.getFeatureCounter().get(DIST);//mf.getFeatureBits(DIST); + s_feat = mf.getFeatureCounter().get(FEAT);//mf.getFeatureBits(Pipe.FEAT); + s_spath = mf.getFeatureCounter().get(Cluster.SPATH)==null?0:mf.getFeatureCounter().get(Cluster.SPATH);//mf.getFeatureBits(Cluster.SPATH); + s_lpath = mf.getFeatureCounter().get(Cluster.LPATH)==null?0:mf.getFeatureCounter().get(Cluster.LPATH);//mf.getFeatureBits(Cluster.LPATH); + } + + public void init(){ + // DB.println("init"); + d0.a0 = s_type;d0.a1 = s_pos;d0.a2 = s_pos;d0.a3 = s_pos;d0.a4 = s_pos;d0.a5 = s_pos;d0.a6 = s_pos;d0.a7 = s_pos; + dl1.a0 = s_type;dl1.a1 = s_rel; dl1.a2 = s_pos;dl1.a3 = s_pos; dl1.a4 = s_pos; dl1.a5 = s_pos; dl1.a6 = s_pos; dl1.a7 = s_pos; + dl2.a0 = s_type;dl2.a1 = s_rel;dl2.a2 = s_word;dl2.a3 = s_pos;dl2.a4 = s_pos;dl2.a5 = s_pos;dl2.a6 = s_pos;dl2.a7 = s_pos; + dwp.a0 = s_type; dwp.a1 = s_rel; dwp.a2 = s_word; dwp.a3 = s_pos; dwp.a4 = s_pos; dwp.a5 = s_word; + dwwp.a0 = s_type; dwwp.a1 = s_rel; dwwp.a2 = s_word; dwwp.a3 = s_word; dwwp.a4 = s_pos; dwwp.a5 = s_word; + dlf.a0 = s_type;dlf.a1 = s_rel; dlf.a2 = s_pos;dlf.a3 = s_pos; dlf.a4 = s_feat; dlf.a5 = s_feat; dlf.a6 = s_pos; dlf.a7 = s_pos; + d3lp.a0 = s_type; d3lp.a1 = s_rel; d3lp.a2 = s_lpath; d3lp.a3 = s_lpath; d3lp.a4 = s_lpath; d3lp.a5 = s_word; d3lp.a6 = s_spath; d3lp.a7 = s_spath; + d2lp.a0 = s_type; d2lp.a1 = s_rel; d2lp.a2 = s_lpath; d2lp.a3 = s_lpath; d2lp.a4 = s_word; d2lp.a5 = s_word; //d3lp.a6 = s_spath; d3lp.a7 = s_spath; + d2pw.a0 = s_type; d2pw.a1 = s_rel; d2pw.a2 = s_lpath; d2pw.a3 = s_lpath; d2pw.a4 = s_word; d2pw.a5 = s_word; //d3lp.a6 = s_spath; d3lp.a7 = s_spath; + d2pp.a0 = s_type; d2pp.a1 = s_rel; d2pp.a2 = s_lpath; d2pp.a3 = s_lpath; d2pp.a4 = s_pos; d2pp.a5 = s_pos; //d3lp.a6 = s_spath; d3lp.a7 = s_spath; + } + + + public int basic(short[] pposs, int[] form, int p, int d, Cluster cluster, IFV f) + { + + d0.clean(); dl1.clean(); dl2.clean(); dwp.clean(); dwwp.clean(); dlf.clean(); d3lp.clean(); + + d3lp.clean(); d2lp.clean();d2pw.clean(); d2pp.clean(); + + int n=1; + int dir= (p < d)? ra:la; + // d0.v0= n; d0.v1=pposs[p]; d0.v2=pposs[d]; //d0.stop=4; + int end= (p >= d ? p : d); + int start = (p >= d ? d : p) + 1; + + StringBuilder s = new StringBuilder(end-start); + int[] x = new int[end-start]; + int c=0; + for(int i = start ; i <end ; i++) { + //d0.v3=pposs[i]; + //d0.cz4(); + //d0.csa(s_dir,dir,f); +// s.append((char)pposs[i]); + x[c++] =pposs[i]; + } + + Arrays.sort(x); + for(int i = 0;i<x.length ; i++) { + if (i==0 || x[i]!=x[i-1] ) s.append(x[i]); + } + int v = mf.register("px", s.toString()); + + dwp.v0 = n++; dwp.v1 = 1;dwp.v2 = v; dwp.v3 = pposs[p]; dwp.v4 = pposs[d]; dwp.cz5(); dwp.csa(s_dir,dir,f); + + return n; + } + + + public void firstm(Instances is, int i, + int prnt, int dpnt, int label, Cluster cluster, long[] f) + { + + + //short[] pposs, int[] form, int[] lemmas, short[][] feats + for(int k=0;k<f.length;k++) f[k]=0; + + short[] pposs = is.pposs[i]; + int[] form =is.forms[i]; + short[][] feats = is.feats[i]; + + + int pF = form[prnt],dF = form[dpnt]; + int pL = is.plemmas[i][prnt],dL = is.plemmas[i][dpnt]; + int pP = pposs[prnt],dP = pposs[dpnt]; + + int prntLS = pF==-1?-1:cluster.getLP(pF), chldLS = dF==-1?-1:cluster.getLP(dF); + + final int dir= (prnt < dpnt)? ra:la; + + if (pF>maxForm) pF=-1; + if (pL>maxForm) pL=-1; + + if (dF>maxForm) dF=-1; + if (dL>maxForm) dL=-1; + + + int n=3,c=0; + + dl2.v1=label; + dl2.v0= n++; dl2.v2=pF; dl2.v3=dP; dl2.cz4(); f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++; dl2.cz3(); f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++; dl2.v2=dF; dl2.v3=pP; dl2.cz4(); f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++; dl2.cz3(); f[c++]=dl2.csa(s_dir,dir); + + + dwwp.v1=label; + dwwp.v0= n++; dwwp.v2=pF; dwwp.v3=dF; dwwp.cz4(); f[c++]=dwwp.csa(s_dir,dir); + + dl1.v1=label; + dl1.v0= n++; dl1.v2=dP; dl1.cz3(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=pP; dl1.cz3(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v3=dP; dl1.cz4(); f[c++]=dl1.csa(s_dir,dir); + + int pPm1 = prnt > 0 ? pposs[prnt - 1] : s_str, dPm1 = dpnt > 0 ? pposs[dpnt - 1] : s_str; + int pPp1 = prnt < pposs.length - 1 ? pposs[prnt + 1]:s_end, dPp1 = dpnt < pposs.length - 1 ? pposs[dpnt + 1]:s_end; + + int pPm2 = prnt > 1 ? pposs[prnt - 2] : s_str, dPm2 = dpnt > 1 ? pposs[dpnt - 2] : s_str; + int pPp2 = prnt < pposs.length - 2 ? pposs[prnt + 2]:s_end, dPp2 = dpnt < pposs.length - 2 ? pposs[dpnt + 2]:s_end; + + int pFm1 = prnt > 0 ? form[prnt - 1] : s_stwrd, dFm1 = dpnt > 0 ? form[dpnt - 1] : s_stwrd; + int pFp1 = prnt < form.length - 1 ? form[prnt + 1]:s_stwrd, dFp1 = dpnt < form.length - 1 ? form[dpnt + 1]:s_stwrd; + + + + dl1.v0= n++;dl1.v2=pP; dl1.v3=pPp1; dl1.v4=dP;dl1.v5=dPp1; dl1.cz6(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v5=dPm1; dl1.cz6(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v3=pPm1; dl1.cz6(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v5=dPp1; dl1.cz6(); f[n++]=dl1.csa(s_dir,dir); + + + dl1.v0= n++; dl1.v3=pPm1; dl1.cz5(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v3=dPm1; dl1.cz5(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v3=dPp1; dl1.cz5(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v3=pPp1; dl1.cz5(); f[n++]=dl1.csa(s_dir,dir); + + dl1.v0= n++;dl1.v2=pP; dl1.v3=pPp2; dl1.v4=dP;dl1.v5=dPp2; dl1.cz6(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v5=dPm2; dl1.cz6(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v3=pPm2; dl1.cz6(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v5=dPp2; dl1.cz6(); f[n++]=dl1.csa(s_dir,dir); + + dl1.v0= n++; dl1.v3=pPm2; dl1.cz5(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v3=dPm2; dl1.cz5(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v3=dPp2; dl1.cz5(); f[n++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v3=pPp2; dl1.cz5(); f[n++]=dl1.csa(s_dir,dir); + + + + dl2.v0= n++; dl2.v3=dFm1; dl2.v3=pPp1;dl2.v4=pP; dl2.cz5(); f[n++]=dl2.getVal(); + dl2.v0= n++; dl2.v3=dFp1; dl2.v3=pPm1; dl2.cz5(); f[n++]=dl2.getVal(); + dl2.v0= n++; dl2.v3=pFm1; dl2.v3=dPp1;dl2.v4=dP; dl2.cz5(); f[n++]=dl2.getVal(); + dl2.v0= n++; dl2.v3=pFp1; dl2.v3=dPm1; dl2.cz5(); f[n++]=dl2.getVal(); + + + dl2.v0= n++; dl2.v3=dFm1; dl2.v3=dPm2;dl2.v4=pP; dl2.cz5(); f[n++]=dl2.getVal(); + dl2.v0= n++; dl2.v3=dFp1; dl2.v3=dPp2; dl2.cz5(); f[n++]=dl2.getVal(); + dl2.v0= n++; dl2.v3=pFm1; dl2.v3=pPm2;dl2.v4=dP; dl2.cz5(); f[n++]=dl2.getVal(); + dl2.v0= n++; dl2.v3=pFp1; dl2.v3=pPp2; dl2.cz5(); f[n++]=dl2.getVal(); + + + dwwp.v0= n++; dwwp.v2=pF; dwwp.v3=dF; dwwp.v4=dP; dwwp.cz5(); f[n++]=dwwp.csa(s_dir,dir); + dwwp.v0= n++; dwwp.v2=pF; dwwp.v3=dF; dwwp.v4=pP; dwwp.cz5(); f[n++]=dwwp.csa(s_dir,dir); + dwwp.v0= n++; dwwp.v2=dF; dwwp.v3=pF; dwwp.v4=pP; dwwp.v4=dP; dwwp.cz6(); f[n++]=dwwp.csa(s_dir,dir); + + + + // lemmas + + dl2.v1=label; + dl2.v0= n++; dl2.v2=pL; dl2.v3=dP; dl2.cz4(); f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++; dl2.cz3(); f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++; dl2.v2=dL; dl2.v3=pP; dl2.cz4(); f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++; dl2.cz3(); f[c++]=dl2.csa(s_dir,dir); + + + dwwp.v1=label; + dwwp.v0= n++; dwwp.v2=pL; dwwp.v3=dL; dwwp.cz4(); f[c++]=dwwp.csa(s_dir,dir); + + dwp.v1= label; + dwp.v0=n++;dwp.v2=dL; dwp.v3=pP;dwp.v4=dP;dwp.v5=pL; dwp.cz6(); f[c++]=dwp.csa(s_dir,dir); + dwp.v0=n++;dwp.cz5(); f[c++]=dwp.csa(s_dir,dir); + + dwp.v0=n++;dwp.v2=pL; dwp.cz5(); f[c++]=dwp.csa(s_dir,dir); + dwwp.v0= n++; dwwp.v2=pL; dwwp.v3=dL; dwwp.v4=dP; dwwp.cz5(); f[c++]=dwwp.csa(s_dir,dir); + dwwp.v0= n++; dwwp.v4=pP; dwwp.cz5(); f[c++]=dwwp.csa(s_dir,dir); + + + // cluster + + d2pw.v1=label; + d2pw.v0=n++; d2pw.v2=prntLS; d2pw.v3=chldLS; d2pw.cz4(); f[c++]=d2pw.csa(s_dir,dir); + d2pw.v0=n++; d2pw.v4=pF; d2pw.cz5(); f[c++]=d2pw.csa(s_dir,dir); + d2pw.v0=n++; d2pw.v4=dF; d2pw.cz5(); f[c++]=d2pw.csa(s_dir,dir); + d2pw.v0=n++; d2pw.v5=pF; d2pw.cz6(); f[c++]=d2pw.csa(s_dir,dir); + + + d2pp.v1=label; + d2pp.v0=n++; d2pp.v2=prntLS; d2pp.v3=chldLS; d2pp.cz4(); f[c++]=d2pp.csa(s_dir,dir); + d2pp.v0=n++; d2pp.v4=pP; d2pp.cz5(); f[c++]=d2pp.csa(s_dir,dir); + d2pp.v0=n++; d2pp.v4=dP; d2pp.cz5(); f[c++]=d2pp.csa(s_dir,dir); + d2pp.v0=n++; d2pp.v5=pP; d2pp.cz6(); f[c++]=d2pp.csa(s_dir,dir); + + + short[] prel = is.plabels[i]; + short[] phead = is.pheads[i]; + + + //take those in for stacking + // dl2.v1=label; + // dl2.v0= n++;dl2.v2=prel[dpnt];dl2.v3=pP;dl2.v4=dP; dl2.v5=prnt==phead[dpnt]?1:2; dl2.cz6(); f[c++]=dl2.csa(s_dir,dir); + // dl2.v0= n++;dl2.v2=pP;dl2.v3=dP; dl2.v4=prnt==phead[dpnt]?1:2; dl2.cz5(); f[c++]=dl2.csa(s_dir,dir); + + + + if (feats==null) return; + + short[] featsP =feats[prnt], featsD =feats[dpnt]; + dlf.v0= n++; dlf.v1=label; dlf.v2=pP; dlf.v3=dP; + extractFeat(f, c, dir, featsP, featsD); + + return; + } + + + + public void gcm(Instances is , int i, int p, int d, int gc, int label,Cluster cluster, long[] f) { + + for(int k=0;k<f.length;k++) f[k]=0; + + short[] pos= is.pposs[i]; + int[] forms=is.forms[i]; + int[] lemmas=is.plemmas[i]; + short[][] feats=is.feats[i]; + + int pP = pos[p], dP = pos[d]; + int prntF = forms[p], chldF = forms[d]; + int prntL = lemmas[p], chldL = lemmas[d]; + int prntLS = prntF==-1?-1:cluster.getLP(prntF), chldLS = chldF==-1?-1:cluster.getLP(chldF); + + int gP = gc != -1 ? pos[gc] : s_str; + int gcF = gc != -1 ? forms[gc] : s_stwrd; + int gcL = gc != -1 ? lemmas[gc] : s_stwrd; + int gcLS = (gc != -1) && (gcF!=-1) ? cluster.getLP(gcF) : s_stwrd; + + if (prntF>maxForm) prntF=-1; + if (prntL>maxForm) prntL=-1; + + if (chldF>maxForm) chldF=-1; + if (chldL>maxForm) chldL=-1; + + if (gcF>maxForm) gcF=-1; + if (gcL>maxForm) gcL=-1; + + + int dir= (p < d)? ra:la, dir_gra =(d < gc)? ra:la; + + int n=84,c=0; + + //dl1.v023(); + dl1.v1=label; + dl1.v0= n++; dl1.v2=pP; dl1.v3=dP;dl1.v4=gP; dl1.cz5(); dl1.cs(s_dir,dir);f[c++]=dl1.csa(s_dir,dir_gra); + dl1.v0= n++; dl1.v2=pP; dl1.v3=gP; dl1.cz4();dl1.cs(s_dir,dir);f[c++]=dl1.csa(s_dir,dir_gra); + dl1.v0= n++; dl1.v2=dP; dl1.cz4(); dl1.cs(s_dir,dir);f[c++]=dl1.csa(s_dir,dir_gra); + + dwwp.v1=label; + dwwp.v0= n++; dwwp.v2=prntF; dwwp.v3=gcF; + dwwp.cz4(); dwwp.cs(s_dir,dir);f[c++]=dwwp.csa(s_dir,dir_gra); + + dwwp.v0= n++; dwwp.v2=chldF; dwwp.v3=gcF; + dwwp.cz4(); dwwp.cs(s_dir,dir);f[c++]=dwwp.csa(s_dir,dir_gra); + + dwp.v1=label; + dwp.v0= n++; dwp.v2=gcF; dwp.v3=pP; + dwp.cz4(); dwp.cs(s_dir,dir);f[c++]=dwp.csa(s_dir,dir_gra); + + dwp.v0= n++; dwp.v2=gcF; dwp.v3=dP; + dwp.cz4(); dwp.cs(s_dir,dir);f[c++]=dwp.csa(s_dir,dir_gra); + + dwp.v0= n++; dwp.v2=prntF; dwp.v3=gP; + dwp.cz4(); dwp.cs(s_dir,dir);f[c++]=dwp.csa(s_dir,dir_gra); + + dwp.v0= n++; dwp.v2=chldF; dwp.v3=gP; + dwp.cz4(); dwp.cs(s_dir,dir); f[c++]=dwp.csa(s_dir,dir_gra); + + + // lemma + + dwwp.v0= n++; dwwp.v2=prntL; dwwp.v3=gcL; + dwwp.cz4();dwwp.cs(s_dir,dir);f[c++]=dwwp.csa(s_dir,dir_gra); + + dwwp.v0= n++; dwwp.v2=chldL; dwwp.v3=gcL; + dwwp.cz4(); dwwp.cs(s_dir,dir);f[c++]=dwwp.csa(s_dir,dir_gra); + + dwp.v0= n++; dwp.v2=gcL; dwp.v3=pP; + dwp.cz4(); dwp.cs(s_dir,dir);f[c++]=dwp.csa(s_dir,dir_gra); + + dwp.v0= n++; dwp.v2=gcL; dwp.v3=dP; + dwp.cz4(); dwp.cs(s_dir,dir);f[c++]=dwp.csa(s_dir,dir_gra); + + dwp.v0= n++; dwp.v2=prntL; dwp.v3=gP; + dwp.cz4(); dwp.cs(s_dir,dir);f[c++]=dwp.csa(s_dir,dir_gra); + + dwp.v0= n++; dwp.v2=chldL; dwp.v3=gP; + dwp.cz4(); dwp.cs(s_dir,dir); f[c++]=dwp.csa(s_dir,dir_gra); + + + // clusters + + d2lp.v1= label; + d2lp.v0= n++; d2lp.v2=prntLS; d2lp.v3=gcLS; d2lp.cz4(); d2lp.cs(s_dir,dir);f[c++]=d2lp.csa(s_dir,dir_gra);// f.add(li.l2i(l)); + d2lp.v0= n++; d2lp.v2=chldLS; d2lp.v3=gcLS; d2lp.cz4(); d2lp.cs(s_dir,dir);f[c++]=d2lp.csa(s_dir,dir_gra); + d3lp.v0= n++; d3lp.v1= label; d3lp.v2=prntLS; d3lp.v3=chldLS; d3lp.v4=gcLS; d3lp.cz5(); d3lp.cs(s_dir,dir);f[c++]=d3lp.csa(s_dir,dir_gra); + + //_f83; + d2lp.v0= n++; d2lp.v2=prntLS; d2lp.v3=chldLS; d2lp.v4=gcF; d2lp.cz5(); f[c++]=d2lp.csa(s_dir,dir); + d2lp.v0= n++; d2lp.v2=prntLS; d2lp.v3=gcLS; d2lp.v4=chldF; d2lp.cz5(); f[c++]=d2lp.csa(s_dir,dir); + d2lp.v0= n++; d2lp.v2=chldLS; d2lp.v3=gcLS; d2lp.v4=prntF; d2lp.cz5(); f[c++]=d2lp.csa(s_dir,dir); + + d2pp.v1= label; + d2pp.v0= n++; d2pp.v2=prntLS; d2pp.v3=chldLS; d2pp.v4=gP; d2pp.cz5(); f[c++]=d2pp.csa(s_dir,dir); + d2pp.v0= n++; d2pp.v2=prntLS; d2pp.v3=gcLS; d2pp.v4=dP; d2pp.cz5(); f[c++]=d2pp.csa(s_dir,dir); + d2pp.v0= n++; d2pp.v2=chldLS; d2pp.v3=gcLS; d2pp.v4=pP; d2pp.cz5(); f[c++]=d2pp.csa(s_dir,dir); + + + + // linear features + + int prntPm1 = p != 0 ? pos[p - 1] : s_str; // parent-pos-minus1 + int chldPm1 = d - 1 >=0 ? pos[d - 1] : s_str; // child-pos-minus1 + int prntPp1 = p != pos.length - 1 ? pos[p + 1] : s_end; + int chldPp1 = d != pos.length - 1 ? pos[d + 1] : s_end; + + int gcPm1 = gc > 0 ? pos[gc - 1] : s_str; + int gcPp1 = gc < pos.length - 1 ? pos[gc + 1] : s_end; + + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcPp1;dl1.v4=dP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcPm1;dl1.v4=dP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=dP;dl1.v4=chldPp1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=dP;dl1.v4=chldPm1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcPp1;dl1.v4=chldPm1;dl1.v5=dP;dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gcPm1; dl1.v3=gP;dl1.v4=chldPm1;dl1.v5=dP; dl1.cz6();f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcPp1;dl1.v4=dP;dl1.v5=chldPp1; dl1.cz6();f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gcPm1; dl1.v3=gP;dl1.v4=dP;dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcPp1;dl1.v4=pP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcPm1;dl1.v4=pP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=pP;dl1.v4=prntPp1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=pP;dl1.v4=prntPm1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcPp1;dl1.v4=prntPm1;dl1.v5=pP; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gcPm1; dl1.v3=gP;dl1.v4=prntPm1;dl1.v5=pP; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcPp1;dl1.v4=pP;dl1.v5=prntPp1; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gcPm1; dl1.v3=gP; dl1.v4=pP; dl1.v5=prntPp1;dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + + + int pLSp1 = p != pos.length - 1 ? forms[p + 1]==-1?-1:cluster.getLP(forms[p + 1]): _cend; + int cLSp1 = d != pos.length - 1 ? forms[d + 1] ==-1?-1:cluster.getLP(forms[d + 1]):_cend; + int gcLSp1 = gc < pos.length -1 ? forms[gc + 1] ==-1?-1:cluster.getLP(forms[gc + 1]) : s_end; + + int pLSm1 = p != 0 ? lemmas[p - 1]==-1?-1:cluster.getLP(lemmas[p - 1]): _cstr; + int cLSm1 = d - 1 >=0 ? lemmas[d - 1] ==-1?-1:cluster.getLP(lemmas[d - 1]):_cstr; + int gcLSm1 = gc > 0 ? lemmas[gc - 1] ==-1?-1:cluster.getLP(lemmas[gc - 1]) : _cstr; + + + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcLSp1;dl1.v4=dP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcLSm1;dl1.v4=dP; dl1.cz5();f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=dP;dl1.v4=cLSp1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=dP;dl1.v4=cLSm1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcLSp1;dl1.v4=cLSm1;dl1.v5=dP;dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gcLSm1; dl1.v3=gP;dl1.v4=cLSm1;dl1.v5=dP; dl1.cz6();f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcLSp1;dl1.v4=dP;dl1.v5=cLSp1; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=cLSm1; dl1.v3=gP;dl1.v4=dP;dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcLSp1;dl1.v4=pP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcLSm1;dl1.v4=pP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=pP;dl1.v4=pLSp1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=pP;dl1.v4=pLSm1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcLSp1;dl1.v4=pLSm1;dl1.v5=pP; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gcLSm1; dl1.v3=gP;dl1.v4=pLSm1;dl1.v5=pP; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gP; dl1.v3=gcLSp1;dl1.v4=pP;dl1.v5=pLSp1; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=gcLSm1; dl1.v3=gP; dl1.v4=pP; dl1.v5=pLSp1;dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + + + + short[] prel = is.plabels[i],phead=is.pheads[i]; + + int g = p==phead[d]?1:2 ; + if (gc>=0) g += d==phead[gc]?4:8; + + int gr = gc==-1?s_relend:prel[gc]; + + // take those in for stacking + /* + dl2.v1=label; + dl2.v0= n++;dl2.v2=prel[d];dl2.v3=g;dl2.v4=gP;dl2.v5=dP;dl2.cz6();f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++;dl2.v2=prel[d];dl2.v3=g;dl2.v4=gP;dl2.v5=pP;dl2.cz6();f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++;dl2.v2=prel[d];dl2.v3=g;dl2.v4=gP;dl2.v5=pP;dl2.v6=dP;dl2.cz7();f[c++]=dl2.csa(s_dir,dir); + + dl2.v0= n++;dl2.v2=gr;dl2.v3=g;dl2.v4=gP;dl2.v5=dP;dl2.cz6();f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++;dl2.v2=gr;dl2.v3=g;dl2.v4=gP;dl2.v5=pP;dl2.cz6();f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++;dl2.v2=gr;dl2.v3=g;dl2.v4=gP;dl2.v5=pP;dl2.v6=dP;dl2.cz7();f[c++]=dl2.csa(s_dir,dir); + +*/ + if (feats==null) return; + + short[] featsP =feats[d]; + short[] featsD =gc!=-1?feats[gc]:null; + + dlf.v0= n++; dlf.v1=label; dlf.v2=gP; dlf.v3=dP; + extractFeat(f, c, dir, featsP, featsD); + return; + } + + + public void siblingm(Instances is , int i,short pos[], int forms[], int[] lemmas, short[][] feats, int prnt, int d, int sblng, int label, Cluster cluster, long[] f, int v) + { + + for(int k=0;k<f.length;k++) f[k]=0; + + int pP = pos[prnt], dP = pos[d]; + int prntF = forms[prnt],chldF = forms[d]; + int prntL = lemmas[prnt], chldL = lemmas[d]; + int prntLS = prntF==-1?-1:cluster.getLP(prntF), chldLS = chldF==-1?-1:cluster.getLP(chldF); + + int sP = sblng!=-1 ? pos[sblng] : s_str, sblF = sblng!=-1 ? forms[sblng] : s_stwrd, sblL = sblng!=-1 ? lemmas[sblng] : s_stwrd; + + int sblLS = (sblng != -1)&&(sblF!=-1) ? cluster.getLP(sblF) : s_stwrd; + + + int dir= (prnt < d)? ra:la; + + int abs = Math.abs(prnt-d); + + final int dist; + if (abs > 10)dist=d10;else if (abs>5) dist=d5;else if( abs==5)dist=d4;else if (abs==4)dist=d3;else if (abs==3)dist=d2; + else if (abs==2)dist=d1; else dist=di0; + + int n=147; + + if (prntF>maxForm) prntF=-1; + if (prntL>maxForm) prntL=-1; + + if (chldF>maxForm) chldF=-1; + if (chldL>maxForm) chldL=-1; + + if (sblF>maxForm) sblF=-1; + if (sblL>maxForm) sblL=-1; + + + dl1.v0= n++; dl1.v1=label;dl1.v2=pP; dl1.v3=dP;dl1.v4=sP; dl1.cz5(); f[0]=dl1.csa(s_dir,dir);f[1]=dl1.csa(s_dist,dist); + dl1.v0= n++; dl1.v3=sP; dl1.cz4(); f[2]=dl1.csa(s_dir,dir); f[3]=dl1.csa(s_dist,dist); + dl1.v0= n++; dl1.v2=dP;dl1.cz4(); f[4]=dl1.csa(s_dir,dir); f[5]=dl1.csa(s_dist,dist); + + // sibling only could be tried + dwwp.v1=label; + dwwp.v0= n++; dwwp.v2=prntF; dwwp.v3=sblF; dwwp.cz4(); f[6]=dwwp.csa(s_dir,dir); f[7]=dwwp.csa(s_dist,dist); + dwwp.v0= n++; dwwp.v2=chldF; dwwp.cz4(); f[8]=dwwp.csa(s_dir,dir); f[9]=dwwp.csa(s_dist,dist); + dwp.v0= n++; dwp.v1=label; dwp.v2=sblF; dwp.v3=pP; dwp.cz4(); f[10]=dwp.csa(s_dir,dir); f[11]=dwp.csa(s_dist,dist); + dwp.v0= n++; /*dwp.v1=label; */dwp.v3=dP; dwp.cz4(); f[12]=dwp.csa(s_dir,dir); f[13]=dwp.csa(s_dist,dist); + dwp.v0= n++; /*dwp.v1=label;*/ dwp.v2=prntF; dwp.v3=sP; dwp.cz4(); f[14]=dwp.csa(s_dir,dir); f[15]=dwp.csa(s_dist,dist); + dwp.v0= n++; /*dwp.v1=label;*/ dwp.v2=chldF; dwp.cz4(); f[16]=dwp.csa(s_dir,dir); f[17]=dwp.csa(s_dist,dist); + + //lemmas + dwwp.v0= n++; dwwp.v2=prntL; dwwp.v3=sblL; dwwp.cz4(); f[18]=dwwp.csa(s_dir,dir); + dwwp.v0= n++; dwwp.v2=chldL; dwwp.cz4(); f[19]=dwwp.csa(s_dir,dir); f[20]=dwwp.csa(s_dist,dist); + dwp.v0= n++; /*dwp.v1=label;*/ dwp.v2=sblL; dwp.v3=pP; dwp.cz4(); f[21]=dwp.csa(s_dir,dir); f[22]=dwp.csa(s_dist,dist); + dwp.v0= n++; /*dwp.v1=label; */ dwp.v3=dP; dwp.cz4(); f[23]=dwp.csa(s_dir,dir);f[24]=dwp.csa(s_dist,dist); + dwp.v0= n++; /*dwp.v1=label;*/ dwp.v2=prntL; dwp.v3=sP; dwp.cz4(); f[25]=dwp.csa(s_dir,dir); f[26]=dwp.csa(s_dist,dist); + dwp.v0= n++; /*dwp.v1=label;*/ dwp.v2=chldL; dwp.cz4(); f[27]=dwp.csa(s_dir,dir);f[28]=dwp.csa(s_dist,dist); + + + // clusters + + d2lp.v1=label; + d2lp.v0= n++; d2lp.v2=prntLS; d2lp.v3=sblLS; d2lp.cz4(); f[29]=d2lp.csa(s_dir,dir); + d2lp.v0= n++; d2lp.v2=chldLS; d2lp.v3=sblLS; d2lp.cz4(); f[30]=d2lp.csa(s_dir,dir); f[31]=d2lp.csa(s_dist,dist); + + d3lp.v1= label; + d3lp.v0= n++; d3lp.v2=prntLS; d3lp.v3=chldLS; d3lp.v4=sblLS;d3lp.cz5(); f[32]=d3lp.csa(s_dir,dir); + + d2lp.v0= n++; d2lp.v2=prntLS; d2lp.v3=chldLS; d2lp.v4=sblF; d2lp.cz5(); f[33]=d2lp.csa(s_dir,dir); f[34]=d2lp.csa(s_dist,dist); + d2lp.v0= n++; d2lp.v2=prntLS; d2lp.v3=sblLS; d2lp.v4=chldF; d2lp.cz5(); f[35]=d2lp.csa(s_dir,dir); f[36]=d2lp.csa(s_dist,dist); + d2lp.v0= n++; d2lp.v2=chldLS; d2lp.v3=sblLS; d2lp.v4=prntF; d2lp.cz5(); f[37]=d2lp.csa(s_dir,dir); f[38]=d2lp.csa(s_dist,dist); + + d2pp.v1=label; + d2pp.v0= n++; d2pp.v2=prntLS; d2pp.v3=chldLS; d2pp.v4=sP; d2pp.cz5(); f[39]=d2pp.csa(s_dir,dir); f[40]=d2pp.csa(s_dist,dist); + d2pp.v0= n++; d2pp.v2=prntLS; d2pp.v3=sblLS; d2pp.v4=dP; d2pp.cz5(); f[41]=d2pp.csa(s_dir,dir); f[42]=d2pp.csa(s_dist,dist); + d2pp.v0= n++; d2pp.v2=chldLS; d2pp.v3=sblLS; d2pp.v4=pP; d2pp.cz5(); f[43]=d2pp.csa(s_dir,dir); f[44]=d2pp.csa(s_dist,dist); + + + int prntPm1 = prnt!=0 ? pos[prnt-1] : s_str; + int chldPm1 = d-1>=0 ? pos[d-1] : s_str; + int prntPp1 = prnt!=pos.length-1 ? pos[prnt+1] : s_end; + int chldPp1 = d!=pos.length-1 ? pos[d+1] : s_end; + + // sibling part of speech minus and plus 1 + int sblPm1 = sblng>0 ? pos[sblng-1]:s_str; + int sblPp1 = sblng<pos.length-1 ? pos[sblng + 1]:s_end; + + dl1.v0=n++; dl1.v2=sP; dl1.v3=sblPp1;dl1.v4=pP; dl1.cz5(); f[45]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sblPm1;dl1.v4=pP; dl1.cz5(); f[46]=dl1.csa(s_dir,dir);// f.add(li.l2i(l)); + dl1.v0=n++; dl1.v2=sP; dl1.v3=pP;dl1.v4=prntPp1;dl1.cz5(); f[47]=dl1.csa(s_dir,dir);// f.add(li.l2i(l)); + dl1.v0=n++; dl1.v2=sP; dl1.v3=pP;dl1.v4=prntPm1; dl1.cz5(); f[48]=dl1.csa(s_dir,dir);// f.add(li.l2i(l)); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sblPp1;dl1.v4=prntPm1;dl1.v5=pP; dl1.cz6(); f[49]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sblPm1; dl1.v3=sP;dl1.v4=prntPm1;dl1.v5=pP;dl1.cz6(); f[50]=dl1.csa(s_dir,dir);// f.add(li.l2i(l)); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sblPp1;dl1.v4=pP;dl1.v5=prntPp1; dl1.cz6(); f[51]=dl1.csa(s_dir,dir);// f.add(li.l2i(l)); + dl1.v0=n++; dl1.v2=sblPm1; dl1.v3=sP; dl1.v4=pP;dl1.v5=prntPp1; dl1.cz6(); f[52]=dl1.csa(s_dir,dir);// f.add(li.l2i(l)); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sblPp1;dl1.v4=dP; dl1.cz5(); f[53]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sblPm1;dl1.v4=dP; dl1.cz5(); f[54]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=dP;dl1.v4=chldPp1;dl1.cz5(); f[55]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=dP;dl1.v4=chldPm1; dl1.cz5(); f[56]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sblPp1;dl1.v4=chldPm1;dl1.v5=dP; dl1.cz6(); f[57]=dl1.csa(s_dir,dir);// f.add(li.l2i(l)); + dl1.v0=n++; dl1.v2=sblPm1; dl1.v3=sP;dl1.v4=chldPm1;dl1.v5=dP;dl1.cz6(); f[58]=dl1.csa(s_dir,dir); + dl1.v0= n++;dl1.v2=sP; dl1.v3=sblPp1;dl1.v4=dP;dl1.v5=chldPp1;dl1.cz6();f[59]=dl1.csa(s_dir,dir);// f.add(li.l2i(l)); + dl1.v0= n++; dl1.v2=sblPm1; dl1.v3=sP;dl1.v4=dP;dl1.v5=chldPp1;dl1.cz6(); f[60]=dl1.csa(s_dir,dir); + + int c=61; + + int pLSp1 = prnt != pos.length - 1 ? forms[prnt + 1]==-1?-1:cluster.getLP(forms[prnt + 1]): _cend; + int cLSp1 = d != pos.length - 1 ? forms[d + 1] ==-1?-1:cluster.getLP(forms[d + 1]):_cend; + int sLSp1 = sblng < pos.length -1 ? forms[sblng + 1] ==-1?-1:cluster.getLP(forms[sblng + 1]) : _cend; + + int pLSm1 = prnt!=0 ? forms[prnt - 1]==-1?-1:cluster.getLP(forms[prnt - 1]): _cstr; + int cLSm1 = d-1>=0 ? forms[d - 1] ==-1?-1:cluster.getLP(forms[d - 1]):_cstr; + int sLSm1 = sblng>0 ? forms[sblng - 1] ==-1?-1:cluster.getLP(forms[sblng - 1]):_cstr; + + //int c=61; + + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=pP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSm1;dl1.v4=pP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=pP;dl1.v4=pLSp1;dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=pP;dl1.v4=pLSm1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=pLSm1;dl1.v5=pP; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sLSm1; dl1.v3=sP;dl1.v4=pLSm1;dl1.v5=pP; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=pP;dl1.v5=pLSp1; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sLSm1; dl1.v3=sP; dl1.v4=pP;dl1.v5=pLSp1; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=dP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSm1;dl1.v4=dP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=dP;dl1.v4=cLSp1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++;dl1.v2=sP; dl1.v3=dP;dl1.v4=cLSm1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSm1;dl1.v4=cLSm1;dl1.v5=dP; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sLSm1; dl1.v3=sP;dl1.v4=cLSm1;dl1.v5=dP;dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++;dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=dP;dl1.v5=cLSp1;dl1.cz6();f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sLSm1; dl1.v3=sP;dl1.v4=dP;dl1.v5=cLSp1; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + + + + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=pP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSm1;dl1.v4=pP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=pP;dl1.v4=pLSp1;dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=pP;dl1.v4=pLSm1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=pLSm1;dl1.v5=pP; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sLSm1; dl1.v3=sP;dl1.v4=pLSm1;dl1.v5=pP; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=pP;dl1.v5=pLSp1; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sLSm1; dl1.v3=sP; dl1.v4=pP;dl1.v5=pLSp1; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=dP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSm1;dl1.v4=dP; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=dP;dl1.v4=cLSp1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++;dl1.v2=sP; dl1.v3=dP;dl1.v4=cLSm1; dl1.cz5(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSm1;dl1.v4=cLSm1;dl1.v5=dP; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0=n++; dl1.v2=sLSm1; dl1.v3=sP;dl1.v4=cLSm1;dl1.v5=dP;dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++;dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=dP;dl1.v5=cLSp1;dl1.cz6();f[c++]=dl1.csa(s_dir,dir); + dl1.v0= n++; dl1.v2=sLSm1; dl1.v3=sP;dl1.v4=dP;dl1.v5=cLSp1; dl1.cz6(); f[c++]=dl1.csa(s_dir,dir); + + // take those in for stacking + + /* + short[] prel = is.plabels[i],phead=is.pheads[i]; + + int g = prnt==phead[d]?1:2 ; + if (sblng>=0) g += prnt==phead[sblng]?4:8; + + int gr = sblng==-1?s_relend:prel[sblng]; + + + dl2.v0= n++;dl2.v2=prel[d];dl2.v3=g;dl2.v4=sP;dl2.v5=dP;dl2.cz6();f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++;dl2.v2=prel[d];dl2.v3=g;dl2.v4=sP;dl2.v5=pP;dl2.cz6();f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++;dl2.v2=prel[d];dl2.v3=g;dl2.v4=sP;dl2.v5=pP;dl2.v6=dP;dl2.cz7();f[c++]=dl2.csa(s_dir,dir); + + dl2.v0= n++;dl2.v2=gr;dl2.v3=g;dl2.v4=sP;dl2.v5=dP;dl2.cz6();f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++;dl2.v2=gr;dl2.v3=g;dl2.v4=sP;dl2.v5=pP;dl2.cz6();f[c++]=dl2.csa(s_dir,dir); + dl2.v0= n++;dl2.v2=gr;dl2.v3=g;dl2.v4=sP;dl2.v5=pP;dl2.v6=dP;dl2.cz7();f[c++]=dl2.csa(s_dir,dir); +*/ + + if (feats==null) return; + + int cnt=c; + + short[] featsP =feats[d]; + short[] featsSbl =sblng!=-1?feats[sblng]:null; + + dlf.v0= n++; dlf.v1=label; dlf.v2=sP; dlf.v3=dP; + + + cnt = extractFeat(f, cnt ,dir, featsP, featsSbl); + + featsP =feats[prnt]; + featsSbl =sblng!=-1?feats[sblng]:null; + + dlf.v0= n++; dlf.v1=label; dlf.v2=pP; dlf.v3=sP; + if (featsP!=null && featsSbl!=null) { + for(short i1=0;i1<featsP.length;i1++) { + for(short i2=0;i2<featsSbl.length;i2++) { + dlf.v4=featsP[i1]; dlf.v5=featsSbl[i2]; + dlf.cz6(); f[cnt++]=dlf.csa(s_dir,prnt<sblng?1:2); + } + } + } else if (featsP==null && featsSbl!=null) { + + for(short i2=0;i2<featsSbl.length;i2++) { + dlf.v4=nofeat; dlf.v5=featsSbl[i2]; + dlf.cz6(); f[cnt++]=dlf.csa(s_dir,dir); + } + + } else if (featsP!=null && featsSbl==null) { + + for(short i1=0;i1<featsP.length;i1++) { + dlf.v4=featsP[i1]; dlf.v5=nofeat; + dlf.cz6(); f[cnt++]=dlf.csa(s_dir,dir); + } + } + + return; + } + + private int extractFeat(long[] f, int cnt, int dir, short[] featsP, short[] featsD) { + if (featsP!=null && featsD!=null) { + for(short i1=0;i1<featsP.length;i1++) { + for(short i2=0;i2<featsD.length;i2++) { + dlf.v4=featsP[i1]; dlf.v5=featsD[i2]; + dlf.cz6(); f[cnt++]=dlf.csa(s_dir,dir); + } + } + } else if (featsP==null && featsD!=null) { + + for(short i2=0;i2<featsD.length;i2++) { + dlf.v4=nofeat; dlf.v5=featsD[i2]; + dlf.cz6(); f[cnt++]=dlf.csa(s_dir,dir); + + } + } else if (featsP!=null && featsD==null) { + + for(short i1=0;i1<featsP.length;i1++) { + dlf.v4=featsP[i1]; dlf.v5=nofeat; + dlf.cz6(); f[cnt++]=dlf.csa(s_dir,dir); + + } + } + return cnt; + } + + + public FV encodeCat(Instances is, int ic, short pposs[], int forms[], int[] lemmas, short[] heads, short[] types, short feats[][], Cluster cluster, FV f) { + + + long[] svs = new long[250]; + + for (int i = 1; i < heads.length; i++) { + + + int n =basic(pposs, forms, heads[i], i, cluster, f); + + firstm(is, ic, heads[i], i, types[i], cluster,svs); + for(int k=0;k<svs.length;k++) dl1.map(f,svs[k]); + + int ch,cmi,cmo; + if (heads[i] < i) { + ch = rightmostRight(heads, heads[i], i); + cmi = leftmostLeft(heads, i, heads[i]); + cmo = rightmostRight(heads, i, heads.length); + + } else { + ch = leftmostLeft(heads, heads[i], i); + cmi = rightmostRight(heads, i, heads[i]); + cmo = leftmostLeft(heads, i, 0); + } + + siblingm(is,ic,pposs, forms,lemmas, feats, heads[i], i, ch,types[i], cluster, svs,n); + for(int k=0;k<svs.length;k++) dl1.map(f,svs[k]); + + + gcm(is, ic,heads[i],i,cmi, types[i], cluster, svs); + for(int k=0;k<svs.length;k++) dl1.map(f,svs[k]); + + gcm(is, ic, heads[i],i,cmo, types[i], cluster, svs); + for(int k=0;k<svs.length;k++) dl1.map(f,svs[k]); + } + + return f; + } + + + public float encode3(short[] pos, short heads[] , short[] types, DataF d2) { + + double v = 0; + for (int i = 1; i < heads.length; i++) { + + int dir= (heads[i] < i)? 0:1; + + v += d2.pl[heads[i]][i]; + v += d2.lab[heads[i]][i][types[i]][dir]; + + boolean left = i<heads[i]; + short[] labels = Edges.get(pos[heads[i]], pos[i], left); + int lid=-1; + for(int k=0;k<labels.length;k++) if (types[i]== labels[k]) {lid= k;break;} + + int ch,cmi,cmo; + if (heads[i] < i) { + ch = rightmostRight(heads, heads[i], i); + cmi = leftmostLeft(heads, i, heads[i]); + cmo = rightmostRight(heads, i, heads.length); + + if (ch==-1) ch=heads[i]; + if (cmi==-1) cmi=heads[i]; + if (cmo==-1) cmo=heads[i]; + + } else { + ch = leftmostLeft(heads, heads[i], i); + cmi = rightmostRight(heads, i, heads[i]); + cmo = leftmostLeft(heads, i, 0); + + if (ch==-1) ch=i; + if (cmi==-1) cmi=i; + if (cmo==-1) cmo=i; + } + v += d2.sib[heads[i]][i][ch][dir][lid]; + v += d2.gra[heads[i]][i][cmi][dir][lid]; + v += d2.gra[heads[i]][i][cmo][dir][lid]; + } + return (float)v; + } + + /** + * Provide the scores of the edges + * @param pos + * @param heads + * @param types + * @param edgesScores + * @param d2 + * @return + */ + public static float encode3(short[] pos, short heads[] , short[] types, float[] edgesScores, DataF d2) { + + double v = 0; + for (int i = 1; i < heads.length; i++) { + + int dir= (heads[i] < i)? 0:1; + + edgesScores[i] = d2.pl[heads[i]][i]; + edgesScores[i] += d2.lab[heads[i]][i][types[i]][dir]; + + boolean left = i<heads[i]; + short[] labels = Edges.get(pos[heads[i]], pos[i], left); + int lid=-1; + for(int k=0;k<labels.length;k++) if (types[i]== labels[k]) {lid= k;break;} + + int ch,cmi,cmo; + if (heads[i] < i) { + ch = rightmostRight(heads, heads[i], i); + cmi = leftmostLeft(heads, i, heads[i]); + cmo = rightmostRight(heads, i, heads.length); + + if (ch==-1) ch=heads[i]; + if (cmi==-1) cmi=heads[i]; + if (cmo==-1) cmo=heads[i]; + + } else { + ch = leftmostLeft(heads, heads[i], i); + cmi = rightmostRight(heads, i, heads[i]); + cmo = leftmostLeft(heads, i, 0); + + if (ch==-1) ch=i; + if (cmi==-1) cmi=i; + if (cmo==-1) cmo=i; + } + edgesScores[i] += d2.sib[heads[i]][i][ch][dir][lid]; + edgesScores[i] += d2.gra[heads[i]][i][cmi][dir][lid]; + edgesScores[i] += d2.gra[heads[i]][i][cmo][dir][lid]; + v+=edgesScores[i]; + } + return (float)v; + } + + + private static int rightmostRight(short[] heads, int head, int max) { + int rightmost = -1; + for (int i = head + 1; i < max; i++) if (heads[i] == head) rightmost = i; + + return rightmost; + } + + private static int leftmostLeft(short[] heads, int head, int min) { + int leftmost = -1; + for (int i = head - 1; i > min; i--) if (heads[i] == head) leftmost = i; + return leftmost; + } + + public static final String REL = "REL",END = "END",STR = "STR",LA = "LA",RA = "RA"; + + private static int ra,la; + private static int s_str; + private static int s_end, _cend,_cstr, s_stwrd,s_relend; + + protected static final String TYPE = "TYPE",DIR = "D", FEAT="F"; + public static final String POS = "POS"; + protected static final String DIST = "DIST",MID = "MID"; + + private static final String _0 = "0",_4 = "4",_3 = "3", _2 = "2",_1 = "1",_5 = "5",_10 = "10"; + + private static int di0, d4,d3,d2,d1,d5,d10; + + + private static final String WORD = "WORD",STWRD = "STWRD", STPOS = "STPOS"; + + + + private static int nofeat; + + + private static int maxForm; + + + /** + * Initialize the features. + * @param maxFeatures + */ + static public void initFeatures() { + + + MFB mf = new MFB(); + mf.register(POS, MID); + s_str = mf.register(POS, STR); + s_end = mf.register(POS, END); + + s_relend = mf.register(REL, END); + + _cstr= mf.register(Cluster.SPATH,STR); + _cend=mf.register(Cluster.SPATH,END); + + + mf.register(TYPE, POS); + + s_stwrd=mf.register(WORD,STWRD); + mf.register(POS,STPOS); + + la = mf.register(DIR, LA); + ra = mf.register(DIR, RA); + + // mf.register(TYPE, CHAR); + + mf.register(TYPE, FEAT); + nofeat=mf.register(FEAT, "NOFEAT"); + + for(int k=0;k<215;k++) mf.register(TYPE, "F"+k); + + + di0=mf.register(DIST, _0); + d1=mf.register(DIST, _1); + d2=mf.register(DIST, _2); + d3=mf.register(DIST, _3); + d4=mf.register(DIST, _4); + d5=mf.register(DIST, _5); + // d5l=mf.register(DIST, _5l); + d10=mf.register(DIST, _10); + + + } + + /* (non-Javadoc) + * @see extractors.Extractor#getType() + */ + @Override + public int getType() { + return s_type; + } + + /* (non-Javadoc) + * @see extractors.Extractor#setMaxForm(java.lang.Integer) + */ + @Override + public void setMaxForm(int max) { + maxForm = max; + } + + /* (non-Javadoc) + * @see extractors.Extractor#getMaxForm() + */ + @Override + public int getMaxForm() { + return maxForm; + } + + + +} diff --git a/dependencyParser/basic/mate-tools/src/extractors/ExtractorFactory.java b/dependencyParser/basic/mate-tools/src/extractors/ExtractorFactory.java new file mode 100644 index 0000000..9543111 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/extractors/ExtractorFactory.java @@ -0,0 +1,44 @@ +/** + * + */ +package extractors; + +import is2.data.Long2IntInterface; + +/** + * @author Dr. Bernd Bohnet, 29.04.2011 + * + * + */ +public class ExtractorFactory { + + public static final int StackedClustered = 4; + public static final int StackedClusteredR2 = 5; + + + private int type=-1; + + /** + * @param stackedClusteredR22 + */ + public ExtractorFactory(int t) { + type=t; + } + + /** + * @param stackedClusteredR22 + * @param l2i + * @return + */ + public Extractor getExtractor(Long2IntInterface l2i) { + switch(type) + { + case StackedClustered: + return new ExtractorClusterStacked(l2i); + case StackedClusteredR2: + return new ExtractorClusterStackedR2(l2i); + } + return null; + } + +} diff --git a/dependencyParser/basic/mate-tools/src/extractors/ExtractorReranker.java b/dependencyParser/basic/mate-tools/src/extractors/ExtractorReranker.java new file mode 100644 index 0000000..2761f26 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/extractors/ExtractorReranker.java @@ -0,0 +1,621 @@ +package extractors; + + +import is2.data.Cluster; +import is2.data.D4; +import is2.data.Instances; +import is2.data.Long2IntInterface; +import is2.data.MFB; +import is2.data.ParseNBest; +import is2.util.DB; + +import java.util.Arrays; + + + +final public class ExtractorReranker { + + public static int s_rel,s_word,s_type,s_dir,s_dist,s_feat,s_child,s_spath,s_lpath,s_pos; + public static int d0,d1,d2,d3,d4,d5,d10; + + MFB mf; + + final D4 dl1,dl2, dwr,dr,dwwp,dw,dwp,dlf,d3lp, d2lp,d2pw,d2pp ; + + public final Long2IntInterface li; + + public ExtractorReranker(Long2IntInterface li) { + this.li=li; + dl1 = new D4(li);dl2 = new D4(li); + dwr = new D4(li); + dr = new D4(li); + dwwp = new D4(li); + + dw = new D4(li); + dwp = new D4(li); + + dlf = new D4(li); + d3lp = new D4(li); d2lp = new D4(li); d2pw = new D4(li); d2pp = new D4(li); + + } + + public static void initStat() { + DB.println("init called "); + MFB mf = new MFB(); + s_rel = mf.getFeatureCounter().get(REL).intValue();; + s_pos = mf.getFeatureCounter().get(POS).intValue(); + s_word = mf.getFeatureCounter().get(WORD).intValue(); + s_type = mf.getFeatureCounter().get(TYPE).intValue();//mf.getFeatureBits(); + s_dir = mf.getFeatureCounter().get(DIR); + la = mf.getValue(DIR, LA); + ra = mf.getValue(DIR, RA); + s_dist = mf.getFeatureCounter().get(DIST);//mf.getFeatureBits(DIST); + s_feat = mf.getFeatureCounter().get(FEAT);//mf.getFeatureBits(Pipe.FEAT); + s_spath = mf.getFeatureCounter().get(Cluster.SPATH)==null?0:mf.getFeatureCounter().get(Cluster.SPATH);//mf.getFeatureBits(Cluster.SPATH); + s_lpath = mf.getFeatureCounter().get(Cluster.LPATH)==null?0:mf.getFeatureCounter().get(Cluster.LPATH);//mf.getFeatureBits(Cluster.LPATH); + } + + public void init(){ + mf = new MFB(); + + dl1.a0 = s_type;dl1.a1 = 3; dl1.a2 = s_pos;dl1.a3 = s_pos; dl1.a4 = s_pos; dl1.a5 = s_pos; dl1.a6 = s_pos; dl1.a7 = s_pos; + dl2.a0 = s_type;dl2.a1 = 3;dl2.a2 = s_rel;dl2.a3 = s_rel;dl2.a4 = s_rel;dl2.a5 = s_rel;dl2.a6 = s_rel;dl2.a7 = s_rel;dl2.a8 = s_rel; dl2.a9 = s_rel; + dwp.a0 = s_type; dwp.a1 = 3; dwp.a2 = s_word; dwp.a3 = s_rel; dwp.a4 = s_rel; dwp.a5 = s_rel;dwp.a6 = s_rel;dwp.a7 = s_rel; + dwwp.a0 = s_type; dwwp.a1 = 3; dwwp.a2 = s_word; dwwp.a3 = s_word; dwwp.a4 = s_pos; dwwp.a5 = s_word;dwwp.a6 = s_pos;dwwp.a7 = s_pos; + } + + + + + + + public static final String REL = "REL",END = "END",STR = "STR",LA = "LA",RA = "RA", FEAT="F"; + + private static int ra,la; + private static int s_str; + private static int s_end, _cend,_cstr, s_stwrd,s_relend; + + protected static final String TYPE = "TYPE",DIR = "D"; + public static final String POS = "POS"; + protected static final String DIST = "DIST",MID = "MID"; + + private static final String _0 = "0",_4 = "4",_3 = "3", _2 = "2",_1 = "1",_5 = "5",_10 = "10"; + + + + private static final String WORD = "WORD",STWRD = "STWRD", STPOS = "STPOS"; + + + + private static int nofeat; + + + public static int maxForm; + + + final public static int _FC =60; + + + /** + * Initialize the features. + * @param maxFeatures + */ + static public void initFeatures() { + + + MFB mf = new MFB(); + mf.register(POS, MID); + s_str = mf.register(POS, STR); + s_end = mf.register(POS, END); + + s_relend = mf.register(REL, END); + + _cstr= mf.register(Cluster.SPATH,STR); + _cend=mf.register(Cluster.SPATH,END); + + + mf.register(TYPE, POS); + + s_stwrd=mf.register(WORD,STWRD); + mf.register(POS,STPOS); + + la = mf.register(DIR, LA); + ra = mf.register(DIR, RA); + + // mf.register(TYPE, CHAR); + + mf.register(TYPE, FEAT); + nofeat=mf.register(FEAT, "NOFEAT"); + + for(int k=0;k<60;k++) mf.register(TYPE, "F"+k); + + + d0 =mf.register(DIST, _0); + d1= mf.register(DIST, _1); + d2 =mf.register(DIST, _2); + d3= mf.register(DIST, _3); + d4= mf.register(DIST, _4); + d5= mf.register(DIST, _5); + // d5l=mf.register(DIST, _5l); + d10= mf.register(DIST, _10); + + + } + + /** + * @param is + * @param n + * @param parseNBest + * @param vs + */ + public void extractFeatures3(Instances is, int i, ParseNBest parse, int rank, long[] v) { + + int f=1,n=0; + + for(short k= 0; k<is.length(i)-1;k++) { + + short[] chld = children(parse.heads,k); + + f=2; + + int fm = is.forms[i][k]; + int hh = k!=0? is.pposs[i][parse.heads[k]]:s_end; + int h = is.pposs[i][k]; + int hrel = parse.labels[k]; + int hhrel = k!=0? parse.labels[parse.heads[k]]:s_relend; + int hhf = k!=0? is.forms[i][parse.heads[k]]:s_stwrd; + + + + int rlast = chld.length>0?parse.labels[chld[chld.length-1]]:s_relend; + + int [] rels = new int[chld.length]; + int [] pss = new int[chld.length]; + for(int j=0;j<chld.length;j++) { + rels[j] = parse.labels[chld[j]]; + pss[j] = is.pposs[i][chld[j]]; + } + + StringBuilder rl = new StringBuilder(chld.length); + StringBuilder psl = new StringBuilder(chld.length); + for(int j=0;j<chld.length;j++) { + rl.append((char)rels[j]); + psl.append((char)pss[j]); + } + + int rli = mf.register("rli", rl.toString()); + int pli = mf.register("pli", psl.toString()); + + dwwp.v0=f++; dwwp.v2=rli; dwwp.cz3(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=pli; dwwp.cz3(); v[n++]=dwwp.getVal(); + + dwwp.v0=f++; dwwp.v2=rli; dwwp.v3=h;dwwp.cz4(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=h;dwwp.cz4(); v[n++]=dwwp.getVal(); + + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=hh; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=rli; dwwp.v3=hh; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=rli; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=fm; dwwp.v3=rli; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=fm; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + + + + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=rli; dwwp.v4=hh; dwwp.cz5(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=rli; dwwp.v4=hh; dwwp.v5=h; dwwp.cz6(); v[n++]=dwwp.getVal(); + + dwp.v0= f++; dwp.v2=rli; dwp.v3=hrel;dwp.v4=hh; dwp.v5=h; dwp.cz6(); v[n++]=dwp.getVal(); + + Arrays.sort(rels); + Arrays.sort(pss); + + rl = new StringBuilder(chld.length); + psl = new StringBuilder(chld.length); + for(int j=0;j<chld.length;j++) { + rl.append((char)rels[j]); + psl.append((char)pss[j]); + } + rli = mf.register("rli", rl.toString()); + pli = mf.register("pli", psl.toString()); + + + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=rli; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=fm; dwwp.v3=rli; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=fm; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + + dwwp.v0=f++; dwwp.v2=rli; dwwp.v3=h;dwwp.cz4(); v[n++]=dwwp.getVal(); + + dl1.v0= f++; dl1.v2=h; dl1.v3=hrel; dl1.v4=hhrel;dl1.v5=hh; dl1.v6=rlast; dl1.cz6(); v[n++]=dl1.getVal(); + dwp.v0= f++; dwp.v2=fm; dwp.v3=hrel; dwp.v4=hh; dwp.cz5(); v[n++]=dwp.getVal(); + dwp.v0= f++; dwp.v2=hhf; dwp.v3=hrel; dwp.v4=hh; dwp.v5=h; dwp.cz6(); v[n++]=dwp.getVal(); + + dwwp.v0=f++; dwwp.v2=fm; dwwp.v3=hhf; dwwp.v4=hrel; dwwp.v5=hhrel; dwwp.cz6(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=h; dwwp.v3=hhf; dwwp.v4=hrel; dwwp.v5=hhrel; dwwp.cz6(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=fm; dwwp.v3=hh; dwwp.v4=hrel; dwwp.v5=hhrel; dwwp.cz6(); v[n++]=dwwp.getVal(); + + dwwp.v0=f++; dwwp.v2=fm; dwwp.v3=hhf; dwwp.v4=h; dwwp.v5=hh; dwwp.cz6(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=h; dwwp.v3=hhf; dwwp.v4=hrel; dwwp.v5=hh; dwwp.cz6(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=fm; dwwp.v3=hh; dwwp.v4=h; dwwp.v5=hrel; dwwp.cz6(); v[n++]=dwwp.getVal(); + + + // dl1.v0= f++; dl1.v2=h;dl1.v3=hrel; dl1.v4=hhrel;dl1.v5=hh; dl1.v6=hhhrel;dl1.v7=hhh; dl1.v8=rlast; dl1.cz9(); v[n++]=dl1.getVal(); +// dl1.v0= f++; dl1.v2=h;dl1.v3=hrel; dl1.v4=hhrel;dl1.v5=hh; dl1.v6=hhhrel;dl1.v7=hhh; dl1.v8=rlast; dl1.cz9(); v[n++]=dl1.getVal(); + // dl1.v0= f++; dl1.v2=h;dl1.v3=hrel; dl1.v4=dir;dl1.v5=hh; dl1.v6=hhh;dl1.v7=rlast; dl1.v8=r1; dl1.cz9(); v[n++]=dl1.getVal(); + // dl1.v0= f++; dl1.v2=h;dl1.v3=hh; dl1.v4=hhh;dl1.v5=hrel; dl1.cz6(); v[n++]=dl1.getVal(); + + + short hp = parse.heads[k]; + short[] hchld = hp==-1?new short[0]:children(parse.heads,hp); + + int [] hrels = new int[hchld.length]; + int [] hpss = new int[hchld.length]; + for(int j=0;j<hchld.length;j++) { + hrels[j] = parse.labels[hchld[j]]; + hpss[j] = is.pposs[i][hchld[j]]; + } + + + StringBuilder hrl = new StringBuilder(hchld.length); + StringBuilder hpsl = new StringBuilder(hchld.length); + for(int j=0;j<hchld.length;j++) { + hrl.append((char)hrels[j]); + hpsl.append((char)hpss[j]); + } + int hrli = mf.register("rli", hrl.toString()); + int hpli = mf.register("pli", hpsl.toString()); + + dwwp.v0=f++; dwwp.v2=hpli; dwwp.v3=hrli; dwwp.cz4(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=fm; dwwp.v3=hrli; dwwp.cz4(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=hpli; dwwp.v3=fm; dwwp.cz4(); v[n++]=dwwp.getVal(); + + dwwp.v0=f++; dwwp.v2=hpli; dwwp.v3=rli; dwwp.v4=hrel;dwwp.v5=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=hrli;dwwp.v4=hrel;dwwp.v5=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=hpli; dwwp.v3=hpli;dwwp.v4=hrel;dwwp.v5=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + + + + } + + v[n]=Integer.MIN_VALUE; + } + + /** + * This works seem works well with n-best n=8 (88.858074) , n=10 (88.836884), n=12 (88.858) + * n=14 (88.913417) n=16 (88.79546) n=20 (88.80621) n 50 (88.729364) + * 1-best: 88.749605 + * + * @param is + * @param i + * @param parse + * @param rank + * @param v + * @param cluster + */ + public void extractFeatures(Instances is, int i, ParseNBest parse, int rank, long[] v, Cluster cluster) { + + // mf.getValue(REL, "SB"); + + int f=1,n=0; + + for(short k= 0; k<is.length(i)-1;k++) { + + short[] chld = children(parse.heads,k); + + int abs = Math.abs(parse.heads[k]-k); + final int dist; + if (abs > 10)dist=d10;else if (abs>5) dist=d5;else if( abs==5)dist=d4;else if (abs==4)dist=d3;else if (abs==3)dist=d2; + else if (abs==2)dist=d1; else dist=d0; + + + f=2; + + int fm = is.forms[i][k]; + int hh = k!=0? is.pposs[i][parse.heads[k]]:s_end; + int h = is.pposs[i][k]; + int hrel = parse.labels[k];//is.labels[i][k]; + int hhrel = k!=0? parse.labels[parse.heads[k]]:s_relend; + int hhf = k!=0? is.forms[i][parse.heads[k]]:s_stwrd; + + int r1 = chld.length>0?parse.labels[chld[0]]:s_relend; + int rlast = chld.length>0?parse.labels[chld[chld.length-1]]:s_relend; + + int [] rels = new int[chld.length]; + int [] pss = new int[chld.length]; + int [] cls = new int[chld.length]; + + int[] rc = new int[30]; // 20 was a good length + + for(int j=0;j<chld.length;j++) { + rels[j] = parse.labels[chld[j]]; + if (rels[j]<rc.length) rc[rels[j]]++; + pss[j] = is.pposs[i][chld[j]]; +// cls[j] = is.forms[i][chld[j]]==-1?0:cluster.getLP(is.forms[i][chld[j]]); +// cls[j] = cls[j]==-1?0:cls[j]; + } + + StringBuilder rl = new StringBuilder(chld.length); + StringBuilder psl = new StringBuilder(chld.length); + StringBuilder csl = new StringBuilder(chld.length); + for(int j=0;j<chld.length;j++) { + rl.append((char)rels[j]); + psl.append((char)pss[j]); +// csl.append((char)cls[j]); + } + + int rli = mf.register("rli", rl.toString()); + int pli = mf.register("pli", psl.toString()); +// int cli = mf.register("cli", csl.toString()); + + + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=rli; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=fm; dwwp.v3=rli; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=fm; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + // dwwp.v0=f++; dwwp.v2=cli; dwwp.v3=fm; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + + dwwp.v0=f++; dwwp.v2=rli; dwwp.cz3(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=pli; dwwp.cz3(); v[n++]=dwwp.getVal(); + //dwwp.v0=f++; dwwp.v2=cli; dwwp.cz3(); v[n++]=dwwp.getVal(); + + // dwwp.v0=f++; dwwp.v2=cli;dwwp.v3=h; dwwp.cz4(); v[n++]=dwwp.getVal(); + + for(int j=1;j<rc.length;j++) { + dwwp.v0=f++; dwwp.v2=rc[j]==0?1:rc[j]==1?2:3; dwwp.v3=j; dwwp.cz4(); v[n++]=dwwp.getVal();// + } + + dwwp.v0=f++; dwwp.v2=rli; dwwp.v3=h;dwwp.cz4(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=h;dwwp.cz4(); v[n++]=dwwp.getVal(); + + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=hh; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=rli; dwwp.v3=hh; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=rli; dwwp.v4=hh; dwwp.cz5(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=rli; dwwp.v4=hh; dwwp.v5=h; dwwp.cz6(); v[n++]=dwwp.getVal(); + + dwp.v0= f++; dwp.v2=rli; dwp.v3=hrel;dwp.v4=hh; dwp.v5=h; dwp.cz6(); v[n++]=dwp.getVal(); + + //dwwp.v0=f++; dwwp.v2=h; dwwp.v3=hh; dwwp.v4=dist; dwwp.cz5(); v[n++]=dwwp.getVal(); + + Arrays.sort(rels); + Arrays.sort(pss); + + rl = new StringBuilder(chld.length); + psl = new StringBuilder(chld.length); + for(int j=0;j<chld.length;j++) { + rl.append((char)rels[j]); + psl.append((char)pss[j]); + } + rli = mf.register("rli", rl.toString()); + pli = mf.register("pli", psl.toString()); + + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=rli; dwwp.v4=1; dwwp.v5=h; dwwp.cz6(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=fm; dwwp.v3=rli; dwwp.v4=1; dwwp.v5=h; dwwp.cz6(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=fm; dwwp.v4=1; dwwp.v5=h; dwwp.cz6(); v[n++]=dwwp.getVal(); + + dwwp.v0=f++; dwwp.v2=rli; dwwp.v3=h;dwwp.cz4(); v[n++]=dwwp.getVal(); + + dl1.v0= f++; dl1.v2=h; dl1.v3=hrel; dl1.v4=hhrel;dl1.v5=hh; dl1.v6=rlast; dl1.cz6(); v[n++]=dl1.getVal(); + dwp.v0= f++; dwp.v2=fm; dwp.v3=hrel; dwp.v4=hh; dwp.cz5(); v[n++]=dwp.getVal(); + dwp.v0= f++; dwp.v2=hhf; dwp.v3=hrel;dwp.v4=hh; dwp.v5=h; dwp.cz6(); v[n++]=dwp.getVal(); + } + + v[n]=Integer.MIN_VALUE; + } + + /** + + * Works well! + * @param is + * @param i + * @param parse + * @param rank + * @param v + */ + public void extractFeatures6(Instances is, int i, ParseNBest parse, int rank, long[] v) { + + // mf.getValue(REL, "SB"); + + int f=1,n=0; + + for(short k= 0; k<is.length(i)-1;k++) { + + short[] chld = children(parse.heads,k); + + f=2; + + int fm = is.forms[i][k]; + int hh = k!=0? is.pposs[i][parse.heads[k]]:s_end; + int h = is.pposs[i][k]; + int hrel = parse.labels[k];//is.labels[i][k]; + int hhrel = k!=0? parse.labels[parse.heads[k]]:s_relend; + int hhf = k!=0? is.forms[i][parse.heads[k]]:s_stwrd; + + int r1 = chld.length>0?parse.labels[chld[0]]:s_relend; + int rlast = chld.length>0?parse.labels[chld[chld.length-1]]:s_relend; + + int [] rels = new int[chld.length]; + int [] pss = new int[chld.length]; + + int[] rc = new int[30]; // 20 was a good length + + for(int j=0;j<chld.length;j++) { + rels[j] = parse.labels[chld[j]]; + if (rels[j]<rc.length) rc[rels[j]]++; + // if (rels[j]==sb) numSB++; + pss[j] = is.pposs[i][chld[j]]; + } + + StringBuilder rl = new StringBuilder(chld.length); + StringBuilder psl = new StringBuilder(chld.length); + for(int j=0;j<chld.length;j++) { + rl.append((char)rels[j]); + psl.append((char)pss[j]); + } + + int rli = mf.register("rli", rl.toString()); + int pli = mf.register("pli", psl.toString()); + + + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=rli; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=fm; dwwp.v3=rli; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=fm; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + + dwwp.v0=f++; dwwp.v2=rli; dwwp.cz3(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=pli; dwwp.cz3(); v[n++]=dwwp.getVal(); + + for(int j=1;j<rc.length;j++) { + dwwp.v0=f++; dwwp.v2=rc[j]==0?1:rc[j]==1?2:3; dwwp.v3=j; dwwp.cz4(); v[n++]=dwwp.getVal();// + } + + dwwp.v0=f++; dwwp.v2=rli; dwwp.v3=h;dwwp.cz4(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=h;dwwp.cz4(); v[n++]=dwwp.getVal(); + + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=hh; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=rli; dwwp.v3=hh; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=rli; dwwp.v4=hh; dwwp.cz5(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=rli; dwwp.v4=hh; dwwp.v5=h; dwwp.cz6(); v[n++]=dwwp.getVal(); + + dwp.v0= f++; dwp.v2=rli; dwp.v3=hrel;dwp.v4=hh; dwp.v5=h; dwp.cz6(); v[n++]=dwp.getVal(); + + + Arrays.sort(rels); + Arrays.sort(pss); + + rl = new StringBuilder(chld.length); + psl = new StringBuilder(chld.length); + for(int j=0;j<chld.length;j++) { + rl.append((char)rels[j]); + psl.append((char)pss[j]); + } + rli = mf.register("rli", rl.toString()); + pli = mf.register("pli", psl.toString()); + + + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=rli; dwwp.v4=1; dwwp.v5=h; dwwp.cz6(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=fm; dwwp.v3=rli; dwwp.v4=1; dwwp.v5=h; dwwp.cz6(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=fm; dwwp.v4=1; dwwp.v5=h; dwwp.cz6(); v[n++]=dwwp.getVal(); + + dwwp.v0=f++; dwwp.v2=rli; dwwp.v3=h;dwwp.cz4(); v[n++]=dwwp.getVal(); + + dl1.v0= f++; dl1.v2=h; dl1.v3=hrel; dl1.v4=hhrel;dl1.v5=hh; dl1.v6=rlast; dl1.cz6(); v[n++]=dl1.getVal(); + dwp.v0= f++; dwp.v2=fm; dwp.v3=hrel; dwp.v4=hh; dwp.cz5(); v[n++]=dwp.getVal(); + dwp.v0= f++; dwp.v2=hhf; dwp.v3=hrel;dwp.v4=hh; dwp.v5=h; dwp.cz6(); v[n++]=dwp.getVal(); + + } + + v[n]=Integer.MIN_VALUE; + } + + + + public void extractFeatures2(Instances is, int i, ParseNBest parse, int rank, long[] v) { + + + + int f=1,n=0; + + for(short k= 0; k<is.length(i)-1;k++) { + + short[] chld = children(parse.heads,k); + + f=2; + + int fm = is.forms[i][k]; + int hh = k!=0? is.pposs[i][parse.heads[k]]:s_end; + int h = is.pposs[i][k]; + int hrel = parse.labels[k];//is.labels[i][k]; + int hhrel = k!=0? parse.labels[parse.heads[k]]:s_relend; + int hhf = k!=0? is.forms[i][parse.heads[k]]:s_stwrd; + + int r1 = chld.length>0?parse.labels[chld[0]]:s_relend; + int rlast = chld.length>0?parse.labels[chld[chld.length-1]]:s_relend; + + int [] rels = new int[chld.length]; + int [] pss = new int[chld.length]; + + + + for(int j=0;j<chld.length;j++) { + rels[j] = parse.labels[chld[j]]; + pss[j] = is.pposs[i][chld[j]]; + } + + StringBuilder rl = new StringBuilder(chld.length); + StringBuilder psl = new StringBuilder(chld.length); + for(int j=0;j<chld.length;j++) { + rl.append((char)rels[j]); + psl.append((char)pss[j]); + } + + int rli = mf.register("rli", rl.toString()); + int pli = mf.register("pli", psl.toString()); + + + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=rli; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=fm; dwwp.v3=rli; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=fm; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + + dwwp.v0=f++; dwwp.v2=rli; dwwp.cz3(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=pli; dwwp.cz3(); v[n++]=dwwp.getVal(); + + dwwp.v0=f++; dwwp.v2=rli; dwwp.v3=h;dwwp.cz4(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=h;dwwp.cz4(); v[n++]=dwwp.getVal(); + + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=hh; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=rli; dwwp.v3=hh; dwwp.v4=h; dwwp.cz5(); v[n++]=dwwp.getVal(); + + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=rli; dwwp.v4=hh; dwwp.cz5(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=rli; dwwp.v4=hh; dwwp.v5=h; dwwp.cz6(); v[n++]=dwwp.getVal(); + + dwp.v0= f++; dwp.v2=rli; dwp.v3=hrel;dwp.v4=hh; dwp.v5=h; dwp.cz6(); v[n++]=dwp.getVal(); + + + Arrays.sort(rels); + Arrays.sort(pss); + + rl = new StringBuilder(chld.length); + psl = new StringBuilder(chld.length); + for(int j=0;j<chld.length;j++) { + rl.append((char)rels[j]); + psl.append((char)pss[j]); + } + rli = mf.register("rli", rl.toString()); + pli = mf.register("pli", psl.toString()); + + + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=rli; dwwp.v4=1; dwwp.v5=h; dwwp.cz6(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=fm; dwwp.v3=rli; dwwp.v4=1; dwwp.v5=h; dwwp.cz6(); v[n++]=dwwp.getVal(); + dwwp.v0=f++; dwwp.v2=pli; dwwp.v3=fm; dwwp.v4=1; dwwp.v5=h; dwwp.cz6(); v[n++]=dwwp.getVal(); + + dwwp.v0=f++; dwwp.v2=rli; dwwp.v3=h;dwwp.cz4(); v[n++]=dwwp.getVal(); + + dl1.v0= f++; dl1.v2=h; dl1.v3=hrel; dl1.v4=hhrel;dl1.v5=hh; dl1.v6=rlast; dl1.cz6(); v[n++]=dl1.getVal(); + dwp.v0= f++; dwp.v2=fm; dwp.v3=hrel; dwp.v4=hh; dwp.cz5(); v[n++]=dwp.getVal(); + dwp.v0= f++; dwp.v2=hhf; dwp.v3=hrel;dwp.v4=hh; dwp.v5=h; dwp.cz6(); v[n++]=dwp.getVal(); + + } + + v[n]=Integer.MIN_VALUE; + } + + + + /** + * @param parse + * @param k + * @return + */ + private short[] children(short[] heads, short h) { + + int c=0; + for(int k=0;k<heads.length;k++) if (heads[k] ==h ) c++; + + short[] clds = new short[c]; + c=0; + for(int k=0;k<heads.length;k++) if (heads[k] ==h ) clds[c++]=(short)k; + return clds; + } + + + +} diff --git a/dependencyParser/basic/mate-tools/src/extractors/ParallelExtract.java b/dependencyParser/basic/mate-tools/src/extractors/ParallelExtract.java new file mode 100755 index 0000000..a2ef72c --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/extractors/ParallelExtract.java @@ -0,0 +1,194 @@ +package extractors; + +import is2.data.Cluster; +import is2.data.DataF; +import is2.data.Edges; +import is2.data.F2SF; +import is2.data.FV; +import is2.data.Instances; +import is2.data.Long2IntInterface; + +import java.util.ArrayList; +import java.util.concurrent.Callable; + + +/** + * @author Bernd Bohnet, 30.08.2009 + * + * This class implements a parallel feature extractor. + */ +final public class ParallelExtract implements Callable<Object> +{ + // the data space of the weights for a dependency tree + final DataF d; + + // the data extractor does the actual work + final Extractor extractor; + + private Instances is; + private int i; + + private F2SF para; + + private Cluster cluster; + + private Long2IntInterface li; + + public ParallelExtract(Extractor e, Instances is, int i, DataF d, F2SF para,Cluster cluster, Long2IntInterface li) { + + this.is =is; + extractor=e; + this.d =d; + this.i=i; + this.para=para; + this.cluster = cluster; + this.li=li; + } + + + public static class DSet { + int w1,w2; + } + + public Object call() { + + try { + + F2SF f= para; + + + short[] pos=is.pposs[i]; + int[] forms=is.forms[i]; + int[] lemmas=is.plemmas[i]; + short[][] feats=is.feats[i]; + int length = pos.length; + + long[] svs = new long[250]; + + int type=extractor.getType(); + + while (true) { + + DSet set = get(); + if (set ==null) break; + + int w1=set.w1; + int w2=set.w2; + + f.clear(); + int n =extractor.basic(pos, forms, w1, w2,cluster, f); + d.pl[w1][w2]=f.getScoreF(); + + short[] labels = Edges.get(pos[w1], pos[w2],false); + float[][] lab = d.lab[w1][w2]; + + extractor.firstm(is, i, w1, w2, 0, cluster, svs); + + if (labels!=null) { + + + for (int l = labels.length - 1; l >= 0; l--) { + + short label = labels[l]; + + f.clear(); + for(int k=svs.length-1;k>=0;k--) if (svs[k]>0) f.add(li.l2i(svs[k]+label*type)); + lab[label][0]=f.getScoreF(); + } + } + + labels = Edges.get(pos[w1], pos[w2],true); + + if (labels!=null) { + + for (int l = labels.length - 1; l >= 0; l--) { + + int label = labels[l]; + f.clear(); + for(int k=svs.length-1;k>=0;k--) if (svs[k]>0) f.add(li.l2i(svs[k]+label*type)); + lab[label][1]=f.getScoreF(); + } + } + + int s = w1<w2 ? w1 : w2; + int e = w1<w2 ? w2 : w1; + + int sg = w1<w2 ? w1 : 0; + int eg = w1<w2 ? length : w1+1; + + + for(int m=s;m<e;m++) { + for(int dir=0;dir<2;dir++) { + labels = Edges.get(pos[w1], pos[w2],dir==1); + float lab2[]= new float[labels.length]; + + int g = (m==s||e==m) ? -1 : m; + + + extractor.siblingm(is,i,pos,forms,lemmas,feats, w1, w2, g, 0, cluster, svs,n); + + for (int l = labels.length - 1; l >= 0; l--) { + + int label = labels[l]; + f.clear(); + + for(int k=svs.length-1;k>=0;k--) { + if (svs[k]>0) f.add(li.l2i(svs[k]+label*type)); + } + lab2[l] = (float)f.score;//f.getScoreF(); + } + d.sib[w1][w2][m][dir]=lab2; + } + } + + for(int m=sg;m<eg;m++) { + for(int dir=0;dir<2;dir++) { + labels = Edges.get(pos[w1], pos[w2],dir==1); + float[] lab2 = new float[labels.length]; + + int g = (m==s||e==m) ? -1 : m; + + extractor.gcm(is, i, w1,w2,g, 0, cluster, svs); + + for (int l = labels.length - 1; l >= 0; l--) { + + int label = labels[l]; + + f.clear(); + for(int k=svs.length-1;k>=0;k--) { + if (svs[k]>0) f.add(li.l2i(svs[k]+label*type)); + } + lab2[l] = f.getScoreF(); + } + d.gra[w1][w2][m][dir] =lab2; + } + } + + } + } catch(Exception e ) { + e.printStackTrace(); + } + return null; + } + + + static ArrayList<DSet> sets = new ArrayList<DSet>(); + + private DSet get() { + + synchronized (sets) { + if (sets.size()==0) return null; + return sets.remove(sets.size()-1); + } + } + static public void add(int w1, int w2){ + DSet ds =new DSet(); + ds.w1=w1; + ds.w2=w2; + sets.add(ds); + } + + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/Closed.java b/dependencyParser/basic/mate-tools/src/is2/data/Closed.java new file mode 100755 index 0000000..378d0c6 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/Closed.java @@ -0,0 +1,31 @@ +package is2.data; + + + +final public class Closed { + + public double p; + short b,e,m; + byte dir; + + Closed d; + Open u; + + public Closed(short s, short t, int m, int dir,Open u, Closed d, float score) { + this.b = s; + this.e = t; + this.m = (short)m; + this.dir = (byte)dir; + this.u=u; + this.d =d; + p=score; + } + + + public void create(Parse parse) { + if (u != null) u.create(parse); + if (d != null) d.create(parse); + } +} + + diff --git a/dependencyParser/basic/mate-tools/src/is2/data/Cluster.java b/dependencyParser/basic/mate-tools/src/is2/data/Cluster.java new file mode 100644 index 0000000..485713d --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/Cluster.java @@ -0,0 +1,158 @@ +/** + * + */ +package is2.data; + + + +import is2.util.DB; + +import java.io.BufferedReader; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; + +/** + * @author Dr. Bernd Bohnet, 28.10.2010 + * + * + */ +final public class Cluster { + + public static final String LPATH = "LP"; + public static final String SPATH = "SP"; + + // [word][p] p = [0:long-path | 1:short-path] + final private short[][] word2path; + + public Cluster() { + word2path =new short[0][0]; + } + + /** + * @param clusterFile + * @param mf + * + */ + public Cluster(String clusterFile, IEncoderPlus mf, int ls) { + + final String REGEX = "\t"; + + // register words + try { + BufferedReader inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(clusterFile),"UTF-8"),32768); + + int cnt=0; + String line; + while ((line =inputReader.readLine())!=null) { + + cnt++; + try { + String[] split = line.split(REGEX); + mf.register(SPATH, split[0].length()<ls?split[0]:split[0].substring(0,ls)); + mf.register(LPATH, split[0]); + mf.register(PipeGen.WORD, split[1]); + } catch(Exception e) { + System.out.println("Error in cluster line "+cnt+" error: "+e.getMessage()); + } + } + System.out.println("read number of clusters "+cnt); + inputReader.close(); + + } catch (Exception e) { + e.printStackTrace(); + } + + word2path = new short[mf.getFeatureCounter().get(PipeGen.WORD)][2]; + + + // insert words + try { + String line; + BufferedReader inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(clusterFile),"UTF-8"),32768); + + while ((line =inputReader.readLine())!=null) { + + String[] split = line.split(REGEX); + int wd = mf.getValue(PipeGen.WORD, split[1]); + word2path[wd][0] = (short)mf.getValue(SPATH, split[0].length()<ls?split[0]:split[0].substring(0,ls)); + word2path[wd][1] = (short)mf.getValue(LPATH, split[0]); + } + inputReader.close(); + int fill=0; + for(int l = 0; l<word2path.length; l++ ){ + if (word2path[l][0]!=0) fill++; + } + /* + for(int l = 0; l<word2path.length; l++ ){ + if (word2path[l][1]!=0) fillL++; + if (word2path[l][1]<-1) System.out.println("lower "+word2path[l][1]); + } + */ + System.out.println("filled "+fill+" of "+word2path.length); + + } catch (Exception e) { + e.printStackTrace(); + } + } + + /** + * Read the cluster + * @param dos + * @throws IOException + */ + public Cluster(DataInputStream dis) throws IOException { + + word2path = new short[dis.readInt()][2]; + for(int i =0;i<word2path.length;i++) { + word2path[i][0]=dis.readShort(); + word2path[i][1]=dis.readShort(); + } + DB.println("Read cluster with "+word2path.length+" words "); + } + + /** + * Write the cluster + * @param dos + * @throws IOException + */ + public void write(DataOutputStream dos) throws IOException { + + dos.writeInt(word2path.length); + for(short[] i : word2path) { + dos.writeShort(i[0]); + dos.writeShort(i[1]); + } + + } + + /** + * @param form the id of a word form + * @return the short path to the word form in the cluster + + final public int getSP(int form) { + if (word2path.length<form) return -1; + return word2path[form][0]; + } + */ + /** + * get the long path to a word form in the cluster + * @param form the id of a word form + * @return the long path to the word + */ + final public int getLP(int form) { + if (word2path.length<=form || word2path[form].length<=0) return -1; + return word2path[form][0]==0?-1:word2path[form][0]; + } + + final public int getLP(int form, int l) { + if (word2path.length<form) return -1; + return word2path[form][l]==0?-1:word2path[form][l]; + } + + final public int size() { + return word2path.length; + } +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/D4.java b/dependencyParser/basic/mate-tools/src/is2/data/D4.java new file mode 100644 index 0000000..8be3df2 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/D4.java @@ -0,0 +1,191 @@ +/** + * + */ +package is2.data; + +import is2.util.DB; + +/** + * @author Dr. Bernd Bohnet, 30.10.2010 + * + * This class computes the mapping of features to the weight vector. + */ +final public class D4 extends DX { + private long shift; + private long h; + + + private final Long2IntInterface _li; + public D4(Long2IntInterface li) { + _li=li; + } + + + final public void clean() { + v0=0;v1=0;v2=0;v3=0;v4=0;v5=0;v6=0;v7=0;v8=0; + shift=0;h=0; + } + + final public void cz3(){ + if (v0<0||v1<0||v2<0) { h=-1;return;} + + h= v0+v1*(shift =a0)+(long)v2*(shift *=a1); + shift *=a2; + } + + final public long c3(){ + if (v0<0||v1<0||v2<0) { h=-1;return h;} + + h= v0+v1*(shift =a0)+(long)v2*(shift *=a1); + shift *=a2; + return h; + } + + final public void cz4(){ + if (v0<0||v1<0||v2<0||v3<0) {h=-1;return;} + + h =v0+v1*(shift =a0)+v2*(shift *=a1)+v3*(shift *=a2); + shift *=a3; + } + + final public long c4(){ + if (v0<0||v1<0||v2<0||v3<0) {h=-1;return h;} + + h =v0+v1*(shift =a0)+v2*(shift *=a1)+v3*(shift *=a2); + shift *=a3; + return h; + } + + + final public void cz5(){ + + if (v0<0||v1<0||v2<0||v3<0||v4<0) {h=-1;return;} + + h =v0+v1*(shift =a0)+v2*(shift *=a1)+v3*(shift*=a2)+v4*(shift*=a3); + shift*=a4; + + } + + final public long c5(){ + + if (v0<0||v1<0||v2<0||v3<0||v4<0) {h=-1;return h;} + + h =v0+v1*(shift =a0)+v2*(shift *=a1)+v3*(shift *=a2)+v4*(shift*=a3); + shift*=a4; + return h; + } + + + final public void cz6(){ + + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0) {h=-1; return;} + + h =v0+v1*(shift =a0)+v2*(shift *=a1)+v3*(shift *=a2); + h +=v4*(shift*=a3)+v5*(shift*=a4); + shift*=a5; + } + + final public long c6(){ + + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0) {h=-1; return h;} + + h =v0+v1*(shift =a0)+v2*(shift *=a1)+v3*(shift *=a2); + h +=v4*(shift*=a3)+v5*(shift*=a4); + shift*=a5; + return h; + } + + + final public long cs(int b, int v) { + if (h<0) {h=-1; return h;} + + h += v*shift; + shift *=b; + return h; + + } + + final public void csa(int b, int v, IFV f) { + if (h<0) {h=-1; return;} + + h += v*shift; + shift *=b; + f.add(_li.l2i(h)); + } + + final public long csa(int b, int v) { + if (h<0) {h=-1; return-1; } + + h += v*shift; + shift *=b; + return h; + } + + public final long getVal(){ + return h; + } + + public final void map(IFV f, long l){ + if (l>0) f.add(this._li.l2i(l)); + } + + /** + * @param f + */ + final public void add(IFV f) { + f.add(_li.l2i(h)); + } + + final public void cz7() { + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0||v6<0) {h=-1; return;} + + h =v0+v1*(shift =a0)+v2*(shift *=a1)+v3*(shift *=a2); + h +=v4*(shift*=a3)+v5*(shift*=a4)+v6*(shift*=a5); + shift*=a6; + + } + + final public long c7() { + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0||v6<0) {h=-1; return h;} + + h =v0+v1*(shift =a0)+v2*(shift *=a1)+v3*(shift *=a2); + h +=v4*(shift*=a3)+v5*(shift*=a4)+v6*(shift*=a5); + shift*=a6; + return h; + } + + /** + * + */ + final public void cz8() { + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0||v6<0||v7<0) {h=-1; return;} + + h =v0+v1*(shift =a0)+v2*(shift *=a1)+v3*(shift *=a2); + h +=v4*(shift*=a3)+v5*(shift*=a4)+v6*(shift*=a5)+v7*(shift*=a6); + shift*=a7; + } + + final public void cz9() { + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0||v6<0||v7<0||v8<0) {h=-1; return;} + + h =v0+v1*(shift =a0)+v2*(shift *=a1)+v3*(shift *=a2); + h +=v4*(shift*=a3)+v5*(shift*=a4)+v6*(shift*=a5)+v7*(shift*=a6)+v8*(shift*=a7); + shift*=a8; + } + + + /* (non-Javadoc) + * @see is2.data.DX#computeLabeValue(short, short) + */ + @Override + public int computeLabeValue(int label, int shift) { + return label*shift; + } + + + public void fix() { + + } + + +} \ No newline at end of file diff --git a/dependencyParser/basic/mate-tools/src/is2/data/D6.java b/dependencyParser/basic/mate-tools/src/is2/data/D6.java new file mode 100644 index 0000000..3694249 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/D6.java @@ -0,0 +1,197 @@ +/** + * + */ +package is2.data; + +import is2.util.DB; + +/** + * @author Dr. Bernd Bohnet, 30.10.2010 + * + * This class computes the mapping of features to the weight vector. + */ +final public class D6 extends DX { + private long shift; + private long h; + + + private final Long2IntInterface _li; + public D6(Long2IntInterface li) { + _li=li; + } + + boolean fixed =false; + + public void fix() { + + if (fixed) { + DB.println("warning: already fixed"); + // return; + } + + long t0= 1, t1=a0, t2=t1*a1, t3=t2*a2,t4=t3*a3, t5=t4*a4,t6=t5*a5, t7=t6*a6, t8=t7*a7, t9=t8*a8; + + + + + a0=t0;a1=t1;a2=t2;a3=t3;a4=t4;a5=t5;a6=t6;a7=t7;a8=t8; a9=t9; + + fixed=true; + } + + + + final public void clean() { + v0=0;v1=0;v2=0;v3=0;v4=0;v5=0;v6=0;v7=0;v8=0; + shift=0;h=0; + } + + final public void cz3(){ + if (v0<0||v1<0||v2<0) { h=-1;return;} + + h= v0+v1*a1+v2*a2; + shift =a3; + } + + final public long c3(){ + if (v0<0||v1<0||v2<0) { h=-1;return h;} + + h= v0+v1*a1+v2*a2; + shift =a3; + return h; + } + + final public void cz4(){ + if (v0<0||v1<0||v2<0||v3<0) {h=-1;return;} + + h =v0+v1*a1+v2*a2+v3*a3; + shift =a4; + } + + final public long c4(){ + if (v0<0||v1<0||v2<0||v3<0) {h=-1;return h;} + + h =v0+v1*a1+v2*a2+v3*a3; + shift =a4; + return h; + } + + + final public void cz5(){ + + if (v0<0||v1<0||v2<0||v3<0||v4<0) {h=-1;return;} + + h =v0+v1*a1+v2*a2+v3*a3+v4*a4; + shift=a5; + + } + + final public long c5(){ + + if (v0<0||v1<0||v2<0||v3<0||v4<0) {h=-1;return h;} + + h =v0+v1*a1+v2*a2+v3*a3+v4*a4; + shift=a5; + return h; + } + + + final public void cz6(){ + + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0) {h=-1; return;} + + h =v0+v1*a1+v2*a2+v3*a3 +v4*a4+v5*a5; + shift=a6; + } + + final public long c6(){ + + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0) {h=-1; return h;} + + h =v0+v1*a1+v2*a2+v3*a3 +v4*a4+v5*a5; + shift=a6; + return h; + } + + + final public long cs(int b, int v) { + if (h<0) {h=-1; return h;} + + h += v*shift; + shift *=b; + return h; + + } + + final public void csa(int b, int v, IFV f) { + if (h<0) {h=-1; return;} + + h += v*shift; + shift *=b; + f.add(_li.l2i(h)); + } + + final public long csa(int b, int v) { + if (h<0) {h=-1; return-1; } + + h += v*shift; + shift *=b; + return h; + } + + public final long getVal(){ + return h; + } + + public final void map(IFV f, long l){ + if (l>0) f.add(this._li.l2i(l)); + } + + /** + * @param f + */ + final public void add(IFV f) { + f.add(_li.l2i(h)); + } + + final public void cz7() { + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0||v6<0) {h=-1; return;} + + h =v0+v1*a1+v2*a2+v3*a3 +v4*a4+v5*a5+v6*a6; + shift=a7; + + } + + final public long c7() { + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0||v6<0) {h=-1; return h;} + + h =v0+v1*a1+v2*a2+v3*a3 +v4*a4+v5*a5+v6*a6; + shift=a7; + return h; + } + + /** + * + */ + final public void cz8() { + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0||v6<0||v7<0) {h=-1; return;} + + h =v0+v1*a1+v2*a2+v3*a3 +v4*a4+v5*a5+v6*a6+v7*a7; + shift=a8; + } + + + + /* (non-Javadoc) + * @see is2.data.DX#computeLabeValue(short, short) + */ + @Override + public int computeLabeValue(int label, int shift) { + return label*shift; + } + + + + + +} \ No newline at end of file diff --git a/dependencyParser/basic/mate-tools/src/is2/data/D7.java b/dependencyParser/basic/mate-tools/src/is2/data/D7.java new file mode 100644 index 0000000..f4675d8 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/D7.java @@ -0,0 +1,220 @@ +/** + * + */ +package is2.data; + + +/** + * @author Dr. Bernd Bohnet, 30.10.2010 + * + * This class computes the mapping of features to the weight vector. + */ +final public class D7 extends DX { + + private long shift; + private long h; + private final Long2IntInterface _li; + + public D7(Long2IntInterface li) { + _li=li; + } + + boolean fixed =false; + + public void fix() { + + long t0= 1, t1=a0, t2=t1*a1, t3=t2*a2,t4=t3*a3, t5=t4*a4,t6=t5*a5, t7=t6*a6, t8=t7*a7, t9=t8*a8; + + a0=t0;a1=t1;a2=t2;a3=t3;a4=t4;a5=t5;a6=t6;a7=t7;a8=t8; a9=t9; + + } + + + + final public void clean() { + v0=0;v1=0;v2=0;v3=0;v4=0;v5=0;v6=0;v7=0;v8=0; + shift=0;h=0; + } + + final public void cz3(){ + if (v2<0) { h=-1;return;} + + h= v0+v1*a1+v2*a2; + shift =a3; + } + + final public long c3(){ + if (v2<0) { h=-1;return h;} + + h= v0+v1*a1+v2*a2; + shift =a3; + return h; + } + + final public long d3(){ + if (v2<0)return -1; + return v0+v2*a2; + } + + final public void cz4(){ + // if (v0<0||v1<0||v2<0||v3<0) {h=-1;return;} + if (v2<0||v3<0) {h=-1;return;} + + h =v0+v1*a1+v2*a2+v3*a3; + shift =a4; + } + + final public long c4(){ + if (v2<0||v3<0) {h=-1;return h;} + + h =v0+v1*a1+v2*a2+v3*a3; + shift =a4; + return h; + } + + + final public long d4(){ + if (v2<0||v3<0) return -1; + return v0+v2*a2+v3*a3; + } + + + final public void cz5(){ + + if (v2<0||v3<0||v4<0) {h=-1;return;} + + h =v0+v1*a1+v2*a2+v3*a3+v4*a4; + shift=a5; + + } + + final public long c5(){ + + if (v2<0||v3<0||v4<0) {h=-1;return h;} + + h =v0+v1*a1+v2*a2+v3*a3+v4*a4; + shift=a5; + return h; + } + + final public long d5(){ + if (v2<0||v3<0||v4<0) return -1; + return v0+v2*a2+v3*a3+v4*a4; + } + + + final public void cz6(){ + + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0) {h=-1; return;} + + h =v0+v1*a1+v2*a2+v3*a3 +v4*a4+v5*a5; + shift=a6; + } + + final public long c6(){ + + if (v2<0||v3<0||v4<0||v5<0) {h=-1; return h;} + + h =v0+v1*a1+v2*a2+v3*a3 +v4*a4+v5*a5; + shift=a6; + return h; + } + + final public long d6(){ + if (v2<0||v3<0||v4<0||v5<0) return -1; + return v0+v2*a2+v3*a3 +v4*a4+v5*a5; + } + + + final public long cs(int b, int v) { + if (h<0) {h=-1; return h;} + + h += v*shift; + shift *=b; + return h; + + } + + final public void csa(int b, int v, IFV f) { + if (h<0) {h=-1; return;} + + h += v*shift; + shift *=b; + f.add(_li.l2i(h)); + } + + final public long csa(int b, int v) { + if (h<0) {h=-1; return-1; } + + h += v*shift; + shift *=b; + return h; + } + + public final long getVal(){ + return h; + } + + public final void map(IFV f, long l){ + if (l>0) f.add(this._li.l2i(l)); + } + + /** + * @param f + */ + final public void add(IFV f) { + f.add(_li.l2i(h)); + } + + final public void cz7() { + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0||v6<0) {h=-1; return;} + + h =v0+v1*a1+v2*a2+v3*a3 +v4*a4+v5*a5+v6*a6; + shift=a7; + + } + + + final public long c7() { + if (v2<0||v3<0||v4<0||v5<0||v6<0) {h=-1; return h;} + + h =v0+v1*a1+v2*a2+v3*a3 +v4*a4+v5*a5+v6*a6; + shift=a7; + return h; + } + + final public long d7() { + if (v2<0||v3<0||v4<0||v5<0||v6<0) return -1; + return v0+v2*a2+v3*a3 +v4*a4+v5*a5+v6*a6; + } + + /** + * + */ + final public void cz8() { + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0||v6<0||v7<0) {h=-1; return;} + + h =v0+v1*a1+v2*a2+v3*a3 +v4*a4+v5*a5+v6*a6+v7*a7; + shift=a8; + } + + final public long d8() { + if (v2<0||v3<0||v4<0||v5<0||v6<0||v7<0) {return-1;} + return v0+v2*a2+v3*a3 +v4*a4+v5*a5+v6*a6+v7*a7; + } + + + + /* (non-Javadoc) + * @see is2.data.DX#computeLabeValue(short, short) + */ + @Override + public int computeLabeValue(int label, int shift) { + return label*shift; + } + + + + + +} \ No newline at end of file diff --git a/dependencyParser/basic/mate-tools/src/is2/data/DPSTree.java b/dependencyParser/basic/mate-tools/src/is2/data/DPSTree.java new file mode 100644 index 0000000..554f756 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/DPSTree.java @@ -0,0 +1,115 @@ +/** + * + */ +package is2.data; + +import is2.util.DB; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Stack; + +/** + * @author Dr. Bernd Bohnet, 17.01.2011 + * + * Dynamic phrase structure tree. + */ +public class DPSTree { + + + private int size=0; + + public int[] heads; + public int[] labels; + + public DPSTree() { + this(30); + } + + public DPSTree(int initialCapacity) { + heads = new int[initialCapacity]; + labels = new int[initialCapacity]; + } + + + /** + * Increases the capacity of this <tt>Graph</tt> instance, if + * necessary, to ensure that it can hold at least the number of nodes + * specified by the minimum capacity argument. + * + * @param minCapacity the desired minimum capacity. + */ + private void ensureCapacity(int minCapacity) { + + + if (minCapacity > heads.length) { + + int newCapacity =minCapacity + 1; + + if (newCapacity < minCapacity) newCapacity = minCapacity; + int oldIndex[] = heads; + heads = new int[newCapacity]; + System.arraycopy(oldIndex, 0, heads, 0, oldIndex.length); + + oldIndex = labels; + labels = new int[newCapacity]; + System.arraycopy(oldIndex, 0, labels, 0, oldIndex.length); + + } + } + + + final public int size() { + return size; + } + + + final public boolean isEmpty() { + return size == 0; + } + + final public void clear() { + size = 0; + } + + final public void createTerminals(int terminals) { + ensureCapacity(terminals+1); + size= terminals+1; + } + + final public int create(int phrase) { + + ensureCapacity(size+1); + labels[size] =phrase; + size++; + return size-1; + } + + public int create(int phrase, int nodeId) { + + if (nodeId<0) return this.create(phrase); +// DB.println("create phrase "+nodeId+" label "+phrase); + ensureCapacity(nodeId+1); + labels[nodeId] =phrase; + if (size<nodeId) size=nodeId+1; + return nodeId; + } + + public void createEdge(int i, int j) { + heads[i] =j; +// DB.println("create edge "+i+"\t "+j); + } + + public DPSTree clone() { + DPSTree ps = new DPSTree(this.size+1); + + for(int k=0;k<size;k++) { + ps.heads[k] = heads[k]; + ps.labels[k] = labels[k]; + } + ps.size=size; + return ps; + + } + +} \ No newline at end of file diff --git a/dependencyParser/basic/mate-tools/src/is2/data/DX.java b/dependencyParser/basic/mate-tools/src/is2/data/DX.java new file mode 100644 index 0000000..c357b58 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/DX.java @@ -0,0 +1,58 @@ +/** + * + */ +package is2.data; + +import is2.data.IFV; + +/** + * @author Dr. Bernd Bohnet, 30.08.2011 + * + * + */ +public abstract class DX { + + public long a0,a1,a2,a3,a4,a5,a6,a7,a8,a9; + public long v0,v1,v2,v3,v4,v5,v6,v7,v8,v9; + + public abstract void cz3(); + + public abstract void cz4(); + + public abstract void cz5(); + + public abstract void cz6(); + + public abstract void cz7(); + + public abstract void cz8(); + + public abstract void clean(); + + public abstract long cs(int b, int v); + + public abstract long csa(int b, int v); + + public abstract void csa(int b, int v, IFV f); + + /** + * @return + */ + public abstract long getVal(); + + /** + * @param f + * @param l + */ + public abstract void map(IFV f, long l); + + /** + * @param label + * @param s_type + * @return + */ + public abstract int computeLabeValue(int label,int s_type) ; + + public abstract void fix(); + +} \ No newline at end of file diff --git a/dependencyParser/basic/mate-tools/src/is2/data/DataF.java b/dependencyParser/basic/mate-tools/src/is2/data/DataF.java new file mode 100755 index 0000000..f127fbd --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/DataF.java @@ -0,0 +1,39 @@ +package is2.data; + + + +final public class DataF { + + final public short typesLen; + final public int len; + + // first order features + final public float[][] pl; + + // remove !!!! +// final public float[][] highestLab; + + //final public FV[][][] label; + final public float[][][][] lab; + + + public FV fv; + + final public float[][][][][] sib; + + final public float[][][][][] gra; + + + public DataF(int length, short types) { + typesLen=types; + len =length; + + pl = new float[length][length]; + lab = new float[length][length][types][2]; + // highestLab = new float[length][length]; + + sib = new float[length][length][length][2][]; + gra = new float[length][length][length][2][]; + + } +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/DataFES.java b/dependencyParser/basic/mate-tools/src/is2/data/DataFES.java new file mode 100644 index 0000000..9772858 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/DataFES.java @@ -0,0 +1,38 @@ +package is2.data; + + + +final public class DataFES { + + final public short typesLen; + final public int len; + + // first order features + final public float[][] pl; + + // remove !!!! +// final public float[][] highestLab; + + //final public FV[][][] label; + final public float[][][] lab; + + + public FV fv; + + final public float[][][][] sib; + + final public float[][][][] gra; + + + public DataFES(int length, short types) { + typesLen=types; + len =length; + + pl = new float[length][length]; + lab = new float[length][length][types]; + + sib = new float[length][length][length][]; + gra = new float[length][length][length][]; + + } +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/DataT.java b/dependencyParser/basic/mate-tools/src/is2/data/DataT.java new file mode 100644 index 0000000..47691f8 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/DataT.java @@ -0,0 +1,25 @@ +package is2.data; + + + +final public class DataT { + + final public short typesLen; + final public int len; + + + //final public FV[][][] label; + // a b lab op + final public float[][][][] lab; + + + + public DataT(int length, short types) { + typesLen=types; + len =length; + + lab = new float[length][length][types][4]; + + + } +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/Edges.java b/dependencyParser/basic/mate-tools/src/is2/data/Edges.java new file mode 100644 index 0000000..f8b2ef9 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/Edges.java @@ -0,0 +1,224 @@ +/** + * + */ +package is2.data; + + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Map.Entry; + +/** + * @author Dr. Bernd Bohnet, 13.05.2009; + * + * + */ +public final class Edges { + + + private static short[][][][] edges; + private static HashMap<Short,Integer> labelCount = new HashMap<Short,Integer>(); + + private static HashMap<String,Integer> slabelCount = new HashMap<String,Integer>(); + + + static short[] def = new short[1]; + + private Edges () {} + + /** + * @param length + */ + public static void init(int length) { + edges = new short[length][length][2][]; + } + + + public static void findDefault(){ + + int best =0; + + + + for(Entry<Short,Integer> e : labelCount.entrySet()) { + + + if (best<e.getValue()) { + best = e.getValue(); + def[0]=e.getKey(); + } + } + + + // labelCount=null; + // String[] types = new String[mf.getFeatureCounter().get(PipeGen.REL)]; + // for (Entry<String, Integer> e : MFO.getFeatureSet().get(PipeGen.REL).entrySet()) types[e.getValue()] = e.getKey(); + + is2.util.DB.println("set default label to "+def[0]+" " ); + + // System.out.println("found default "+def[0]); + + } + + + final static public void put(int pos1, int pos2, boolean dir, short label) { + putD(pos1, pos2,dir, label); + // putD(pos2, pos1,!dir, label); + } + + + final static public void putD(int pos1, int pos2, boolean dir, short label) { + + Integer lc = labelCount.get(label); + if (lc==null) labelCount.put(label, 1); + else labelCount.put(label, lc+1); + + String key = pos1+"-"+pos2+dir+label; + Integer lcs = slabelCount.get(key); + if (lcs==null) slabelCount.put(key, 1); + else slabelCount.put(key, lcs+1); + + + if (edges[pos1][pos2][dir?0:1]==null) { + edges[pos1][pos2][dir?0:1]=new short[1]; + edges[pos1][pos2][dir?0:1][0]=label; + +// edgesh[pos1][pos2][dir?0:1] = new TIntHashSet(2); +// edgesh[pos1][pos2][dir?0:1].add(label); + } else { + short labels[] = edges[pos1][pos2][dir?0:1]; + for(short l : labels) { + //contains label already? + if(l==label) return; + } + + short[] nlabels = new short[labels.length+1]; + System.arraycopy(labels, 0, nlabels, 0, labels.length); + nlabels[labels.length]=label; + edges[pos1][pos2][dir?0:1]=nlabels; + + // edgesh[pos1][pos2][dir?0:1].add(label); + } + } + + final static public short[] get(int pos1, int pos2, boolean dir) { + + if (pos1<0 || pos2<0 || edges[pos1][pos2][dir?0:1]==null) return def; + return edges[pos1][pos2][dir?0:1]; + } + + + /** + * @param dis + */ + static public void write(DataOutputStream d) throws IOException { + + int len = edges.length; + d.writeShort(len); + + for(int p1 =0;p1<len;p1++) { + for(int p2 =0;p2<len;p2++) { + if (edges[p1][p2][0]==null) d.writeShort(0); + else { + d.writeShort(edges[p1][p2][0].length); + for(int l =0;l<edges[p1][p2][0].length;l++) { + d.writeShort(edges[p1][p2][0][l]); + } + + } + + if (edges[p1][p2][1]==null) d.writeShort(0); + else { + d.writeShort(edges[p1][p2][1].length); + for(int l =0;l<edges[p1][p2][1].length;l++) { + d.writeShort(edges[p1][p2][1][l]); + } + } + } + } + + d.writeShort(def[0]); + + } + + + /** + * @param dis + */ + public static void read(DataInputStream d) throws IOException { + int len = d.readShort(); + + edges = new short[len][len][2][]; + for(int p1 =0;p1<len;p1++) { + for(int p2 =0;p2<len;p2++) { + int ll = d.readShort(); + if (ll==0) { + edges[p1][p2][0]=null; + } else { + edges[p1][p2][0] = new short[ll]; + for(int l =0;l<ll;l++) { + edges[p1][p2][0][l]=d.readShort(); + } + } + + ll = d.readShort(); + if (ll==0) { + edges[p1][p2][1]=null; + } else { + edges[p1][p2][1] = new short[ll]; + for(int l =0;l<ll;l++) { + edges[p1][p2][1][l]=d.readShort(); + } + } + } + } + + def[0]= d.readShort(); + + } + + public static class C implements Comparator<Short> { + + public C() { + super(); + } + + String _key; + + public C(String key) { + super(); + _key=key; + } + + /* (non-Javadoc) + * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) + */ + @Override + public int compare(Short l1, Short l2) { + + // int c1 = labelCount.get(l1); + // int c2 = labelCount.get(l2); + // if (true) return c1==c2?0:c1>c2?-1:1; + + int x1 = slabelCount.get(_key+l1.shortValue()); + int x2 = slabelCount.get(_key+l2.shortValue()); + // System.out.println(x1+" "+x2); + + + return x1==x2?0:x1>x2?-1:1; + + + + } + + + + + + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/F2S.java b/dependencyParser/basic/mate-tools/src/is2/data/F2S.java new file mode 100755 index 0000000..1f1f668 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/F2S.java @@ -0,0 +1,48 @@ +package is2.data; + + +final public class F2S extends IFV { + + private double[] parameters; + public F2S() {} + + public double score; + + /** + * @param parameters2 + */ + public F2S(double[] parameters2) { + parameters=parameters2; + } + + @Override + public void add(int i) { + if (i>0)score += parameters[i]; + } + + public void setParameters(double[] p) { + parameters =p; + } + + @Override + public void clear() { + score =0; + } + + /* (non-Javadoc) + * @see is2.IFV#getScore() + */ + @Override + public double getScore() { + return score; + } + + /* (non-Javadoc) + * @see is2.IFV#clone() + */ + @Override + public IFV clone() { + return new F2S(parameters); + } + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/F2SD.java b/dependencyParser/basic/mate-tools/src/is2/data/F2SD.java new file mode 100755 index 0000000..45c554f --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/F2SD.java @@ -0,0 +1,44 @@ +package is2.data; + + +final public class F2SD extends IFV { + + final private double[] parameters; + + public double score =0; + + public F2SD(double[] p) { + parameters =p; + } + + + @Override + public void add(int i) { + if (i>0)score += parameters[i]; + } + + @Override + public void clear() { + score =0; + } + + + /* (non-Javadoc) + * @see is2.IFV#getScore() + */ + @Override + public double getScore() { + return score; + } + + + /* (non-Javadoc) + * @see is2.IFV#clone() + */ + @Override + public IFV clone() { + return new F2SD(parameters); + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/F2SF.java b/dependencyParser/basic/mate-tools/src/is2/data/F2SF.java new file mode 100755 index 0000000..127d775 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/F2SF.java @@ -0,0 +1,78 @@ +package is2.data; + + +final public class F2SF extends IFV { + + final private float[] parameters; + + + + public float score =0; + + public F2SF(float[] p) { + parameters =p; + } + + @Override + final public void add(int i) { + if (i>0) score += parameters[i]; + } + + + final public void add(int[] i) { + for(int k=0;k<i.length;k++) { + if (i[k]>0) score += parameters[i[k]]; + } + } + + + final public void sub(float[] px,int i, Long2IntInterface li) { + + if (i>0) { + score -= px[li.l2i(i)]; +// score -= px[i]; + //else score -=px[]; + } + } + + + @Override + public void clear() { + score =0; + } + + + /* (non-Javadoc) + * @see is2.IFV#getScore() + */ + @Override + public double getScore() { + return score; + } + + public float getScoreF() { + return score; + } + + /* (non-Javadoc) + * @see is2.IFV#clone() + */ + @Override + public IFV clone() { + return new F2SF(this.parameters); + } + + /** + * @param l2i + */ + public void addRel(int i, float f) { + if (i>0) score += parameters[i]*f; + + } + + public int length() { + return this.parameters.length; + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/F2SP.java b/dependencyParser/basic/mate-tools/src/is2/data/F2SP.java new file mode 100644 index 0000000..515a788 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/F2SP.java @@ -0,0 +1,76 @@ +package is2.data; + + +final public class F2SP extends IFV { + + final private float[] parameters; + + public double score =0; + + public F2SP(float[] p) { + parameters =p; + } + + @Override + final public void add(int i) { + if (i>0) score += parameters[i]; + } + + + final public void add(int[] i) { + for(int k=0;k<i.length;k++) { + if (i[k]>0) score += parameters[i[k]]; + } + } + + + final public void sub(float[] px,int i, Long2IntInterface li) { + + if (i>0) { + score -= px[li.l2i(i)]; +// score -= px[i]; + //else score -=px[]; + } + } + + + @Override + public void clear() { + score =0; + } + + + /* (non-Javadoc) + * @see is2.IFV#getScore() + */ + @Override + public double getScore() { + return score; + } + + public double getScoreF() { + return score; + } + + /* (non-Javadoc) + * @see is2.IFV#clone() + */ + @Override + public IFV clone() { + return new F2SP(this.parameters); + } + + /** + * @param l2i + */ + public void addRel(int i, float f) { + if (i>0) score += parameters[i]*f; + + } + + public int length() { + return this.parameters.length; + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/F2ST.java b/dependencyParser/basic/mate-tools/src/is2/data/F2ST.java new file mode 100644 index 0000000..2ef062b --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/F2ST.java @@ -0,0 +1,47 @@ +package is2.data; + + +final public class F2ST extends IFV { + + final private short[] parameters; + + + + public int score =0; + + public F2ST(short[] p) { + parameters =p; + } + + @Override + final public void add(int i) { + if (i>0) score += parameters[i]; + } + + @Override + public void clear() { + score =0; + } + + + /* (non-Javadoc) + * @see is2.IFV#getScore() + */ + @Override + public double getScore() { + return score; + } + + public float getScoreF() { + return score; + } + + /* (non-Javadoc) + * @see is2.IFV#clone() + */ + @Override + public IFV clone() { + return new F2ST(this.parameters); + } + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/FV.java b/dependencyParser/basic/mate-tools/src/is2/data/FV.java new file mode 100755 index 0000000..1cfbeba --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/FV.java @@ -0,0 +1,551 @@ +package is2.data; + + + + + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; + +public final class FV extends IFV { + + private FV subfv1; + private FV subfv2; + private boolean negateSecondSubFV = false; + + private int size; + + // content of the nodes NxC + private int m_index[]; + + // type of the nodes NxT + + public FV() { + this(10); + } + + public FV(int initialCapacity) { + m_index = new int[initialCapacity]; + } + + + public FV (FV fv1, FV fv2) { + subfv1 = fv1; + subfv2 = fv2; + } + + public FV (FV fv1, FV fv2, boolean negSecond) { + this(0); + subfv1 = fv1; + subfv2 = fv2; + negateSecondSubFV = negSecond; + } + + /** + * Read a feature vector + * @param index + * @param value + */ + public FV(DataInputStream dos, int capacity) throws IOException { + this(capacity); + size= m_index.length; + + for (int i=0; i<size; i++) m_index[i] = dos.readInt(); + } + + + /** + * Read a feature vector + * @param index + * @param value + */ + public FV(DataInputStream dos) throws IOException { + this(dos.readInt()); + size= m_index.length; + + for (int i=0; i<size; i++) m_index[i] = dos.readInt(); + + + } + + /** + * Increases the capacity of this <tt>Graph</tt> instance, if + * necessary, to ensure that it can hold at least the number of nodes + * specified by the minimum capacity argument. + * + * @param minCapacity the desired minimum capacity. + */ + private void ensureCapacity(int minCapacity) { + + + if (minCapacity > m_index.length) { + + int oldIndex[] = m_index; + + int newCapacity = ( m_index.length * 3)/2 + 1; + + + if (newCapacity < minCapacity) newCapacity = minCapacity; + + m_index = new int[newCapacity]; + System.arraycopy(oldIndex, 0, m_index, 0, oldIndex.length); + + } + } + + + final public int size() { + return size; + } + + final public boolean isEmpty() { + return size == 0; + } + + @Override + final public void clear() { + size = 0; + } + + + final public int createFeature(int i, double v) { + + ensureCapacity(size+1); + m_index[size] =i; + size++; + return size-1; + } + + final public int createFeature(int i) { + + ensureCapacity(size+1); + m_index[size] =i; + size++; + return size-1; + } + + + final public int getIndex(int i) { + return m_index[i]; + } + + public void setIndex(int p, int i) { + m_index[p] = i; + } + + + /** + * Trims the capacity of this <tt>Graph</tt> instance to true size. + * An application can use this operation to minimize + * the storage of an <tt>Graph</tt> instance. + */ + public void trimToSize() { + + if (size < m_index.length) { + + + int oldIndex[] = m_index; + + m_index = new int[size]; + System.arraycopy(oldIndex, 0, m_index, 0, size); + + } + + } + + + + + + @Override + final public void add(int i) { + if (i>=0) { + ensureCapacity(size+1); + m_index[size] =i; + size++; + } + } + + final public void add(int[] i) { + + for(int k =0;k<i.length;k++) add(i[k]); + + } + + final public void put(int i, double f) { + if (i>=0) createFeature(i,f); + } + + + // fv1 - fv2 + public FV getDistVector(FV fl2) { + return new FV(this, fl2, true); + } + + + public double getScore(double[] parameters, boolean negate) { + double score = 0.0; + + if (null != subfv1) { + score += subfv1.getScore(parameters, negate); + + if (null != subfv2) { + if (negate) score += subfv2.getScore(parameters, !negateSecondSubFV); + else score += subfv2.getScore(parameters, negateSecondSubFV); + + } + } + + if (negate) for(int i=0;i<size;i++) score -= parameters[m_index[i]]; + else for(int i=0;i<size;i++) score += parameters[m_index[i]]; + + + return score; + } + + + final public float getScore(float[] parameters, boolean negate) { + float score = 0.0F; + + if (null != subfv1) { + score += subfv1.getScore(parameters, negate); + + if (null != subfv2) { + if (negate) score += subfv2.getScore(parameters, !negateSecondSubFV); + else score += subfv2.getScore(parameters, negateSecondSubFV); + + } + } + + // warning changed the the value + + if (negate) for(int i=0;i<size;i++) score -= parameters[m_index[i]];//*m_value[i]; + else for(int i=0;i<size;i++) score += parameters[m_index[i]];//*m_value[i]; + + return score; + } + + final public int getScore(short[] parameters, boolean negate) { + int score = 0; + + if (null != subfv1) { + score += subfv1.getScore(parameters, negate); + + if (null != subfv2) { + if (negate) score += subfv2.getScore(parameters, !negateSecondSubFV); + else score += subfv2.getScore(parameters, negateSecondSubFV); + + } + } + + // warning changed the value + + if (negate) for(int i=0;i<size;i++) score -= parameters[m_index[i]];//*m_value[i]; + else for(int i=0;i<size;i++) score += parameters[m_index[i]];//*m_value[i]; + + return score; + } + + + + public void update(double[] parameters, double[] total, double alpha_k, double upd) { + update(parameters, total, alpha_k, upd, false); + } + + public final void update(double[] parameters, double[] total, double alpha_k, double upd, boolean negate) { + + if (null != subfv1) { + subfv1.update(parameters, total, alpha_k, upd, negate); + + if (null != subfv2) { + if (negate) subfv2.update(parameters, total, alpha_k, upd, !negateSecondSubFV); + else subfv2.update(parameters, total, alpha_k, upd, negateSecondSubFV); + } + } + + if (negate) { + for(int i=0;i<size;i++) { + parameters[m_index[i]] -= alpha_k;//*getValue(i); + total[m_index[i]] -= upd*alpha_k;//*getValue(i); + } + } else { + for(int i=0;i<size;i++){ + parameters[m_index[i]] += alpha_k;//*getValue(i); + total[m_index[i]] += upd*alpha_k;//*getValue(i); + } + } + + + } + + public final void update(short[] parameters, short[] total, double alpha_k, double upd, boolean negate) { + + if (null != subfv1) { + subfv1.update(parameters, total, alpha_k, upd, negate); + + if (null != subfv2) { + if (negate) subfv2.update(parameters, total, alpha_k, upd, !negateSecondSubFV); + else subfv2.update(parameters, total, alpha_k, upd, negateSecondSubFV); + } + } + + if (negate) { + for(int i=0;i<size;i++) { + parameters[m_index[i]] -= alpha_k;//*getValue(i); + total[m_index[i]] -= upd*alpha_k;//*getValue(i); + } + } else { + for(int i=0;i<size;i++){ + parameters[m_index[i]] += alpha_k;//*getValue(i); + total[m_index[i]] += upd*alpha_k;//*getValue(i); + } + } + + + } + + + public final void update(float[] parameters, float[] total, double alpha_k, double upd, boolean negate) { + + if (null != subfv1) { + subfv1.update(parameters, total, alpha_k, upd, negate); + + if (null != subfv2 && negate) { + subfv2.update(parameters, total, alpha_k, upd, !negateSecondSubFV); + } else { + subfv2.update(parameters, total, alpha_k, upd, negateSecondSubFV); + } + + } + + if (negate) { + for(int i=0;i<size;i++){ + parameters[getIndex(i)] -= alpha_k; + total[getIndex(i)] -= upd*alpha_k; + } + } else { + for(int i=0;i<size;i++){ + parameters[getIndex(i)] += alpha_k; + total[getIndex(i)] += upd*alpha_k; // + } + } + + + } + + + public final void update(float[] parameters, float[] total, double alpha_k, + double upd, boolean negate, float[] totalp, Long2IntInterface li) { + + if (null != subfv1) { + subfv1.update(parameters, total, alpha_k, upd, negate,totalp,li); + + if (null != subfv2 && negate) { + subfv2.update(parameters, total, alpha_k, upd, !negateSecondSubFV,totalp,li); + } else { + subfv2.update(parameters, total, alpha_k, upd, negateSecondSubFV,totalp,li); + } + } + + if (negate) { + for(int i=0;i<size;i++){ + parameters[getIndex(i)] -= alpha_k; + total[getIndex(i)] -= upd*alpha_k; + + totalp[li.l2i(getIndex(i))] -=upd*alpha_k; + // totalp[getIndex(i)] -=upd*alpha_k; + } + } else { + for(int i=0;i<size;i++){ + parameters[getIndex(i)] += alpha_k; + total[getIndex(i)] += upd*alpha_k; // + + totalp[li.l2i(getIndex(i))] +=upd*alpha_k; + // totalp[getIndex(i)] +=upd*alpha_k; + } + } + } + + + + + private static IntIntHash hm1; + private static IntIntHash hm2; + + public int dotProduct(FV fl2) { + + if (hm1==null) hm1 = new IntIntHash(size(),0.4F); + else hm1.clear(); + + addFeaturesToMap(hm1); + + if (hm2==null)hm2 = new IntIntHash(fl2.size,0.4F); + else hm2.clear(); + + fl2.addFeaturesToMap(hm2); + + int[] keys = hm1.keys(); + + int result = 0; + for(int i = 0; i < keys.length; i++) result += hm1.get(keys[i])*hm2.get(keys[i]); + + return result; + + } + + public double twoNorm(FV fl2) { + + if (hm1==null) hm1 = new IntIntHash(size(),0.4F); + else hm1.clear(); + + addFeaturesToMap(hm1); + + if (hm2==null)hm2 = new IntIntHash(fl2.size,0.4F); + else hm2.clear(); + + fl2.addFeaturesToMap(hm2); + + int[] keys = hm1.keys(); + + int result = 0; + for(int i = 0; i < keys.length; i++) result += hm1.get(keys[i])*hm2.get(keys[i]); + + + return Math.sqrt((double)result); + + + } + + public void addFeaturesToMap(IntIntHash map) { + + if (null != subfv1) { + subfv1.addFeaturesToMap(map); + + if (null != subfv2) { + subfv2.addFeaturesToMap(map, negateSecondSubFV); + + } + } + + + for(int i=0;i<size;i++) if (!map.adjustValue(getIndex(i), 1)) map.put(getIndex(i), 1); + + + + } + + + + private void addFeaturesToMap(IntIntHash map, boolean negate) { + + if (null != subfv1) { + subfv1.addFeaturesToMap(map, negate); + + if (null != subfv2) { + if (negate) subfv2.addFeaturesToMap(map, !negateSecondSubFV); + else subfv2.addFeaturesToMap(map, negateSecondSubFV); + + } + } + + if (negate) { + for(int i=0;i<size;i++) if (!map . adjustValue(getIndex(i), -1)) map.put(getIndex(i), -1); + } else { + for(int i=0;i<size;i++) if (!map.adjustValue(getIndex(i), 1)) map.put(getIndex(i), 1); + } + + + } + + + @Override + public final String toString() { + StringBuilder sb = new StringBuilder(); + toString(sb); + return sb.toString(); + } + + private final void toString(StringBuilder sb) { + if (null != subfv1) { + subfv1.toString(sb); + + if (null != subfv2) + subfv2.toString(sb); + } + for(int i=0;i<size;i++) + sb.append(getIndex(i)).append(' '); + } + + public void writeKeys(DataOutputStream dos) throws IOException { + + // int keys[] = keys(); + // dos.writeInt(keys.length); + // for(int i=0;i<keys.length;i++) { + // dos.writeInt(keys[i]); + // } + + + //int keys[] = keys(); + dos.writeInt(size); + for(int i=0;i<size;i++) { + dos.writeInt(m_index[i]); + } + + } + + public void readKeys(DataInputStream dos) throws IOException { + + int keys = dos.readInt(); + for (int i=0; i<keys; i++) createFeature(dos.readInt(), 1.0); + + + } + + final public static FV cat(FV f1,FV f2) { + if (f1==null) return f2; + if (f2==null) return f1; + return new FV(f1, f2); + } + + final public static FV cat(FV f1,FV f2, FV f3) { + return FV.cat(f1, FV.cat(f2, f3)); + } + final public static FV cat(FV f1,FV f2, FV f3, FV f4) { + return FV.cat(f1, FV.cat(f2, FV.cat(f3, f4))); + } + + + final public static FV read(DataInputStream dis) throws IOException { + int cap = dis.readInt(); + if (cap == 0) return null; + return new FV(dis,cap); + + } + + /* (non-Javadoc) + * @see is2.IFV#getScore() + */ + @Override + public double getScore() { + //System.out.println("not implemented"); + // TODO Auto-generated method stub + return 0; + } + + /* (non-Javadoc) + * @see is2.IFV#clone() + */ + @Override + public IFV clone() { + FV f= new FV(this.size); + for(int i=0;i<this.size;i++) { + f.m_index[i]=m_index[i]; + } + f.size=this.size; + return f; + } + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/FVR.java b/dependencyParser/basic/mate-tools/src/is2/data/FVR.java new file mode 100644 index 0000000..f0b6784 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/FVR.java @@ -0,0 +1,468 @@ +package is2.data; + + + +import gnu.trove.TIntDoubleHashMap; + + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; + +public final class FVR extends IFV { + + private FVR subfv1; + private FVR subfv2; + private boolean negateSecondSubFV = false; + + private int size; + + + + // content of the nodes NxC + private int m_index[]; + private float m_value[]; + + // type of the nodes NxT + + public FVR() { + this(10); + } + + public FVR(int initialCapacity) { + m_index = new int[initialCapacity]; + m_value = new float[initialCapacity]; + } + +/* + public FVR (FVR fv1, FVR fv2) { + subfv1 = fv1; + subfv2 = fv2; + } +*/ + public FVR (FVR fv1, FVR fv2, boolean negSecond) { + this(0); + subfv1 = fv1; + subfv2 = fv2; + negateSecondSubFV = negSecond; + } + + /** + * Read a feature vector + * @param index + * @param value + */ + public FVR(DataInputStream dos, int capacity) throws IOException { + this(capacity); + size= m_index.length; + + for (int i=0; i<size; i++) m_index[i] = dos.readInt(); + } + + + /** + * Read a feature vector + * @param index + * @param value + */ + public FVR(DataInputStream dos) throws IOException { + this(dos.readInt()); + size= m_index.length; + + for (int i=0; i<size; i++) m_index[i] = dos.readInt(); + + + } + + /** + * Increases the capacity of this <tt>Graph</tt> instance, if + * necessary, to ensure that it can hold at least the number of nodes + * specified by the minimum capacity argument. + * + * @param minCapacity the desired minimum capacity. + */ + private void ensureCapacity(int minCapacity) { + + + if (minCapacity > m_index.length) { + + int oldIndex[] = m_index; + float oldValue[] = m_value; + + int newCapacity = ( m_index.length * 3)/2 + 1; + + + if (newCapacity < minCapacity) newCapacity = minCapacity; + + m_index = new int[newCapacity]; + m_value = new float[newCapacity]; + + System.arraycopy(oldIndex, 0, m_index, 0, oldIndex.length); + System.arraycopy(oldValue, 0, m_value, 0, oldValue.length); + + } + } + + + final public int size() { + return size; + } + + final public boolean isEmpty() { + return size == 0; + } + + @Override + final public void clear() { + size = 0; + } + + + final public int createFeature(int i, float v) { + + ensureCapacity(size+1); + m_index[size] =i; + m_value[size] =v; + size++; + return size-1; + } + /* + final public int createFeature(int i) { + + ensureCapacity(size+1); + m_index[size] =i; + size++; + return size-1; + } + */ + + final public int getIndex(int i) { + return m_index[i]; + } + + public void setIndex(int p, int i) { + m_index[p] = i; + } + + + /** + * Trims the capacity of this <tt>Graph</tt> instance to true size. + * An application can use this operation to minimize + * the storage of an <tt>Graph</tt> instance. + */ + public void trimToSize() { + + if (size < m_index.length) { + + + int oldIndex[] = m_index; + + m_index = new int[size]; + System.arraycopy(oldIndex, 0, m_index, 0, size); + + } + + } + + + + + + final public void add(int i) { + if (i>=0) { + ensureCapacity(size+1); + m_index[size] =i; + m_value[size] =1.0f; + size++; + } + } + + final public void add(int i, float f) { + if (i>=0) createFeature(i,f); + } + + + // fv1 - fv2 + public FVR getDistVector(FVR fl2) { + return new FVR(this, fl2, true); + } + + + public double getScore(double[] parameters, boolean negate) { + double score = 0.0; + + if (null != subfv1) { + score += subfv1.getScore(parameters, negate); + + if (null != subfv2) { + if (negate) score += subfv2.getScore(parameters, !negateSecondSubFV); + else score += subfv2.getScore(parameters, negateSecondSubFV); + + } + } + + if (negate) for(int i=0;i<size;i++) score -= parameters[m_index[i]]; + else for(int i=0;i<size;i++) score += parameters[m_index[i]]; + + + return score; + } + + + final public float getScore(float[] parameters, boolean negate) { + float score = 0.0F; + + if (null != subfv1) { + score += subfv1.getScore(parameters, negate); + + if (null != subfv2) { + if (negate) score += subfv2.getScore(parameters, !negateSecondSubFV); + else score += subfv2.getScore(parameters, negateSecondSubFV); + + } + } + + // warning changed the value + + if (negate) for(int i=0;i<size;i++) score -= parameters[m_index[i]]*m_value[i]; + else for(int i=0;i<size;i++) score += parameters[m_index[i]]*m_value[i]; + + return score; + } + + final public int getScore(short[] parameters, boolean negate) { + int score = 0; + + if (null != subfv1) { + score += subfv1.getScore(parameters, negate); + + if (null != subfv2) { + if (negate) score += subfv2.getScore(parameters, !negateSecondSubFV); + else score += subfv2.getScore(parameters, negateSecondSubFV); + + } + } + + // warning changed the value + + if (negate) for(int i=0;i<size;i++) score -= parameters[m_index[i]]*m_value[i]; + else for(int i=0;i<size;i++) score += parameters[m_index[i]]*m_value[i]; + + return score; + } + + + + + + + public final void update(float[] parameters, float[] total, double alpha_k, double upd, boolean negate) { + + if (null != subfv1) { + subfv1.update(parameters, total, alpha_k, upd, negate); + + if (null != subfv2 && negate) { + subfv2.update(parameters, total, alpha_k, upd, !negateSecondSubFV); + } else { + subfv2.update(parameters, total, alpha_k, upd, negateSecondSubFV); + } + + } + + if (negate) { + for(int i=0;i<size;i++){ + parameters[getIndex(i)] -= alpha_k*m_value[i]; + total[getIndex(i)] -= upd*alpha_k*m_value[i]; + } + } else { + for(int i=0;i<size;i++){ + parameters[getIndex(i)] += alpha_k*m_value[i]; + total[getIndex(i)] += upd*alpha_k*m_value[i]; // + } + } + + + } + + + +// private static IntIntHash hm1; +// private static IntIntHash hm2; + + private static TIntDoubleHashMap hd1; + private static TIntDoubleHashMap hd2; + + + public int dotProduct(FVR fl2) { + + if (hd1==null) hd1 = new TIntDoubleHashMap(size(),0.4F); + else hd1.clear(); + + addFeaturesToMap(hd1); + + if (hd2==null)hd2 = new TIntDoubleHashMap(fl2.size,0.4F); + else hd2.clear(); + + fl2.addFeaturesToMap(hd2); + + int[] keys = hd1.keys(); + + int result = 0; + for(int i = 0; i < keys.length; i++) result += hd1.get(keys[i])*hd2.get(keys[i]); + + return result; + + } + + + private void addFeaturesToMap(TIntDoubleHashMap map) { + + if (null != subfv1) { + subfv1.addFeaturesToMap(map); + + if (null != subfv2) { + subfv2.addFeaturesToMap(map, negateSecondSubFV); + + } + } + + + for(int i=0;i<size;i++) if (!map.adjustValue(getIndex(i), m_value[i])) map.put(getIndex(i), m_value[i]); + + + + } + + + + private void addFeaturesToMap(IntIntHash map, boolean negate) { + + if (null != subfv1) { + subfv1.addFeaturesToMap(map, negate); + + if (null != subfv2) { + if (negate) subfv2.addFeaturesToMap(map, !negateSecondSubFV); + else subfv2.addFeaturesToMap(map, negateSecondSubFV); + + } + } + + if (negate) { + for(int i=0;i<size;i++) if (!map . adjustValue(getIndex(i), -1)) map.put(getIndex(i), -1); + } else { + for(int i=0;i<size;i++) if (!map.adjustValue(getIndex(i), 1)) map.put(getIndex(i), 1); + } + + + } + + private void addFeaturesToMap(TIntDoubleHashMap map, boolean negate) { + + if (null != subfv1) { + subfv1.addFeaturesToMap(map, negate); + + if (null != subfv2) { + if (negate) subfv2.addFeaturesToMap(map, !negateSecondSubFV); + else subfv2.addFeaturesToMap(map, negateSecondSubFV); + + } + } + + if (negate) { + for(int i=0;i<size;i++) if (!map . adjustValue(getIndex(i), -m_value[i])) map.put(getIndex(i), -m_value[i]); + } else { + for(int i=0;i<size;i++) if (!map.adjustValue(getIndex(i), m_value[i])) map.put(getIndex(i), m_value[i]); + } + + + } + + + + @Override + public final String toString() { + StringBuilder sb = new StringBuilder(); + toString(sb); + return sb.toString(); + } + + private final void toString(StringBuilder sb) { + if (null != subfv1) { + subfv1.toString(sb); + + if (null != subfv2) + subfv2.toString(sb); + } + for(int i=0;i<size;i++) + sb.append(getIndex(i)).append('=').append(m_value[i]).append(' '); + } + + public void writeKeys(DataOutputStream dos) throws IOException { + + // int keys[] = keys(); + // dos.writeInt(keys.length); + // for(int i=0;i<keys.length;i++) { + // dos.writeInt(keys[i]); + // } + + + //int keys[] = keys(); + dos.writeInt(size); + for(int i=0;i<size;i++) { + dos.writeInt(m_index[i]); + } + + } + + /* + + final public static FVR cat(FVR f1,FVR f2) { + if (f1==null) return f2; + if (f2==null) return f1; + return new FVR(f1, f2); + } + + final public static FVR cat(FVR f1,FVR f2, FVR f3) { + return FVR.cat(f1, FVR.cat(f2, f3)); + } + final public static FVR cat(FVR f1,FVR f2, FVR f3, FVR f4) { + return FVR.cat(f1, FVR.cat(f2, FVR.cat(f3, f4))); + } + */ + + + final public static FVR read(DataInputStream dis) throws IOException { + int cap = dis.readInt(); + if (cap == 0) return null; + return new FVR(dis,cap); + + } + + /* (non-Javadoc) + * @see is2.IFV#getScore() + */ + @Override + public double getScore() { + System.out.println("not implemented"); + new Exception().printStackTrace(); + // TODO Auto-generated method stub + return 0; + } + + /* (non-Javadoc) + * @see is2.IFV#clone() + */ + @Override + public IFV clone() { + FVR f= new FVR(this.size); + for(int i=0;i<this.size;i++) { + f.m_index[i]=m_index[i]; + f.m_value[i]=m_value[i]; + } + f.size=this.size; + return f; + } + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/IEncoder.java b/dependencyParser/basic/mate-tools/src/is2/data/IEncoder.java new file mode 100755 index 0000000..03c4a45 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/IEncoder.java @@ -0,0 +1,26 @@ +/** + * + */ +package is2.data; + +import java.util.HashMap; + +/** + * @author Bernd Bohnet, 20.09.2009 + * + * + */ +public interface IEncoder { + public int getValue(String a, String v); + + /** + * @param spath + * @param substring + */ +// public int register(String spath, String substring); + + /** + * @return + */ +// public HashMap<String,Integer> getFeatureCounter(); +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/IEncoderPlus.java b/dependencyParser/basic/mate-tools/src/is2/data/IEncoderPlus.java new file mode 100644 index 0000000..2558d6e --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/IEncoderPlus.java @@ -0,0 +1,28 @@ +/** + * + */ +package is2.data; + +import java.util.HashMap; + +/** + * @author Bernd Bohnet, 20.09.2009 + * + * + */ +public interface IEncoderPlus extends IEncoder { + + final public static String NONE="<None>"; + + + /** + * @param spath + * @param substring + */ + public int register(String spath, String substring); + + /** + * @return + */ + public HashMap<String,Integer> getFeatureCounter(); +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/IFV.java b/dependencyParser/basic/mate-tools/src/is2/data/IFV.java new file mode 100755 index 0000000..28fbcfe --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/IFV.java @@ -0,0 +1,28 @@ +package is2.data; + +public abstract class IFV { + + // public double score=0; + + public abstract void add(int i); + + public abstract double getScore(); + + public abstract void clear(); + + @Override + public abstract IFV clone(); + + /** + * @param gvs + * @param li + */ + public void add(long[] gvs, Long2IntInterface li, int l) { + for(int k=0;k<gvs.length;k++) { + if (gvs[k]==Integer.MIN_VALUE) break; + if (gvs[k]>0) add(li.l2i(gvs[k]+l)); + } + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/Instances.java b/dependencyParser/basic/mate-tools/src/is2/data/Instances.java new file mode 100755 index 0000000..ccb26f9 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/Instances.java @@ -0,0 +1,392 @@ +package is2.data; + +import java.util.BitSet; + +import is2.io.CONLLReader09; +import is2.util.DB; + + + +public class Instances { + + + public IEncoder m_encoder; + + + protected int size=0; + + protected int capacity; + + public int[][] forms; + + public int[][] plemmas; + public int[][] glemmas; + + + public short[][] heads; + public short[][] pheads; + + public short[][] labels; + public short[][] plabels; + + public short[][] gpos; + public short[][] pposs; + + + public short[][][] feats; + + + public int[][] predicat; + + + public short[][] predicateId; + + + public short[][] semposition; + + + public short[][][] arg; + + + public short[][][] argposition; + + + public BitSet[] pfill; + + + public short[][] gfeats; + public short[][] pfeats; + + + public Instances() {} + + + + + public static int m_unkown = 0; + public static int m_count = 0; + + + public static boolean m_report; + public static boolean m_found =false; + + + final public void setForm(int i, int p, String x) { + + + forms[i][p] = m_encoder.getValue(PipeGen.WORD,x); + if (forms[i][p]==-1) { + if (m_report) System.out.println("unkwrd "+x); + m_unkown++; + m_found=true; + } + m_count++; + } + + + + final public void setRel(int i, int p, String x) { + labels[i][p] = (short)m_encoder.getValue(PipeGen.REL,x); + + } + + + final public void setHead(int i, int c, int p) { + heads[i][c] =(short)p; + } + + final public int size() { + return size; + } + public void setSize(int n) { + size=n; + } + + + + + public void init(int ic, IEncoder mf) { + init(ic, mf, -1); + } + + + public void init(int ic, IEncoder mf, int version) { + capacity =ic; + m_encoder = mf; + + forms = new int[capacity][]; + plemmas = new int[capacity][]; + glemmas = new int[capacity][]; + pposs= new short[capacity][]; + + gpos= new short[capacity][]; + labels= new short[capacity][]; + heads= new short[capacity][]; + plabels= new short[capacity][]; + pheads= new short[capacity][]; + feats = new short[capacity][][]; + gfeats = new short[capacity][]; + pfeats = new short[capacity][]; + + predicat =new int[ic][]; + predicateId = new short[ic][]; + semposition = new short[ic][]; + arg= new short[ic][][]; + argposition= new short[ic][][]; + + pfill = new BitSet[ic]; + } + + + public int length(int i) { + return forms[i].length; + } + + + public int createInstance09(int length) { + + forms[size] = new int[length]; + plemmas[size] = new int[length]; + glemmas[size] = new int[length]; + + pposs[size] = new short[length]; + + gpos[size] = new short[length]; + + labels[size] = new short[length]; + heads[size] = new short[length]; + + this.pfill[size] = new BitSet(length); + + feats[size] = new short[length][]; + gfeats[size] = new short[length]; + pfeats[size] = new short[length]; + plabels[size] = new short[length]; + pheads[size] = new short[length]; + + size++; + + return size-1; + + } + +/* + public final void setPPos(int i, int p, String x) { + ppos[i][p] = (short)m_encoder.getValue(PipeGen.POS,x); + + } +*/ + + public final void setPPoss(int i, int p, String x) { + pposs[i][p] = (short)m_encoder.getValue(PipeGen.POS,x); + + } + + + public final void setGPos(int i, int p, String x) { + gpos[i][p] = (short)m_encoder.getValue(PipeGen.POS,x); + } + + + public void setLemma(int i, int p, String x) { + plemmas[i][p] = m_encoder.getValue(PipeGen.WORD,x); + } + + + public void setGLemma(int i, int p, String x) { + glemmas[i][p] = m_encoder.getValue(PipeGen.WORD,x); + } + + + public void setFeats(int i, int p, String[] fts) { + if (fts==null) { + feats[i][p] =null; + return ; + } + feats[i][p] = new short[fts.length]; + + for(int k=0;k<fts.length;k++) { + feats[i][p][k] = (short)m_encoder.getValue(PipeGen.FEAT,fts[k]); + } + + } + + + public void setFeature(int i, int p, String feature) { + if (feature==null) return; + this.gfeats[i][p]= (short) m_encoder.getValue(PipeGen.FFEATS,feature); +/* if (gfeats[i][p]==-1) { + System.out.println("+"+feature); + new Exception().printStackTrace(); + System.exit(0); + } + */ + } + public void setPFeature(int i, int p, String feature) { + if (feature==null) return; + this.pfeats[i][p]= (short) m_encoder.getValue(PipeGen.FFEATS,feature); + } + + + public int getWValue(String v) { + return m_encoder.getValue(PipeGen.WORD, v); + } + + + public final void setPRel(int i, int p, String x) { + plabels[i][p] = (short)m_encoder.getValue(PipeGen.REL,x); + } + + + public final void setPHead(int i, int c, int p) { + pheads[i][c] =(short)p; + } + +/* + public String toString(int c) { + StringBuffer s = new StringBuffer(); + for(int i=0;i<length(c);i++) { + s.append(i).append('\t').append(forms[c][i]).append("\t_\t").append(ppos[c][i]).append('\t'). + append('\t').append(heads[c][i]).append('\n'); + } + + return s.toString(); + } +*/ + + /* + public void setPos(int i, int p, String x) { + ppos[i][p] = (short)m_encoder.getValue(PipeGen.POS,x); + + } +*/ + + /** + * Create the semantic representation + * @param inst + * @param it + * @return + */ + public boolean createSem(int inst, SentenceData09 it) { + + boolean error = false; + + if (it.sem==null) return error; + + predicat[inst] = new int[it.sem.length]; + semposition[inst] = new short[it.sem.length]; + predicateId[inst] = new short[it.sem.length]; + + if (it.sem!=null) { + arg[inst] = new short[it.sem.length][]; + argposition[inst] =new short[it.sem.length][]; + } + if (it.sem==null) return error; + + // init sems + + + + + for(int i=0;i<it.sem.length;i++) { + + String pred; + short predSense =0; + if (it.sem[i].indexOf('.')>0) { + pred = it.sem[i].substring(0, it.sem[i].indexOf('.')); + predSense = (short)m_encoder.getValue(PipeGen.SENSE, it.sem[i].substring(it.sem[i].indexOf('.')+1, it.sem[i].length())); + //Short.parseShort(it.sem[i].substring(it.sem[i].indexOf('.')+1, it.sem[i].length())); + } else { + pred = it.sem[i]; + predSense=(short)m_encoder.getValue(PipeGen.SENSE, ""); + } + + predicat[inst][i] = m_encoder.getValue(PipeGen.PRED, pred); + predicateId[inst][i] = predSense; + + semposition[inst][i]=(short)it.semposition[i]; + + // this can happen too when no arguments have values + if (it.arg==null) { + // DB.println("error arg == null "+i+" sem"+it.sem[i]+" inst number "+inst); + // error =true; + continue; + } + + + // last pred(s) might have no argument + if (it.arg.length<=i) { + // DB.println("error in instance "+inst+" argument list and number of predicates different arg lists: "+it.arg.length+" preds "+sem.length); + // error =true; + continue; + } + + + // this happens from time to time, if the predicate has no arguments + if (it.arg[i]==null) { + // DB.println("error no args for pred "+i+" "+it.sem[i]+" length "+it.ppos.length); + // error =true; + continue; + } + + int argCount=it.arg[i].length; + arg[inst][i] = new short[it.arg[i].length]; + argposition[inst][i] = new short[it.arg[i].length]; + + // add the content of the argument + for(int a=0;a<argCount;a++) { + arg[inst][i][a]=(short)m_encoder.getValue(PipeGen.ARG, it.arg[i][a]); + argposition[inst][i][a]=(short)it.argposition[i][a]; + + //System.out.print(" #"+a+" pos: "+argposition[inst][i][a]+" "+it.arg[i][a]+" "); + } + //System.out.println(""); + + } + + return error; + + } + + + public int predCount(int n) { + return pfill[n].cardinality(); + } + + + /** + * @param pscnt + * @return + */ + public String print(int pscnt) { + StringBuilder s = new StringBuilder(); + + for(int i=0;i<this.length(pscnt);i++) { + s.append(i+"\t"+forms[pscnt][i]+"\t"+this.glemmas[pscnt][i]+"\t"+this.plemmas[pscnt][i]+"\t"+this.gpos[pscnt][i]+"\t" + +this.pposs[pscnt][i]+"\t"+this.gfeats[pscnt][i]+"\t"+(this.feats[pscnt][i]!=null&&this.feats[pscnt][i].length>0?this.feats[pscnt][i][0]:null)+ + "\t l "+(labels[pscnt]!=null&&labels[pscnt].length>i?labels[pscnt][i]:null)+"\t"+ + "\t"+heads[pscnt][i]+"\t"+ + (plabels[pscnt]!=null&&plabels[pscnt].length>i?plabels[pscnt][i]:null)+ + "\t"+this.predicat[pscnt][i]+"\n"); + } + return s.toString(); + } + + public String print1(int pscnt) { + StringBuilder s = new StringBuilder(); + + for(int i=0;i<this.length(pscnt);i++) { + s.append(i+"\t"+forms[pscnt][i]+"\t"+"\t"+this.plemmas[pscnt][i]+"\t"+ + +this.pposs[pscnt][i]+ + "\t l "+(labels[pscnt]!=null&&labels[pscnt].length>i?labels[pscnt][i]:null)+"\t"+ + "\t"+heads[pscnt][i]+"\t"+ + (plabels[pscnt]!=null&&plabels[pscnt].length>i?plabels[pscnt][i]:null)+ + "\n"); + } + return s.toString(); + } + + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/InstancesTagger.java b/dependencyParser/basic/mate-tools/src/is2/data/InstancesTagger.java new file mode 100644 index 0000000..8079222 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/InstancesTagger.java @@ -0,0 +1,77 @@ +/** + * + */ +package is2.data; + +import is2.data.IEncoder; +import is2.data.Instances; +import is2.data.SentenceData09; + + +/** + * @author Dr. Bernd Bohnet, 06.11.2010 + * + * + */ +public class InstancesTagger extends Instances { + + public short[][][] chars; + public int[][] formlc; + + public void init(int ic, IEncoder mf) { + super.init(ic, mf,9); + chars = new short[capacity][][]; + formlc = new int[capacity][]; + // System.out.println("create chars "+capacity ); + } + + public void fillChars(SentenceData09 instance, int i, int cend) { + chars[i] = new short[instance.length()][13]; + formlc[i] = new int[instance.length()]; + + + for(int k=0;k<instance.length();k++) { + chars[i][k][0]= (short) ( instance.forms[k].length()>0?m_encoder.getValue(PipeGen.CHAR, String.valueOf(instance.forms[k].charAt(0))):cend); + chars[i][k][1]= (short) ( instance.forms[k].length()>1?m_encoder.getValue(PipeGen.CHAR, String.valueOf(instance.forms[k].charAt(1))):cend);//m_encoder.getValue(PipeGen.CHAR, END); + chars[i][k][2]= (short) ( instance.forms[k].length()>2?m_encoder.getValue(PipeGen.CHAR, String.valueOf(instance.forms[k].charAt(2))):cend); + chars[i][k][3]= (short) ( instance.forms[k].length()>3?m_encoder.getValue(PipeGen.CHAR, String.valueOf(instance.forms[k].charAt(3))):cend); + chars[i][k][4]= (short) ( instance.forms[k].length()>4?m_encoder.getValue(PipeGen.CHAR, String.valueOf(instance.forms[k].charAt(4))):cend); + chars[i][k][5]= (short) ( instance.forms[k].length()>5?m_encoder.getValue(PipeGen.CHAR, String.valueOf(instance.forms[k].charAt(5))):cend); + + chars[i][k][6]= (short) ( instance.forms[k].length()>0?m_encoder.getValue(PipeGen.CHAR,String.valueOf(instance.forms[k].charAt(instance.forms[k].length()-1))):cend); + chars[i][k][7]= (short) ( instance.forms[k].length()>1?m_encoder.getValue(PipeGen.CHAR,String.valueOf(instance.forms[k].charAt(instance.forms[k].length()-2))):cend);//m_encoder.getValue(PipeGen.CHAR, END); + chars[i][k][8]= (short) ( instance.forms[k].length()>2?m_encoder.getValue(PipeGen.CHAR,String.valueOf(instance.forms[k].charAt(instance.forms[k].length()-3))):cend); + chars[i][k][9]= (short) ( instance.forms[k].length()>3?m_encoder.getValue(PipeGen.CHAR,String.valueOf(instance.forms[k].charAt(instance.forms[k].length()-4))):cend); + chars[i][k][10]= (short) ( instance.forms[k].length()>4?m_encoder.getValue(PipeGen.CHAR,String.valueOf(instance.forms[k].charAt(instance.forms[k].length()-5))):cend); + chars[i][k][11] = (short)instance.forms[k].length(); + chars[i][k][12] = (short) ( instance.forms[k].length()>0?m_encoder.getValue(PipeGen.CHAR, String.valueOf(instance.forms[k].charAt(0))):cend); + formlc[i][k] =m_encoder.getValue(PipeGen.WORD, instance.forms[k].toLowerCase()); + } + } + + public void fillChars(SentenceData09 instance, int i, String[] what,int cend) { + chars[i] = new short[instance.length()][13]; + formlc[i] = new int[instance.length()]; + + + for(int k=0;k<instance.length();k++) { + chars[i][k][0]= (short) m_encoder.getValue(PipeGen.CHAR, String.valueOf(what[k].charAt(0))); + chars[i][k][1]= (short) ( what[k].length()>1?m_encoder.getValue(PipeGen.CHAR, String.valueOf(what[k].charAt(1))):cend);//m_encoder.getValue(PipeGen.CHAR, END); + chars[i][k][2]= (short) ( what[k].length()>2?m_encoder.getValue(PipeGen.CHAR, String.valueOf(what[k].charAt(2))):cend); + chars[i][k][3]= (short) ( what[k].length()>3?m_encoder.getValue(PipeGen.CHAR, String.valueOf(what[k].charAt(3))):cend); + chars[i][k][4]= (short) ( what[k].length()>4?m_encoder.getValue(PipeGen.CHAR, String.valueOf(what[k].charAt(4))):cend); + chars[i][k][5]= (short) ( what[k].length()>5?m_encoder.getValue(PipeGen.CHAR, String.valueOf(what[k].charAt(5))):cend); + + chars[i][k][6]= (short) ( m_encoder.getValue(PipeGen.CHAR,String.valueOf(what[k].charAt(what[k].length()-1)))); + chars[i][k][7]= (short) ( what[k].length()>1?m_encoder.getValue(PipeGen.CHAR,String.valueOf(what[k].charAt(what[k].length()-2))):cend);//m_encoder.getValue(PipeGen.CHAR, END); + chars[i][k][8]= (short) ( what[k].length()>2?m_encoder.getValue(PipeGen.CHAR,String.valueOf(what[k].charAt(what[k].length()-3))):cend); + chars[i][k][9]= (short) ( what[k].length()>3?m_encoder.getValue(PipeGen.CHAR,String.valueOf(what[k].charAt(what[k].length()-4))):cend); + chars[i][k][10]= (short) ( what[k].length()>4?m_encoder.getValue(PipeGen.CHAR,String.valueOf(what[k].charAt(what[k].length()-5))):cend); + chars[i][k][11] = (short)what[k].length(); + formlc[i][k] =m_encoder.getValue(PipeGen.WORD, what[k].toLowerCase()); + } + } + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/IntIntHash.java b/dependencyParser/basic/mate-tools/src/is2/data/IntIntHash.java new file mode 100644 index 0000000..4aec043 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/IntIntHash.java @@ -0,0 +1,270 @@ +package is2.data; + + + +import java.util.Arrays; + +final public class IntIntHash { + + + protected int _size; + protected int _free; + protected float _loadFactor; + public int _maxSize; + protected int _autoCompactRemovesRemaining; + protected float _autoCompactionFactor; + public int _set[]; + private int _values[]; + + + public IntIntHash() { + this(102877, 0.5F); + } + + + public IntIntHash(int initialCapacity, float loadFactor) { + _loadFactor = loadFactor; + _autoCompactionFactor = loadFactor; + setUp((int)Math.ceil(initialCapacity / loadFactor)); + } + + + public int size() { return _size;} + + public void ensureCapacity(int desiredCapacity) { + + if(desiredCapacity > _maxSize - size()) { + rehash(PrimeFinder.nextPrime((int)Math.ceil((desiredCapacity + size()) / _loadFactor) + 1)); + computeMaxSize(capacity()); + } + } + + public void compact() { + rehash(PrimeFinder.nextPrime((int)Math.ceil(size() / _loadFactor) + 1)); + computeMaxSize(capacity()); + if(_autoCompactionFactor != 0.0F) computeNextAutoCompactionAmount(size()); + } + + public void setAutoCompactionFactor(float factor) { + if(factor < 0.0F) { + throw new IllegalArgumentException((new StringBuilder()).append("Factor must be >= 0: ").append(factor).toString()); + } else + { + _autoCompactionFactor = factor; + return; + } + } + + public float getAutoCompactionFactor() { return _autoCompactionFactor; } + + + private void computeMaxSize(int capacity) + { + _maxSize = Math.min(capacity - 1, (int)Math.floor(capacity * _loadFactor)); + _free = capacity - _size; + } + + private void computeNextAutoCompactionAmount(int size) + { + if(_autoCompactionFactor != 0.0F) + _autoCompactRemovesRemaining = Math.round(size * _autoCompactionFactor); + } + + protected final void postInsertHook(boolean usedFreeSlot) + { + if(usedFreeSlot) _free--; + if(++_size > _maxSize || _free == 0) { + int newCapacity = _size <= _maxSize ? capacity() : PrimeFinder.nextPrime(capacity() << 1); + rehash(newCapacity); + computeMaxSize(capacity()); + } + } + + protected int calculateGrownCapacity() { return capacity() << 1; } + + protected int capacity() { return _values.length; } + + public boolean contains(int val) { return index(val) >= 0;} + + private int index(int v) { + + int length = _set.length; + int index = Math.abs((computeHashCode(v) /*& 2147483647*/ ) % length); + + while(true) { + // first + long l =_set[index]; + if (l == 0) { + // good++; + return -1; + } + // second + if (l == v) { + return index; + } + if(--index < 0) index += length; + } + //return -1; + } + + protected int insertionIndex(long val) + { + int length = _set.length; + int index = Math.abs((computeHashCode(val) /*& 2147483647*/ ) % length); + while(true) { + if(_set[index] == 0) return index; + if(_set[index] == val) return -index - 1; + if(--index < 0) index += length; + + } + } + + public int computeHashCode(long value) + { + return (int)(( value ^ (value&0xffffffff00000000L) >>> 32 ) *31);//0x811c9dc5 ^ // 29 + } + + + + + + + protected int setUp(int initialCapacity) + { + int capacity = PrimeFinder.nextPrime(initialCapacity); + computeMaxSize(capacity); + computeNextAutoCompactionAmount(initialCapacity); + _set = new int[capacity]; + _values = new int[capacity]; + return capacity; + } + + public void put(int key, int value) + { + int index = insertionIndex(key); + doPut(key, value, index); + } + private void doPut(int key, int value, int index) + { + boolean isNewMapping = true; + if(index < 0) + { + index = -index - 1; + isNewMapping = false; + } + _set[index] = key; + _values[index] = value; + if(isNewMapping) postInsertHook(true); + + } + + protected void rehash(int newCapacity) + { + int oldCapacity = _set.length; + int oldKeys[] = _set; + int oldVals[] = _values; + _set = new int[newCapacity]; + _values = new int[newCapacity]; + int i = oldCapacity; + + while(true){ + if(i-- <= 0) break; + if(oldVals[i] != 0) { + int o = oldKeys[i]; + int index = insertionIndex(o); + _set[index] = o; + _values[index] = oldVals[i]; + } + } + } + + int index =0; + + + public int get(int key) + { + int index = index(key); + return index >= 0 ? _values[index] : 0; + } + + + public void clear() + { + _size = 0; + _free = capacity(); + Arrays.fill(_set, 0, _set.length, 0); + // Arrays.fill(_values, 0, _values.length, 0); + } + + public int remove(int key) + { + int prev = 0; + int index = index(key); + if(index >= 0) + { + prev = _values[index]; + _values[index] = 0; + _set[index] = 0; + _size--; + if(_autoCompactionFactor != 0.0F) { + _autoCompactRemovesRemaining--; + if( _autoCompactRemovesRemaining <= 0) compact(); + } + } + return prev; + } + + + public int[] getValues() + { + int vals[] = new int[size()]; + int v[] = _values; + int i = v.length; + int j = 0; + do + { + if(i-- <= 0) break; + if(v[i] != 0) vals[j++] = v[i]; + } while(true); + return vals; + } + + public int[] keys() + { + int keys[] = new int[size()]; + int k[] = _set; + // byte states[] = _states; + int i = k.length; + int j = 0; + do + { + if(i-- <= 0) + break; + if(k[i] != 0) + keys[j++] = k[i]; + } while(true); + return keys; + } + + + /** + * @param index2 + * @param i + * @return + */ + public boolean adjustValue(int key, int i) { + int index = index(key); + if (index >= 0){ + _values[index] +=i; + return true; + } + return false; + } + + + + + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/Long2Int.java b/dependencyParser/basic/mate-tools/src/is2/data/Long2Int.java new file mode 100755 index 0000000..e505cb0 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/Long2Int.java @@ -0,0 +1,113 @@ +package is2.data; + + + +/** + * @author Bernd Bohnet, 01.09.2009 + * + * Maps for the Hash Kernel the long values to the int values. + */ +final public class Long2Int implements Long2IntInterface { + + + public Long2Int() { + size=115911564; + } + + + public Long2Int(int s) { + size=s; + } + + public static void main(String args[]) { + + long l =123456; + long l2 =1010119; + System.out.println("l \t"+l+"\t"+printBits(l)); + + long x =100000000; + System.out.println("1m\t"+l2+"\t"+printBits(x)+"\t"+x); + + System.out.println("l2\t"+l2+"\t"+printBits(l)); + + System.out.println("l2*l\t"+l2+"\t"+printBits(l*l2)+" \t "+l*l2); + + System.out.println("l2*l*l2\t"+l2+"\t"+printBits(l*l2*l2)+" \t "+l*l2*l2); + + System.out.println("l2*l*l2\t"+l2+"\t"+printBits(l*l2*l2*l2)+" \t "+l*l2*l2*l2); + + + System.out.println("l2*l*l2\t"+l2+"\t"+printBits((l*l2)%0xfffff)+" \t "+l*l2*l2*l2+"\t "+0xfffff); + System.out.println("l2*l*l2\t"+l2+"\t"+printBits((l*l2)&0xfffffff)+" \t "+l*l2*l2*l2); + } + + + /** Integer counter for long2int */ + final private int size; //0x03ffffff //0x07ffffff + + + /* (non-Javadoc) + * @see is2.sp09k9992.Long2IntIterface#size() + */ + public int size() {return size;} + + /* (non-Javadoc) + * @see is2.sp09k9992.Long2IntIterface#start() + * has no meaning for this implementation + */ + final public void start() {} + + + /* (non-Javadoc) + * @see is2.sp09k9992.Long2IntIterface#l2i(long) + */ + final public int l2i(long l) { + if (l<0) return -1; + + // this works well LAS 88.138 + // int r= (int)(( l ^ (l&0xffffffff00000000L) >>> 29 ));//0x811c9dc5 ^ // 29 + // return Math.abs(r % size); + // this works a bit better and good with 0x03ffffff + // + /* + long r= l;//26 + l = (l>>12)&0xfffffffffffff000L; + r ^= l;//38 + l = (l>>11)&0xffffffffffffc000L; + r ^= l;//49 + l = (l>>9)& 0xffffffffffff0000L; //53 + r ^= l;//58 + l = (l>>7)&0xfffffffffffc0000L; //62 + r ^=l;//65 + int x = (int)r; + x = x % size; + // return x >= 0 ? x : -x ;// Math.abs(r % size); + + */ + // 26 0x03ffffff + // together with 0x07ffffff 27 88.372 + long r= l;// 27 + l = (l>>13)&0xffffffffffffe000L; + r ^= l; // 40 + l = (l>>11)&0xffffffffffff0000L; + r ^= l; // 51 + l = (l>>9)& 0xfffffffffffc0000L; //53 + r ^= l; // 60 + l = (l>>7)& 0xfffffffffff00000L; //62 + r ^=l; //67 + int x = ((int)r) % size; + + return x >= 0 ? x : -x ; + } + + static public StringBuffer printBits(long out) { + StringBuffer s = new StringBuffer(); + + for(int k=0;k<65;k++) { + s.append((out & 1)==1?"1":"0"); + out >>=1; + } + s.reverse(); + return s; + } +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/Long2IntExact.java b/dependencyParser/basic/mate-tools/src/is2/data/Long2IntExact.java new file mode 100644 index 0000000..62f6375 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/Long2IntExact.java @@ -0,0 +1,60 @@ +package is2.data; + + + +/** + * @author Bernd Bohnet, 01.09.2009 + * + * Maps for the Hash Kernel the long values to the int values. + */ +final public class Long2IntExact implements Long2IntInterface { + + static gnu.trove.TLongIntHashMap mapt = new gnu.trove.TLongIntHashMap(); + + static int cnt=0; + + + + public Long2IntExact() { + size=115911564; + } + + + public Long2IntExact(int s) { + size=s; + } + + + /** Integer counter for long2int */ + final private int size; //0x03ffffff //0x07ffffff + + + /* (non-Javadoc) + * @see is2.sp09k9992.Long2IntIterface#size() + */ + public int size() {return size;} + + /* (non-Javadoc) + * @see is2.sp09k9992.Long2IntIterface#start() + * has no meaning for this implementation + */ + final public void start() {} + + + /* (non-Javadoc) + * @see is2.sp09k9992.Long2IntIterface#l2i(long) + */ + final public int l2i(long l) { + if (l<0) return -1; + + int i = mapt.get(l); + if (i!=0) return i; + + if (i==0 && cnt<size-1) { + cnt++; + mapt.put(l, cnt); + return cnt; + } + return -1; + } +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/Long2IntInterface.java b/dependencyParser/basic/mate-tools/src/is2/data/Long2IntInterface.java new file mode 100755 index 0000000..a6cba63 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/Long2IntInterface.java @@ -0,0 +1,15 @@ +package is2.data; + +public interface Long2IntInterface { + + public abstract int size(); + + + /** + * Maps a long to a integer value. This is very useful to save memory for sparse data long values + * @param l + * @return the integer + */ + public abstract int l2i(long l); + +} \ No newline at end of file diff --git a/dependencyParser/basic/mate-tools/src/is2/data/Long2IntQuick.java b/dependencyParser/basic/mate-tools/src/is2/data/Long2IntQuick.java new file mode 100644 index 0000000..adbe57d --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/Long2IntQuick.java @@ -0,0 +1,47 @@ +package is2.data; + + + +/** + * @author Bernd Bohnet, 01.09.2009 + * + * Maps for the Hash Kernel the long values to the int values. + */ +final public class Long2IntQuick implements Long2IntInterface { + + + /** Integer counter for long2int */ + final private int size; + + public Long2IntQuick() { + size=0x07ffffff; + } + + + public Long2IntQuick(int s) { + size=s; + } + + + /* (non-Javadoc) + * @see is2.sp09k9992.Long2IntIterface#size() + */ + public int size() {return size;} + + /* (non-Javadoc) + * @see is2.sp09k9992.Long2IntIterface#start() + * has no meaning for this implementation + */ + + + /* (non-Javadoc) + * @see is2.sp09k9992.Long2IntIterface#l2i(long) + */ + final public int l2i(long r) { + long l = (r>>16)&0xfffffffffffff000L; + r ^= l; + r ^= l = (l>>12)&0xffffffffffff0000L; + r ^= l = (l>>8)& 0xfffffffffffc0000L; + return (int)(r % size); + } +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/MFB.java b/dependencyParser/basic/mate-tools/src/is2/data/MFB.java new file mode 100755 index 0000000..04c36ae --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/MFB.java @@ -0,0 +1,256 @@ +package is2.data; + + +import is2.util.DB; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map.Entry; + +/** + * Map Features, do not map long to integer + * + * @author Bernd Bohnet, 20.09.2009 + */ + +final public class MFB implements IEncoderPlus { + + /** The features and its values */ + static private final HashMap<String,HashMap<String,Integer>> m_featureSets = new HashMap<String,HashMap<String,Integer>>(); + + /** The feature class and the number of values */ + static private final HashMap<String,Integer> m_featureCounters = new HashMap<String,Integer>(); + + /** The number of bits needed to encode a feature */ + static final HashMap<String,Integer> m_featureBits = new HashMap<String,Integer>(); + + /** Integer counter for long2int */ + static private int count=0; + + /** Stop growing */ + public boolean stop=false; + + final public static String NONE="<None>"; + + + + + + + + public MFB () {} + + + public int size() {return count;} + + + + /** + * Register an attribute class, if it not exists and add a possible value + * @param type + * @param type2 + */ + final public int register(String a, String v) { + + synchronized(m_featureCounters) { + + HashMap<String,Integer> fs = getFeatureSet().get(a); + if (fs==null) { + fs = new HashMap<String,Integer>(); + getFeatureSet().put(a, fs); + fs.put(NONE, 0); + getFeatureCounter().put(a, 1); + } + + Integer i = fs.get(v); + if (i==null) { + Integer c = getFeatureCounter().get(a); + fs.put(v, c); + c++; + getFeatureCounter().put(a,c); + return c-1; + } else return i; + } + } + + /** + * Calculates the number of bits needed to encode a feature + */ + public void calculateBits() { + + int total=0; + for(Entry<String,Integer> e : getFeatureCounter().entrySet() ){ + int bits =(int)Math.ceil((Math.log(e.getValue()+1)/Math.log(2))); + m_featureBits.put(e.getKey(), bits); + total+=bits; + // System.out.println(" "+e.getKey()+" bits "+bits+" number "+(e.getValue()+1)); + } + +// System.out.println("total number of needed bits "+total); + } + + + + public String toString() { + + StringBuffer content = new StringBuffer(); + for(Entry<String,Integer> e : getFeatureCounter().entrySet() ){ + content.append(e.getKey()+" "+e.getValue()); + content.append(':'); + // HashMap<String,Integer> vs = getFeatureSet().get(e.getKey()); + content.append(getFeatureBits(e.getKey())); + + /*if (vs.size()<120) + for(Entry<String,Integer> e2 : vs.entrySet()) { + content.append(e2.getKey()+" ("+e2.getValue()+") "); + }*/ + content.append('\n'); + + } + return content.toString(); + } + + + + static final public short getFeatureBits(String a) { + if(m_featureBits.get(a)==null) return 0; + return (short)m_featureBits.get(a).intValue(); + } + + + + /** + * Get the integer place holder of the string value v of the type a + * + * @param t the type + * @param v the value + * @return the integer place holder of v + */ + final public int getValue(String t, String v) { + + if (m_featureSets.get(t)==null) return -1; + Integer vi = m_featureSets.get(t).get(v); + if (vi==null) return -1; //stop && + return vi.intValue(); + } + + /** + * Static version of getValue + * @see getValue + */ + static final public int getValueS(String a, String v) { + + if (m_featureSets.get(a)==null) return -1; + Integer vi = m_featureSets.get(a).get(v); + if (vi==null) return -1; //stop && + return vi.intValue(); + } + + public int hasValue(String a, String v) { + + Integer vi = m_featureSets.get(a).get(v); + if (vi==null) return -1; + return vi.intValue(); + } + + + public static String printBits(int k) { + StringBuffer s = new StringBuffer(); + for(int i =0;i<31;i++) { + s.append((k&0x00000001)==1?'1':'0'); + k=k>>1; + + } + s.reverse(); + return s.toString(); + } + + + + + + + + /** + * Maps a long to a integer value. This is very useful to save memory for sparse data long values + * @param l + * @return the integer + */ + static public int misses = 0; + static public int good = 0; + + + + + /** + * Write the data + * @param dos + * @throws IOException + */ + static public void writeData(DataOutputStream dos) throws IOException { + dos.writeInt(getFeatureSet().size()); + // DB.println("write"+getFeatureSet().size()); + for(Entry<String, HashMap<String,Integer>> e : getFeatureSet().entrySet()) { + dos.writeUTF(e.getKey()); + dos.writeInt(e.getValue().size()); + + for(Entry<String,Integer> e2 : e.getValue().entrySet()) { + + if(e2.getKey()==null) DB.println("key "+e2.getKey()+" value "+e2.getValue()+" e -key "+e.getKey()); + dos.writeUTF(e2.getKey()); + dos.writeInt(e2.getValue()); + + } + + } + } + public void read(DataInputStream din) throws IOException { + + int size = din.readInt(); + for(int i=0; i<size;i++) { + String k = din.readUTF(); + int size2 = din.readInt(); + + HashMap<String,Integer> h = new HashMap<String,Integer>(); + getFeatureSet().put(k,h); + for(int j = 0;j<size2;j++) { + h.put(din.readUTF(), din.readInt()); + } + getFeatureCounter().put(k, size2); + } + + count =size; + // stop(); + calculateBits(); + } + + + /** + * Clear the data + */ + static public void clearData() { + getFeatureSet().clear(); + m_featureBits.clear(); + getFeatureSet().clear(); + } + + public HashMap<String,Integer> getFeatureCounter() { + return m_featureCounters; + } + + static public HashMap<String,HashMap<String,Integer>> getFeatureSet() { + return m_featureSets; + } + + static public String[] reverse(HashMap<String,Integer> v){ + String[] set = new String[v.size()]; + for(Entry<String,Integer> e : v.entrySet()) { + set[e.getValue()]=e.getKey(); + } + return set; + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/MFC.java b/dependencyParser/basic/mate-tools/src/is2/data/MFC.java new file mode 100644 index 0000000..bb1f27a --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/MFC.java @@ -0,0 +1,246 @@ +package is2.data; + + +import is2.util.DB; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map.Entry; + +/** + * Map Features, do not map long to integer + * + * @author Bernd Bohnet, 17.09.2011 + */ + +final public class MFC implements IEncoderPlus { + + /** The features and its values */ + private final HashMap<String,HashMap<String,Integer>> m_featureSets = new HashMap<String,HashMap<String,Integer>>(); + + /** The feature class and the number of values */ + private final HashMap<String,Integer> m_featureCounters = new HashMap<String,Integer>(); + + /** The number of bits needed to encode a feature */ + final HashMap<String,Integer> m_featureBits = new HashMap<String,Integer>(); + + /** Integer counter for long2int */ + private int count=0; + + + public MFC () {} + + + public int size() {return count;} + + + + /** + * Register an attribute class, if it not exists and add a possible value + * @param type + * @param type2 + */ + final public int register(String a, String v) { + + synchronized(m_featureCounters) { + + HashMap<String,Integer> fs = getFeatureSet().get(a); + if (fs==null) { + fs = new HashMap<String,Integer>(); + getFeatureSet().put(a, fs); + fs.put(NONE, 0); + getFeatureCounter().put(a, 1); + } + + Integer i = fs.get(v); + if (i==null) { + Integer c = getFeatureCounter().get(a); + fs.put(v, c); + c++; + getFeatureCounter().put(a,c); + return c-1; + } else return i; + } + } + + /** + * Calculates the number of bits needed to encode a feature + */ + public void calculateBits() { + + int total=0; + for(Entry<String,Integer> e : getFeatureCounter().entrySet() ){ + int bits =(int)Math.ceil((Math.log(e.getValue()+1)/Math.log(2))); + m_featureBits.put(e.getKey(), bits); + total+=bits; + // System.out.println(" "+e.getKey()+" bits "+bits+" number "+(e.getValue()+1)); + } + +// System.out.println("total number of needed bits "+total); + } + + + + public String toString() { + + StringBuffer content = new StringBuffer(); + for(Entry<String,Integer> e : getFeatureCounter().entrySet() ){ + content.append(e.getKey()+" "+e.getValue()); + content.append(':'); + // HashMap<String,Integer> vs = getFeatureSet().get(e.getKey()); + content.append(getFeatureBits(e.getKey())); + + /*if (vs.size()<120) + for(Entry<String,Integer> e2 : vs.entrySet()) { + content.append(e2.getKey()+" ("+e2.getValue()+") "); + }*/ + content.append('\n'); + + } + return content.toString(); + } + + + + final public short getFeatureBits(String a) { + if(m_featureBits.get(a)==null) return 0; + return (short)m_featureBits.get(a).intValue(); + } + + + + /** + * Get the integer place holder of the string value v of the type a + * + * @param t the type + * @param v the value + * @return the integer place holder of v + */ + final public int getValue(String t, String v) { + + if (m_featureSets.get(t)==null) return -1; + Integer vi = m_featureSets.get(t).get(v); + if (vi==null) return -1; //stop && + return vi.intValue(); + } + + /** + * Static version of getValue + * @see getValue + */ + final public int getValueS(String a, String v) { + + if (m_featureSets.get(a)==null) return -1; + Integer vi = m_featureSets.get(a).get(v); + if (vi==null) return -1; //stop && + return vi.intValue(); + } + + public int hasValue(String a, String v) { + + Integer vi = m_featureSets.get(a).get(v); + if (vi==null) return -1; + return vi.intValue(); + } + + + public static String printBits(int k) { + StringBuffer s = new StringBuffer(); + for(int i =0;i<31;i++) { + s.append((k&0x00000001)==1?'1':'0'); + k=k>>1; + + } + s.reverse(); + return s.toString(); + } + + + + + + + + /** + * Maps a long to a integer value. This is very useful to save memory for sparse data long values + * @param l + * @return the integer + */ + static public int misses = 0; + static public int good = 0; + + + + + /** + * Write the data + * @param dos + * @throws IOException + */ + public void writeData(DataOutputStream dos) throws IOException { + dos.writeInt(getFeatureSet().size()); + // DB.println("write"+getFeatureSet().size()); + for(Entry<String, HashMap<String,Integer>> e : getFeatureSet().entrySet()) { + dos.writeUTF(e.getKey()); + dos.writeInt(e.getValue().size()); + + for(Entry<String,Integer> e2 : e.getValue().entrySet()) { + + if(e2.getKey()==null) DB.println("key "+e2.getKey()+" value "+e2.getValue()+" e -key "+e.getKey()); + dos.writeUTF(e2.getKey()); + dos.writeInt(e2.getValue()); + + } + + } + } + public void read(DataInputStream din) throws IOException { + + int size = din.readInt(); + for(int i=0; i<size;i++) { + String k = din.readUTF(); + int size2 = din.readInt(); + + HashMap<String,Integer> h = new HashMap<String,Integer>(); + getFeatureSet().put(k,h); + for(int j = 0;j<size2;j++) { + h.put(din.readUTF(), din.readInt()); + } + getFeatureCounter().put(k, size2); + } + + count =size; + // stop(); + calculateBits(); + } + + + /** + * Clear the data + */ + public void clearData() { + getFeatureSet().clear(); + m_featureBits.clear(); + getFeatureSet().clear(); + } + + public HashMap<String,Integer> getFeatureCounter() { + return m_featureCounters; + } + + public HashMap<String,HashMap<String,Integer>> getFeatureSet() { + return m_featureSets; + } + + public String[] reverse(HashMap<String,Integer> v){ + String[] set = new String[v.size()]; + for(Entry<String,Integer> e : v.entrySet()) { + set[e.getValue()]=e.getKey(); + } + return set; + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/MFO.java b/dependencyParser/basic/mate-tools/src/is2/data/MFO.java new file mode 100755 index 0000000..ff4d43e --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/MFO.java @@ -0,0 +1,386 @@ +package is2.data; + + +import is2.util.DB; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map.Entry; + +/** + * Map Features, do not map long to integer + * + * @author Bernd Bohnet, 20.09.2009 + */ + +final public class MFO implements IEncoderPlus { + + /** The features and its values */ + static private final HashMap<String,HashMap<String,Integer>> m_featureSets = new HashMap<String,HashMap<String,Integer>>(); + + /** The feature class and the number of values */ + static private final HashMap<String,Integer> m_featureCounters = new HashMap<String,Integer>(); + + /** The number of bits needed to encode a feature */ + static final HashMap<String,Integer> m_featureBits = new HashMap<String,Integer>(); + + final public static String NONE="<None>"; + + final public static class Data4 { + public int shift; + public short a0,a1,a2,a3,a4,a5,a6,a7,a8,a9; + public int v0,v1,v2,v3,v4,v5,v6,v7,v8,v9; + + final public long calcs(int b, long v, long l) { + if (l<0) return l; + l |= v<<shift; + shift +=b; + return l; + } + + } + + public MFO () {} + + + + + + /** + * Register an attribute class, if it not exists and add a possible value + * @param type + * @param type2 + */ + final public int register(String a, String v) { + + HashMap<String,Integer> fs = getFeatureSet().get(a); + if (fs==null) { + fs = new HashMap<String,Integer>(); + getFeatureSet().put(a, fs); + fs.put(NONE, 0); + getFeatureCounter().put(a, 1); + } + Integer c = getFeatureCounter().get(a); + + Integer i = fs.get(v); + if (i==null) { + fs.put(v, c); + c++; + getFeatureCounter().put(a,c); + return c-1; + } else return i; + } + + /** + * Calculates the number of bits needed to encode a feature + */ + public void calculateBits() { + + int total=0; + for(Entry<String,Integer> e : getFeatureCounter().entrySet() ){ + int bits =(int)Math.ceil((Math.log(e.getValue()+1)/Math.log(2))); + m_featureBits.put(e.getKey(), bits); + total+=bits; + // System.out.println(" "+e.getKey()+" bits "+bits+" number "+(e.getValue()+1)); + } + + // System.out.println("total number of needed bits "+total); + } + + + + @Override + public String toString() { + + StringBuffer content = new StringBuffer(); + for(Entry<String,Integer> e : getFeatureCounter().entrySet() ){ + content.append(e.getKey()+" "+e.getValue()); + content.append(':'); + // HashMap<String,Integer> vs = getFeatureSet().get(e.getKey()); + content.append(getFeatureBits(e.getKey())); + + /*if (vs.size()<120) + for(Entry<String,Integer> e2 : vs.entrySet()) { + content.append(e2.getKey()+" ("+e2.getValue()+") "); + }*/ + content.append('\n'); + + } + return content.toString(); + } + + + static final public long calcs(Data4 d,int b, long v, long l) { + if (l<0) return l; + l |= v<<d.shift; + d.shift +=b; + return l; + } + + + static final public short getFeatureBits(String a) { + return (short)m_featureBits.get(a).intValue(); + } + + + + /** + * Get the integer place holder of the string value v of the type a + * + * @param t the type + * @param v the value + * @return the integer place holder of v + */ + final public int getValue(String t, String v) { + + if (m_featureSets.get(t)==null) return -1; + Integer vi = m_featureSets.get(t).get(v); + if (vi==null) return -1; //stop && + return vi.intValue(); + } + + /** + * Static version of getValue + * @see getValue + */ + static final public int getValueS(String a, String v) { + + if (m_featureSets.get(a)==null) return -1; + Integer vi = m_featureSets.get(a).get(v); + if (vi==null) return -1; //stop && + return vi.intValue(); + } + + public int hasValue(String a, String v) { + + Integer vi = m_featureSets.get(a).get(v); + if (vi==null) return -1; + return vi.intValue(); + } + + + + + final public long calc2(Data4 d) { + + if (d.v0<0||d.v1<0) return -1; + // if (d.v1<0||d.v2<0) return -1; + + long l = d.v0; + short shift =d.a0; + l |= (long)d.v1<<shift; + shift +=d.a1; + // l |= (long)d.v2<<shift; + d.shift=shift; + + //d.shift=; + return l; + } + + + + final public long calc3(Data4 d) { + + if (d.v0<0||d.v1<0||d.v2<0) return -1; + // if (d.v1<0||d.v2<0) return -1; + + long l = d.v0; + short shift =d.a0; + l |= (long)d.v1<<shift; + shift +=d.a1; + l |= (long)d.v2<<shift; + d.shift=shift + d.a2; + + //d.shift=; + return l; + } + + + final public long calc4(Data4 d) { + if (d.v0<0||d.v1<0||d.v2<0||d.v3<0) return -1; + + long l = d.v0; + int shift =d.a0; + l |= (long)d.v1<<shift; + shift +=d.a1; + l |= (long)d.v2<<shift; + shift +=d.a2; + l |= (long)d.v3<<shift; + d.shift= shift +d.a3; + + return l; + } + + + + final public long calc5(Data4 d) { + + if (d.v0<0||d.v1<0||d.v2<0||d.v3<0||d.v4<0) return -1; + + long l = d.v0; + int shift =d.a0; + l |= (long)d.v1<<shift; + shift +=d.a1; + l |= (long)d.v2<<shift; + shift +=d.a2; + l |= (long)d.v3<<shift; + shift +=d.a3; + l |= (long)d.v4<<shift; + d.shift =shift+d.a4; + + return l; + } + + + static final public long calc6(Data4 d) { + + if (d.v0<0||d.v1<0||d.v2<0||d.v3<0||d.v4<0||d.v5<0) return -1; + + long l = d.v0; + int shift =d.a0; + l |= (long)d.v1<<shift; + shift +=d.a1; + l |= (long)d.v2<<shift; + shift +=d.a2; + l |= (long)d.v3<<shift; + shift +=d.a3; + l |= (long)d.v4<<shift; + shift +=d.a4; + l |= (long)d.v5<<shift; + d.shift =shift+d.a5; + + return l; + } + + final public long calc7(Data4 d) { + + if (d.v0<0||d.v1<0||d.v2<0||d.v3<0||d.v4<0||d.v5<0||d.v6<0) return -1; + + long l = d.v0; + int shift =d.a0; + l |= (long)d.v1<<shift; + shift +=d.a1; + l |= (long)d.v2<<shift; + shift +=d.a2; + l |= (long)d.v3<<shift; + shift +=d.a3; + l |= (long)d.v4<<shift; + shift +=d.a4; + l |= (long)d.v5<<shift; + shift +=d.a5; + l |= (long)d.v6<<shift; + d.shift =shift+d.a6; + + return l; + } + + + final public long calc8(Data4 d) { + + if (d.v0<0||d.v1<0||d.v2<0||d.v3<0||d.v4<0||d.v5<0||d.v6<0||d.v7<0) return -1; + + long l = d.v0; + int shift =d.a0; + l |= (long)d.v1<<shift; + shift +=d.a1; + l |= (long)d.v2<<shift; + shift +=d.a2; + l |= (long)d.v3<<shift; + shift +=d.a3; + l |= (long)d.v4<<shift; + shift +=d.a4; + l |= (long)d.v5<<shift; + shift +=d.a5; + l |= (long)d.v6<<shift; + shift +=d.a6; + l |= (long)d.v7<<shift; + d.shift =shift+d.a7; + + return l; + } + + + + + + + + /** + * Maps a long to a integer value. This is very useful to save memory for sparse data long values + * @param node + * @return the integer + */ + static public int misses = 0; + static public int good = 0; + + + + + /** + * Write the data + * @param dos + * @throws IOException + */ + public void writeData(DataOutputStream dos) throws IOException { + dos.writeInt(getFeatureSet().size()); + for(Entry<String, HashMap<String,Integer>> e : getFeatureSet().entrySet()) { + dos.writeUTF(e.getKey()); + dos.writeInt(e.getValue().size()); + + for(Entry<String,Integer> e2 : e.getValue().entrySet()) { + + if(e2.getKey()==null) DB.println("key "+e2.getKey()+" value "+e2.getValue()+" e -key "+e.getKey()); + dos.writeUTF(e2.getKey()); + dos.writeInt(e2.getValue()); + + } + + } + } + public void read(DataInputStream din) throws IOException { + + int size = din.readInt(); + for(int i=0; i<size;i++) { + String k = din.readUTF(); + int size2 = din.readInt(); + + HashMap<String,Integer> h = new HashMap<String,Integer>(); + getFeatureSet().put(k,h); + for(int j = 0;j<size2;j++) { + h.put(din.readUTF(), din.readInt()); + } + getFeatureCounter().put(k, size2); + } + + calculateBits(); + } + + + /** + * Clear the data + */ + static public void clearData() { + getFeatureSet().clear(); + m_featureBits.clear(); + getFeatureSet().clear(); + } + + public HashMap<String,Integer> getFeatureCounter() { + return m_featureCounters; + } + + static public HashMap<String,HashMap<String,Integer>> getFeatureSet() { + return m_featureSets; + } + + static public String[] reverse(HashMap<String,Integer> v){ + String[] set = new String[v.size()]; + for(Entry<String,Integer> e : v.entrySet()) { + set[e.getValue()]=e.getKey(); + } + return set; + } + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/Open.java b/dependencyParser/basic/mate-tools/src/is2/data/Open.java new file mode 100755 index 0000000..ba75fe3 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/Open.java @@ -0,0 +1,37 @@ +package is2.data; + + + +final public class Open { + + public float p; + short s, e, label; + byte dir; + + Closed left; + Closed right; + + public Open(short s, short t, short dir, short label,Closed left, Closed right, float p) { + this.s = s; + this.e = t; + this.label = label; + this.dir = (byte)dir; + this.left =left; + this.right=right; + this.p=p; + } + + + void create(Parse parse) { + if (dir == 0) { + parse.heads[s] = e; + if (label != -1) parse.labels[s] = label; + } else { + parse.heads[e] = s; + if (label != -1) parse.labels[e] = label; + } + if (left != null) left.create(parse); + if (right != null) right.create(parse); + } + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/PSTree.java b/dependencyParser/basic/mate-tools/src/is2/data/PSTree.java new file mode 100644 index 0000000..30c1364 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/PSTree.java @@ -0,0 +1,711 @@ +/** + * + */ +package is2.data; + +import is2.util.DB; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Stack; + +/** + * @author Dr. Bernd Bohnet, 17.01.2011 + * + * + */ +public class PSTree { + + int wordCount =0; + public String entries[]; + public String lemmas[]; + public int head[]; + public String pos[]; + public int[] ok; + public int non; + public int terminalCount; + public String[] morph; + + public int[] forms; + public int[] phrases; + public int[][] psfeats; + public int[] ppos; + + + /** + * @param d + */ + public PSTree(SentenceData09 d) { + create(d.length()-1,d.length()*20); + for(int i=1;i<d.length();i++) { + entries[i-1]=d.forms[i]; + pos[i-1]=d.ppos[i]; + } + } + + + /** + * Create an undefined phrase tree + */ + public PSTree() { } + + + /** + * @param terminals + * @param nonTerminals + */ + public void create(int terminals, int nonTerminals) { + entries = new String[terminals+nonTerminals]; + pos = new String[terminals+nonTerminals]; + head = new int[terminals+nonTerminals]; + lemmas = new String[terminals+nonTerminals]; + morph = new String[terminals+nonTerminals]; + non=terminals; + wordCount=terminals; + + for(int i=terminals+1;i<head.length;i++) head[i]=-1; + } + + + public String toString() { + + StringBuffer s = new StringBuffer(); + + for(int i=0;i<entries.length;i++) { + if (head[i]==-1&&entries[i]==null) break; + + s.append(i+"\t"+pos[i]+"\t"+entries[i]+"\t"+head[i]+(ok==null?"":("\t"+(ok[i]==1)))+" \n"); + + } + // DB.println("entries "+entries.length); + return s.toString(); + } + + + /** + * @return + */ + public boolean containsNull() { + for(int k=0;k<wordCount-1;k++) { + if (entries[k]==null) return true; + } + return false; + } + + + public int equals(SentenceData09 s) { + + int j=1; // starts with root + for(int i=0;i<terminalCount-1;i++){ + + // if (s.forms[j].equals("erschrekkend")) s.forms[j]="erschreckend"; + + if (s.forms.length<j) { + DB.println(""+s+" "+this.toString()); + return i; + + } + + if(!entries[i].equals(s.forms[j])) { + // System.out.println("ps "+entries[i]+" != ds "+s.forms[j]); + // Rolls-Royce + if(entries[i].startsWith(s.forms[j]) && s.forms.length>i+2 && s.forms[j+1].equals("-")) { + j+=2; + if( entries[i].contains(s.forms[j-1]) && s.forms.length>i+3 && s.forms[j+1].equals("-")) { + j+=2; // && + // System.out.println("s.forms[j] "+s.forms[j]+" s.forms[j-1] "+s.forms[j-1]+" "+entries[i]); + if( entries[i].contains(s.forms[j-1]) && s.forms.length>i+3 && s.forms[j+1].equals("-")) { + j+=2; // && + // System.out.println("s.forms[j] "+s.forms[j]+" s.forms[j-1] "+s.forms[j-1]+" "+entries[i]); + } + } + //Interstate\/Johnson + } else if(entries[i].startsWith(s.forms[j]) && s.forms.length>i+2 && s.forms[j+1].equals("/")) { + j+=2; + if( entries[i].contains(s.forms[j-1]) && s.forms.length>i+3 && s.forms[j+1].equals("/")) { + j+=2; // && + // System.out.println("s.forms[j] "+s.forms[j]+" s.forms[j-1] "+s.forms[j-1]+" "+entries[i]); + } + + // U.S.-Japan -> U . S . - Japan + } else if(entries[i].startsWith(s.forms[j]) && s.forms.length>i+2 && s.forms[j+1].equals(".")) { + j+=2; + if( entries[i].contains(s.forms[j-1]) && s.forms.length>i+3 && s.forms[j+1].equals(".")) { + j+=2; // && + // System.out.println("s.forms[j] "+s.forms[j]+" s.forms[j-1] "+s.forms[j-1]+" "+entries[i]); + } + } else if(entries[i].startsWith(s.forms[j]) && s.forms.length>i+1 && s.forms[j+1].equals("'S")) { + j+=1; + + } else { + + // chech those !!! + // System.out.print("entry "+entries[i]+" form "+s.forms[j]+" "); + return j; + } + + } + j++; + + + } + + // without root + return s.length(); + //return j; + + } + + + /** + * @param dn + * @return + */ + public int getPS(int dn) { + + return this.head[dn-1]; + } + + + /** + * @param dn + * @param n + * @param commonHead the common head in the phrase structure + * @return + */ + public String getChain(int dn, int n, int commonHead) { + + int pdn =dn-1,pdh=n-1; + // int phraseHead =head[pdh]; + + // System.out.println("phrase head "+phraseHead+" common head "+commonHead); + + int[] ch = new int[20]; + int head =this.head[pdn]; + int i=0; + ch[i++]=head; + while(commonHead!=head && head!=0) { + + head = this.head[head]; + ch[i++]=head; + } + StringBuffer chain= new StringBuffer(); + + for(int k=0;k<i;k++) { + chain.append(entries[ch[k]]).append(" "); + } + return chain.toString(); + } + + + /** + * @param dn + * @param n + * @return + */ + public int getCommonHead(int d, int dh) { + int pdh = this.getPS(dh), pd = this.getPS(d); + + + ArrayList<Integer> path2root = getPath2Root(pdh); + + //System.out.println("path 2 root "+path2root+" pdh "+pdh); + + for(int n : path2root) { + int candidateHead=pd; + while(candidateHead!=0&& candidateHead!=-1) { + if (n==candidateHead) return n; + candidateHead =this.head[candidateHead]; + } + } + return -1; + } + + + /** + * @param pdh + */ + private ArrayList<Integer> getPath2Root(int pdh) { + ArrayList<Integer> path = new ArrayList<Integer>(); + + + // restrict the number in case its a cycle which should never be + for(int k=0;k<100;k++) { + if(pdh==-1) break; + path.add(pdh); + pdh = this.head[pdh]; + if(pdh==0) break; + } + return path; + } + + + /** + * Get operations to create root + * see operation in method getOperation + * @param pr + */ + public String getOperationRoot(int pr) { + + StringBuffer o = new StringBuffer(); + int h = pr; + int[] path = new int[10]; + // System.out.println(" start node "+pr); + int k=0; + for(;k<10;k++) { + h = head[h]; + if (h==-1){ + break; + } + path[k]=h; + if (h==0){ + break; + } + + } + k-=2; + + boolean first=true; + for(;k>=0;k--) { + + // create phrase + if (first) { + o.append("c:").append(entries[path[k]]); + first =false; + } + + // insert and create phrase + else {o.append(":ci:").append(entries[path[k]]);} + } + + + // insert dependent node + //if (o.length()>0) + o.append(":in:d"); + //else o.append("in:d"); // insert root into nothing + return o.toString(); + } + + + /** + * Create operation to include dependency edges in phrase structure + * Operations: c - create ; i - insert ; in - insert (dependent) node ; up:X go the (phrase) X up + * ci create and insert ... + * + * @param dn + * @param n + * @param commonHead + * @return + */ + public String getOperation(int dn, int n, int commonHead) { + + StringBuffer o= new StringBuffer(); + + // from n move up to common head, if needed + int ph =n-1, pd = dn-1; + + int[] path = new int[20]; + int i=0; + + int h =ph; + + boolean nth=false; + for(int k=0;k<10;k++) { + h = head[h]; + path[k]=h; + if (nth) o.append(':'); + o.append("up:"+entries[h]); + nth=true; + if (h==commonHead) break; + } + + // from common head to the node + int k=0; + h=pd; + for(;k<10;k++) { + h = head[h]; + path[k]=h; + if (h==commonHead){ + break; + } + + } + k-=1; + + // boolean first=true; + for(;k>=0;k--) { + + // create phrase + if (!nth) { + o.append("ci:").append(entries[path[k]]); + nth =true; + } + + // insert and create phrase + else {o.append(":ci:").append(entries[path[k]]);} + } + + + // insert dependent node + o.append(":in:d"); + + + + return o.toString(); + } + + + /** + * @param ph node in the phrase structure corresponding to the head in the dependency structure + * @param pt node in the prhase structure corresponding to the dependent in the ds. + * @param check + * @return rules was applicable + */ + public boolean exec(String r, int ph, int pt, boolean check) { + + String o[] = r.split(":"); + + int last =-1, headP = -1; + + // create root node + + // System.out.println("operation "+r+" "+ph+" "+pt); + boolean done =true; + for(int i=0;i<o.length;i++) { + + if (o[i].equals("c")) { + if (check) return true; + + if(ph<0) { + last=non++; + } + + entries[non]=o[++i]; // create + head[pt]=non; + head[non]=last; // insert into root + last=non++; + } else if (o[i].equals("ci")) { + if (check) return true; + entries[non]= o[++i]; // create + head[non] = last; // insert + last =non; + non++; + } else if (o[i].equals("in")&&o[i+1].equals("d")) { + if (check) return true; + head[pt] = last; // insert + i++; // move forward because of 'd' + } else if (o[i].equals("up")) { + + if (ph==-1) { + // System.out.println("ph is -1 please check this "+ph+" there is a bug "); + return false; + } + + if (headP==-1) headP=head[ph]; + else headP=head[headP]; + + try { + if (headP==-1 || entries[headP]==null ||!entries[headP].equals(o[i+1])) return false; + + } catch(Exception e) { + e.printStackTrace(); + System.out.println(""+entries[headP]+" o[i+1] "+o[i+1]+" "+headP+" "+this.terminalCount); + // System.out.println(""+ this.toString()); + System.exit(0); + } + + i++; + last =headP; + } else { + done = false; + } + + } + + + return done; + } + + /** + * More tolerant mapping + * + * @param ph node in the phrase structure corresponding to the head in the dependency structure + * @param pt node in the prhase structure corresponding to the dependent in the ds. + * @param check + * @return rules was applicable + */ + public boolean execT(String r, int ph, int pt, boolean check) { + + String o[] = r.split(":"); + + int last =-1, headP = -1; + + int up=0; + + boolean done =true; + for(int i=0;i<o.length;i++) { + + if (o[i].equals("c")) { + if (check) return true; + + + // create root node + if(ph<0) { + last=non++; + } + + entries[non]= o[++i]; // create + head[pt]=non; + head[non]=last; // insert into root + last=non++; + } else if (o[i].equals("ci")) { + + if (check) return true; + entries[non]= o[++i]; // create + head[non] = last; // insert + last =non; + non++; + } else if (o[i].equals("in")&&o[i+1].equals("d")) { + if (check) return true; + + // DB.println("hallo"); + + if (last !=-1) + head[pt] = last; // insert + + + // i am not sure if this does much good? + + // if (last ==-1) + + // done=true; + + + + i++; // move forward because of 'd' + + } else if (o[i].equals("up")) { + up++; + if (ph==-1) { + return false; + } + + if (headP==-1) headP=head[ph]; + else headP=head[headP]; + + try { + + // tolerant mapping + if (headP==-1 || entries[headP]==null || + ((!entries[headP].equals(o[i+1]) ) && up>1 )) return false; //>1 +// && entries[headP].charAt(0)!=o[i+1].charAt(0) + } catch(Exception e) { + e.printStackTrace(); + System.out.println(""+entries[headP]+" o[i+1] "+o[i+1]+" "+headP+" "+this.terminalCount); + } + + i++; + last =headP; + } else { + done = false; + } + + } + + + return done; + } + + + public final static boolean INSERT_NEWLINE =true; + + /** + * Convert to bracket format + * @param newLine + * @return + */ + public String toPennBracket(boolean newLine) { + + + StringBuffer b = new StringBuffer(); + ArrayList<Integer> current=null;// = new ArrayList<Integer>(); + int open =0; + for(int i=0; i<terminalCount ;i++) { + ArrayList<Integer> path = getPathToRoot(i); + + ArrayList<Integer> diff = getDiffPath(path, current); + + boolean spaces=false; + + ArrayList<Integer> common = this.getDiffCommon(path, current); + + if(current!=null && (current.size()>common.size())) { + + // close brackets + for(int bc =0;bc<current.size()-common.size();bc++) { + b.append(")"); + open--; + } + if(diff.size()==0 && newLine) b.append("\n"); + spaces=true; + } + + if(i!=0 && diff.size()>0 && newLine) b.append("\n").append(createSpaces(open)); + + for(int k=diff.size()-1;k>=0;k--) { + open++; + b.append("("+(entries[path.get(k)]==null?" ":entries[path.get(k)])); + if (k!=0 &&path.size()-1!=k && newLine) + b.append("\n").append(createSpaces(open)); + spaces=false; + } + if(spaces) b.append(createSpaces(open)); + else b.append(" "); + + String term=entries[i]; + if(term.equals("(")) term="-LRB-"; + if(term.equals(")")) term="-RRB-"; + if(term.equals("{")) term="-LCB-"; + if(term.equals("}")) term="-RCB-"; + + String ps=pos[i]; + if(ps.equals("(")) ps="-LRB-"; + if(ps.equals("$(")) ps="-LRB-"; + + if(ps.equals(")")) ps="-RRB-"; + if(ps.equals("{")) ps="-LCB-"; + if(ps.equals("}")) ps="-RCB-"; + + + b.append("(").append(ps).append(" ").append(term).append(')'); + current = path; + // break; + } + for(;open>0;open--) { + b.append(")"); + } + // b.append("\n"); + + return b.toString(); + } + static int cnt=0; + + /** + * @param path + * @param current + * @return + */ + private ArrayList<Integer> getDiffPath(ArrayList<Integer> path, ArrayList<Integer> current) { + if (current==null) return path; + + ArrayList<Integer> common = new ArrayList<Integer>(); + + int pindex = path.size()-1; + int cindex = current.size()-1; + + while(cindex>=0 && pindex>=0) { + + if(path.get(pindex)==current.get(cindex)) { + cindex--; + pindex--; + } else break; + } + + for(int k=0;k<=pindex;k++) { + common.add(path.get(k)); + } + + return common; + } + + private ArrayList<Integer> getDiffCommon(ArrayList<Integer> path, ArrayList<Integer> current) { + if (current==null) return path; + + ArrayList<Integer> common = new ArrayList<Integer>(); + + int pindex = path.size()-1; + int cindex = current.size()-1; + + while(cindex>=0 && pindex>=0) { + + if(path.get(pindex)==current.get(cindex)) { + common.add(path.get(pindex)); + cindex--; + pindex--; + } else break; + } + + Collections.reverse(common); + // System.out.println("common "+pindex+" "+common); + + return common; + } + /** + * @param i + * @return + */ + private StringBuffer createSpaces(int i) { + StringBuffer s = new StringBuffer(); + for (int k=0;k<i;k++) s.append(" "); + return s; + } + + + /** + * @param i + * @return + */ + private ArrayList<Integer> getPathToRoot(int i) { + + ArrayList<Integer> path = new ArrayList<Integer> (); + + int h=i; + while(true) { + h=this.head[h]; + if (h<this.terminalCount || path.contains(h)) break; + path.add(h); + } + + // Collections.reverse(list) + + + return path; + } + + + public String conll09() { + + StringBuilder s = new StringBuilder(); + for(int i=0;i<this.terminalCount;i++) { + if (head[i]==-1&&entries[i]==null) break; + + s.append((i+1)).append('\t').append(entries[i]).append("\t_\t_\t").append(pos[i]).append("\t_\t_\t_\t_\t_\t_\t_\t_\n"); + + + } + + + return s.toString(); + } + + /** + * @param phead + * @return + */ + public int[] getChilds(int head) { + + int count=0; + for(int i =0;i<this.entries.length;i++) { + if (this.head[i]==head) count++; + } + + int[] clds = new int[count]; + count=0; + for(int i =0;i<this.entries.length;i++) { + if (this.head[i]==head) clds[count++]=i; + } + + return clds; + } + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/Parameter.java b/dependencyParser/basic/mate-tools/src/is2/data/Parameter.java new file mode 100644 index 0000000..35a9911 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/Parameter.java @@ -0,0 +1,13 @@ +/** + * + */ +package is2.data; + +/** + * @author Dr. Bernd Bohnet, 23.12.2010 + * + * + */ +public class Parameter { + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/ParametersFloat.java b/dependencyParser/basic/mate-tools/src/is2/data/ParametersFloat.java new file mode 100755 index 0000000..653487e --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/ParametersFloat.java @@ -0,0 +1,183 @@ +package is2.data; + +import is2.util.DB; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; + + + +final public class ParametersFloat { + + public float[] parameters; + public float[] total; + + public ParametersFloat(int size) { + + parameters = new float[size]; + total = new float[size]; + for(int i = 0; i < parameters.length; i++) { + parameters[i] = 0F; + total[i] = 0F; + } + } + + + /** + * @param parameters2 + */ + public ParametersFloat(float[] p) { + parameters =p; + } + + public void average(double avVal) { + for(int j = 0; j < total.length; j++) { + parameters[j] = total[j]/((float)avVal); + } + total =null; + } + + public ParametersFloat average2(double avVal) { + float[] px = new float[this.parameters.length]; + for(int j = 0; j < total.length; j++) { + px[j] = total[j]/((float)avVal); + } + ParametersFloat pf = new ParametersFloat(px); + return pf; + } + + public void update(FV pred, FV act, float upd, float err) { + + + float lam_dist = act.getScore(parameters,false)- pred.getScore(parameters,false); + float loss =(float)err - lam_dist; + + FV dist = act.getDistVector(pred); + + float alpha; + float A = dist.dotProduct(dist); + if (A<=0.0000000000000001) alpha=0.0f; + else alpha= loss/A; + + // alpha = Math.min(alpha, 0.00578125F); + + dist.update(parameters, total, alpha, upd,false); + + } + + public void update(FV pred, FV act, float upd, float err, float C) { + + + float lam_dist = act.getScore(parameters,false)- pred.getScore(parameters,false); + float loss =(float)err - lam_dist; + + FV dist = act.getDistVector(pred); + + float alpha; + float A = dist.dotProduct(dist); + if (A<=0.0000000000000001) alpha=0.0f; + else alpha= loss/A; + + alpha = Math.min(alpha, C); + + dist.update(parameters, total, alpha, upd,false); + + } + + + + public double update(FV a, double b) { + + double A = a.dotProduct(a); + if (A<=0.0000000000000000001) return 0.0; + return b/A; + } + + + public double getScore(FV fv) { + if (fv ==null) return 0.0F; + return fv.getScore(parameters,false); + + } + + + final public void write(DataOutputStream dos) throws IOException{ + + dos.writeInt(parameters.length); + for(float d : parameters) dos.writeFloat(d); + + } + + public void read(DataInputStream dis) throws IOException{ + + parameters = new float[dis.readInt()]; + int notZero=0; + for(int i=0;i<parameters.length;i++) { + parameters[i]=dis.readFloat(); + if (parameters[i]!=0.0F) notZero++; + } + + DB.println("read parameters "+parameters.length+" not zero "+notZero); + + } + + public int countNZ() { + + int notZero=0; + for(int i=0;i<parameters.length;i++) { + if (parameters[i]!=0.0F) notZero++; + } + return notZero; + + } + + public F2SF getFV() { + return new F2SF(parameters); + } + + + public int size() { + return parameters.length; + } + + public void update(FVR act, FVR pred, Instances isd, int instc, Parse dx, double upd, double e, float lam_dist) { + + e++; + + + float b = (float)e-lam_dist; + + FVR dist = act.getDistVector(pred); + + dist.update(parameters, total, hildreth(dist,b), upd,false); + } + + + public void update(FVR pred, FVR act, float upd, float e) { + + e++; + float lam_dist = act.getScore(parameters,false)- pred.getScore(parameters,false); + + float b = (float)e-lam_dist; + + FVR dist = act.getDistVector(pred); + + dist.update(parameters, total, hildreth(dist,b), upd,false); + } + + protected double hildreth(FVR a, double b) { + + double A = a.dotProduct(a); + if (A<=0.0000000000000000001) return 0.0; + return b/A; + } + + public float getScore(FVR fv) { //xx + if (fv ==null) return 0.0F; + return fv.getScore(parameters,false); + + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/Parse.java b/dependencyParser/basic/mate-tools/src/is2/data/Parse.java new file mode 100755 index 0000000..21a83d9 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/Parse.java @@ -0,0 +1,157 @@ +package is2.data; + + +import java.util.BitSet; + + +public class Parse implements Comparable<Parse> { + + public short[] heads; + public short[] labels; + public double f1; + + + public Parse() {} + + public Parse(int i) { + heads = new short[i]; + labels = new short[i]; + + } + /** + * @param heads2 + * @param types2 + * @param p_new + */ + public Parse(short[] heads2, short[] types2, float p_new) { + this.heads = new short[heads2.length]; + this.labels = new short[types2.length]; + // this.heads=heads2; + // this.labels=types2; + System.arraycopy(heads2, 0, heads, 0, heads.length); + System.arraycopy(types2, 0, labels, 0, labels.length); + f1=p_new; + + } + + /** + * @param heads2 + * @param types2 + * @param p_new + */ + public Parse(String parse, float p_new) { + + // this(parse.length()/2); + + signature2parse(parse); + + f1=p_new; + + } + + public void signature2parse(String parse) { + int p=0; + heads = new short[parse.length()/2]; + labels = new short[heads.length]; + // DB.println("pl "+parse.length()); + for(int k=0;k<heads.length;k++) { + heads[k]= (short)parse.charAt(p++); + labels[k] = (short)parse.charAt(p++); + } + } + + + @Override + public Parse clone() { + Parse p = new Parse(); + p.heads = new short[heads.length]; + p.labels = new short[labels.length]; + + System.arraycopy(heads, 0, p.heads, 0, heads.length); + System.arraycopy(labels, 0, p.labels, 0, labels.length); + + p.f1=f1; + + return p; + } + + /** + * Check if it is a tree + * @return + */ + public boolean checkTree() { + + BitSet set = new BitSet(heads.length); + set.set(0); + return checkTree(set, 0); + + } + + /** + * @param set + * @return + */ + private boolean checkTree(BitSet set, int h) { + //System.out.print(" h "+h); + + for(int i=0;i<heads.length;i++) { + if (heads[i]==h) { + // System.out.print(" "+i); + if (!set.get(i)) checkTree(set, i); + set.set(i); + + } + } + + for(int i=0;i<heads.length;i++) { + if (!set.get(i)) return false; + } + return true; + } + + public String toString() { + StringBuilder b = new StringBuilder(); + for(int k=0;k<this.heads.length;k++) { + b.append(k).append(" ").append(heads[k]+" ").append(this.labels[k]).append("\n"); + } + return b.toString(); + } + + + + /* (non-Javadoc) + * @see java.lang.Comparable#compareTo(java.lang.Object) + */ + @Override + public int compareTo(Parse o) { + + if (f1==o.f1) return this.signature().compareTo(o.signature()); + return f1<o.f1?1:f1==o.f1?0:-1; + } + + /** + * @return the signature of a parse + */ + public String signature() { + StringBuilder b = new StringBuilder(heads.length*2); + for(int k=0;k<heads.length;k++) { + b.append((char)heads[k]).append((char)labels[k]); + } + return b.toString(); + } + + + /** + * @return the signature of a parse + */ + public StringBuilder signatureSB() { + StringBuilder b = new StringBuilder(heads.length*2); + for(int k=0;k<heads.length;k++) { + b.append((char)heads[k]).append((char)labels[k]); + } + return b; + } + + +} + \ No newline at end of file diff --git a/dependencyParser/basic/mate-tools/src/is2/data/ParseNBest.java b/dependencyParser/basic/mate-tools/src/is2/data/ParseNBest.java new file mode 100644 index 0000000..b66306c --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/ParseNBest.java @@ -0,0 +1,107 @@ +package is2.data; + + + + +final public class ParseNBest extends Parse { + + + private String signature=null; + + //public float[] scores; + + public ParseNBest() {} + + public ParseNBest(short[] heads2, short[] types2, float p_new) { + super(heads2, types2, p_new); + } + + public ParseNBest(int i) { + super(i); + } + + /** + * @param sig + * @param readFloat + */ + public ParseNBest(String sig, float score) { + super(sig,score); + } + + /** + * create a total order to provide replicable deterministic results + * @param o + * @return + */ + public int compareTo(ParseNBest o) { + if (f1<o.f1) return 1; + if (f1==o.f1) { + if (signature==null) signature=signature(); + if (o.signature==null) o.signature=o.signature(); + return o.signature.compareTo(signature); + + } + return -1; + } + + /** + * @return the signature of a parse + */ + public String signature() { + if (signature!=null) return signature; + signature= super.signature(); + return signature; + } + + /** + * @return the signature of a parse + */ + public String signature(short[] heads, short[] labels) { + StringBuilder b = new StringBuilder(heads.length*2); + for(int k=0;k<heads.length;k++) { + b.append((char)heads[k]). + append((char)labels[k]); + } + signature = b.toString(); + return signature; + } + + /** + * @param heads + * @param types + * @param oldP + * @param ch + * @param s + */ + public String signature(short[] heads, short[] types, short p, short ch,short l) { + StringBuilder b = new StringBuilder(heads.length*2); + for(int k=0;k<heads.length;k++) { + + + b.append(k==ch?(char)p: + (char)heads[k]). + append(k==ch?(char)l:(char)types[k]); + } + signature = b.toString(); + return signature; + + } + + @Override + public Parse clone() { + ParseNBest p = new ParseNBest(); + p.heads = new short[heads.length]; + p.labels = new short[labels.length]; + + System.arraycopy(heads, 0, p.heads, 0, heads.length); + System.arraycopy(labels, 0, p.labels, 0, labels.length); + + p.f1=f1; + + return p; + } + + +} + + \ No newline at end of file diff --git a/dependencyParser/basic/mate-tools/src/is2/data/PipeGen.java b/dependencyParser/basic/mate-tools/src/is2/data/PipeGen.java new file mode 100755 index 0000000..b63fb90 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/PipeGen.java @@ -0,0 +1,83 @@ +package is2.data; + + +public class PipeGen { + + public static final String SENSE = "SENSE",POS = "POS",DIST = "DIST",WORD = "WORD",PRED = "PRED",ARG = "ARG", + FEAT = "F", REL = "REL",TYPE = "TYPE" ,CHAR = "C",FFEATS="FF", DIR="DIR",LA = "LA",RA = "RA"; + + public static final String GPOS = "GPOS", MID = "MID",END = "END",STR = "STR",FM="FM", NOFEAT = "NOFEAT"; + + public static final String _0 = "0",_4 = "4", _3 = "3", _2 = "2",_1 = "1", _5 = "5",_10 = "10"; + + static public int outValue(int num1, int del) { + String out = ""+num1; + StringBuffer delS=new StringBuffer(); + for(int k =0;k< del;k++) delS.append('\b'); + del=out.length(); + System.out.print(delS+out); + return del; + } + + static public int outValue(int num1, int del, long last) { + String out = ""+num1+" ("+(System.currentTimeMillis()-last)/(num1+1)+" ms/instance)"; + StringBuffer delS=new StringBuffer(); + for(int k =0;k< del;k++) delS.append('\b'); + del=out.length(); + System.out.print(delS+out); + return del; + } + + static public int outValueErr(int num1, float err, float f1, int del, long last) { + + String out = ""+num1+" ("+(System.currentTimeMillis()-last)/(num1+1)+" ms/instance "+(err/num1)+" err/instance f1="+ + f1 +") "; + StringBuffer delS=new StringBuffer(); + for(int k =0;k< del;k++) delS.append('\b'); + del=out.length(); + System.out.print(delS+out); + return del; + } + + + static public int outValueErr(int num1, float err, float f1, int del, long last, double upd) { + String out = ""+num1+" ("+(System.currentTimeMillis()-last)/(num1+1)+" ms/instance "+(err/num1)+" err/instance f1="+ + f1 +") upd "+upd; + StringBuffer delS=new StringBuffer(); + for(int k =0;k< del;k++) delS.append('\b'); + del=out.length(); + System.out.print(delS+out); + return del; + } + + static public int outValueErr(int num1, float err, float f1, int del, long last, double upd, String info) { + String out = ""+num1+" ("+(System.currentTimeMillis()-last)/(num1+1)+" ms/instance "+(err/(float)num1)+" err/instance f1="+ + f1 +") upd "+upd+" "+info; + StringBuffer delS=new StringBuffer(); + for(int k =0;k< del;k++) delS.append('\b'); + del=out.length(); + System.out.print(delS+out); + return del; + } + + + /** + * @param cnt + * @param l + * @return + */ + public static String getSecondsPerInstnace(int cnt, long l) { + return " "+((float)l/(cnt*1000f))+" seconds/sentnece "; + } + + /** + * @param l + * @return + */ + public static String getUsedTime(long l) { + return "Used time " + (((float)l)/1000f)+" seconds "; + } + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/PrimeFinder.java b/dependencyParser/basic/mate-tools/src/is2/data/PrimeFinder.java new file mode 100644 index 0000000..38c614b --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/PrimeFinder.java @@ -0,0 +1,66 @@ +/** + * + */ +package is2.data; + +import java.util.Arrays; + +/** + * @author Dr. Bernd Bohnet, 13.05.2010 + * + * + */ +public class PrimeFinder { + + + + public PrimeFinder() + { + } + + public static final int nextPrime(int desiredCapacity) + { + int i = Arrays.binarySearch(primeCapacities, desiredCapacity); + if(i < 0) + i = -i - 1; + return primeCapacities[i]; + } + + public static final int largestPrime = 2147483647; + private static final int primeCapacities[] = { + 2147483647, 5, 11, 23, 47, 97, 197, 397, 797, 1597, + 3203, 6421, 12853, 25717, 51437, 102877, 205759, 411527, 823117, 1646237, + 3292489, 6584983, 13169977, 26339969, 52679969, 105359939, 210719881, 421439783, 842879579, 1685759167, + 433, 877, 1759, 3527, 7057, 14143, 28289, 56591, 113189, 226379, + 452759, 905551, 1811107, 3622219, 7244441, 14488931, 28977863, 57955739, 115911563, 231823147, + 463646329, 927292699, 1854585413, 953, 1907, 3821, 7643, 15287, 30577, 61169, + 122347, 244703, 489407, 978821, 1957651, 3915341, 7830701, 15661423, 31322867, 62645741, + 125291483, 250582987, 501165979, 1002331963, 2004663929, 1039, 2081, 4177, 8363, 16729, + 33461, 66923, 133853, 267713, 535481, 1070981, 2141977, 4283963, 8567929, 17135863, + 34271747, 68543509, 137087021, 274174111, 548348231, 1096696463, 31, 67, 137, 277, + 557, 1117, 2237, 4481, 8963, 17929, 35863, 71741, 143483, 286973, + 573953, 1147921, 2295859, 4591721, 9183457, 18366923, 36733847, 73467739, 146935499, 293871013, + 587742049, 1175484103, 599, 1201, 2411, 4831, 9677, 19373, 38747, 77509, + 155027, 310081, 620171, 1240361, 2480729, 4961459, 9922933, 19845871, 39691759, 79383533, + 158767069, 317534141, 635068283, 1270136683, 311, 631, 1277, 2557, 5119, 10243, + 20507, 41017, 82037, 164089, 328213, 656429, 1312867, 2625761, 5251529, 10503061, + 21006137, 42012281, 84024581, 168049163, 336098327, 672196673, 1344393353, 3, 7, 17, + 37, 79, 163, 331, 673, 1361, 2729, 5471, 10949, 21911, + 43853, 87719, 175447, 350899, 701819, 1403641, 2807303, 5614657, 11229331, 22458671, + 44917381, 89834777, 179669557, 359339171, 718678369, 1437356741, 43, 89, 179, 359, + 719, 1439, 2879, 5779, 11579, 23159, 46327, 92657, 185323, 370661, + 741337, 1482707, 2965421, 5930887, 11861791, 23723597, 47447201, 94894427, 189788857, 379577741, + 759155483, 1518310967, 379, 761, 1523, 3049, 6101, 12203, 24407, 48817, + 97649, 195311, 390647, 781301, 1562611, 3125257, 6250537, 12501169, 25002389, 50004791, + 100009607, 200019221, 400038451, 800076929, 1600153859 + }; + + static + { + Arrays.sort(primeCapacities); + } + + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/RandomIndex.java b/dependencyParser/basic/mate-tools/src/is2/data/RandomIndex.java new file mode 100644 index 0000000..7fc67b3 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/RandomIndex.java @@ -0,0 +1,161 @@ +/** + * + */ +package is2.data; + +import java.util.BitSet; + +import is2.util.DB; + + +/** + * @author Dr. Bernd Bohnet, 20.05.2011 + * + * + */ +public class RandomIndex implements Long2IntInterface { + + + final int[] prims = {52349171,199951347,89990,5001,32891,17,19,23,29,31,37,47,53,59,61,67,71}; +// final int[] prims = {1,3,5,7,11,17,19,23,29,31,37,47,53,59,61,67,71}; + + final long hashFunctionModifiers[]; + + final int kbit,lbit; + final int hsize ; // maximal size of hash + + final int bits; // available bits + final int moves; // needed moves to put a number into + + + /** + * Creates the random functions. + * + * @param kbit The bits to be mapped + * @param lbit The left shift of the bits + * @param hsize The size of the featurs space (not included in the original algorithm) + * @param numberFunctions The number of the hash functions + */ + public RandomIndex(int kbit, int lbit, int hsize, int numberFunctions) { + + + this.kbit =kbit; + this.lbit =lbit; + + + if (hsize<=0) this.hsize = 67000001; // default value + else this.hsize = hsize; + + bits = (int) Math.ceil(Math.log(this.hsize)/Math.log(2)); + + moves = (int) Math.ceil(64f/(float)bits); + + + + DB.println("moves "+moves+" bits "+bits+" hsize "+hsize); + + hashFunctionModifiers = new long[numberFunctions]; + + for (int f = 0;f<numberFunctions;f++) hashFunctionModifiers[f] = prims[f]; + } + + + + public int[] hash(long x) + { + int[] hvals = new int[hashFunctionModifiers.length]; + + for(int k=0;k<hashFunctionModifiers.length;k++) { + + // the original function: value = ((x+1) * hashFunctionModifiers[k] & m ) >> n; + + // the first part of the original function + long value = (x+1) * hashFunctionModifiers[k]; + + // do the above >> n with a maximal size of the available hash values + // Shift all bits until they have been each xor-ed (^) in the range of the hash + // in order the have all information potentially represented there. + + for(int j=1;j<= moves;j++) value = value ^ (value >> (bits*j)); + + // Map the value to the range of the available space should be the same as (value & m) . + hvals[k] = Math.abs((int)value % hsize); + } + return hvals; + } + + public int[] hashU(long x) + { + int[] hvals = new int[hashFunctionModifiers.length]; + + long y = Long.reverse(x); + for(int k=0;k<hashFunctionModifiers.length;k++) { + + // the original function: value = ((x+1) * hashFunctionModifiers[k] & m ) >> n; + + // the first part of the original function + long value1 = (((y+1) * hashFunctionModifiers[k]) /* % 2 pow 64 */ ) >> (kbit-lbit); + + // I get probably only the first part lets get the second part too + // long value2 = (((y+1>>20) * hashFunctionModifiers[k]) /* % 2 pow 64 */ ) >> (kbit-lbit); + + + // the modulo (%) 2 pow 64 is done since the long number can not be larger than 2 pow 64. + // System.out.println("value "+value+" shift "+(lbit-kbit)); + hvals[k] = Math.abs((int)value1); + } + return hvals; + } + + /* + (defun generate-hash-fn (&key (k-bit 32) + (l-bit 8) + verbosep constants (count 4)) + + (labels ((random-constant () + (let ((a (+ (random (- (expt 2 k-bit) 1)) 1))) + (logior a 1)))) ;; inclusive OR ensures odd number. + (let ((pdiff (- (- k-bit l-bit)));; neg. sign to do a rightshift, see ash() + (sub1 (- (expt 2 k-bit) 1)) + (constants (copy-list constants))) + (unless constants + (loop ;; a = odd number a where 0 < a < u. + until (= count (length constants)) + do (pushnew (random-constant) constants))) + (when verbosep + (format t "~&generate-hash-fn(): using random constants: ~a~%" + constants)) + (values + #'(lambda (x) + (loop + for a in constants + ;;; always add 1 to x to avoid f(0)=0. + collect (ash (logand (* (+ 1 x) a) sub1) pdiff))) + constants)))) + + */ + + + + + + /* (non-Javadoc) + * @see is2.data.Long2IntInterface#l2i(long) + */ + @Override + public int l2i(long l) { + // TODO Auto-generated method stub + return 0; + } + + /* (non-Javadoc) + * @see is2.data.Long2IntInterface#size() + */ + @Override + public int size() { + return hsize; + } + +} + + diff --git a/dependencyParser/basic/mate-tools/src/is2/data/SentenceData09.java b/dependencyParser/basic/mate-tools/src/is2/data/SentenceData09.java new file mode 100755 index 0000000..46cabc0 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/SentenceData09.java @@ -0,0 +1,530 @@ +package is2.data; + + +import is2.io.CONLLReader09; +import is2.io.CONLLWriter09; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.StringWriter; +import java.util.ArrayList; + +public class SentenceData09 { + + + public String[] id; + public String[] forms; + + public String[] lemmas; + public String[] plemmas; + + public int[] heads; + public int[] pheads; + + public String[] labels; + public String[] plabels; + + public String[] gpos; // gold pos + public String[] ppos; + + public String feats[][]; +// public String[] split_lemma; + + public String[] sem; + public int[] semposition; + + // predicate number, argument number -> argument string + public String[][] arg; + public int[][] argposition; + + public String[] fillp; + + public String[] ofeats; + public String[] pfeats; + + public SentenceData09() {} + + public SentenceData09(String[] forms, String[] postags, String[] labs, int[] heads) { + this.forms = forms; + gpos = postags; + labels = labs; + this.heads = heads; + } + + public SentenceData09(String[] forms, String[] lemmas, String[] postags, String[] labs, int[] heads) { + this.forms = forms; + gpos = postags; + //ppos = postags; + + labels = labs; + this.heads = heads; + this.plemmas = lemmas; + } + public SentenceData09(String[] forms, String[] lemmas, String[] gpos, String[] ppos, String[] labs, int[] heads) { + this.forms = forms; + this.gpos = gpos; + this.ppos = ppos; + + labels = labs; + this.heads = heads; + this.plemmas = lemmas; + + + } + public SentenceData09(String[] forms, String[] lemmas, String[] gpos, String[] ppos, String[] labs, int[] heads, String[] fillpred) { + this.forms = forms; + this.gpos = gpos; + this.ppos = ppos; + + labels = labs; + this.heads = heads; + this.plemmas = lemmas; + + fillp =fillpred; + } + + public SentenceData09(String[] forms, String[] lemmas, String[] olemmas,String[] gpos, String[] ppos, String[] labs, int[] heads, String[] fillpred) { + this.forms = forms; + this.gpos = gpos; + this.ppos = ppos; + + labels = labs; + this.heads = heads; + this.plemmas = lemmas; + this.lemmas =olemmas; + fillp =fillpred; + } + + public SentenceData09(String[] forms, String[] olemmas, String[] lemmas,String[] gpos, + String[] ppos, String[] labs, int[] heads, String[] fillpred, String[] of, String[] pf) { + this.forms = forms; + this.gpos = gpos; + this.ppos = ppos; + + labels = labs; + this.heads = heads; + this.pheads =heads; + this.plabels=labs; + this.plemmas = lemmas; + this.lemmas =olemmas; + + this.ofeats =of; + this.pfeats =pf; + fillp =fillpred; + } + + + + + /** + * Create an instance without root of the input instance + * @param instance + */ + public SentenceData09(SentenceData09 i) { + + int length = i.length()-1; + + forms = new String[length]; + gpos = new String[length]; + ppos = new String[length]; + plemmas = new String[length]; + plabels = new String[length]; + lemmas = new String[length]; + heads = new int[length]; + pheads = new int[length]; + ofeats = new String[length]; + pfeats = new String[length]; + labels = new String[length]; + fillp = new String[length]; + id = new String[length]; + + + for(int j = 0; j < length; j++) { + forms[j] = i.forms[j+1]; + ppos[j] = i.ppos[j+1]; + gpos[j] = i.gpos[j+1]; + + labels[j] = i.labels[j+1]; + heads[j] = i.heads[j+1]; + + + + if (i.pheads!=null) pheads[j] = i.pheads[j+1]; + if (i.plabels!=null) plabels[j] = i.plabels[j+1]; + + + if (i.lemmas!=null) lemmas[j] = i.lemmas[j+1]; + + plemmas[j] = i.plemmas[j+1]; + + + if (i.ofeats!=null) ofeats[j] = i.ofeats[j+1]; + if (i.pfeats!=null) pfeats[j] = i.pfeats[j+1]; + + if (i.fillp!=null) fillp[j] = i.fillp[j+1]; + if (i.id!=null) id[j] = i.id[j+1]; + } + + + } + public void setPPos(String[] pos) { + ppos=pos; + } + + public void setLemmas(String[] lemmas) { + this.plemmas=lemmas; + } + + public void setFeats(String[] fts) { + feats = new String[fts.length][]; + for(int i=0;i<fts.length;i++) { + feats[i] = fts[i].split("\\|"); + } + pfeats =fts; + } + + public int length () { + return forms.length; + } + + @Override + public String toString () { + // prepare the output + StringWriter sw = new StringWriter(); + CONLLWriter09 snt2str = new is2.io.CONLLWriter09(sw); + try{ + snt2str.write(this, CONLLWriter09.NO_ROOT); + snt2str.finishWriting(); + return sw.toString(); + }catch(Exception e) { + e.printStackTrace(); + } + + // backup + StringBuffer sb = new StringBuffer(); + for(int k=0;k<forms.length;k++) sb.append(k+1).append('\t').append(forms[k]).append('\t').append(heads[k]).append('\t').append(labels[k]).append('\n'); + return sw.toString(); + } + + + final public void write (DataOutputStream out) throws IOException { + + out.writeInt(forms.length); + for(int k=0;k<forms.length;k++) { + out.writeUTF(forms[k]); + out.writeUTF(ppos[k]); + out.writeUTF(gpos[k]); + out.writeInt(heads[k]); + out.writeUTF(labels[k]); + out.writeUTF(lemmas[k]); + out.writeUTF(plemmas[k]); + out.writeUTF(ofeats[k]); // needed for mtag + out.writeUTF(fillp[k]); + } + + // out.writeUTF(actParseTree); + + } + + final public void read (DataInputStream dis) throws IOException { + + int l = dis.readInt(); + + forms = new String[l]; + lemmas = new String[l]; + plemmas = new String[l]; + ppos = new String[l]; + gpos = new String[l]; + labels = new String[l]; + heads = new int[l]; + fillp = new String[l]; + ofeats=new String[l]; + + for(int k=0;k<l;k++) { + forms[k] = dis.readUTF(); + ppos[k]=dis.readUTF(); + gpos[k]=dis.readUTF(); + heads[k]=dis.readInt(); + labels[k]=dis.readUTF(); + lemmas[k]=dis.readUTF(); + plemmas[k]=dis.readUTF(); + ofeats[k]=dis.readUTF(); + fillp[k]=dis.readUTF(); + + } + } + + + private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException { + forms = (String[])in.readObject(); + plemmas = (String[])in.readObject(); + ppos = (String[])in.readObject(); + heads = (int[])in.readObject(); + labels = (String[])in.readObject(); + } + + public void addPredicate(int i, String s) { + + int predId; + if (sem == null) { + predId=0; + sem = new String[1]; + semposition = new int[1]; + } + else { + predId=sem.length; + String p[] = new String[sem.length+1]; + System.arraycopy(sem, 0, p, 0, sem.length); + int id[] = new int[sem.length+1]; + System.arraycopy(semposition, 0, id, 0, semposition.length); + sem =p; + semposition=id; + } + sem[predId]=s; + semposition[predId]=i; + } + + + /** + * Add an argument + * @param i the instance (the child) + * @param predId the id of the predicate (the head) + * @param a the label of the argument + */ + public void addArgument(int i, int predId, String a) { + + if (a ==null || a.equals("_")) return; + + // ensure the space for the argument in the data structure + if (arg == null) { + arg = new String[predId+1][]; + argposition = new int[predId+1][]; + } else if (arg.length<=predId) { + String p[][] = new String[predId+1][]; + System.arraycopy(arg, 0, p, 0, arg.length); + arg =p; + + int id[][] = new int[predId+1][]; + System.arraycopy(argposition, 0, id, 0, argposition.length); + argposition = id; + } + + + + int aId; + if (arg[predId]==null) { + aId=0; + arg[predId] = new String[1]; + argposition[predId] = new int[1]; + } else { + aId =arg[predId].length; + String args[] = new String[arg[predId].length+1]; + System.arraycopy(arg[predId], 0, args, 0, arg[predId].length); + arg[predId]=args; + + int argsId[] = new int[argposition[predId].length+1]; + System.arraycopy(argposition[predId], 0, argsId, 0, argposition[predId].length); + argposition[predId]=argsId; + } + + arg[predId][aId]=a; + argposition[predId][aId]=i; + + } + + public int[] getParents() { + return heads; + } + + public String[] getLabels() { + return labels; + } + + public String printSem() { + + if (sem==null) return ""; + StringBuilder s = new StringBuilder(); + + for(int k=0;k<sem.length;k++) { + s.append(sem[k]).append("\n"); + + if (arg==null) { + s.append("arg == null"); + }else + if (arg.length<=k) { + s.append("args.length <=k arg.length:"+arg.length+" k:"+k); + } else if (arg[k]!=null) { + for(int a=0;a< arg[k].length;a++) { + s.append(" ").append(arg[k][a]); + } + } else { + s.append("args == null "); + } + s.append('\n'); + } + return s.toString(); + } + + + /** + * Initialize a instance so that a tagger, parser, etc. could be applied + * @param forms + */ + public void init(String[] forms) { + this.forms = forms; + heads = new int[forms.length]; + gpos = new String[forms.length]; + ppos = new String[forms.length]; + plemmas = new String[forms.length]; + feats = new String[forms.length][0]; + labels = new String[forms.length]; + } + + /** + * @param instance + * @param fillp2 + * @param i09 + */ + public void createSemantic(SentenceData09 instance) { + + this.sem = instance.sem; + this.semposition = instance.semposition; + + if (instance.semposition!=null) + for (int k= 0;k< instance.semposition.length;k++) { + this.semposition[k]=instance.semposition[k]-1; + } + + this.arg = instance.arg; + + + this.argposition = instance.argposition; + + if (this.argposition!=null) + for (int p= 0;p< instance.argposition.length;p++) { + if (this.argposition[p]!=null) + for(int a=0;a<instance.argposition[p].length;a++) + this.argposition[p][a]=instance.argposition[p][a]-1; + } + + + } + + /** + * + */ + public String oneLine() { + + + StringBuffer o = new StringBuffer(); + for(int i=1;i<this.length();i++) { + + if (i!=1)o.append(" "); + o.append(this.forms[i]); + } + return o.toString(); + } + + /** + * Get the children of this instance + * @param head + * @return children of the head + */ + public ArrayList<Integer> getChildren(int head) { + + ArrayList<Integer> children = new ArrayList<Integer>(); + for(int i=0;i<length();i++) { + if (heads[i]==head) children.add(i); + } + return children; + } + + public void createWithRoot(SentenceData09 i) { + + int length = i.length(); + int offset = 0; + if (! i.forms[0].equals(CONLLReader09.ROOT)) { + length++; + offset = -1; + } + + + + forms = new String[length]; + gpos = new String[length]; + ppos = new String[length]; + plemmas = new String[length]; + plabels = new String[length]; + lemmas = new String[length]; + heads = new int[length]; + pheads = new int[length]; + ofeats = new String[length]; + pfeats = new String[length]; + labels = new String[length]; + fillp = new String[length]; + id = new String[length]; + feats = new String[forms.length][]; + + for(int j = 1; j < length; j++) { + forms[j] = i.forms[j+offset]; + ppos[j] = i.ppos[j+offset]; + gpos[j] = i.gpos[j+offset]; + + labels[j] = i.labels[j+offset]; + heads[j] = i.heads[j+offset]; + + + + if (i.pheads!=null) pheads[j] = i.pheads[j+offset]; + if (i.plabels!=null) plabels[j] = i.plabels[j+offset]; + + + if (i.lemmas!=null) lemmas[j] = i.lemmas[j+offset]; + + plemmas[j] = i.plemmas[j+offset]; + + + // if (i.ofeats!=null) ofeats[j] = i.ofeats[j+offset]; + + ofeats[j]= i.ofeats[j+offset].equals(CONLLWriter09.DASH)? "_" : i.ofeats[j+offset]; + + // if (i.pfeats!=null) pfeats[j] = i.pfeats[j+offset]; + + if (i.pfeats!=null && i.pfeats[j+offset]!=null) { + if (i.pfeats[j+offset].equals(CONLLWriter09.DASH)) feats[j]=null; + else { + feats[j] =i.pfeats[j+offset].split(CONLLReader09.PIPE); + + // if (info[7].equals(CONLLWriter09.DASH)) it.feats[i]=null; + // else { + // it.feats[i] =info[7].split(PIPE); + pfeats[j] = i.pfeats[j+offset]; + // } + } + } + + if (i.fillp!=null) fillp[j] = i.fillp[j+offset]; + if (i.id!=null) id[j] = i.id[j+offset]; + } + + + + forms[0] = CONLLReader09.ROOT; + plemmas[0] = CONLLReader09.ROOT_LEMMA; + fillp[0] = "N"; + lemmas[0] = CONLLReader09.ROOT_LEMMA; + + gpos[0] = CONLLReader09.ROOT_POS; + ppos[0] = CONLLReader09.ROOT_POS; + labels[0] = CONLLReader09.NO_TYPE; + heads[0] = -1; + plabels[0] = CONLLReader09.NO_TYPE; + pheads[0] = -1; + ofeats[0] = CONLLReader09.NO_TYPE; + id[0] ="0"; + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/data/Thesaurus.java b/dependencyParser/basic/mate-tools/src/is2/data/Thesaurus.java new file mode 100644 index 0000000..2d3677a --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/data/Thesaurus.java @@ -0,0 +1,194 @@ +/** + * + */ +package is2.data; + +import is2.util.DB; + +import java.io.BufferedReader; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; + +/** + * @author Dr. Bernd Bohnet, 28.10.2010 + * + * + */ +final public class Thesaurus { + + public static final String LPATH = "LP"; + public static final String SPATH = "SP"; + + // [word][p] p = [0:long-path | 1:short-path] + final private int[][] word2path; + + public Thesaurus() { + word2path =new int[0][]; + } + + /** + * @param clusterFile + * @param mf + * + */ + public Thesaurus(String clusterFile, IEncoderPlus mf, int ls) { + + final String REGEX = "\t"; + + // register words + try { + BufferedReader inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(clusterFile),"UTF-8"),32768); + + int cnt=0; + String line; + while ((line =inputReader.readLine())!=null) { + + cnt++; + try { + String[] split = line.split(REGEX); + // mf.register(LPATH, split[0].length()<ls?split[0]:split[0].substring(0,ls)); + mf.register(PipeGen.WORD, split[0]); + mf.register(PipeGen.WORD, split[1]); + } catch(Exception e) { + System.out.println("Error in cluster line "+cnt+" error: "+e.getMessage()); + } + } + System.out.println("read number of thesaury entries "+cnt); + inputReader.close(); + + } catch (Exception e) { + e.printStackTrace(); + } + + word2path = new int[mf.getFeatureCounter().get(PipeGen.WORD)][]; + + + // insert words + try { + String line; + BufferedReader inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(clusterFile),"UTF-8"),32768); + + int startWd =-1; + ArrayList<Integer> wrds = new ArrayList<Integer>(); + while ((line =inputReader.readLine())!=null) { + + String[] split = line.split(REGEX); + int wd = mf.getValue(PipeGen.WORD, split[0]); + // DB.println("wd "+wd+" "+startWd); + if (startWd == wd) { + int thesaurusWrd = mf.getValue(PipeGen.WORD, split[1]); + if (thesaurusWrd!=wd) wrds.add(thesaurusWrd); + } else if (startWd!=-1) { + int[] ths = new int[wrds.size()]; + for(int k=0;k<ths.length;k++) ths[k]=wrds.get(k); + word2path[startWd] = ths; + // DB.println(""+wrds+" size "+ths.length); + wrds.clear(); + int thesaurusWrd = mf.getValue(PipeGen.WORD, split[1]); + if (thesaurusWrd!=wd) wrds.add(thesaurusWrd); + } + startWd=wd; + } + + if (wrds.size()!=0) { + // put rest of the words + int[] ths = new int[wrds.size()]; + for(int k=0;k<ths.length;k++) ths[k]=wrds.get(k); + word2path[startWd] = ths; + // DB.println(""+wrds+" size "+ths.length); + wrds.clear(); + + + + + } + + inputReader.close(); + int fill=0; + for(int l = 0; l<word2path.length; l++ ){ + if (word2path[l]!=null) fill++; + } + /* + for(int l = 0; l<word2path.length; l++ ){ + if (word2path[l][1]!=0) fillL++; + if (word2path[l][1]<-1) System.out.println("lower "+word2path[l][1]); + } + */ + System.out.println("filled "+fill+" of "+word2path.length); + + } catch (Exception e) { + e.printStackTrace(); + } + } + + /** + * Read the cluster + * @param dos + * @throws IOException + */ + public Thesaurus(DataInputStream dis) throws IOException { + + word2path = new int[dis.readInt()][]; + for(int i =0;i<word2path.length;i++) { + int len = dis.readInt(); + if (len>0) { + word2path[i] = new int[len]; + for(int j =0;j<len;j++) { + word2path[i][j] = dis.readInt(); + + } + } + + word2path[i][0]=dis.readShort(); + } + DB.println("Read cluster with "+word2path.length+" words "); + } + + /** + * Write the cluster + * @param dos + * @throws IOException + */ + public void write(DataOutputStream dos) throws IOException { + + dos.writeInt(word2path.length); + for(int[] i : word2path) { + dos.writeInt(i==null?0:i.length); + + if (i!=null) { + for(int j=0;j<i.length;j++) { + + dos.writeInt(i[j]); + + } + + } + } + + } + + /** + * @param form the id of a word form + * @return the short path to the word form in the cluster + + final public int getSP(int form) { + if (word2path.length<form) return -1; + return word2path[form][0]; + } + */ + /** + * get the long path to a word form in the cluster + * @param form the id of a word form + * @return the long path to the word + */ + final public int get(int form, int k) { + if (word2path.length<form || word2path[form]==null) return -1; + return word2path[form][k]; + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/io/CONLLReader04.java b/dependencyParser/basic/mate-tools/src/is2/io/CONLLReader04.java new file mode 100644 index 0000000..4ca5254 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/io/CONLLReader04.java @@ -0,0 +1,272 @@ + + +package is2.io; + +import is2.data.Instances; +import is2.data.SentenceData09; +import is2.util.DB; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; + + + +/** + * This class reads files in the CONLL-08 and CONLL-09 format. + * + * @author Bernd Bohnet + */ +public class CONLLReader04 { + + private static final String US = "_"; + private static final String REGEX = "\t"; + public static final String STRING = "*"; + public static final String PIPE = "\\|"; + public static final String NO_TYPE = "<no-type>"; + public static final String ROOT_POS = "<root-POS>"; + public static final String ROOT_LEMMA = "<root-LEMMA>"; + public static final String ROOT = "<root>"; + public static final String EMPTY_FEAT = "<ef>"; + + private static final String NUMBER = "[0-9]+|[0-9]+\\.[0-9]+|[0-9]+[0-9,]+"; + private static final String NUM = "<num>"; + + private BufferedReader inputReader; + + public static final int TASK08=8; + public static final int TASK09=9; + + public static boolean normalizeOn =true; + + + private int lineNumber = 0; + + public CONLLReader04(){} + + public CONLLReader04(String file){ + lineNumber=0; + try { + inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"UTF-8"),32768); //,"UTF-8" + } catch (Exception e) { + e.printStackTrace(); + } + } + + public CONLLReader04(String file, int task){ + this(file); + } + + + + public void startReading(String file ){ + lineNumber=0; + try { + inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"UTF-8"),32768); + } catch (Exception e) { + e.printStackTrace(); + } + } + + /**i.forms[heads[l]-1]+" "+rel+" "+ + * Read a instance + * @return a instance + * @throws Exception + */ + public SentenceData09 getNext() throws Exception { + + try { + + ArrayList<String[]> lineList = new ArrayList<String[]>(); + + String line = inputReader.readLine(); + + while(line !=null && line.length()<2) { + line = inputReader.readLine(); + lineNumber++; + System.out.println("skip empty line at line "+lineNumber); + } + + while (line != null && line.length()!=0 && !line.startsWith(STRING) &&!line.startsWith(REGEX)) { + lineList.add(line.split(REGEX)); + line = inputReader.readLine(); + lineNumber++; + } + + + + int length = lineList.size(); + + if(length == 0) { + inputReader.close(); + return null; + } + + SentenceData09 it = new SentenceData09(); + + // column content + // 1 id + // 2 form + // 3 lemma + // 4 cpos-tag + // 5 pos-tog + // 6 feats + // 7 head + // 8 deprel + + + it.forms = new String[length+1]; + + it.plemmas = new String[length+1]; + it.gpos = new String[length+1]; + it.labels = new String[length+1]; + it.heads = new int[length+1]; + it.pheads = new int[length+1]; + it.plabels = new String[length+1]; + + it.ppos = new String[length+1]; + it.lemmas = new String[length+1]; + it.fillp = new String[length+1]; + it.feats = new String[length+1][]; + it.ofeats = new String[length+1]; + it.pfeats = new String[length+1]; + + + it.forms[0] = ROOT; + it.plemmas[0] = ROOT_LEMMA; + it.fillp[0] = "N"; + it.lemmas[0] = ROOT_LEMMA; + + it.gpos[0] = ROOT_POS; + it.ppos[0] = ROOT_POS; + it.labels[0] = NO_TYPE; + it.heads[0] = -1; + it.plabels[0] = NO_TYPE; + it.pheads[0] = -1; + it.ofeats[0] = NO_TYPE; + + // root is 0 therefore start with 1 + + for(int i = 1; i <= length; i++) { + + String[] info = lineList.get(i-1); + + it.forms[i] = info[0]; //normalize( + + it.lemmas[i] = "_"; + it.plemmas[i] ="_"; + + // 3 cpos + + it.gpos[i] = info[1]; + it.ppos[i] = info[1]; + + it.ofeats[i]="_"; + + + + it.feats[i]=null; + // it.feats[i] =info[5].split(PIPE); + it.pfeats[i] = "_"; + + + if (info[2].equals(US)) it.heads[i]=-1; + else it.heads[i] = Integer.parseInt(info[2]);// head + + + + it.labels[i] = info[3]; + + + } + return it; + + } catch(Exception e) { + System.out.println("\n!!! Error in input file at line : "+lineNumber+" "+e.toString()); + e.printStackTrace(); + throw new Exception(); + // return null; + } + + } + + /** + * Read a instance an store it in a compressed format + * @param is + * @return + * @throws IOException + */ + final public SentenceData09 getNext(Instances is) throws Exception { + + SentenceData09 it = getNext(); + + if (is !=null) insert(is,it); + + return it; + + } + + + + + final public boolean insert(Instances is, SentenceData09 it) throws IOException { + + try { + + if(it == null) { + inputReader.close(); + return false; + } + + int i= is.createInstance09(it.length()); + + for(int p = 0; p < it.length(); p++) { + + is.setForm(i, p, normalize(it.forms[p])); + is.setGPos(i, p, it.gpos[p]); + + if (it.ppos[p]==null||it.ppos[p].equals(US)) { + is.setPPoss(i, p, it.gpos[p]); + } else is.setPPoss(i, p, it.ppos[p]); + + + if (it.plemmas[p]==null ||it.plemmas[p].equals(US)) { + is.setLemma(i, p, normalize(it.forms[p])); + } else is.setLemma(i, p, normalize(it.plemmas[p])); + + + is.setFeats(i,p,it.feats[p]); + + + is.setFeature(i,p,it.ofeats[p]); + + + is.setRel(i,p,it.labels[p]); + if (it.plabels!=null) is.setPRel(i,p,it.plabels[p]); + is.setHead(i,p,it.heads[p]); + if (it.pheads!=null) is.setPHead(i,p,it.pheads[p]); + + if (it.fillp!=null && it.fillp[p]!=null && it.fillp[p].startsWith("Y")) is.pfill[i].set(p); + else is.pfill[i].clear(p); + } + + if (is.createSem(i,it)) { + DB.println("count "+i+" len "+it.length()); + DB.println(it.printSem()); + } + } catch(Exception e ){ + DB.println("head "+it); + e.printStackTrace(); + } + return true; + + } + public static String normalize (String s) { + if (!normalizeOn) return s; + if(s.matches(NUMBER)) return NUM; + return s; + } + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/io/CONLLReader06.java b/dependencyParser/basic/mate-tools/src/is2/io/CONLLReader06.java new file mode 100755 index 0000000..351fa04 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/io/CONLLReader06.java @@ -0,0 +1,275 @@ + + +package is2.io; + +import is2.data.Instances; +import is2.data.SentenceData09; +import is2.util.DB; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; + + + +/** + * This class reads files in the CONLL-08 and CONLL-09 format. + * + * @author Bernd Bohnet + */ +public class CONLLReader06 { + + private static final String US = "_"; + private static final String REGEX = "\t"; + public static final String STRING = "*"; + public static final String PIPE = "\\|"; + public static final String NO_TYPE = "<no-type>"; + public static final String ROOT_POS = "<root-POS>"; + public static final String ROOT_LEMMA = "<root-LEMMA>"; + public static final String ROOT = "<root>"; + public static final String EMPTY_FEAT = "<ef>"; + + private static final String NUMBER = "[0-9]+|[0-9]+\\.[0-9]+|[0-9]+[0-9,]+"; + private static final String NUM = "<num>"; + + private BufferedReader inputReader; + + public static final int TASK08=8; + public static final int TASK09=9; + + public static boolean normalizeOn =true; + + + private int lineNumber = 0; + + public CONLLReader06(){} + + public CONLLReader06(String file){ + lineNumber=0; + try { + inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"UTF-8"),32768); //,"UTF-8" + } catch (Exception e) { + e.printStackTrace(); + } + } + + public CONLLReader06(String file, int task){ + this(file); + } + + + + public void startReading(String file ){ + lineNumber=0; + try { + inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"UTF-8"),32768); + } catch (Exception e) { + e.printStackTrace(); + } + } + + /**i.forms[heads[l]-1]+" "+rel+" "+ + * Read a instance + * @return a instance + * @throws Exception + */ + public SentenceData09 getNext() throws Exception { + + try { + + ArrayList<String[]> lineList = new ArrayList<String[]>(); + + String line = inputReader.readLine(); + + while(line !=null && line.length()==0) { + line = inputReader.readLine(); + lineNumber++; + System.out.println("skip empty line at line "+lineNumber); + } + + while (line != null && line.length()!=0 && !line.startsWith(STRING) &&!line.startsWith(REGEX)) { + lineList.add(line.split(REGEX)); + line = inputReader.readLine(); + lineNumber++; + } + + + + int length = lineList.size(); + + if(length == 0) { + inputReader.close(); + return null; + } + + SentenceData09 it = new SentenceData09(); + + // column content + // 1 id + // 2 form + // 3 lemma + // 4 cpos-tag + // 5 pos-tog + // 6 feats + // 7 head + // 8 deprel + + + it.forms = new String[length+1]; + + it.plemmas = new String[length+1]; + it.gpos = new String[length+1]; + it.labels = new String[length+1]; + it.heads = new int[length+1]; + it.pheads = new int[length+1]; + it.plabels = new String[length+1]; + + it.ppos = new String[length+1]; + it.lemmas = new String[length+1]; + it.fillp = new String[length+1]; + it.feats = new String[length+1][]; + it.ofeats = new String[length+1]; + it.pfeats = new String[length+1]; + + + it.forms[0] = ROOT; + it.plemmas[0] = ROOT_LEMMA; + it.fillp[0] = "N"; + it.lemmas[0] = ROOT_LEMMA; + + it.gpos[0] = ROOT_POS; + it.ppos[0] = ROOT_POS; + it.labels[0] = NO_TYPE; + it.heads[0] = -1; + it.plabels[0] = NO_TYPE; + it.pheads[0] = -1; + it.ofeats[0] = NO_TYPE; + + // root is 0 therefore start with 1 + + for(int i = 1; i <= length; i++) { + + String[] info = lineList.get(i-1); + + it.forms[i] = info[1]; //normalize( + + it.lemmas[i] = info[2]; + it.plemmas[i] =info[2]; + + // 3 cpos + + it.gpos[i] = info[3]; + it.ppos[i] = info[4]; + + it.ofeats[i]=info[5].equals(CONLLWriter09.DASH)? "": info[5]; + + + + if (info[5].equals(CONLLWriter09.DASH)) it.feats[i]=null; + else { + it.feats[i] =info[5].split(PIPE); + it.pfeats[i] = info[5]; + } + + if (info[6].equals(US)) it.heads[i]=-1; + else it.heads[i] = Integer.parseInt(info[6]);// head + + +// it.phead[i]=info[9].equals(US) ? it.phead[i]=-1: Integer.parseInt(info[9]);// head + + it.labels[i] = info[7]; +// it.pedge[i] = info[11]; + + + } + return it; + + } catch(Exception e) { + System.out.println("\n!!! Error in input file at line : "+lineNumber+" "+e.toString()); + e.printStackTrace(); + throw new Exception(); + // return null; + } + + } + + /** + * Read a instance an store it in a compressed format + * @param is + * @return + * @throws IOException + */ + final public SentenceData09 getNext(Instances is) throws Exception { + + SentenceData09 it = getNext(); + + if (is !=null) insert(is,it); + + return it; + + } + + + + + final public boolean insert(Instances is, SentenceData09 it) throws IOException { + + try { + + if(it == null) { + inputReader.close(); + return false; + } + + int i= is.createInstance09(it.length()); + + for(int p = 0; p < it.length(); p++) { + + is.setForm(i, p, normalize(it.forms[p])); + is.setGPos(i, p, it.gpos[p]); + + if (it.ppos[p]==null||it.ppos[p].equals(US)) { + is.setPPoss(i, p, it.gpos[p]); + } else is.setPPoss(i, p, it.ppos[p]); + + + if (it.plemmas[p]==null ||it.plemmas[p].equals(US)) { + is.setLemma(i, p, normalize(it.forms[p])); + } else is.setLemma(i, p, normalize(it.plemmas[p])); + + + is.setFeats(i,p,it.feats[p]); + + + is.setFeature(i,p,it.ofeats[p]); + + + is.setRel(i,p,it.labels[p]); + if (it.plabels!=null) is.setPRel(i,p,it.plabels[p]); + is.setHead(i,p,it.heads[p]); + if (it.pheads!=null) is.setPHead(i,p,it.pheads[p]); + + if (it.fillp!=null && it.fillp[p]!=null && it.fillp[p].startsWith("Y")) is.pfill[i].set(p); + else is.pfill[i].clear(p); + } + + if (is.createSem(i,it)) { + DB.println("count "+i+" len "+it.length()); + DB.println(it.printSem()); + } + } catch(Exception e ){ + DB.println("head "+it); + e.printStackTrace(); + } + return true; + + } + public static String normalize (String s) { + if (!normalizeOn) return s; + if(s.matches(NUMBER)) return NUM; + return s; + } + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/io/CONLLReader08.java b/dependencyParser/basic/mate-tools/src/is2/io/CONLLReader08.java new file mode 100644 index 0000000..a6194a3 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/io/CONLLReader08.java @@ -0,0 +1,413 @@ + + +package is2.io; + +import is2.data.Instances; +import is2.data.SentenceData09; +import is2.util.DB; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; + + + +/** + * This class reads files in the CONLL-09 format. + * + * @author Bernd Bohnet + */ +public class CONLLReader08 extends IOGenerals { + + + private BufferedReader inputReader; + + public static final boolean NORMALIZE = true; + + public static final boolean NO_NORMALIZE = false; + + public boolean normalizeOn =true; + + + + private int format = 0; + + private int lineNumber = 0; + + + public CONLLReader08(boolean normalize){ + + normalizeOn=normalize; + } + + public CONLLReader08(String file){ + lineNumber=0; + try { + inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"UTF-8"),32768); + } catch (Exception e) { + e.printStackTrace(); + } + } + + public CONLLReader08(String file, boolean normalize){ + this(file); + normalizeOn=normalize; + } + + /** + * Sets the input format: + * + * CONLL09 is standard, + * ONE_LINE + * + * @param format the fomrat (see the constants starting with F_). + */ + public void setInputFormat(int format) { + this.format=format; + } + + + + /** + * + */ + public CONLLReader08() {} + + /** + * @param testfile + * @param formatTask + */ + public CONLLReader08(String testfile, int formatTask) { + this(testfile); + } + + public void startReading(String file ){ + lineNumber=0; + try { + inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"UTF-8"),32768); + } catch (Exception e) { + e.printStackTrace(); + } + } + + public SentenceData09 getNext() { + + if (F_ONE_LINE == format) return getNextOneLine(); + else return getNextCoNLL09(); + } + + /** + * @return + */ + private SentenceData09 getNextOneLine() { + + String line=null; + int i=0; + try { + + + line = inputReader.readLine(); + lineNumber++; + + if (line==null ) { + inputReader.close(); + return null; + } + + String[] tokens = line.split(" "); + int length = tokens.length; + if (line.isEmpty()) length=0; + + SentenceData09 it = new SentenceData09(); + + it.forms = new String[length+1]; + + it.plemmas = new String[length+1]; + // it.ppos = new String[length+1]; + it.gpos = new String[length+1]; + it.labels = new String[length+1]; + it.heads = new int[length+1]; + it.pheads = new int[length+1]; + it.plabels = new String[length+1]; + + it.ppos = new String[length+1]; + it.lemmas = new String[length+1]; + it.fillp = new String[length+1]; + it.feats = new String[length+1][]; + it.ofeats = new String[length+1]; + it.pfeats = new String[length+1]; + it.id = new String[length+1]; + + it.forms[0] = ROOT; + it.plemmas[0] = ROOT_LEMMA; + it.fillp[0] = "N"; + it.lemmas[0] = ROOT_LEMMA; + + it.gpos[0] = ROOT_POS; + it.ppos[0] = ROOT_POS; + it.labels[0] = NO_TYPE; + it.heads[0] = -1; + it.plabels[0] = NO_TYPE; + it.pheads[0] = -1; + it.ofeats[0] = NO_TYPE; + it.id[0] ="0"; + + // root is 0 therefore start with 1 + + for(i = 1; i <= length; i++) { + + it.id[i] = ""+i; + + it.forms[i] = this.normalizeOn?normalize(tokens[i-1]):tokens[i-1]; //normalize( + + + } + + return it; + + } catch(Exception e) { + System.out.println("\n!!! Error in input file sentence before line: "+lineNumber+" (in sentence line "+i+" ) "+e.toString()); + e.printStackTrace(); + System.exit(0); + + + + + //throw new Exception(); + return null; + } + + + + } + + /**i.forms[heads[l]-1]+" "+rel+" "+ + * Read a instance + * @return a instance + * @throws Exception + */ + + public SentenceData09 getNextCoNLL09() { + + String line=null; + int i=0; + try { + + ArrayList<String[]> lineList = new ArrayList<String[]>(); + + line = inputReader.readLine(); + lineNumber++; + + while(line !=null && line.length()==0) { + line = inputReader.readLine(); + lineNumber++; + System.out.println("skip empty line at line "+lineNumber); + } + + while (line != null && line.length()!=0 && !line.startsWith(STRING) &&!line.startsWith(REGEX)) { + lineList.add(line.split(REGEX)); + line = inputReader.readLine(); + lineNumber++; + } + + + + int length = lineList.size(); + + if(length == 0) { + inputReader.close(); + return null; + } + + SentenceData09 it = new SentenceData09(); + + it.forms = new String[length+1]; + + it.plemmas = new String[length+1]; + // it.ppos = new String[length+1]; + it.gpos = new String[length+1]; + it.labels = new String[length+1]; + it.heads = new int[length+1]; + it.pheads = new int[length+1]; + it.plabels = new String[length+1]; + + it.ppos = new String[length+1]; + it.lemmas = new String[length+1]; + it.fillp = new String[length+1]; + it.feats = new String[length+1][]; + it.ofeats = new String[length+1]; + it.pfeats = new String[length+1]; + it.id = new String[length+1]; + + it.forms[0] = ROOT; + it.plemmas[0] = ROOT_LEMMA; + it.fillp[0] = "N"; + it.lemmas[0] = ROOT_LEMMA; + + it.gpos[0] = ROOT_POS; + it.ppos[0] = ROOT_POS; + it.labels[0] = NO_TYPE; + it.heads[0] = -1; + it.plabels[0] = NO_TYPE; + it.pheads[0] = -1; + it.ofeats[0] = NO_TYPE; + it.id[0] ="0"; + + // root is 0 therefore start with 1 + + for(i = 1; i <= length; i++) { + + + + String[] info = lineList.get(i-1); + + it.id[i] = info[0]; + it.forms[i] = info[5]; //normalize( + if (info.length<3) continue; + + //it.lemmas[i] = info[2]; + it.plemmas[i] =info[6]; + it.gpos[i] = info[3]; + + if (info.length<5) continue; + it.ppos[i] = info[7];//.split("\\|")[0]; + + // feat 6 + // pfeat 7 + + // this causes trouble in the perl eval09 scirpt + //it.ofeats[i]=info[6].equals(CONLLWriter09.DASH)? "" : info[6]; + + // now we try underscore + it.ofeats[i]="_"; + + + // it.feats[i] ="_"; + it.pfeats[i] = "_"; + + + + + if (info[8].equals(US)) it.heads[i]=-1; + else it.heads[i] = Integer.parseInt(info[8]);// head + + it.pheads[i]=-1;// head + + it.labels[i] = info[9]; + it.plabels[i] = "_"; + + it.fillp[i]=info[10]; + + if (info.length>11) { + if (!info[10].equals(US)) it.addPredicate(i,info[10]); + for(int k=11;k<info.length;k++) it.addArgument(i,k-11,info[k]); + } + + + + + } + return it; + + } catch(Exception e) { + System.out.println("\n!!! Error in input file sentence before line: "+lineNumber+" (in sentence line "+i+" ) "+e.toString()); + e.printStackTrace(); + System.exit(0); + + + + + //throw new Exception(); + return null; + } + + } + + /** + * Read a instance an store it in a compressed format + * @param is + * @return + * @throws IOException + */ + final public SentenceData09 getNext(Instances is) { + + SentenceData09 it = getNext(); + + if (is !=null) insert(is,it); + + return it; + + } + + + + + final public boolean insert(Instances is, SentenceData09 it) { + + try { + + if(it == null) { + inputReader.close(); + return false; + } + + int i= is.createInstance09(it.length()); + + for(int p = 0; p < it.length(); p++) { + + is.setForm(i, p, normalize(it.forms[p])); + is.setGPos(i, p, it.gpos[p]); + + // System.out.println(""+is.gpos[i][p]); + + if (it.ppos[p]==null||it.ppos[p].equals(US)) { + + is.setPPoss(i, p, it.gpos[p]); + } else is.setPPoss(i, p, it.ppos[p]); + + + if (it.plemmas[p]==null ||it.plemmas[p].equals(US)) { + is.setLemma(i, p, normalize(it.forms[p])); + } else is.setLemma(i, p, normalize(it.plemmas[p])); + + if (it.lemmas!=null) + if (it.lemmas[p]==null ) { // ||it.org_lemmas[p].equals(US) that harms a lot the lemmatizer + is.setGLemma(i, p, it.plemmas[p]); + } else is.setGLemma(i, p, it.lemmas[p]); + + + if (it.feats!=null && it.feats[p]!=null) is.setFeats(i,p,it.feats[p]); + + if (it.ofeats!=null) is.setFeature(i,p,it.ofeats[p]); + + + is.setRel(i,p,it.labels[p]); + if (it.plabels!=null) is.setPRel(i,p,it.plabels[p]); + + is.setHead(i,p,it.heads[p]); + if (it.pheads!=null) is.setPHead(i,p,it.pheads[p]); + + if (it.fillp!=null && it.fillp[p]!=null && it.fillp[p].startsWith("Y")) is.pfill[i].set(p); + else is.pfill[i].clear(p); + } + + if (is.createSem(i,it)) { + DB.println("count "+i+" len "+it.length()); + DB.println(it.printSem()); + } + } catch(Exception e ){ + DB.println("head "+it); + e.printStackTrace(); + } + return true; + + } + public String normalize (String s) { + if (!normalizeOn) return s; + if(s.matches(NUMBER)) return NUM; + return s; + } + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/io/CONLLReader09.java b/dependencyParser/basic/mate-tools/src/is2/io/CONLLReader09.java new file mode 100755 index 0000000..c020579 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/io/CONLLReader09.java @@ -0,0 +1,411 @@ + + +package is2.io; + +import is2.data.Instances; +import is2.data.SentenceData09; +import is2.util.DB; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.ArrayList; + + + +/** + * This class reads files in the CONLL-09 format. + * + * @author Bernd Bohnet + */ +public class CONLLReader09 extends IOGenerals { + + + private BufferedReader inputReader; + + public static final boolean NORMALIZE = true; + + public static final boolean NO_NORMALIZE = false; + + public boolean normalizeOn =true; + + static public String joint =""; + + private int format = 0; + + private int lineNumber = 0; + + + public CONLLReader09(boolean normalize){ + + normalizeOn=normalize; + } + + public CONLLReader09(String file){ + lineNumber=0; + try { + inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"UTF-8"),32768); + } catch (Exception e) { + e.printStackTrace(); + } + } + + public CONLLReader09(String file, boolean normalize){ + this(file); + normalizeOn=normalize; + } + + /** + * Sets the input format: + * + * CONLL09 is standard, + * ONE_LINE + * + * @param format the fomrat (see the constants starting with F_). + */ + public void setInputFormat(int format) { + this.format=format; + } + + + + /** + * + */ + public CONLLReader09() {} + + /** + * @param testfile + * @param formatTask + */ + public CONLLReader09(String testfile, int formatTask) { + this(testfile); + } + + public void startReading(String file ){ + lineNumber=0; + try { + inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"UTF-8"),32768); + } catch (Exception e) { + e.printStackTrace(); + } + } + + public SentenceData09 getNext() { + + if (F_ONE_LINE == format) return getNextOneLine(); + else return getNextCoNLL09(); + } + + /** + * @return + */ + private SentenceData09 getNextOneLine() { + + String line=null; + int i=0; + try { + + + line = inputReader.readLine(); + lineNumber++; + + if (line==null ) { + inputReader.close(); + return null; + } + + String[] tokens = line.split(" "); + int length = tokens.length; + if (line.isEmpty()) length=0; + + SentenceData09 it = new SentenceData09(); + + it.forms = new String[length+1]; + + it.plemmas = new String[length+1]; + // it.ppos = new String[length+1]; + it.gpos = new String[length+1]; + it.labels = new String[length+1]; + it.heads = new int[length+1]; + it.pheads = new int[length+1]; + it.plabels = new String[length+1]; + + it.ppos = new String[length+1]; + it.lemmas = new String[length+1]; + it.fillp = new String[length+1]; + it.feats = new String[length+1][]; + it.ofeats = new String[length+1]; + it.pfeats = new String[length+1]; + it.id = new String[length+1]; + + it.forms[0] = ROOT; + it.plemmas[0] = ROOT_LEMMA; + it.fillp[0] = "N"; + it.lemmas[0] = ROOT_LEMMA; + + it.gpos[0] = ROOT_POS; + it.ppos[0] = ROOT_POS; + it.labels[0] = NO_TYPE; + it.heads[0] = -1; + it.plabels[0] = NO_TYPE; + it.pheads[0] = -1; + it.ofeats[0] = NO_TYPE; + it.id[0] ="0"; + + // root is 0 therefore start with 1 + + for(i = 1; i <= length; i++) { + + it.id[i] = ""+i; + + it.forms[i] = this.normalizeOn?normalize(tokens[i-1]):tokens[i-1]; //normalize( + + + } + + return it; + + } catch(Exception e) { + System.out.println("\n!!! Error in input file sentence before line: "+lineNumber+" (in sentence line "+i+" ) "+e.toString()); + e.printStackTrace(); + + + + + + //throw new Exception(); + return null; + } + + + + } + + /**i.forms[heads[l]-1]+" "+rel+" "+ + * Read a instance + * @return a instance + * @throws Exception + */ + + public SentenceData09 getNextCoNLL09() { + + String line=null; + int i=0; + try { + + ArrayList<String[]> lineList = new ArrayList<String[]>(); + + line = inputReader.readLine(); + lineNumber++; + + while(line !=null && line.length()==0) { + line = inputReader.readLine(); + lineNumber++; + System.out.println("skip empty line at line "+lineNumber); + } + + while (line != null && line.length()!=0 && !line.startsWith(STRING) &&!line.startsWith(REGEX)) { + lineList.add(line.split(REGEX)); + line = inputReader.readLine(); + lineNumber++; + } + + + + int length = lineList.size(); + + if(length == 0) { + inputReader.close(); + return null; + } + + SentenceData09 it = new SentenceData09(); + + it.forms = new String[length+1]; + + it.plemmas = new String[length+1]; + // it.ppos = new String[length+1]; + it.gpos = new String[length+1]; + it.labels = new String[length+1]; + it.heads = new int[length+1]; + it.pheads = new int[length+1]; + it.plabels = new String[length+1]; + + it.ppos = new String[length+1]; + it.lemmas = new String[length+1]; + it.fillp = new String[length+1]; + it.feats = new String[length+1][]; + it.ofeats = new String[length+1]; + it.pfeats = new String[length+1]; + it.id = new String[length+1]; + + it.forms[0] = ROOT; + it.plemmas[0] = ROOT_LEMMA; + it.fillp[0] = "N"; + it.lemmas[0] = ROOT_LEMMA; + + it.gpos[0] = ROOT_POS; + it.ppos[0] = ROOT_POS; + it.labels[0] = NO_TYPE; + it.heads[0] = -1; + it.plabels[0] = NO_TYPE; + it.pheads[0] = -1; + it.ofeats[0] = NO_TYPE; + it.id[0] ="0"; + + // root is 0 therefore start with 1 + + for(i = 1; i <= length; i++) { + + + + String[] info = lineList.get(i-1); + + it.id[i] = info[0]; + it.forms[i] = info[1]; //normalize( + if (info.length<3) continue; + + it.lemmas[i] = info[2]; + it.plemmas[i] =info[3]; + it.gpos[i] = info[4]; + + if (info.length<5) continue; + it.ppos[i] = info[5];//.split("\\|")[0]; + // feat 6 + + // now we try underscore + it.ofeats[i]=info[6].equals(CONLLWriter09.DASH)? "_" : info[6]; + + if (info[7].equals(CONLLWriter09.DASH)) it.feats[i]=null; + else { + it.feats[i] =info[7].split(PIPE); + it.pfeats[i] = info[7]; + } + + + + if (info[8].equals(US))it.heads[i]=-1; + else it.heads[i] = Integer.parseInt(info[8]);// head + + it.pheads[i]=info[9].equals(US) ? it.pheads[i]=-1: Integer.parseInt(info[9]);// head + + it.labels[i] = info[10]; + it.plabels[i] = info[11]; + it.fillp[i]=info[12]; + + if (info.length>13) { + if (!info[13].equals(US)) it.addPredicate(i,info[13]); + for(int k=14;k<info.length;k++) it.addArgument(i,k-14,info[k]); + + } + + + + + } + return it; + + } catch(Exception e) { + System.out.println("\n!!! Error in input file sentence before line: "+lineNumber+" (in sentence line "+i+" ) "+e.toString()); + e.printStackTrace(); + System.exit(0); + + + + + //throw new Exception(); + return null; + } + + } + + /** + * Read a instance an store it in a compressed format + * @param is + * @return + * @throws IOException + */ + final public SentenceData09 getNext(Instances is) { + + SentenceData09 it = getNext(); + + if (is !=null) insert(is,it); + + return it; + + } + + + + + final public boolean insert(Instances is, SentenceData09 it) { + + try { + + if(it == null) { + inputReader.close(); + return false; + } + + int i= is.createInstance09(it.length()); + + for(int p = 0; p < it.length(); p++) { + + is.setForm(i, p, normalize(it.forms[p])); + // is.setFormOrg(i, p, it.forms[p]); + is.setGPos(i, p, it.gpos[p]); + + // System.out.println(""+is.gpos[i][p]); + + if (it.ppos[p]==null||it.ppos[p].equals(US)) { + + is.setPPoss(i, p, it.gpos[p]); + } else is.setPPoss(i, p, it.ppos[p]); + + + if (it.plemmas[p]==null ||it.plemmas[p].equals(US)) { + is.setLemma(i, p, normalize(it.forms[p])); + } else is.setLemma(i, p, normalize(it.plemmas[p])); + + if (it.lemmas!=null) + if (it.lemmas[p]==null ) { // ||it.org_lemmas[p].equals(US) that harms a lot the lemmatizer + is.setGLemma(i, p, it.plemmas[p]); + } else is.setGLemma(i, p, it.lemmas[p]); + + + if (it.feats!=null && it.feats[p]!=null) is.setFeats(i,p,it.feats[p]); + + if (it.ofeats!=null) is.setFeature(i,p,it.ofeats[p]); + if (it.pfeats!=null) is.setPFeature(i,p,it.pfeats[p]); + + + is.setRel(i,p,it.labels[p]); + if (it.plabels!=null) is.setPRel(i,p,it.plabels[p]); + + is.setHead(i,p,it.heads[p]); + if (it.pheads!=null) is.setPHead(i,p,it.pheads[p]); + + if (it.fillp!=null && it.fillp[p]!=null && it.fillp[p].startsWith("Y")) is.pfill[i].set(p); + else is.pfill[i].clear(p); + } + + if (is.createSem(i,it)) { + DB.println("count "+i+" len "+it.length()); + DB.println(it.printSem()); + } + } catch(Exception e ){ + DB.println("head "+it); + e.printStackTrace(); + } + return true; + + } + public String normalize (String s) { + if (!normalizeOn) return s; + if(s.matches(NUMBER)) return NUM; + return s; + } + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/io/CONLLWriter06.java b/dependencyParser/basic/mate-tools/src/is2/io/CONLLWriter06.java new file mode 100755 index 0000000..26762bc --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/io/CONLLWriter06.java @@ -0,0 +1,193 @@ +package is2.io; + +import is2.data.SentenceData09; +import is2.util.DB; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.util.StringTokenizer; + + +public class CONLLWriter06 { + + public static final String DASH = "_"; + + protected BufferedWriter writer; + + public CONLLWriter06 () { } + + + + public static void main(String args[]) throws IOException { + + + if (args.length==2) { + File f = new File(args[0]); + File f2 = new File(args[1]); + // BufferedReader bf = new BufferedReader(new FileInputStream(new File(args[0]),"UTF-8"),32768); + BufferedReader ir = new BufferedReader(new InputStreamReader(new FileInputStream(f),"ISO-8859"),32768); + BufferedWriter br = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f2),"UTF-8"));; + boolean found =false; + boolean tab =false; + while(true) { + String l = ir.readLine(); + if (l==null) break; + String x =l.trim(); + if (x.endsWith("\t")) tab=true; + br.write(x); + br.newLine(); + if (!l.equals(x)) found =true; + + } + ir.close(); + br.flush(); + br.close(); + + if (found) DB.println("found diff. found tab? "+tab); + } else if (args.length==3) { + File f1 = new File(args[1]); + File f2 = new File(args[2]); + + BufferedReader ir1 = new BufferedReader(new InputStreamReader(new FileInputStream(f1),"ISO-8859"),32768); + BufferedReader ir2 = new BufferedReader(new InputStreamReader(new FileInputStream(f2),"UTF-8"),32768); + + int line =0, alltabs1=0,alltabs2=0; + while(true) { + String l1 = ir1.readLine(); + String l2 = ir2.readLine(); + + if (l1==null && l2!=null) DB.println("files do not end at the same line "); + if (l1!=null && l2==null) DB.println("files do not end at the same line "); + if (l1==null ) break; + StringTokenizer t1 = new StringTokenizer(l1,"\t"); + StringTokenizer t2 = new StringTokenizer(l2,"\t"); + int tabs1=0; + while(t1.hasMoreTokens()) { + + t1.nextElement(); + tabs1++; + alltabs1++; + } + + int tabs2=0; + while(t2.hasMoreTokens()) { + + t2.nextElement(); + tabs2++; + alltabs2++; + } + line ++; + if (tabs1!=tabs2) { + DB.println("number of tabs different in line "+line+" file1-tabs "+tabs1+" file2-tabs "+tabs2); + System.exit(0); + } + + + } + DB.println("checked lines "+line+" with tabs in file 1 "+alltabs1+" in file2 "+alltabs2); + + } else { + File f = new File(args[0]); + String[] dir =f.list(); + for(String fx :dir) { + BufferedReader ir = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]+File.separatorChar+fx),"UTF-8"),32768); + System.out.println("check file "+fx); + while(true) { + String l = ir.readLine(); + if (l==null) break; + if (l.endsWith("\t")) { + DB.println("found tab in file "+fx); + break; + } + } + ir.close(); + } + } + + } + + +// public int version = CONLLReader09.TASK08; + + public CONLLWriter06 (String file) { + + try { + writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file),"UTF-8")); + } catch (Exception e) { + e.printStackTrace(); + } + } + + public CONLLWriter06(String outfile, int formatTask) { + this(outfile); + // version = formatTask; + } + + public void write(SentenceData09 inst) throws IOException { + + for (int i=0; i<inst.length(); i++) { + + + writer.write(Integer.toString(i+1)); writer.write('\t'); // id + writer.write(inst.forms[i]); writer.write('\t'); // form + + if (inst.lemmas!=null && inst.lemmas[i]!=null) { + writer.write(inst.lemmas[i]); + } + else writer.write(DASH); // lemma + writer.write('\t'); + +// writer.write(DASH); // cpos +// writer.write('\t'); + + + writer.write(inst.gpos[i]); // cpos has to be included + writer.write('\t'); + + writer.write(inst.gpos[i]); // gpos + writer.write('\t'); + + + if (inst.ofeats[i].isEmpty()||inst.ofeats[i].equals(" ")) writer.write(DASH); + else writer.write(inst.ofeats[i]); + writer.write('\t'); + + + //writer.write(DASH); writer.write('\t'); // pfeat + + writer.write(Integer.toString(inst.heads[i])); writer.write('\t'); // head + + if (inst.labels[i]!=null) writer.write(inst.labels[i]); // rel + else writer.write(DASH); + writer.write('\t'); + + writer.write(DASH); + writer.write('\t'); + + writer.write(DASH); + writer.write('\t'); + + + writer.newLine(); + } + writer.newLine(); + + } + + + + public void finishWriting () throws IOException { + writer.flush(); + writer.close(); + } + + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/io/CONLLWriter09.java b/dependencyParser/basic/mate-tools/src/is2/io/CONLLWriter09.java new file mode 100755 index 0000000..e7a92a5 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/io/CONLLWriter09.java @@ -0,0 +1,307 @@ +package is2.io; + +import is2.data.SentenceData09; +import is2.util.DB; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.util.StringTokenizer; + + +public class CONLLWriter09 extends IOGenerals { + + + int format =0; + + public static final String DASH = "_"; + + public static final boolean NO_ROOT = true, ROOT = false; + + protected BufferedWriter writer; + + public CONLLWriter09 () { } + + public static void main(String args[]) throws IOException { + + + if (args.length==2) { + File f = new File(args[0]); + File f2 = new File(args[1]); + BufferedReader ir = new BufferedReader(new InputStreamReader(new FileInputStream(f),"UTF-8"),32768); + BufferedWriter br = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f2),"UTF8"));; + boolean found =false; + boolean tab =false; + while(true) { + String l = ir.readLine(); + if (l==null) break; + String x =l.trim(); + if (x.endsWith("\t")) tab=true; + br.write(x); + br.newLine(); + if (!l.equals(x)) found =true; + + } + ir.close(); + br.flush(); + br.close(); + + if (found) DB.println("found diff. found tab? "+tab); + } else if (args.length==3) { + File f1 = new File(args[1]); + File f2 = new File(args[2]); + + BufferedReader ir1 = new BufferedReader(new InputStreamReader(new FileInputStream(f1),"UTF-8"),32768); + BufferedReader ir2 = new BufferedReader(new InputStreamReader(new FileInputStream(f2),"UTF-8"),32768); + + int line =0, alltabs1=0,alltabs2=0; + while(true) { + String l1 = ir1.readLine(); + String l2 = ir2.readLine(); + + if (l1==null && l2!=null) DB.println("files do not end at the same line "); + if (l1!=null && l2==null) DB.println("files do not end at the same line "); + if (l1==null ) break; + StringTokenizer t1 = new StringTokenizer(l1,"\t"); + StringTokenizer t2 = new StringTokenizer(l2,"\t"); + int tabs1=0; + while(t1.hasMoreTokens()) { + + t1.nextElement(); + tabs1++; + alltabs1++; + } + + int tabs2=0; + while(t2.hasMoreTokens()) { + + t2.nextElement(); + tabs2++; + alltabs2++; + } + line ++; + if (tabs1!=tabs2) { + DB.println("number of tabs different in line "+line+" file1-tabs "+tabs1+" file2-tabs "+tabs2); + System.exit(0); + } + + + } + DB.println("checked lines "+line+" with tabs in file 1 "+alltabs1+" in file2 "+alltabs2); + + } else { + File f = new File(args[0]); + String[] dir =f.list(); + for(String fx :dir) { + BufferedReader ir = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]+File.separatorChar+fx),"UTF-8"),32768); + System.out.println("check file "+fx); + while(true) { + String l = ir.readLine(); + if (l==null) break; + if (l.endsWith("\t")) { + DB.println("found tab in file "+fx); + break; + } + } + ir.close(); + } + } + + } + + + public CONLLWriter09 (String file) { + + try { + writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file),"UTF8")); + } catch (Exception e) { + e.printStackTrace(); + } + } + + public CONLLWriter09 (Writer writer) { + this.writer = new BufferedWriter(writer); + } + + + + public CONLLWriter09(String outfile, int formatTask) { + this(outfile); + } + + public void write(SentenceData09 inst) throws IOException { + write(inst, NO_ROOT); + } + + /** + * + * @param inst + * @param root true: remove root node + * @throws IOException + */ + public void write(SentenceData09 inst, boolean root) throws IOException { + + int i, mod; + if(root&&(inst.forms[0].startsWith("<root")||(inst.lemmas[0]!=null&&inst.lemmas[0].startsWith("<root")))){ + i=1; mod=0; + } else { + i=0; mod=1; + } + //=()?1:0; + + if (format == this.F_ONE_LINE) { + boolean first =true; + for (; i<inst.length(); i++) { + if (first ){ + first=false; + } else writer.write(" "); + writer.write(inst.plemmas[i]); + } + writer.newLine(); + + return ; + } + + + for (; i<inst.length(); i++) { + + if (inst.id==null|| inst.id[i]==null) {writer.write(Integer.toString(i+mod)); writer.write('\t');} // id + else { writer.write(inst.id[i]); writer.write('\t');} + + writer.write(inst.forms[i]); writer.write('\t'); // form + + if (inst.lemmas!=null && inst.lemmas[i]!=null) { + writer.write(inst.lemmas[i]); + } + else writer.write(DASH); // lemma + writer.write('\t'); + + if (inst.plemmas!=null && inst.plemmas[i]!=null) writer.write(inst.plemmas[i]); + else writer.write(DASH); // plemma + writer.write('\t'); + + if (inst.gpos[i]!=null) writer.write(inst.gpos[i]); // gpos + else writer.write(DASH); + writer.write('\t'); + + if (inst.ppos!=null && inst.ppos[i]!=null) writer.write(inst.ppos[i]); + else writer.write(DASH); // ppos + writer.write('\t'); + + if (inst.ofeats!=null&& inst.ofeats[i]!=null) writer.write(inst.ofeats[i]); + else writer.write(DASH); + writer.write('\t'); + + //writer.write(DASH); writer.write('\t'); // feat + if (inst.pfeats!=null&&inst.pfeats[i]!=null) { + //System.out.println(""+inst.pfeats[i]); + writer.write(inst.pfeats[i]); + } + else writer.write(DASH); + writer.write('\t'); + + + writer.write(Integer.toString(inst.heads[i])); writer.write('\t'); // head + + if (inst.pheads!=null ) writer.write(Integer.toString(inst.pheads[i])); + else writer.write(DASH); + writer.write('\t'); // phead + + if (inst.labels[i]!=null) writer.write(inst.labels[i]); // rel + else writer.write(DASH); + writer.write('\t'); + + if (inst.plabels!=null &&inst.plabels[i]!=null) writer.write(inst.plabels[i]); // rel + else writer.write(DASH); + writer.write('\t'); + + if (inst.fillp!=null && inst.fillp[i]!=null) writer.write(inst.fillp[i]); // fill p + else { + writer.write(DASH); + } + + +// writer.write('\t'); + + + if (inst.sem==null) { + writer.write('\t'); + writer.write(DASH); + + } else { + + + + boolean foundPred =false; + // print the predicate + for (int p =0;p< inst.sem.length;p++) { + if (inst.semposition[p]==i) { + foundPred=true; + // System.out.println("write pred "+inst.sem[p] ); + writer.write('\t'); writer.write(inst.sem[p]); + + // if (inst.sem[p].startsWith(".")) DB.println("error "+inst.sem[p]); + } + } + + if (!foundPred ) { + writer.write('\t'); + writer.write(DASH); +// writer.write('\t'); +// writer.write(DASH); + } + + // print the arguments + for (int p =0;p< inst.sem.length;p++) { + + boolean found =false; + if (inst.arg!=null &&inst.arg.length>p&&inst.arg[p]!=null) + for(int a = 0; a<inst.arg[p].length;a++) { + + if (i==inst.argposition[p][a]) { + writer.write('\t'); writer.write(inst.arg[p][a]); + found = true; + break; + } + + } + if (!found) { + writer.write('\t'); + writer.write(DASH); + } + + + } + + + + + } + writer.newLine(); + } + writer.newLine(); + } + + public void finishWriting () throws IOException { + writer.flush(); + writer.close(); + } + + /** + * Sets the output format such as CoNLL or one line for the lemmata of the sentence (see F_xxxx constants). + * @param formatTask + */ + public void setOutputFormat(int formatTask) { + format =formatTask; + } + + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/io/IOGenerals.java b/dependencyParser/basic/mate-tools/src/is2/io/IOGenerals.java new file mode 100644 index 0000000..456a17f --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/io/IOGenerals.java @@ -0,0 +1,33 @@ +/** + * + */ +package is2.io; + +/** + * @author Dr. Bernd Bohnet, 18.08.2011 + * + * + */ +public class IOGenerals { + + // some constants + public static final String US = "_"; + public static final String REGEX = "\t"; + public static final String STRING = "*"; + public static final String PIPE = "\\|"; + public static final String NO_TYPE = "<no-type>"; + public static final String ROOT_POS = "<root-POS>"; + public static final String ROOT_LEMMA = "<root-LEMMA>"; + public static final String ROOT = "<root>"; + public static final String EMPTY_FEAT = "<ef>"; + + + // the different readers + public static final int F_CONLL09 = 0; + public static final int F_ONE_LINE = 1; + + // normalization of the input + public static final String NUMBER = "[0-9]+|[0-9]+\\.[0-9]+|[0-9]+[0-9,]+"; + public static final String NUM = "<num>"; + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/io/PSReader.java b/dependencyParser/basic/mate-tools/src/is2/io/PSReader.java new file mode 100644 index 0000000..3598b3d --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/io/PSReader.java @@ -0,0 +1,23 @@ +/** + * + */ +package is2.io; + +import is2.data.PSTree; + +/** + * @author Dr. Bernd Bohnet, 07.02.2011 + * + * + */ +public interface PSReader { + + public PSTree getNext(); + + /** + * @param ps + * @param filter + */ + public void startReading(String ps, String[] filter); + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/io/TigerReader.java b/dependencyParser/basic/mate-tools/src/is2/io/TigerReader.java new file mode 100644 index 0000000..2a98b72 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/io/TigerReader.java @@ -0,0 +1,403 @@ +/** + * + */ +package is2.io; + +import is2.data.PSTree; +import is2.util.DB; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Stack; +import java.util.StringTokenizer; + +/** + * @author Dr. Bernd Bohnet, 17.01.2011 + * + * Reads a sentences in Penn Tree Bank bracket style and return sentences. + */ +public class TigerReader implements PSReader { + + BufferedReader inputReader; + ArrayList<File> psFiles = new ArrayList<File>(); + ArrayList<PSTree> psCache = new ArrayList<PSTree>(); + + String filter[] = null; + int startFilter =-1; + int endFilter =-1; + + public TigerReader() {} + + public TigerReader(String file ) { + + try { + inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"ISO-8859-1"),32768); + } catch (Exception e) { + e.printStackTrace(); + } + } + + /** + * @param ps + */ + @Override + public void startReading(String file, String[] filter) { + + + try { + this.filter =filter; + startFilter =filter==null?-1:1; + endFilter =filter==null?-1:1; + + inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"ISO-8859-1"),32768); + } catch (Exception e) { + e.printStackTrace(); + } + + } + + public static class Line { + String form; + String lemma; + String morph; + String pos; + int parent; + String edge; + + + } + + static int stop=0; + + /** + * @return + */ + public PSTree getNext() { + + PSTree ps = null; + String l =null; + ArrayList<Line> lines = new ArrayList<Line>(); + try { + int state=1, terminals=0, nonterminals=0; + while((l = inputReader.readLine())!=null) { + + if (startFilter==1 && l.startsWith("#BOS "+filter[0]) ) { + System.out.println("found start "+l); + startFilter=2; + } + if (endFilter==1 && l.startsWith("#EOS "+filter[1]) ){ + System.out.println("found end "+l); + + endFilter=2; + } + + + if (startFilter==1||endFilter==2) continue; + + if (l.startsWith("#BOS")) { + + state=2; + continue; + } + if (l.startsWith("#500")) state=3; + if (l.startsWith("#EOS")) state=4; + if (state<2) continue; + + if ( state==4) { + + ps = new PSTree(); + ps.create(terminals, nonterminals); + // System.out.println("terminals "+terminals); + //build ps tree + + int cnt=0; + // ps.entries[0] =CONLLReader09.ROOT; + // ps.head[0]=-1; + int root=-1; + for(Line line : lines) { + + /* if (cnt==terminals) { + // insert root + root =cnt; + cnt++; + } + */ + ps.entries[cnt] = line.form; + if (cnt<terminals) ps.pos[cnt] = line.pos; + else ps.entries[cnt] =line.pos; + ps.lemmas[cnt] = line.lemma; + ps.head[cnt] = line.parent==0?lines.size()-1:line.parent>=500?line.parent-500+terminals:line.parent; + // ps.head[cnt] = line.parent==0?lines.size()-1:line.parent>=500?line.parent-500+terminals:line.parent; + ps.morph[cnt]=line.morph; + cnt++; + + } + + if (root==-1) root= terminals; + ps.head[cnt-1]=0; // root + ps.terminalCount=terminals; + lines.clear(); + state=1; + + /* + for(int k=0;k<ps.head.length;k++) { + if (ps.head[k]<terminals && k!=root) { + ps.head[k]=root; + // DB.println("error "+k+" "+ps.head[k]); + } + } + */ + // System.out.println(""+ps.toString()); + // if (stop++ == 4)System.exit(0); + return ps; + } + + + + StringTokenizer t = new StringTokenizer(l,"\t"); + int tc=0; + Line line = new Line(); + lines.add(line); + while(t.hasMoreTokens()) { + String token = t.nextToken(); + if (token.equals("\t"))continue; + if (tc==0) { + if (token.startsWith("#5")||token.startsWith("#6") ) { + nonterminals++; + + } + else { + terminals++; + + //change it back to the wrong format since the conll stuff was derived from this. + // if (token.equals("durchblicken")) token="durchblikken"; + line.form = token; + } + + } else if (tc==1) { + line.lemma=token; + } else if (tc==2) { + line.pos=token; + } else if (tc==3) { + line.morph=token; + } else if (tc==4) { + line.edge=token; + } else if (tc==5) { + line.parent=Integer.parseInt(token); + } + + + if (token.length()>0)tc++; + } + + // read till #EOS + + + } + } catch(Exception e) { + e.printStackTrace(); + } + return ps; + + } + + /** + * @param tree + */ + private void removeTraces(ArrayList<Object> tree) { + + Stack<ArrayList<Object>> s = new Stack<ArrayList<Object>>(); + + s.push(tree); + ArrayList<Object> list =null; + while (!s.isEmpty()) { + + ArrayList<Object> last =list; + list = s.pop(); + for(int k=0;k<list.size();k++) { + Object o = list.get(k); + if(o instanceof String) { + String t = (String)o; + if ((t.endsWith("-1")||t.endsWith("-2")||t.endsWith("-3")||t.endsWith("-4")) && list.size()>(k+1)) { + t = t.substring(0, t.length()-2); + list.set(k, t); + } + + if (t.startsWith("-NONE-")) { + + // remove the bigger surrounding phrase, e.g. (NP (-NONE- *)) + if (last.size()==2 && last.get(0) instanceof String && last.contains(list)) { + ArrayList<Object> rest = remove(tree, last); + if (rest!=null && rest.size()==1){ + rest = remove(tree, rest); + } + } + // remove the phrase only, e.g. (NP (AP nice small) (-NONE- *)) + else { + // there might a phrase with two empty elements (VP (-NONE- *) (-NONE- ...)) +// System.out.println("last "+last+" list "+list ); + ArrayList<Object> rest = remove(tree, list); + removeTraces(rest); + if (rest.size()==1) { + rest = remove(tree, rest); + if (rest!=null && rest.size()==1){ + System.out.println("rest "+rest); + System.exit(0); + } + } + } + continue; + } + } + if (o instanceof ArrayList) { + s.push((ArrayList<Object>)o); + } + } + } + } + + + + + /** + * Remove from tree p + * @param tree phrase structure tree + * @param p elment to remove + */ + private ArrayList<Object> remove(ArrayList<Object> tree, Object p) { + Stack<ArrayList<Object>> s = new Stack<ArrayList<Object>>(); + + s.push(tree); + + while (!s.isEmpty()) { + + ArrayList<Object> list = s.pop(); + for(int k=0;k<list.size();k++) { + Object o = list.get(k); + if (o == p) { + list.remove(p); + return list ; + } + if (o instanceof ArrayList) { + s.push((ArrayList<Object>)o); + } + } + } + return null; + } + + /** + * Count the terminals + * @param current + * @return + */ + private int countTerminals(ArrayList<Object> current) { + + int count =0; + boolean found =false, all =true ; + for(Object o : current) { + if (o instanceof String) found =true; + else { + all =false; + if (o instanceof ArrayList) count +=countTerminals((ArrayList<Object>)o); + } + } + + if (found && all) { + // System.out.println(""+current); + count++; + } + + return count; + } + + /** + * Count the terminals + * @param current + * @return + */ + private int insert(PSTree ps, ArrayList<Object> current, Integer terminal, Integer xxx, int head) { + + boolean found =false, all =true; + String term =null; + String pos =null; + for(Object o : current) { + if (o instanceof String) { + if (found) term =(String)o; + if (!found) pos =(String)o; + found =true; + } else { + all =false; + // if (o instanceof ArrayList) count +=countTerminals((ArrayList<Object>)o); + } + } + + if (found && all) { + + if(term.equals("-LRB-")) term="("; + if(term.equals("-RRB-")) term=")"; + if(term.equals("-LCB-")) term="{"; + if(term.equals("-RCB-")) term="}"; + if(term.contains("1\\/2-year")) term=term.replace("\\/", "/"); + if(term.contains("1\\/2-foot-tall")) term=term.replace("\\/", "/"); + + + ps.entries[ps.terminalCount] =term; + ps.pos[ps.terminalCount]=pos; + ps.head[ps.terminalCount]=head; + // System.out.println("terminal "+term+" "+ps.terminal+" head "+head); + ps.terminalCount ++; + } else if (found && ! all) { + if(pos.startsWith("NP-SBJ")) pos="NP-SBJ"; + if(pos.startsWith("WHNP")) pos="WHNP"; + + ps.entries[ps.non] =pos; + ps.head[ps.non]=head; + // System.out.println("non terminal "+pos+" "+ps.non+" head "+ head); + int non =ps.non ++; + + for (Object o : current) { + if (o instanceof ArrayList) { + insert(ps,(ArrayList<Object>)o,terminal,ps.non, non); + } + } + } + if(!all && !found)for (Object o : current) { + if (o instanceof ArrayList) { + insert(ps,(ArrayList<Object>)o,terminal,0, ps.non-1); + } + } + return terminal; + } + + + /** + * Count the terminals + * @param current + * @return + */ + private int countNonTerminals(ArrayList<Object> current) { + + int count =0; + boolean found =false, all =true ; + for(Object o : current) { + if (o instanceof String) found =true; + else { + all =false; + if (o instanceof ArrayList) count +=countNonTerminals((ArrayList<Object>)o); + } + } + + if (found && !all) count++; + + return count; + } + + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/lemmatizer/Evaluator.java b/dependencyParser/basic/mate-tools/src/is2/lemmatizer/Evaluator.java new file mode 100755 index 0000000..b333c62 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/lemmatizer/Evaluator.java @@ -0,0 +1,105 @@ +package is2.lemmatizer; + +import is2.data.SentenceData09; +import is2.io.CONLLReader09; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.Hashtable; +import java.util.Map.Entry; + + +public class Evaluator { + + public static void evaluate (String act_file, String pred_file, String format) throws Exception { + + CONLLReader09 goldReader = new CONLLReader09(act_file, CONLLReader09.NO_NORMALIZE); + CONLLReader09 predictedReader = new CONLLReader09(pred_file,CONLLReader09.NO_NORMALIZE); + // predictedReader.startReading(pred_file); + + + Hashtable<String,Integer> errors = new Hashtable<String,Integer>(); + + + int total = 0, corr = 0, corrL = 0, corrT=0; + int numsent = 0, corrsent = 0, corrsentL = 0; + SentenceData09 goldInstance = goldReader.getNext(); + SentenceData09 predInstance = predictedReader.getNext(); + + while(goldInstance != null) { + + int instanceLength = goldInstance.length(); + + if (instanceLength != predInstance.length()) + System.out.println("Lengths do not match on sentence "+numsent); + + + String gold[] = goldInstance.lemmas; + String pred[] = predInstance.plemmas; + + + boolean whole = true; + boolean wholeL = true; + + // NOTE: the first item is the root info added during nextInstance(), so we skip it. + + for (int i = 1; i < instanceLength; i++) { + if (gold[i].toLowerCase().equals(pred[i].toLowerCase())) corrT++; + + if (gold[i].equals(pred[i])) corrL++; + else { + + // System.out.println("error gold:"+goldPos[i]+" pred:"+predPos[i]+" "+goldInstance.forms[i]+" snt "+numsent+" i:"+i); + String key = "gold: '"+gold[i]+"' pred: '"+pred[i]+"'"; + Integer cnt = errors.get(key); + if (cnt==null) { + errors.put(key,1); + } else { + errors.put(key,cnt+1); + } + } + + } + total += instanceLength - 1; // Subtract one to not score fake root token + + if(whole) corrsent++; + if(wholeL) corrsentL++; + numsent++; + + goldInstance = goldReader.getNext(); + predInstance = predictedReader.getNext(); + } + ArrayList<Entry<String, Integer>> opsl = new ArrayList<Entry<String, Integer>>(); + for(Entry<String, Integer> e : errors.entrySet()) { + opsl.add(e); + } + + Collections.sort(opsl, new Comparator<Entry<String, Integer>>(){ + + @Override + public int compare(Entry<String, Integer> o1, + Entry<String, Integer> o2) { + + return o1.getValue()==o2.getValue()?0:o1.getValue()>o2.getValue()?1:-1; + } + + + }); + + for(Entry<String, Integer> e : opsl) { + // System.out.println(e.getKey()+" "+e.getValue()); + } + + System.out.println("Tokens: " + total+" Correct: " + corrT+" "+(float)corrT/total+" correct uppercase "+(float)corrL/total); + } + + public static void main (String[] args) throws Exception { + String format = "CONLL"; + if (args.length > 2) + format = args[2]; + + evaluate(args[0], args[1], format); + } + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/lemmatizer/Lemmatizer.java b/dependencyParser/basic/mate-tools/src/is2/lemmatizer/Lemmatizer.java new file mode 100755 index 0000000..33756dd --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/lemmatizer/Lemmatizer.java @@ -0,0 +1,535 @@ +package is2.lemmatizer; + + +import is2.data.Cluster; +import is2.data.F2SF; +import is2.data.FV; +import is2.data.Instances; +import is2.data.InstancesTagger; +import is2.data.Long2Int; +import is2.data.ParametersFloat; +import is2.data.PipeGen; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; +import is2.io.CONLLWriter09; +import is2.tools.IPipe; +import is2.tools.Tool; +import is2.tools.Train; +import is2.util.DB; +import is2.util.OptionsSuper; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Map.Entry; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; +import java.util.zip.ZipOutputStream; + + + +public class Lemmatizer implements Tool, Train { + + public Pipe pipe; + public ParametersFloat params; + private Long2Int li; + + private boolean doUppercase=false; + + private long[] vs= new long[40]; + + + + /** + * Creates a lemmatizer due to the model stored in modelFileName + * @param modelFileName the path and file name to a lemmatizer model + */ + public Lemmatizer(String modelFileName) { + + // tell the lemmatizer the location of the model + try { + Options m_options = new Options(new String[] {"-model", modelFileName}); + li = new Long2Int(m_options.hsize); + + // initialize the lemmatizer + readModel(m_options); + + } catch (IOException e) { + e.printStackTrace(); + } + } + + + + + + public Lemmatizer(boolean doUppercase) {this.doUppercase=doUppercase; } + + + + public static void main (String[] args) throws FileNotFoundException, Exception + { + + Options options = new Options(args); + Lemmatizer lemmatizer = new Lemmatizer(options.upper); + + long start = System.currentTimeMillis(); + + + if (options.train) { + + + lemmatizer.li = new Long2Int(options.hsize); + lemmatizer.pipe = new Pipe (options,lemmatizer.li); + + InstancesTagger is = lemmatizer.pipe.createInstances(options.trainfile); + + DB.println("Features: " + lemmatizer.pipe.mf.size()+" Operations "+lemmatizer.pipe.mf.getFeatureCounter().get(Pipe.OPERATION)); + + ParametersFloat params = new ParametersFloat(lemmatizer.li.size()); + + lemmatizer.train(options,lemmatizer.pipe,params,is); + + lemmatizer.writeModel(options, lemmatizer.pipe, params); + } + + if (options.test) { + + lemmatizer.readModel(options); + + lemmatizer.out(options,lemmatizer.pipe, lemmatizer.params); + } + + System.out.println(); + + if (options.eval) { + System.out.println("\nEVALUATION PERFORMANCE:"); + Evaluator.evaluate(options.goldfile, options.outfile,options.format); + } + long end = System.currentTimeMillis(); + System.out.println("used time "+((float)((end-start)/100)/10)); + } + + /* (non-Javadoc) + * @see is2.tools.Train#writeModel(is2.util.OptionsSuper, is2.tools.IPipe, is2.data.ParametersFloat) + */ + @Override + public void writeModel(OptionsSuper options, IPipe pipe, + ParametersFloat params) { + try { + // store the model + ZipOutputStream zos = new ZipOutputStream(new BufferedOutputStream(new FileOutputStream(options.modelName))); + zos.putNextEntry(new ZipEntry("data")); + DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(zos)); + + this.pipe.mf.writeData(dos); + + dos.flush(); + params.write(dos); + + pipe.write(dos); + + dos.writeBoolean(this.doUppercase); + + dos.flush(); + dos.close(); + } catch(Exception e) { + e.printStackTrace(); + } + } + + + public void readModel(OptionsSuper options) { + + try { + + // load the model + ZipInputStream zis = new ZipInputStream(new BufferedInputStream(new FileInputStream(options.modelName))); + zis.getNextEntry(); + DataInputStream dis = new DataInputStream(new BufferedInputStream(zis)); + + MFO mf = new MFO(); + mf.read(dis); + params = new ParametersFloat(0); + params.read(dis); + li =new Long2Int(params.size()); + pipe = new Pipe(options, li); + pipe.mf =mf; + + pipe.initFeatures(); + pipe.initValues(); + + pipe.readMap(dis); + + for(Entry<String,Integer> e : mf.getFeatureSet().get(Pipe.OPERATION).entrySet()) { + this.pipe.types[e.getValue()] = e.getKey(); + // System.out.println("set pos "+e.getKey()); + } + + + pipe.cl = new Cluster(dis); + + if (dis.available()>0) this.doUppercase = dis.readBoolean(); + + + dis.close(); + DB.println("Loading data finished. "); + + DB.println("number of params "+params.parameters.length); + DB.println("number of classes "+pipe.types.length); + + } catch (Exception e ) { + e.printStackTrace(); + } + + } + + + + /** + * Do the training + * @param instanceLengths + * @param options + * @param pipe + * @param params + * @param li + * @throws IOException + * @throws InterruptedException + * @throws ClassNotFoundException + */ + public void train(OptionsSuper options, IPipe p, ParametersFloat params, Instances ist) { + + InstancesTagger is = (InstancesTagger)ist; + + int i = 0,del=0; + FV g = new FV(), f = new FV(); + + int LC = this.pipe.types.length+1, UC = LC+1; + + String wds[] = MFO.reverse(pipe.mf.getFeatureSet().get(Pipe.WORD)); + + F2SF fs = params.getFV(); + double upd=0; + + for(i = 0; i < options.numIters; i++) { + + System.out.print("Iteration "+i+": "); + + long start = System.currentTimeMillis(); + int numInstances = is.size(); + int correct =0,count=0; + + long last= System.currentTimeMillis(); + int wrongOp=0,correctOp=0, correctUC=0, wrongUC=0; + + HashMap<String,Integer> map = new HashMap<String,Integer>(); + + for(int n = 0; n < numInstances; n++) { + + if((n+1) % 500 == 0) del= Pipe.outValueErr(n+1, (float)(count-correct),(float)correct/(float)count,del,last,upd); + + upd = (double)(options.numIters*numInstances - (numInstances*i+(n+1))+ 1); + + for(int k = 0; k < is.length(n); k++) { + + double best = -1000; + String bestOp=""; + + + + count++; + pipe.addCoreFeatures(is, n, k, 0,wds[is.forms[n][k]], vs); + + String lemma = pipe.opse.get(wds[is.forms[n][k]].toLowerCase()); + + + // predict + if (lemma==null) + for(int t = 0; t < pipe.types.length; t++) { + + fs.clear(); + for(int l=vs.length-1;l>=0;l--) if (vs[l]>0) fs.add(li.l2i(vs[l]+(t*Pipe.s_type))); + + float score = (float) fs.getScore(); + if (score >best) { + bestOp = pipe.types[t]; + best =score; + } + } + + if (doUppercase) { + fs.clear(); + for(int l=vs.length-1;l>=0;l--) if (vs[l]>0) fs.add(li.l2i(vs[l]+(LC*Pipe.s_type))); + + int correctOP =-1, selectedOP =-1; + if (wds[is.glemmas[n][k]].length()>0 && + Character.isUpperCase(wds[is.glemmas[n][k]].charAt(0)) && + fs.score > 0) { + + correctOP = UC; + selectedOP =LC; + } else if (wds[is.glemmas[n][k]].length()>0 + &&Character.isLowerCase(wds[is.glemmas[n][k]].charAt(0)) && + fs.score <= 0) { + + + correctOP = LC; + selectedOP =UC; + } + + if (correctOP!=-1 && wds[is.glemmas[n][k]].length()>0) { + + wrongUC++; + f.clear(); + for(int l=vs.length-1;l>=0;l--) if (vs[l]>0) f.add(li.l2i(vs[l]+(selectedOP*Pipe.s_type))); + + g.clear(); + for(int l=vs.length-1;l>=0;l--) if (vs[l]>0) g.add(li.l2i(vs[l]+(correctOP*Pipe.s_type))); + + double lam_dist = params.getScore(g) - params.getScore(f);//f + double loss = 1 - lam_dist; + + FV dist = g.getDistVector(f); + dist.update(params.parameters, params.total, params.update(dist,loss), upd,false); + + } else { + correctUC++; + } + } + if (lemma!=null) { + correct++; + correctOp++; + continue; + } + + + String op = Pipe.getOperation(is,n, k,wds); + if (op.equals(bestOp) ) { + correct++; + correctOp++; + continue; + } + wrongOp++; + + f.clear(); + int bop =pipe.mf.getValue(Pipe.OPERATION, bestOp); + for(int r=vs.length-1;r>=0;r--) if (vs[r]>0)f.add(li.l2i(vs[r]+(bop*Pipe.s_type))); + + g.clear(); + int gop =pipe.mf.getValue(Pipe.OPERATION, op); + for(int r=vs.length-1;r>=0;r--) if (vs[r]>0)g.add(li.l2i(vs[r]+(gop*Pipe.s_type))); + double lam_dist = params.getScore(g) - params.getScore(f);//f + + double loss = 1 - lam_dist; + + FV dist = g.getDistVector(f); + + dist.update(params.parameters, params.total, params.update(dist,loss), upd,false); //0.05 + + } + + } + ArrayList<Entry<String, Integer>> opsl = new ArrayList<Entry<String, Integer>>(); + for(Entry<String, Integer> e : map.entrySet()) { + if(e.getValue()>1) { + opsl.add(e); + } + } + + Collections.sort(opsl, new Comparator<Entry<String, Integer>>(){ + @Override + public int compare(Entry<String, Integer> o1, + Entry<String, Integer> o2) { + + return o1.getValue()==o2.getValue()?0:o1.getValue()>o2.getValue()?1:-1; + } + }); + + if (opsl.size()>0) System.out.println(); + for(Entry<String, Integer> e : opsl) { + System.out.println(e.getKey()+" "+e.getValue()); + } + map.clear(); + + del= Pipe.outValueErr(numInstances, (float)(count-correct), (float)correct/(float)count,del,last,upd, + "time "+(System.currentTimeMillis()-start)+ + " corr/wrong "+correctOp+" "+wrongOp+" uppercase corr/wrong "+correctUC+" "+wrongUC); + del=0; + System.out.println(); + } + + params.average(i*is.size()); + + } + + + /** + * Do the work + * @param options + * @param pipe + * @param params + * @throws IOException + */ + public void out (OptionsSuper options, IPipe pipe, ParametersFloat params) { + + long start = System.currentTimeMillis(); + + CONLLReader09 depReader = new CONLLReader09(options.testfile, CONLLReader09.NO_NORMALIZE); + depReader.setInputFormat(options.formatTask); + CONLLWriter09 depWriter = new CONLLWriter09(options.outfile); + depWriter.setOutputFormat(options.formatTask); + + System.out.print("Processing Sentence: "); + + int cnt = 0; + int del=0; + + try { + + while(true) { + + InstancesTagger is = new InstancesTagger(); + + is.init(1, new MFO()); + SentenceData09 instance = depReader.getNext(is);//pipe.nextInstance(null, depReader); + + if (instance==null) break; + is.fillChars(instance, 0, Pipe._CEND); + cnt++; + SentenceData09 i09 =lemmatize(is, instance, this.li); + + if(options.normalize) for(int k=0;k<i09.length();k++) { + boolean save = depReader.normalizeOn; + depReader.normalizeOn =true; + i09.plemmas[k] = depReader.normalize(i09.plemmas[k]); + depReader.normalizeOn = save; + } + + if (options.overwritegold) i09.lemmas = i09.plemmas; + + + + depWriter.write(i09); + + if (cnt%100 ==0) del=Pipe.outValue(cnt, del); + + } + depWriter.finishWriting(); + del=Pipe.outValue(cnt, del); + long end = System.currentTimeMillis(); + + System.out.println(PipeGen.getSecondsPerInstnace(cnt,(end-start))); + System.out.println(PipeGen.getUsedTime(end-start)); + } catch(Exception e) { + e.printStackTrace(); + } + } + + + private SentenceData09 lemmatize(InstancesTagger is, SentenceData09 instance, Long2Int li) { + + int LC = pipe.types.length+1; + + is.feats[0] = new short[instance.length()][11]; + + is.fillChars(instance, 0, Pipe._CEND); + + int length = instance.length(); + + F2SF fs = new F2SF(params.parameters); + + + for(int w1 = 0; w1 < length; w1++) { + instance.plemmas[w1]="_"; + pipe.addCoreFeatures(is, 0, w1, 0,instance.forms[w1], vs); + + String f =null; + if (is.forms[0][w1]!=-1) { + f = pipe.opse.get(instance.forms[w1].toLowerCase()); + if (f!=null) { + instance.plemmas[w1]=f; + } + } + double best = -1000.0; + int bestOp=0; + + for(int t = 0; t < pipe.types.length; t++) { + + fs.clear(); + for(int l=vs.length-1;l>=0;l--) if (vs[l]>0) fs.add(li.l2i(vs[l]+(t*Pipe.s_type))); + + if (fs.score >=best) { + best =fs.score; + bestOp=t; + } + } + //instance.ppos[w1]=""+bestOp; + if (f==null) instance.plemmas[w1] = StringEdit.change((doUppercase?instance.forms[w1]:instance.forms[w1].toLowerCase()),pipe.types[bestOp]); + + // check for empty string + if(instance.plemmas[w1].length()==0) instance.plemmas[w1] = "_"; + + if(doUppercase){ + fs.clear(); + for(int l=vs.length-1;l>=0;l--) if (vs[l]>0) fs.add(li.l2i(vs[l]+(LC*Pipe.s_type))); + + + try { + + if (fs.score<=0 && instance.plemmas[w1].length()>1) { + instance.plemmas[w1] = Character.toUpperCase(instance.plemmas[w1].charAt(0))+instance.plemmas[w1].substring(1); + } else if (fs.score<=0 && instance.plemmas[w1].length()>0) { + instance.plemmas[w1] = String.valueOf(Character.toUpperCase(instance.plemmas[w1].charAt(0))); + } else if (fs.score>0) { + instance.plemmas[w1] = instance.plemmas[w1].toLowerCase(); + } + + } catch(Exception e){ + e.printStackTrace(); + // System.out.println("error "+pipe.types[bestOp]+" "+instance.forms[w1]); + } + } + } + + + SentenceData09 i09 = new SentenceData09(instance); + i09.createSemantic(instance); + return i09; + } + + + /* (non-Javadoc) + * @see is2.tools.Tool#apply(is2.data.SentenceData09) + */ + @Override + public SentenceData09 apply(SentenceData09 snt) { + InstancesTagger is = new InstancesTagger(); + + // be robust + if (snt.length()== 0) return snt; + + SentenceData09 it = new SentenceData09(); + it.createWithRoot(snt); + + + is.init(1, new MFO()); + is.createInstance09(it.length()); + is.fillChars(it, 0, Pipe._CEND); + + for(int j = 0; j < it.length(); j++) is.setForm(0, j, it.forms[j]); + + return lemmatize(is, it,li); + } + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/lemmatizer/MFO.java b/dependencyParser/basic/mate-tools/src/is2/lemmatizer/MFO.java new file mode 100755 index 0000000..249ca42 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/lemmatizer/MFO.java @@ -0,0 +1,257 @@ +package is2.lemmatizer; + + +import is2.data.IEncoder; +import is2.data.IEncoderPlus; +import is2.data.IFV; +import is2.data.Long2IntInterface; +import is2.util.DB; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map.Entry; + +/** + * Map Features, do not map long to integer + * + * @author Bernd Bohnet, 20.09.2009 + */ + +final public class MFO implements IEncoderPlus { + + /** The features and its values */ + static private final HashMap<String,HashMap<String,Integer>> m_featureSets = new HashMap<String,HashMap<String,Integer>>(); + + /** The feature class and the number of values */ + static private final HashMap<String,Integer> m_featureCounters = new HashMap<String,Integer>(); + + /** The number of bits needed to encode a feature */ + static final HashMap<String,Integer> m_featureBits = new HashMap<String,Integer>(); + + /** Integer counter for long2int */ + static private int count=0; + + /** Stop growing */ + public boolean stop=false; + + final public static String NONE="<None>"; + + + + + + + + public MFO () {} + + + public int size() {return count;} + + + + /** + * Register an attribute class, if it not exists and add a possible value + * @param type + * @param type2 + */ + final public int register(String a, String v) { + + HashMap<String,Integer> fs = getFeatureSet().get(a); + if (fs==null) { + fs = new HashMap<String,Integer>(); + getFeatureSet().put(a, fs); + fs.put(NONE, 0); + getFeatureCounter().put(a, 1); + } + Integer c = getFeatureCounter().get(a); + + Integer i = fs.get(v); + if (i==null) { + fs.put(v, c); + c++; + getFeatureCounter().put(a,c); + return c-1; + } else return i; + } + + /** + * Calculates the number of bits needed to encode a feature + */ + public void calculateBits() { + + int total=0; + for(Entry<String,Integer> e : getFeatureCounter().entrySet() ){ + int bits =(int)Math.ceil((Math.log(e.getValue()+1)/Math.log(2))); + m_featureBits.put(e.getKey(), bits); + total+=bits; + // System.out.println(" "+e.getKey()+" bits "+bits+" number "+(e.getValue()+1)); + } + +// System.out.println("total number of needed bits "+total); + } + + + + public String toString() { + + StringBuffer content = new StringBuffer(); + for(Entry<String,Integer> e : getFeatureCounter().entrySet() ){ + content.append(e.getKey()+" "+e.getValue()); + content.append(':'); + // HashMap<String,Integer> vs = getFeatureSet().get(e.getKey()); + content.append(getFeatureBits(e.getKey())); + + /*if (vs.size()<120) + for(Entry<String,Integer> e2 : vs.entrySet()) { + content.append(e2.getKey()+" ("+e2.getValue()+") "); + }*/ + content.append('\n'); + + } + return content.toString(); + } + + + + static final public short getFeatureBits(String a) { + if(m_featureBits.get(a)==null) return 0; + return (short)m_featureBits.get(a).intValue(); + } + + + + /** + * Get the integer place holder of the string value v of the type a + * + * @param t the type + * @param v the value + * @return the integer place holder of v + */ + final public int getValue(String t, String v) { + + if (m_featureSets.get(t)==null) return -1; + Integer vi = m_featureSets.get(t).get(v); + if (vi==null) return -1; //stop && + return vi.intValue(); + } + + /** + * Static version of getValue + * @see getValue + */ + static final public int getValueS(String a, String v) { + + if (m_featureSets.get(a)==null) return -1; + Integer vi = m_featureSets.get(a).get(v); + if (vi==null) return -1; //stop && + return vi.intValue(); + } + + public int hasValue(String a, String v) { + + Integer vi = m_featureSets.get(a).get(v); + if (vi==null) return -1; + return vi.intValue(); + } + + + public static String printBits(int k) { + StringBuffer s = new StringBuffer(); + for(int i =0;i<31;i++) { + s.append((k&0x00000001)==1?'1':'0'); + k=k>>1; + + } + s.reverse(); + return s.toString(); + } + + + + + + + + /** + * Maps a long to a integer value. This is very useful to save memory for sparse data long values + * @param l + * @return the integer + */ + static public int misses = 0; + static public int good = 0; + + + + + /** + * Write the data + * @param dos + * @throws IOException + */ + static public void writeData(DataOutputStream dos) throws IOException { + dos.writeInt(getFeatureSet().size()); + // DB.println("write"+getFeatureSet().size()); + for(Entry<String, HashMap<String,Integer>> e : getFeatureSet().entrySet()) { + dos.writeUTF(e.getKey()); + dos.writeInt(e.getValue().size()); + + for(Entry<String,Integer> e2 : e.getValue().entrySet()) { + + if(e2.getKey()==null) DB.println("key "+e2.getKey()+" value "+e2.getValue()+" e -key "+e.getKey()); + dos.writeUTF(e2.getKey()); + dos.writeInt(e2.getValue()); + + } + + } + } + public void read(DataInputStream din) throws IOException { + + int size = din.readInt(); + for(int i=0; i<size;i++) { + String k = din.readUTF(); + int size2 = din.readInt(); + + HashMap<String,Integer> h = new HashMap<String,Integer>(); + getFeatureSet().put(k,h); + for(int j = 0;j<size2;j++) { + h.put(din.readUTF(), din.readInt()); + } + getFeatureCounter().put(k, size2); + } + + count =size; + // stop(); + calculateBits(); + } + + + /** + * Clear the data + */ + static public void clearData() { + getFeatureSet().clear(); + m_featureBits.clear(); + getFeatureSet().clear(); + } + + public HashMap<String,Integer> getFeatureCounter() { + return m_featureCounters; + } + + static public HashMap<String,HashMap<String,Integer>> getFeatureSet() { + return m_featureSets; + } + + static public String[] reverse(HashMap<String,Integer> v){ + String[] set = new String[v.size()]; + for(Entry<String,Integer> e : v.entrySet()) { + set[e.getValue()]=e.getKey(); + } + return set; + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/lemmatizer/Options.java b/dependencyParser/basic/mate-tools/src/is2/lemmatizer/Options.java new file mode 100755 index 0000000..a4b9e69 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/lemmatizer/Options.java @@ -0,0 +1,67 @@ +package is2.lemmatizer; + +import java.io.File; +import java.io.IOException; + +import is2.util.OptionsSuper; + + +public final class Options extends OptionsSuper { + + + public Options (String[] args) throws IOException { + + + + for(int i = 0; i < args.length; i++) { + + if (args[i].equals("--help")) explain(); + + if (args[i].equals("-normalize")) { + normalize=Boolean.parseBoolean(args[++i]); + } else if (args[i].equals("-features")) { + features= args[i+1]; i++; + } else if (args[i].equals("-hsize")) { + hsize= Integer.parseInt(args[i+1]); i++; + } else if (args[i].equals("-len")) { + maxLen= Integer.parseInt(args[i+1]); i++; + } else if (args[i].equals("-tmp")) { + tmp = args[i+1]; i++; + } else if (args[i].equals("-uc")) { + upper=true; + System.out.println("set uppercase "+upper); + + } else super.addOption(args, i); + + } + + if (trainfile!=null) { + + + if (tmp!=null) trainforest = File.createTempFile("train", ".tmp", new File(tmp)); + else trainforest = File.createTempFile("train", ".tmp"); //,new File("F:\\") + trainforest.deleteOnExit(); + } + + + + + } + + private void explain() { + System.out.println("Usage: "); + System.out.println("java -class mate.jar is2.lemmatizer.Lemmatizer [Options]"); + System.out.println(); + System.out.println("Options:"); + System.out.println(""); + System.out.println(" -train <file> the corpus a model is trained on; default "+this.trainfile); + System.out.println(" -test <file> the input corpus for testing; default "+this.testfile); + System.out.println(" -out <file> the output corpus (result) of a test run; default "+this.outfile); + System.out.println(" -model <file> the parsing model for traing the model is stored in the files"); + System.out.println(" and for parsing the model is load from this file; default "+this.modelName); + System.out.println(" -i <number> the number of training iterations; good numbers are 10 for smaller corpora and 6 for bigger; default "+this.numIters); + System.out.println(" -count <number> the n first sentences of the corpus are take for the training default "+this.count); + + System.exit(0); + } +} diff --git a/dependencyParser/basic/mate-tools/src/is2/lemmatizer/Pipe.java b/dependencyParser/basic/mate-tools/src/is2/lemmatizer/Pipe.java new file mode 100755 index 0000000..37647ee --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/lemmatizer/Pipe.java @@ -0,0 +1,585 @@ +package is2.lemmatizer; + +import is2.data.Cluster; +import is2.data.D4; +import is2.data.Instances; +import is2.data.InstancesTagger; +import is2.data.PipeGen; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; +import is2.tools.IPipe; +import is2.util.DB; +import is2.util.OptionsSuper; +import is2.data.Long2Int; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map.Entry; + + + + +final public class Pipe extends PipeGen implements IPipe { + + + private static final int _MIN_WORDS_MAPPED_BY_SCRIPT = 1; + private static final int _MIN_OCCURENT_FOR_SCRIPT_USE = 4; + + private static final String _F0 = "F0"; + private static final String _F1 = "F1",_F2 = "F2",_F3 = "F3",_F4 = "F4",_F5 = "F5",_F6= "F6",_F7= "F7",_F8= "F8",_F9="F9",_F10 = "F10"; + private static final String _F11="F11",_F12="F12",_F13= "F13",_F14="F14",_F15="F15",_F16="F16",_F17="F17",_F18="F18",_F19="F19",_F20="F20"; + private static final String _F21="F21",_F22="F22",_F23= "F23",_F24="F24",_F25="F25",_F26="F26",_F27="F27",_F28="F28",_F29="F29",_F30="F30"; + private static final String _F31="F31",_F32="F32",_F33= "F33",_F34="F34",_F35="F35",_F36="F36",_F37="F37",_F38="F38",_F39="F39",_F40="F40"; + private static final String _F41="F41"; + + private static int _f0,_f1,_f2,_f3,_f4,_f5,_f6,_f7,_f8,_f9,_f10,_f11,_f12,_f13,_f14,_f15,_f16,_f17,_f18,_f19,_f20; + private static int _f21,_f22,_f23,_f24,_f25,_f26,_f27,_f28,_f29,_f30,_f31,_f32,_f33,_f34,_f35,_f36,_f37,_f38,_f39,_f41; + public static int _CEND,_swrd,_ewrd; + + public static final String MID = "MID", END = "END",STR = "STR",OPERATION = "OP"; + + private CONLLReader09 depReader; + + + public HashMap<String,String> opse = new HashMap<String, String> (); + + public String[] types; + + + public MFO mf =new MFO(); + private D4 z, x; + + + Cluster cl; + OptionsSuper options; + Long2Int li; + + public Pipe (OptionsSuper options2, Long2Int l) { + + options=options2; + li=l; + } + + + public InstancesTagger createInstances(String file) { + + InstancesTagger is = new InstancesTagger(); + + depReader = new CONLLReader09(CONLLReader09.NO_NORMALIZE); + + depReader.startReading(file); + mf.register(REL,"<root-type>"); + mf.register(POS,"<root-POS>"); + + + System.out.print("Registering feature parts "); + HashMap<String,Integer> ops = new HashMap<String, Integer> (); + HashMap<String,HashSet<String>> op2form = new HashMap<String, HashSet<String>> (); + int ic=0; + int del=0; + HashSet<String> rm = new HashSet<String> (); + + while(true) { + SentenceData09 instance1 = depReader.getNext(); + if (instance1== null) break; + ic++; + if (ic % 100 ==0) {del = outValue(ic, del);} + + + String[] labs1 = instance1.labels; + for(int i1 = 0; i1 < labs1.length; i1++) { + //typeAlphabet.lookupIndex(labs1[i1]); + mf.register(REL, labs1[i1]); + } + + String[] w = instance1.forms; + for(int i1 = 0; i1 < w.length; i1++) { + // saw the first time? + if (mf.getValue(WORD, w[i1].toLowerCase())==-1) + opse.put(instance1.forms[i1].toLowerCase(), instance1.lemmas[i1]); + + mf.register(WORD, w[i1].toLowerCase()); + } + for(int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1]); + + w = instance1.lemmas; + for(int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1]); + for(int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1].toLowerCase()); + + w = instance1.plemmas; + for(int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1]); + for(int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1].toLowerCase()); + + + for(int i1 = 0; i1 < w.length; i1++) registerChars(CHAR, w[i1]); + + w = instance1.ppos; + for(int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); + + w = instance1.gpos; + for(int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); + + + for(int i1 = 1; i1 < w.length; i1++) { + String op = getOperation(instance1, i1); + if (ops.get(op)==null) ops.put(op, 1); + else { + ops.put(op, (ops.get(op)+1)); + if (ops.get(op)>4) rm.add(instance1.forms[i1].toLowerCase()); + } + + + HashSet<String> forms = op2form.get(op); + if (forms==null) { + forms = new HashSet<String>(); + op2form.put(op, forms); + } + forms.add(instance1.forms[i1].toLowerCase()); + + } + + } + + int countFreqSingleMappings =0; + + int sc=0; + ArrayList<Entry<String, Integer>> opsl = new ArrayList<Entry<String, Integer>>(); + for(Entry<String, Integer> e : ops.entrySet()) { + + // do not use scripts for infrequent cases or frequent single mappings (der -> die) + if(e.getValue()>_MIN_OCCURENT_FOR_SCRIPT_USE && op2form.get(e.getKey()).size()>_MIN_WORDS_MAPPED_BY_SCRIPT) { + mf.register(OPERATION, e.getKey()); + sc++; + opsl.add(e); + } else { + // do not remove the infrequent cases + rm.removeAll(op2form.get(e.getKey())); + + if (op2form.get(e.getKey()).size()<=1) countFreqSingleMappings+=op2form.get(e.getKey()).size(); + } + } + for(String k : rm) { + opse.remove(k); + } + + Collections.sort(opsl, new Comparator<Entry<String, Integer>>(){ + + @Override + public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) { + + return o1.getValue()==o2.getValue()?0:o1.getValue()>o2.getValue()?1:-1; + } + }); + + + + for(Entry<String, Integer> e : opsl) { + // System.out.println(e.getKey()+" "+e.getValue()); + } + + + if (options.clusterFile==null)cl = new Cluster(); + else cl= new Cluster(options.clusterFile, mf,6); + + + System.out.println("\nfound scripts "+ops.size()+" used scripts "+sc); + System.out.println("found mappings of single words "+countFreqSingleMappings); + System.out.println("use word maps instead of scripts "+this.opse.size()); + // System.out.println(" "+opse); + System.out.println(""+mf.toString()); + + initFeatures(); + + mf.calculateBits(); + initValues(); + + depReader.startReading(options.trainfile); + + int i = 0; + long start1 = System.currentTimeMillis(); + + System.out.print("Creating Features: "); + is.init(ic, mf) ; + del=0; + while(true) { + try { + if (i % 100 ==0) {del = outValue(i, del);} + SentenceData09 instance1 = depReader.getNext(is); + if (instance1== null) break; + + is.fillChars(instance1, i, _CEND); + + if (i>options.count) break; + + i++; + } catch(Exception e) { + DB.println("error in sentnence "+i); + e.printStackTrace(); + } + } + long end1 = System.currentTimeMillis(); + System.gc(); + long mem2 = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory(); + System.out.print(" time "+(end1-start1)+" mem "+(mem2/1024)+" kb"); + + types = new String[mf.getFeatureCounter().get(OPERATION)]; + + for(Entry<String,Integer> e : mf.getFeatureSet().get(OPERATION).entrySet()) { + types[e.getValue()] = e.getKey(); + // System.out.println("set pos "+e.getKey()); + } + + System.out.println("Num Features: " + mf.size()); + + + + return is; + + } + + + /** + * @param is + * @param n + * @param k + * @param wds + * @return + */ + public static String getOperation(Instances is, int n, int k, String[] wds) { + + + String form = wds[is.forms[n][k]]; + String olemma = wds[is.glemmas[n][k]]; + + String s = new StringBuffer(form.toLowerCase()).reverse().toString(); + String t = new StringBuffer(olemma.toLowerCase()).reverse().toString(); + + + + return getOperation2(s, t); + } + + + + public static String getOperation(SentenceData09 instance1, int i1) { + String s = new StringBuffer(instance1.forms[i1].toLowerCase()).reverse().toString(); + String t = new StringBuffer(instance1.lemmas[i1].toLowerCase()).reverse().toString(); + + + + return getOperation2(s, t); + } + + public static String getOperation(String si, String ti) { + String s = new StringBuffer(si.toLowerCase()).reverse().toString(); + String t = new StringBuffer(ti.toLowerCase()).reverse().toString(); + + + + return getOperation2(s, t); + } + + + private static String getOperation2(String s, String t) { + StringBuffer po = new StringBuffer(); + String op; + if (!s.equals(t)) { + + + int[][] d =StringEdit.LD(s, t); + StringEdit.searchPath(s,t,d, po, false); + op = po.toString(); + + } else op ="0"; // do nothing + return op; + } + + + + private void registerChars(String type, String word) { + for(int i=0;i<word.length();i++) mf.register(type, Character.toString(word.charAt(i))); + } + + + + public void initValues() { + + z = new D4(li); + + x = new D4(li); + x.a0=s_type; + + s_pos = mf.getFeatureCounter().get(POS).intValue();//mf.getFeatureBits(POS); + s_word = mf.getFeatureCounter().get(WORD); + s_type = mf.getFeatureCounter().get(TYPE).intValue();//mf.getFeatureBits(TYPE); + s_char = mf.getFeatureCounter().get(CHAR).intValue();//mf.getFeatureBits(CHAR); + s_oper = mf.getFeatureCounter().get(OPERATION).intValue();//mf.getFeatureBits(OPERATION); + + types = new String[mf.getFeatureCounter().get(Pipe.OPERATION)]; + for(Entry<String,Integer> e : mf.getFeatureSet().get(Pipe.OPERATION).entrySet()) types[e.getValue()] = e.getKey(); + + //wds = new String[mf.getFeatureCounter().get(Pipe.WORD)]; + //for(Entry<String,Integer> e : mf.getFeatureSet().get(Pipe.WORD).entrySet()) wds[e.getValue()] = e.getKey(); + + + z.a0 = s_type;z.a1 = s_oper; z.a2 = s_char; z.a3 = s_char; z.a4 = s_char;z.a5 = s_char;z.a6 = s_char;z.a7 = s_char; + x.a0 = s_type; x.a1 = s_oper;x.a2 = s_word; x.a3 = s_word; x.a4 = s_word;x.a5 = s_char;x.a6 = s_char;x.a7 = s_char; + + } + + public static int s_pos,s_word,s_type,s_dir,s_dist, s_char, s_oper; + + + + /** + * Initialize the features. + * @param maxFeatures + */ + public void initFeatures() { + + + + for(int k=0;k<50;k++) { + mf.register(TYPE, "F"+k); + } + + _f0 = mf.register(TYPE, _F0); + _f1 = mf.register(TYPE, _F1); + _f2 = mf.register(TYPE, _F2); + _f3 = mf.register(TYPE, _F3); + _f4 = mf.register(TYPE, _F4); + _f5 = mf.register(TYPE, _F5); + _f6 = mf.register(TYPE, _F6); + _f7 = mf.register(TYPE, _F7); + _f8 = mf.register(TYPE, _F8); + _f9 = mf.register(TYPE, _F9); + _f10 = mf.register(TYPE, _F10); + _f11 = mf.register(TYPE, _F11); + _f12 = mf.register(TYPE, _F12); + _f13 = mf.register(TYPE, _F13); + _f14 = mf.register(TYPE, _F14); + _f15 = mf.register(TYPE, _F15); + _f16 = mf.register(TYPE, _F16); + _f17 = mf.register(TYPE, _F17); + _f18 = mf.register(TYPE, _F18); + _f19 = mf.register(TYPE, _F19); + _f20 = mf.register(TYPE, _F20); + _f21 = mf.register(TYPE, _F21); + _f22 = mf.register(TYPE, _F22); + _f23 = mf.register(TYPE, _F23); + _f24 = mf.register(TYPE, _F24); + _f25 = mf.register(TYPE, _F25); + _f26 = mf.register(TYPE, _F26); + _f27 = mf.register(TYPE, _F27); + _f28 = mf.register(TYPE, _F28); + _f29 = mf.register(TYPE, _F29); + _f30 = mf.register(TYPE, _F30); + + _f31 = mf.register(TYPE, _F31); + _f32 = mf.register(TYPE, _F32); + _f33 = mf.register(TYPE, _F33); + _f34 = mf.register(TYPE, _F34); + + _f35 = mf.register(TYPE, _F35); + _f36 = mf.register(TYPE, _F36); + _f37 = mf.register(TYPE, _F37); + _f38 = mf.register(TYPE, _F38); + + + mf.register(POS, MID); + mf.register(POS, STR); + mf.register(POS, END); + mf.register(TYPE, CHAR); + + _swrd = mf.register(WORD, STR); + _ewrd = mf.register(WORD, END); + + + _CEND = mf.register(CHAR, END); + + + } + + + + final public void addCoreFeatures(InstancesTagger is, int ic, int i, int oper, String form, long[] f) { + + for(int l=f.length-1;l>=0;l--) f[l]=0; + + int formi =is.forms[ic][i]; + int wl =is.chars[ic][i][11];//.forms[i].length(); + + int position = 1+(i<3?i:3); + + int c0= is.chars[ic][i][0], c1=is.chars[ic][i][1], c2=is.chars[ic][i][2], c3=is.chars[ic][i][3], c4=is.chars[ic][i][4],c5=is.chars[ic][i][5]; + int e0 =is.chars[ic][i][6], e1 =is.chars[ic][i][7],e2 =is.chars[ic][i][8],e3 =is.chars[ic][i][9],e4 =is.chars[ic][i][10]; + + int len = is.length(ic); + + + + x.v1=oper; x.v0 = _f0; x.v2 = formi; x.cz3(); f[0]=x.getVal(); f[1]=x.csa(3, position); + x.v0 = _f1; x.v2 = formi; x.v3 =i+1>=len?x.v3=_ewrd:is.forms[ic][i+1];x.cz4(); f[2]=x.getVal(); + + // contains upper case include again!!! + + short upper =0; + short number = 1; + for(int k1=0;k1<wl;k1++){ + char c =form.charAt(k1); + if (Character.isUpperCase(c)) { + if (k1==0) upper=1; + else { + // first char + another + if (upper==1)upper=3; + // another uppercase in the word + else if (upper==0) upper=2; + } + } + + if (Character.isDigit(c) && k1==0) number =2 ; + else if (Character.isDigit(c) && number==1) number = 3 ; + + } + + // contains a number + z.v0= _f21; z.v2=number; z.cz3();f[3]=z.getVal(); + + z.v0 = _f4; z.v1 = oper; z.v2=c0; z.cz3();f[4]=z.getVal(); + z.v0 = _f5; z.v2 = e0;z.cz3();f[5]=z.getVal(); + + z.v2=c0; z.v3=c1; z.v4=c2; z.v5=c3; z.v6=c4; + z.v0=_f6; z.cz4(); f[6]=z.getVal(); + z.v0=_f7; z.cz5(); f[7]=z.getVal(); + z.v0=_f8; z.cz6(); f[8]=z.getVal(); + z.v0=_f9; z.cz7(); f[9]=z.getVal(); + + int c=10; + z.v2=e0; z.v3=e1; z.v4=e2; z.v5=e3; z.v6=e4; + z.v0 =_f10; z.cz4();f[c++]=z.getVal(); f[c++]= z.csa(3, upper); + z.v0 =_f11; z.cz5();f[c++]=z.getVal(); f[c++]= z.csa(3, upper); + z.v0 =_f12; z.cz6();f[c++]=z.getVal(); f[c++]= z.csa(3, upper); + z.v0 =_f13; z.cz7();f[c++]=z.getVal(); f[c++]= z.csa(3, upper); + + if (len>i+1) { + + z.v0 = _f14; z.v2 = is.chars[ic][i+1][0]; + z.cz3();f[c++]=z.getVal(); + + z.v0 = _f15; z.v2 = is.chars[ic][i+1][5];z.cz3();f[c++]=z.getVal(); + + if (is.chars[ic][i+1][11]>1 ) { + z.v0 = _f16; z.v2 = is.chars[ic][i+1][0]; + z.v3 = is.chars[ic][i+1][2];z.cz4();f[c++]=z.getVal(); + + z.v0 = _f17; z.v2 = is.chars[ic][i+1][1]; + z.v3 = is.chars[ic][i+1][6]; + z.cz4();f[c++]=z.getVal();//fv.add(li.l2i(mf.calc4(b))); + } + + + x.v0 = _f18; + x.v2 = is.forms[ic][i+1]; + x.cz3();f[c++]=x.getVal(); + + if (len>i+2) { + x.v0 = _f32; + x.v2 = is.forms[ic][i+2]; x.v3 = is.forms[ic][i+1]; x.cz4();f[c++]=x.getVal(); + x.cz3();f[c++]=x.getVal();//fv.add(li.l2i(mf.calc3(b))); + + } + + if (len>i+3) { + x.v0 = _f33; x.v2 = is.forms[ic][i+3]; x.v3 = is.forms[ic][i+2];x.cz4();f[c++]=x.getVal();//fv.add(li.l2i(mf.calc4(b))); + x.cz3();f[27]=x.getVal();//fv.add(li.l2i(mf.calc3(b))); + } + } + + // length + + z.v0= _f19; z.v1=oper; z.v2=wl;z.cz3();f[c++]=z.getVal();//fv.add(li.l2i(mf.calc3(dl1))); + + if (i<1) return ; + + x.v0 = _f27; x.v1=oper; + x.v2 = is.forms[ic][i-1];x.cz3();f[c++]=x.getVal();//fv.add(li.l2i(mf.calc3(b))); + + + if (i<2) return ; + + //added this before it was 99.46 + x.v0 = _f28; x.v2 = is.forms[ic][i-2];x.cz3();f[c++]=x.getVal();//fv.add(li.l2i(mf.calc3(b))); + + // result 99.484 + if (i<3) return ; + + x.v0 = _f31; x.v1=oper; x.v2 = is.forms[ic][i-3]; x.v3 = is.forms[ic][i-2]; x.cz4();f[c++]=x.getVal();//fv.add(li.l2i(mf.calc4(b))); + + } + + + +// public String[] wds; + + /** + * Write the lemma that are not mapped by operations + * @param dos + */ + private void writeMap(DataOutputStream dos) { + + try { + dos.writeInt(opse.size()); + for(Entry<String, String> e : opse.entrySet()) { + dos.writeUTF(e.getKey()); + dos.writeUTF(e.getValue()); + } + } catch (IOException e1) { + e1.printStackTrace(); + } + } + + + + /** + * Read the form-lemma mapping not read by operations + * @param dis + */ + public void readMap(DataInputStream dis) { + try { + int size = dis.readInt(); + for(int i =0; i<size;i++) { + opse.put(dis.readUTF(), dis.readUTF()); + } + } catch (IOException e1) { + e1.printStackTrace(); + } + } + + + + + + + /* (non-Javadoc) + * @see is2.tools.IPipe#write(java.io.DataOutputStream) + */ + @Override + public void write(DataOutputStream dos) { + this.writeMap(dos); + try { + cl.write(dos); + } catch (IOException e) { + e.printStackTrace(); + } + + } + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/lemmatizer/StringEdit.java b/dependencyParser/basic/mate-tools/src/is2/lemmatizer/StringEdit.java new file mode 100755 index 0000000..8a4080e --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/lemmatizer/StringEdit.java @@ -0,0 +1,318 @@ +package is2.lemmatizer; + +import is2.util.DB; + +import java.util.ArrayList; + +public class StringEdit { + + + public static void main(String args[]) { + + + + String s = new StringBuffer(args[0]).reverse().toString(); + String t = new StringBuffer(args[1]).reverse().toString(); + + int d[][] = LD(s, t); + + + + StringBuffer opersations = new StringBuffer(); + searchPath(s,t,d, opersations, false); + System.out.println("resuylt "+" "+opersations); + + } + + + + + + //**************************** + // Get minimum of three values + //**************************** + + static private int Minimum (int a, int b, int c) { + int mi; + + mi = a; + if (b < mi) mi = b; + if (c < mi) mi = c; + + return mi; + + } + + //***************************** + // Compute Levenshtein distance + //***************************** + + static public int[][] LD (String s, String t) { + + int n = s.length (); + int m = t.length ();; // length of t + // char s_i; // ith character of s + // char t_j; // jth character of t + int cost; // cost + + // Step 1 + + + int[][] d = new int[n+1][m+1]; + + if (n == 0) return d; + if (m == 0) return d; + + // Step 2 + + for (int i = 0; i <= n; i++) d[i][0] = i; + for (int j = 0; j <= m; j++) d[0][j] = j; + + + // Step 3 + + for (int i = 1; i <= n; i++) { + + int s_i = s.charAt (i - 1); + + // Step 4 + + for (int j = 1; j <= m; j++) { + +// t_j = t.charAt (j - 1); + + // Step 5 + + if (s_i == t.charAt (j - 1)) cost = 0; + else cost = 1; + + + // Step 6 + + d[i][j] = Minimum (d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1] + cost); + + } + + } + + // Step 7 + + + + return d; + + } + + + + + + static String searchPath(String s, String t, int[][] d, StringBuffer operations, boolean debug) { + + StringBuffer result = new StringBuffer(s); + + int n = d.length; + int m = d[0].length; + + int x=n-1; + int y=m-1; + boolean changed =false; + while(true) { + if (debug && changed )System.out.println("result "+new StringBuffer(result) .reverse()); + + if (d[x][y]==0)break; + if (y>0&&x>0&& d[x-1][y-1]<d[x][y]) { + if (debug) System.out.println("min d[x-1][y-1] "+d[x-1][y-1]+" d[x][y] "+d[x][y]+" rep "+s.charAt(x-1)+" with "+t.charAt(y-1)+" at "+(x-1)); + + operations.append('R').append(Character.toString((char)((int)x-1))).append(s.charAt(x-1)).append(t.charAt(y-1)); + if (debug) result.setCharAt(x-1, t.charAt(y-1)); + y--; + x--; + changed =true; + continue; + } + if (y>0&& d[x][y-1]<d[x][y]) { + if (debug) System.out.println("min d[x][y-1] "+d[x][y-1]+" d[x][y] "+d[x][y]+" ins "+t.charAt(y-1)+" at "+(x)); + operations.append('I').append(Character.toString((char)((int)x))).append(t.charAt(y-1)); + if (debug)result.insert(x, t.charAt(y-1)); + y--; + changed =true; + continue; + } + if (x>0&& d[x-1][y]<d[x][y]) { + if (debug)System.out.println("min d[x-1][y] "+d[x-1][y]+" d[x][y] "+d[x][y]+" del "+s.charAt(x-1)+" at "+(x-1)); + operations.append('D').append(Character.toString((char)((int)x-1))).append(s.charAt(x-1)); + if (debug)result.deleteCharAt(x-1); + x--; + changed =true; + continue; + } + changed =false; + if (x>0&& y>0 && d[x-1][y-1]==d[x][y]) { + x--; y--; + continue ; + } + if (x>0&& d[x-1][y]==d[x][y]) { + x--; + continue; + } + if (y>0 && d[x][y-1]==d[x][y]) { + y--; + continue; + } + + } + if (debug) return result.reverse().toString(); + else return null; + } + + public static String change(String s, String operations) { + + StringBuffer result = new StringBuffer(s).reverse(); + + int pc =0; + while(true) { + if (operations.length()<=pc) break; + char nextOperation = operations.charAt(pc); + pc++; + if (nextOperation == 'R') { + //pc++; + int xm1 = (char)operations.charAt(pc); + pc++; + char replace = operations.charAt(pc); + pc++; + char with = operations.charAt(pc); + //operations.append('R').append((char)x-1).append(s.charAt(x-1)).append(t.charAt(y-1)); + // System.out.println(""+result+" xm1 "+xm1+" op "+operations); + + + if (result.length()<=xm1) return s; + + if (result.charAt(xm1)==replace) result.setCharAt(xm1, with); + //if (debug) result.setCharAt(x-1, t.charAt(y-1)); + pc++; + + }else if (nextOperation == 'I') { + // if (debug) System.out.println("min d[x][y-1] "+d[x][y-1]+" d[x][y] "+d[x][y]+" ins "+t.charAt(y-1)+" at "+(x)); + //operations.append('I').append((char)x).append(t.charAt(y-1)); + + //if (debug)result.insert(x, t.charAt(y-1)); + //y--; + //changed =true; + //pc++; + int x = operations.charAt(pc); + pc++; + char in = operations.charAt(pc); + + if (result.length()<x) return s; + + result.insert(x, in); + pc++; + } else if (nextOperation == 'D' ) { + //pc++; + int xm1 = operations.charAt(pc); + + + if (result.length()<=xm1) return s; + + result.deleteCharAt(xm1); + pc++; + // delete with + pc++; + // operations.append('D').append((char)x-1).append(s.charAt(x-1)); + // if (debug)result.deleteCharAt(x-1); + } + + } + return result.reverse().toString(); + //else return null; + } + + + + + + + + + + + + + + + + + + /** + * @param opers + * @param postion + * @return + */ + public static String get(ArrayList<String> opers, int position) { + for(String s : opers) { + int p = (int)s.charAt(1); + if (p==position) { + return s; + } + } + return "0"; + } + + + + + + /** + * @param form + * @param string + * @param c + * @return + */ + public static String changeSimple(String form, String operation, int c) { + + if (operation.equals("0")) return form; + + if (operation.charAt(0)=='I') { + StringBuffer f = new StringBuffer(form); + if (f.length()<=c) { + // DB.println("fail insert "); + return form; + } + f.insert(c+1, operation.charAt(1)); + return f.toString(); + } + if (operation.charAt(0)=='R') { + StringBuffer f = new StringBuffer(form); + // if (f.length()<=c) f.append(' '); + if (f.length()<=c) { + // DB.println("fail replace "); + return form; + } + f.setCharAt(c, operation.charAt(2)); + return f.toString(); + } + + if (operation.charAt(0)=='D') { + StringBuffer f = new StringBuffer(form); + f.delete(c, c+1);//.append(' '); + return f.toString(); + } + return form; + } + + + + + + /** + * @param string + * @return + */ + public static String simple(String o) { + StringBuffer s = new StringBuffer(o); + s.delete(1, 2); + return s.toString(); + } + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/mtag/Convert.java b/dependencyParser/basic/mate-tools/src/is2/mtag/Convert.java new file mode 100755 index 0000000..e262269 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/mtag/Convert.java @@ -0,0 +1,98 @@ +/** + * + */ +package is2.mtag; + + + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.util.ArrayList; + +/** + * @author Dr. Bernd Bohnet, 20.01.2010 + * + * + */ +public class Convert { + + public static void main (String[] args) throws IOException { + + Options options = new Options(args); + + split(options.trainfile); + + } + + /** + * @param trainfile + * @throws IOException + */ + private static void split(String trainfile) throws IOException { + + String dir = "split"; + boolean success = (new File("split")).mkdir(); + if (success) System.out.println("Directory: " + dir + " created"); + + + ArrayList<String> corpus = new ArrayList<String>(); + + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(trainfile),"UTF-8"),32768); + String l =null; + int sentences = 0; + try { + while( (l = reader.readLine())!=null) { + + corpus.add(l); + if (l.length()<8) sentences++; + + } + } catch (IOException e) { + e.printStackTrace(); + } + System.out.println("Corpus has "+sentences+" sentences."); + + int partSize = sentences / 20; + System.out.println("Prepare corpus for cross annotations with 20 parts with part size "+partSize+" number of lines "+corpus.size()); + + + + for(int k=0;k<20;k++) { + BufferedWriter br = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("split/p-"+k),"UTF-8")); + BufferedWriter rest = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("split/r-"+k),"UTF-8")); + int skip=k*partSize; + + int countSentences=0; + int countSentencesWrote=0; + System.out.println("skip from "+skip+" to "+(skip+partSize-1)); + for(String x : corpus) { + if (countSentences>=skip && (countSentences<(skip+partSize)||k==19)){ + rest.write(x); + rest.newLine(); + if (x.length()<8) countSentencesWrote++; + } else { + br.write(x); + br.newLine(); + } + + if (x.length()<8) countSentences++; + } + System.out.println("wrote for this part "+countSentencesWrote); + br.flush(); + br.close(); + rest.flush(); + rest.close(); + + } + + + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/mtag/Evaluator.java b/dependencyParser/basic/mate-tools/src/is2/mtag/Evaluator.java new file mode 100755 index 0000000..09d1455 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/mtag/Evaluator.java @@ -0,0 +1,148 @@ +package is2.mtag; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.Hashtable; +import java.util.Map.Entry; + +import is2.data.SentenceData09; +import is2.io.CONLLReader09; + + +public class Evaluator { + + public static void evaluate (String act_file, String pred_file, String format) throws Exception { + + CONLLReader09 goldReader = new CONLLReader09(act_file);//DependencyReader.createDependencyReader(); + // boolean labeled = goldReader.startReading(act_file); + + CONLLReader09 predictedReader = new CONLLReader09(); + predictedReader.startReading(pred_file); + +// if (labeled != predLabeled) +// System.out.println("Gold file and predicted file appear to differ on whether or not they are labeled. Expect problems!!!"); + + + int total = 0, totalP=0,corr = 0, corrL = 0, corrT=0,totalX=0; + int totalD=0, corrD=0,err=0; + int numsent = 0, corrsent = 0, corrsentL = 0; + SentenceData09 goldInstance = goldReader.getNext(); + SentenceData09 predInstance = predictedReader.getNext(); + + Hashtable<String,Integer> errors = new Hashtable<String,Integer>(); + Hashtable<String,StringBuffer> words = new Hashtable<String,StringBuffer>(); + + + while(goldInstance != null) { + + int instanceLength = goldInstance.length(); + + if (instanceLength != predInstance.length()) + System.out.println("Lengths do not match on sentence "+numsent); + + + String gold[] = goldInstance.ofeats; + String pred[] = predInstance.pfeats; + + boolean whole = true; + boolean wholeL = true; + + // NOTE: the first item is the root info added during nextInstance(), so we skip it. + + for (int i = 1; i < instanceLength; i++) { + if (gold[i].equals(pred[i])||(gold[i].equals("_")&&pred[i]==null)) corrT++; + else { + // System.out.println("gold:"+goldFeats[i]+" pred:"+predFeats[i]+" "+goldInstance.forms[i]+" snt "+numsent+" i:"+i); + //for (int k = 1; k < instanceLength; k++) { + + // System.out.print(goldInstance.forms[k]+":"+goldInstance.gpos[k]); + // if (k==i) System.out.print(":"+predInstance.gpos[k]); + // System.out.print(" "); + + // } + //System.out.println(); + String key = "gold: '"+gold[i]+"' pred: '"+pred[i]+"'"; + Integer cnt = errors.get(key); + StringBuffer errWrd = words.get(key); + if (cnt==null) { + errors.put(key,1); + words.put(key, new StringBuffer().append(goldInstance.forms[i])); + } + else { + errors.put(key,cnt+1); + errWrd.append(" "+goldInstance.forms[i]); + } + err++; + + } + String[] gf = gold[i].split("|"); + int eq=0; + + if (pred[i]!=null) { + String[] pf = pred[i].split("|"); + totalP +=pf.length; + + if (pf.length>gf.length) totalX +=pf.length; + else totalX+=gf.length; + + for(String g : gf) { + for(String p : pf) { + if (g.equals(p)) {eq++;break;} + } + } + } else totalX+=gf.length; + totalD +=gf.length; + corrD +=eq; + } + total += instanceLength - 1; // Subtract one to not score fake root token + + if(whole) corrsent++; + if(wholeL) corrsentL++; + numsent++; + + goldInstance = goldReader.getNext(); + predInstance = predictedReader.getNext(); + } + + ArrayList<Entry<String, Integer>> opsl = new ArrayList<Entry<String, Integer>>(); + for(Entry<String, Integer> e : errors.entrySet()) { + opsl.add(e); + } + + Collections.sort(opsl, new Comparator<Entry<String, Integer>>(){ + + @Override + public int compare(Entry<String, Integer> o1, + Entry<String, Integer> o2) { + + return o1.getValue()==o2.getValue()?0:o1.getValue()>o2.getValue()?-1:1; + } + + + }); + + + int cnt=0; + System.out.println("10 top most errors:"); + for(Entry<String, Integer> e : opsl) { + cnt++; + // System.out.println(e.getKey()+" "+e.getValue()+" context: "+words.get(e.getKey())); + } + + + System.out.println("Tokens: " + total+" Correct: " + corrT+" "+(float)corrT/total+" R "+((float)corrD/totalD)+" tP "+totalP+" tG "+totalD+" P "+(float)corrD/totalP); + System.out.println("err: " + err+" total "+total+" corr "+corrT); +// System.out.println("Unlabeled Complete Correct: " + ((double)corrsent/numsent)); + + } + + public static void main (String[] args) throws Exception { + String format = "CONLL"; + if (args.length > 2) + format = args[2]; + + evaluate(args[0], args[1], format); + } + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/mtag/ExtractorM.java b/dependencyParser/basic/mate-tools/src/is2/mtag/ExtractorM.java new file mode 100644 index 0000000..864b977 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/mtag/ExtractorM.java @@ -0,0 +1,514 @@ +package is2.mtag; + + +import is2.data.Cluster; +import is2.data.F2SF; +import is2.data.Instances; +import is2.data.InstancesTagger; +import is2.data.Long2Int; +import is2.data.Long2IntInterface; +import is2.data.ParametersFloat; +import is2.data.PipeGen; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; +import is2.tools.IPipe; +import is2.util.OptionsSuper; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map.Entry; + + +final public class ExtractorM extends PipeGen implements IPipe { + + public static int _CEND; + + + private static final String STWRD = "STWRD",STPOS = "STPOS",END = "END",STR = "STR"; + + public String[] types; + + Cluster cl; + + final public MFO mf =new MFO(); + public Long2IntInterface li; + + + + final MFO.Data4 d1 = new MFO.Data4(),d2 = new MFO.Data4(),d3 = new MFO.Data4(),dw = new MFO.Data4(); + final MFO.Data4 dwp = new MFO.Data4(),dp = new MFO.Data4(); + + + private OptionsSuper options; + private int _ewrd; + static private int _mid, _strp,_endp; + + public ExtractorM (Options options, Long2Int long2Int) throws IOException { + this.options = options; + + li =long2Int; + } + + public ExtractorM (OptionsSuper options) { + this.options = options; + } + + + public HashMap<Integer,Integer> form2morph = new HashMap<Integer, Integer> (); + + + public Instances createInstances(String file) { + + CONLLReader09 depReader = new CONLLReader09(CONLLReader09.NO_NORMALIZE); + + depReader.startReading(file); + mf.register(POS,"<root-POS>"); + + mf.register(FFEATS, CONLLReader09.NO_TYPE); + mf.register(FFEATS, ""); + + InstancesTagger is = new InstancesTagger(); + + System.out.println("Registering feature parts "); + + HashMap<String,HashSet<String>> op2form = new HashMap<String, HashSet<String>> (); + HashMap<String,Integer> freq = new HashMap<String, Integer> (); + + + int ic=0; + while(true) { + SentenceData09 instance1 = depReader.getNext(); + if (instance1== null) break; + ic++; + + + String[] w = instance1.forms; + for(int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1]); + for(int i1 = 0; i1 < w.length; i1++) registerChars(CHAR, w[i1]); + + for(int i1 = 0; i1 < w.length; i1++) { + mf.register(WORD, w[i1].toLowerCase()); + Integer f = freq.get(w[i1].toLowerCase()); + + if (f==null) freq.put(w[i1].toLowerCase(), 1); + else freq.put(w[i1].toLowerCase(), f+1); + + HashSet<String> forms = op2form.get(w[i1].toLowerCase()); + if (forms==null) { + forms = new HashSet<String>(); + op2form.put(w[i1].toLowerCase(), forms); + } + forms.add(instance1.ofeats[i1]==null?"_":instance1.ofeats[i1]); + } + for(int i1 = 0; i1 < w.length; i1++) registerChars(CHAR, w[i1].toLowerCase()); + + w = instance1.plemmas; + for(int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1]); + for(int i1 = 0; i1 < w.length; i1++) registerChars(CHAR, w[i1]); + + w = instance1.ppos; + for(int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); + + w = instance1.gpos; + for(int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); + + w = instance1.ofeats; + for(int i1 = 0; i1 < w.length; i1++) if (w[i1]!=null) mf.register(FEAT, w[i1]); + for(int i1 = 0; i1 < w.length; i1++) if (w[i1]!=null) mf.register(FFEATS, w[i1]); + + // w = instance1.pfeats; + //for(int i1 = 0; i1 < w.length; i1++) if (w[i1]!=null) mf.register(FEAT, w[i1]); + } + + + for(Entry<String,HashSet<String>> e : op2form.entrySet()) { + if (e.getValue().size()==1 &&freq.get(e.getKey())>10) { + // System.out.println("found map "+e.getKey()+" "+e.getValue()+" "+freq.get(e.getKey())); + form2morph.put(mf.getValue(ExtractorM.WORD, e.getKey()), mf.getValue(FFEATS, (String)e.getValue().toArray()[0])); + } + } + + initFeatures(); + + mf.calculateBits(); + initValues(); + + System.out.println(""+mf.toString()); + + depReader.startReading(file); + + int num1 = 0; + long start1 = System.currentTimeMillis(); + + System.out.print("Creating Features: "); + is.init(ic, mf) ; + int del=0; + + while(true) { + if (num1 % 100 ==0) {del = outValue(num1, del);} + SentenceData09 instance1 = depReader.getNext(is); + if (instance1== null) break; + + if (num1>options.count) break; + + num1++; + } + long end1 = System.currentTimeMillis(); + System.gc(); + long mem2 = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory(); + System.out.print(" time "+(end1-start1)+" mem "+(mem2/1024)+" kb"); + + types = new String[mf.getFeatureCounter().get(FFEATS)]; + + for(Entry<String,Integer> e : mf.getFeatureSet().get(FFEATS).entrySet()) { + types[e.getValue()] = e.getKey(); + } + + + if (options.clusterFile==null)cl = new Cluster(); + else cl= new Cluster(options.clusterFile, mf,6); + + + System.out.println("Num Features: " + types.length); + + + + depReader.startReading(file); + + + + int num11=0; + + while(true) { + + SentenceData09 instance = depReader.getNext(); + if (instance==null) break; + + is.fillChars(instance, num11, _CEND); + + + if (num11>options.count) break; + + num11++; + } + + return is;//.toNativeArray(); + + } + + private void registerChars(String type, String word) { + for(int i=0;i<word.length();i++) mf.register(type, Character.toString(word.charAt(i))); + } + + + + public void initValues() { + s_feat = mf.getFeatureBits(FFEATS); + s_word = mf.getFeatureBits(WORD); + s_type = mf.getFeatureBits(TYPE); + s_char = mf.getFeatureBits(CHAR); + s_pos =mf.getFeatureBits(POS); + // dl1.a[0] = s_type; dl1.a[1] = s_pos; + // for (int k = 2; k < 7; k++) dl1.a[k] = s_pos; + + d1.a0 = s_type; d1.a1 = s_feat; d1.a2= s_word; + d2.a0 = s_type; d2.a1 = s_feat; d2.a2= s_feat; d2.a3= s_feat; d2.a4= s_feat; d2.a5= s_feat; d2.a6= s_feat; + d3.a0 = s_type; d3.a1 = s_feat; d3.a2= s_char; d3.a3= s_char; d3.a4= s_char; d3.a5= s_char; d3.a6= s_char; d3.a7= s_char; + dp.a0 = s_type; dp.a1 = s_feat; dp.a2= s_pos; dp.a3= s_pos; dp.a4= s_feat;// dp.a5= s_char; dp.a6= s_char; dp.a7= s_char; + dw.a0 = s_type; dw.a1 = s_feat;dw.a2= s_word; dw.a3= s_word; dw.a4= s_word; dw.a5= s_word; dw.a6= s_word; dw.a7= s_word; + dwp.a0 = s_type; dwp.a1 = s_feat;dwp.a2= s_word ; dwp.a3= s_feat; dwp.a4= s_word; + + } + + public static short s_feat,s_word,s_type,s_dir,s_dist,s_char,s_pos; + + + + /** + * Initialize the features types. + */ + public void initFeatures() { + + for(int t=0;t<62;t++) { + mf.register(TYPE,"F"+t); + } + + +// _mid = mf.register(POS, MID); + _strp = mf.register(POS, STR); + _endp= mf.register(POS, END); + + mf.register(WORD, STR); + _ewrd = mf.register(WORD, END); + + + _CEND = mf.register(CHAR, END); + + + + + // optional features + mf.register(WORD,STWRD); + mf.register(POS,STPOS); + + + } + + + final public void addCF(InstancesTagger is, int ic, String fs,int i, short pfeat[],short ppos[], int[] forms, int[] lemmas, long[] vs) { + + int c0= is.chars[ic][i][0], c1=is.chars[ic][i][1], c2=is.chars[ic][i][2], c3=is.chars[ic][i][3], c4=is.chars[ic][i][4],c5=is.chars[ic][i][5]; + int e0 =is.chars[ic][i][6], e1 =is.chars[ic][i][7],e2 =is.chars[ic][i][8],e3 =is.chars[ic][i][9],e4 =is.chars[ic][i][10]; + + int f=1,n=0; + short upper =0, number = 1; + for(int k1=0;k1<fs.length();k1++){ + char c = fs.charAt(k1); + if (Character.isUpperCase(c)) { + if (k1==0) upper=1; + else { + // first char + another + if (upper==1) upper=3; + // another uppercase in the word + else if (upper==0) upper=2; + } + } + + if (Character.isDigit(c) && k1==0) number =2 ; + else if (Character.isDigit(c) && number==1) number = 3; + } + + int form = forms[i]; + + int len = forms.length; + long l; + d1.v0 = f++; d1.v2=form; l=mf.calc3(d1); vs[n++]=mf.calc3(d1); + + d1.v0 = f++; d1.v2=is.formlc[ic][i]; vs[n++]=mf.calc3(d1); + + d3.v2=c0; d3.v3=c1; d3.v4=c2; d3.v5=c3; d3.v6=c4; + d3.v0=f++; vs[n++]=mf.calc3(d3); + d3.v0=f++; vs[n++]=mf.calc4(d3); + d3.v0=f++; vs[n++]=mf.calc5(d3); + d3.v0=f++; vs[n++]=mf.calc6(d3); + d3.v0=f++; vs[n++]=mf.calc7(d3); + + if (form!=-1) { + d3.v2=c2; d3.v3=c3; d3.v4=c4; d3.v5=c5; d3.v6=cl.getLP(form); + d3.v0=f; vs[n++]=mf.calc6(d3); d3.v0=f+1; vs[n++]=mf.calc7(d3); + } + f+=2; + + if (form>0) { + d3.v0=f; d3.v5=cl.getLP(form); vs[n++]=mf.calc6(d3); + d3.v0=f+1; d3.v4=cl.getLP(form); vs[n++]=mf.calc5(d3); + d3.v0=f+2; d3.v3=cl.getLP(form); vs[n++]=mf.calc4(d3); + } + f+=3; + + d3.v2=e0; d3.v3=e1; d3.v4=e2; d3.v5=e3; d3.v6=e4; + d3.v0 =f++; vs[n++]=mf.calc3(d3); + d3.v0 =f++; vs[n++]=l=mf.calc4(d3); vs[n++]=d3.calcs(3, upper, l); + d3.v0 =f++; vs[n++]=l=mf.calc5(d3); vs[n++]=d3.calcs(3, upper, l); + d3.v0 =f++; vs[n++]=l=mf.calc6(d3); vs[n++]=d3.calcs(3, upper, l); + d3.v0 =f++; vs[n++]=l=mf.calc7(d3); vs[n++]=d3.calcs(3, upper, l); + + if (form>0) { + d3.v0=f; d3.v5=cl.getLP(form); vs[n++]=mf.calc6(d3); + d3.v0=f+1; d3.v4=cl.getLP(form); vs[n++]=mf.calc5(d3); + d3.v0=f+2; d3.v3=cl.getLP(form); vs[n++]=mf.calc4(d3); + } + f+=3; + + + dw.v0=f++; dw.v2=i+1<len?forms[i+1]:_ewrd;dw.v3= forms[i];vs[n++]=mf.calc4(dw); + + if (len>i+1) { + + dw.v0=f; dw.v2= forms[i+1]; vs[n++]=mf.calc3(dw); + d3.v0=f+1; d3.v2 =is.chars[ic][i+1][0];vs[n++]=mf.calc3(d3); + d3.v0=f+2; d3.v2 =is.chars[ic][i+1][6];vs[n++]=mf.calc3(d3); + + d3.v2=e0; d3.v3=e1; + + d3.v0 =f+3; d3.v4 =is.chars[ic][i+1][0];vs[n++]=mf.calc5(d3); + d3.v0 =f+4; d3.v4 =is.chars[ic][i+1][6];vs[n++]=mf.calc5(d3); + + + + if (is.chars[ic][i+1][11]>1 ) { // instance.forms[i+1].length() + + d3.v0=f+5; d3.v2=is.chars[ic][i+1][0]; d3.v3=is.chars[ic][i+1][1]; vs[n++]=mf.calc4(d3); + d3.v0=f+6; d3.v2=is.chars[ic][i+1][6]; d3.v3=is.chars[ic][i+1][7]; vs[n++]=mf.calc4(d3); + + d3.v2=e0; d3.v3=e1; + + d3.v0=f+7; d3.v4 = is.chars[ic][i+1][0]; d3.v5 =is.chars[ic][i+1][1]; vs[n++]=mf.calc6(d3); + d3.v0=f+8; d3.v4 = is.chars[ic][i+1][6]; d3.v5=is.chars[ic][i+1][7]; vs[n++]=mf.calc6(d3); + + if (forms[i+1]>0) { + d3.v0=f+9; d3.v2=is.chars[ic][i+1][0]; d3.v3=is.chars[ic][i+1][1]; d3.v4 =cl.getLP(forms[i+1]); vs[n++]=mf.calc5(d3); + d3.v0=f+10; d3.v2=is.chars[ic][i+1][6]; d3.v3=is.chars[ic][i+1][7]; d3.v4 =cl.getLP(forms[i+1]); vs[n++]=mf.calc5(d3); + } + } + + if (forms[i+1]>0) { + dw.v0=f+11; dw.v2= cl.getLP(forms[i+1]); dw.v3= forms[i];vs[n++]=mf.calc4(dw); + } + + if (len>i+2) { + dw.v0=f+12; dw.v2= forms[i+2]; dw.v3 = forms[i+1];vs[n++]=mf.calc4(dw);vs[n++]=mf.calc3(dw); +// d2.v0=f+13; d2.v2=pfeat[i+1]; d2.v3= pfeat[i+2]; vs[n++]=mf.calc4(d2); + // dp.v0= f+14; dp.v2=ppos[i+1]; dp.v3=ppos[i+2]; vs[n++]=mf.calc4(dp); + + } + + if (len>i+3) { + dw.v0=f+14; dw.v2= forms[i+3]; dw.v3 = forms[i+2]; vs[n++]=mf.calc4(dw); vs[n++]=mf.calc3(dw); + + } + } + f+=16; + + // length + d2.v0=f++; d2.v2=is.chars[ic][i][11];vs[n++]=mf.calc3(d2); + + + // contains a number + d2.v0=f++; d2.v2=number; vs[n++]=mf.calc3(d2); + d1.v0=f++; d1.v2=lemmas[i]; vs[n++]=mf.calc3(d1); + + if (i!=0 &&len>i+1) { + dw.v0=f; dw.v2=lemmas[i-1];dw.v3=lemmas[i+1];vs[n++]=mf.calc4(dw); + d2.v0=f+1; d2.v2=pfeat[i-1]; d2.v3=pfeat[i+1];vs[n++]=mf.calc4(d2); + } + f+=2; + + d2.v0= f++; d2.v2=i>=1? pfeat[i-1]:_strp; vs[n++]=mf.calc3(d2); + dp.v0= f++; dp.v2=ppos[i]; vs[n++]=mf.calc3(dp); + + if (i>0) { + dw.v0 = f++; dw.v2 =i>=1? forms[i-1]:_strp; vs[n++]=mf.calc3(dw); + dw.v0 = f++; dw.v2 = i>=1? lemmas[i-1]:_strp; vs[n++]=mf.calc3(dw); + + if (len>i+1) { +// d2.v0=f; d2.v2= pfeat[i-1];d2.v3= pfeat[i+1]; vs[n++]=mf.calc4(d2); + // dp.v0= f+1; dp.v2=ppos[i-1]; dp.v3=ppos[i+1]; vs[n++]=mf.calc4(dp); + + } + f++; + dp.v0= f++; dp.v2=ppos[i]; dp.v3=ppos[i-1]; vs[n++]=mf.calc4(dp); + + if (i>1) { + d2.v0=f++; d2.v2=i<2?_strp: pfeat[i-2]; vs[n++]=mf.calc3(d2); + d2.v0=f++; d2.v2= pfeat[i-1]; d2.v3= pfeat[i-2]; vs[n++]=mf.calc4(d2); + + dw.v0=f++; dw.v2= forms[i-2]; vs[n++]=mf.calc3(dw); + dwp.v0=f++; dwp.v2 = forms[i-1]; dwp.v3 = pfeat[i-2];vs[n++]=mf.calc4(dwp); + dwp.v0=f++; dwp.v2 = forms[i-2]; dwp.v3 = pfeat[i-1];vs[n++]=mf.calc4(dwp); + + if (i>2) { + d2.v0=f++; d2.v2=pfeat[i-3]; vs[n++]=mf.calc3(d2); + d2.v0=f++; d2.v2=pfeat[i-2]; d2.v3= pfeat[i-3]; vs[n++]=mf.calc4(d2); + dw.v0=f++; dw.v2 = forms[i-3]; dw.v3 = forms[i-2]; vs[n++]=mf.calc4(dw); + // dp.v0= f++; dp.v2=ppos[i-3]; dp.v3=ppos[i-2]; vs[n++]=mf.calc4(dp); + } + } + } + vs[n] = Integer.MIN_VALUE; + } + + + + + + + + public int fillFeatureVectorsOne(ParametersFloat params, int w1, String form, Instances is, int n, short[] features, long[] vs) { + double best = -1; + int bestType=-1; + + F2SF f = new F2SF(params.parameters); + //is.gfeats[n] + addCF((InstancesTagger)is, n, form, w1, features,is.pposs[n], is.forms[n], is.plemmas[n], vs); + + for(int t = 0; t < types.length; t++) { + + f.clear(); + int p = t<<ExtractorM.s_type; + for(int k=0;k<vs.length;k++) { + if (vs[k]==Integer.MIN_VALUE) break; + if (vs[k]>=0) f.add(li.l2i(vs[k]+p)); + } + if (f.score >best) { + bestType=t; + best =f.score; + } + + } + return bestType; + + } + + + + //static ArrayList<T> todo = new ArrayList<T>(); + static SentenceData09 instance; + + + public static int _FC =200; + + + /** + * Write the lemma that are not mapped by operations + * @param dos + */ + public void writeMap(DataOutputStream dos) { + + try { + dos.writeInt(this.form2morph.size()); + for(Entry<Integer, Integer> e : form2morph.entrySet()) { + dos.writeInt(e.getKey()); + dos.writeInt(e.getValue()); + } + } catch (IOException e1) { + e1.printStackTrace(); + } + } + + + + /** + * Read the form-lemma mapping not read by operations + * @param dis + */ + public void readMap(DataInputStream dis) { + try { + int size = dis.readInt(); + for(int i =0; i<size;i++) { + form2morph.put(dis.readInt(), dis.readInt()); + } + } catch (IOException e1) { + e1.printStackTrace(); + } + } + + + /* (non-Javadoc) + * @see is2.tools.IPipe#write(java.io.DataOutputStream) + */ + @Override + public void write(DataOutputStream dos) { + try { + cl.write(dos); + writeMap(dos); + } catch (IOException e) { + e.printStackTrace(); + } + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/mtag/MFO.java b/dependencyParser/basic/mate-tools/src/is2/mtag/MFO.java new file mode 100755 index 0000000..d91991e --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/mtag/MFO.java @@ -0,0 +1,540 @@ +package is2.mtag; + + +import is2.data.IEncoderPlus; +import is2.util.DB; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map.Entry; + +/** + * Map Features, do not map long to integer + * + * @author Bernd Bohnet, 20.09.2009 + */ + +final public class MFO implements IEncoderPlus { + + /** The features and its values */ + static private final HashMap<String,HashMap<String,Integer>> m_featureSets = new HashMap<String,HashMap<String,Integer>>(); + + /** The feature class and the number of values */ + static private final HashMap<String,Integer> m_featureCounters = new HashMap<String,Integer>(); + + /** The number of bits needed to encode a feature */ + static final HashMap<String,Integer> m_featureBits = new HashMap<String,Integer>(); + + /** Integer counter for long2int */ + //private int count=0; + + /** Stop growing */ + public boolean stop=false; + + final public static String NONE="<None>"; + + public static class Data { + public final String[] a = new String[8]; + public final String[] v = new String[8]; + final short[] s = new short[9]; + public void clear(int i) { + v[i]=null; + } + } + + + + final public static class Data4 { + public int shift; + public short a0,a1,a2,a3,a4,a5,a6,a7,a8,a9; + public int v0,v1,v2,v3,v4,v5,v6,v7,v8,v9; + + final public long calcs(int b, long v, long l) { + if (l<0) return l; + l |= v<<shift; + shift +=b; + return l; + } + + + final public long calc2() { + + if (v0<0||v1<0) return -1; + + long l = v0; + shift =a0; + l |= (long)v1<<shift; + shift +=a1; + + return l; + } + + + + final public long calc3() { + + if (v0<0||v1<0||v2<0) return -1; + // if (v1<0||v2<0) return -1; + + long l = v0; + shift =a0; + l |= (long)v1<<shift; + shift +=a1; + l |= (long)v2<<shift; + shift=(short) (shift + a2); + + //shift=; + return l; + } + + + final public long calc4() { + if (v0<0||v1<0||v2<0||v3<0) return -1; + + long l = v0; + shift =a0; + l |= (long)v1<<shift; + shift +=a1; + l |= (long)v2<<shift; + shift +=a2; + l |= (long)v3<<shift; + shift= shift +a3; + + return l; + } + + + + final public long calc5() { + + if (v0<0||v1<0||v2<0||v3<0||v4<0) return -1; + + long l = v0; + shift =a0; + l |= (long)v1<<shift; + shift +=a1; + l |= (long)v2<<shift; + shift +=a2; + l |= (long)v3<<shift; + shift +=a3; + l |= (long)v4<<shift; + shift =shift+a4; + + return l; + } + + + final public long calc6() { + + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0) return -1; + + long l = v0; + shift =a0; + l |= (long)v1<<shift; + shift +=a1; + l |= (long)v2<<shift; + shift +=a2; + l |= (long)v3<<shift; + shift +=a3; + l |= (long)v4<<shift; + shift +=a4; + l |= (long)v5<<shift; + shift =shift+a5; + + return l; + } + + final public long calc7() { + + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0||v6<0) return -1; + + long l = v0; + shift =a0; + l |= (long)v1<<shift; + shift +=a1; + l |= (long)v2<<shift; + shift +=a2; + l |= (long)v3<<shift; + shift +=a3; + l |= (long)v4<<shift; + shift +=a4; + l |= (long)v5<<shift; + shift +=a5; + l |= (long)v6<<shift; + shift =shift+a6; + + return l; + } + + + final public long calc8() { + + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0||v6<0||v7<0) return -1; + + long l = v0; + shift =a0; + l |= (long)v1<<shift; + shift +=a1; + l |= (long)v2<<shift; + shift +=a2; + l |= (long)v3<<shift; + shift +=a3; + l |= (long)v4<<shift; + shift +=a4; + l |= (long)v5<<shift; + shift +=a5; + l |= (long)v6<<shift; + shift +=a6; + l |= (long)v7<<shift; + shift =shift+a7; + + return l; + } + + } + + public MFO () {} + + + // public int size() {return count;} + + + + + /** + * Register an attribute class, if it not exists and add a possible value + * @param type + * @param type2 + */ + final public int register(String a, String v) { + + HashMap<String,Integer> fs = getFeatureSet().get(a); + if (fs==null) { + fs = new HashMap<String,Integer>(); + getFeatureSet().put(a, fs); + fs.put(NONE, 0); + getFeatureCounter().put(a, 1); + } + Integer c = getFeatureCounter().get(a); + + Integer i = fs.get(v); + if (i==null) { + fs.put(v, c); + c++; + getFeatureCounter().put(a,c); + return c-1; + } else return i; + } + + /** + * Calculates the number of bits needed to encode a feature + */ + public void calculateBits() { + + int total=0; + for(Entry<String,Integer> e : getFeatureCounter().entrySet() ){ + int bits =(int)Math.ceil((Math.log(e.getValue()+1)/Math.log(2))); + m_featureBits.put(e.getKey(), bits); + total+=bits; + // System.out.println(" "+e.getKey()+" bits "+bits+" number "+(e.getValue()+1)); + } + + // System.out.println("total number of needed bits "+total); + } + + + + @Override + public String toString() { + + StringBuffer content = new StringBuffer(); + for(Entry<String,Integer> e : getFeatureCounter().entrySet() ){ + content.append(e.getKey()+" "+e.getValue()); + content.append(':'); + // HashMap<String,Integer> vs = getFeatureSet().get(e.getKey()); + content.append(getFeatureBits(e.getKey())); + + /*if (vs.size()<120) + for(Entry<String,Integer> e2 : vs.entrySet()) { + content.append(e2.getKey()+" ("+e2.getValue()+") "); + }*/ + content.append('\n'); + + } + return content.toString(); + } + + + static final public long calcs(Data4 d,int b, long v, long l) { + if (l<0) return l; + l |= v<<d.shift; + d.shift +=b; + return l; + } + + + static final public short getFeatureBits(String a) { + return (short)m_featureBits.get(a).intValue(); + } + + + + /** + * Get the integer place holder of the string value v of the type a + * + * @param t the type + * @param v the value + * @return the integer place holder of v + */ + final public int getValue(String t, String v) { + + if (m_featureSets.get(t)==null) return -1; + Integer vi = m_featureSets.get(t).get(v); + if (vi==null) return -1; //stop && + return vi.intValue(); + } + + /** + * Static version of getValue + * @see getValue + */ + static final public int getValueS(String a, String v) { + + if (m_featureSets.get(a)==null) return -1; + Integer vi = m_featureSets.get(a).get(v); + if (vi==null) return -1; //stop && + return vi.intValue(); + } + + public int hasValue(String a, String v) { + + Integer vi = m_featureSets.get(a).get(v); + if (vi==null) return -1; + return vi.intValue(); + } + + + + + final public long calc2(Data4 d) { + + if (d.v0<0||d.v1<0) return -1; + // if (d.v1<0||d.v2<0) return -1; + + long l = d.v0; + short shift =d.a0; + l |= (long)d.v1<<shift; + shift +=d.a1; + // l |= (long)d.v2<<shift; + d.shift=shift; + + //d.shift=; + return l; + } + + + + final public long calc3(Data4 d) { + + if (d.v0<0||d.v1<0||d.v2<0) return -1; + // if (d.v1<0||d.v2<0) return -1; + + long l = d.v0; + short shift =d.a0; + l |= (long)d.v1<<shift; + shift +=d.a1; + l |= (long)d.v2<<shift; + d.shift=shift + d.a2; + + //d.shift=; + return l; + } + + + final public long calc4(Data4 d) { + if (d.v0<0||d.v1<0||d.v2<0||d.v3<0) return -1; + + long l = d.v0; + int shift =d.a0; + l |= (long)d.v1<<shift; + shift +=d.a1; + l |= (long)d.v2<<shift; + shift +=d.a2; + l |= (long)d.v3<<shift; + d.shift= shift +d.a3; + + return l; + } + + + + final public long calc5(Data4 d) { + + if (d.v0<0||d.v1<0||d.v2<0||d.v3<0||d.v4<0) return -1; + + long l = d.v0; + int shift =d.a0; + l |= (long)d.v1<<shift; + shift +=d.a1; + l |= (long)d.v2<<shift; + shift +=d.a2; + l |= (long)d.v3<<shift; + shift +=d.a3; + l |= (long)d.v4<<shift; + d.shift =shift+d.a4; + + return l; + } + + + final public long calc6(Data4 d) { + + if (d.v0<0||d.v1<0||d.v2<0||d.v3<0||d.v4<0||d.v5<0) return -1; + + long l = d.v0; + int shift =d.a0; + l |= (long)d.v1<<shift; + shift +=d.a1; + l |= (long)d.v2<<shift; + shift +=d.a2; + l |= (long)d.v3<<shift; + shift +=d.a3; + l |= (long)d.v4<<shift; + shift +=d.a4; + l |= (long)d.v5<<shift; + d.shift =shift+d.a5; + + return l; + } + + final public long calc7(Data4 d) { + + if (d.v0<0||d.v1<0||d.v2<0||d.v3<0||d.v4<0||d.v5<0||d.v6<0) return -1; + + long l = d.v0; + int shift =d.a0; + l |= (long)d.v1<<shift; + shift +=d.a1; + l |= (long)d.v2<<shift; + shift +=d.a2; + l |= (long)d.v3<<shift; + shift +=d.a3; + l |= (long)d.v4<<shift; + shift +=d.a4; + l |= (long)d.v5<<shift; + shift +=d.a5; + l |= (long)d.v6<<shift; + d.shift =shift+d.a6; + + return l; + } + + + final public long calc8(Data4 d) { + + if (d.v0<0||d.v1<0||d.v2<0||d.v3<0||d.v4<0||d.v5<0||d.v6<0||d.v7<0) return -1; + + long l = d.v0; + int shift =d.a0; + l |= (long)d.v1<<shift; + shift +=d.a1; + l |= (long)d.v2<<shift; + shift +=d.a2; + l |= (long)d.v3<<shift; + shift +=d.a3; + l |= (long)d.v4<<shift; + shift +=d.a4; + l |= (long)d.v5<<shift; + shift +=d.a5; + l |= (long)d.v6<<shift; + shift +=d.a6; + l |= (long)d.v7<<shift; + d.shift =shift+d.a7; + + return l; + } + + + + + + + + /** + * Maps a long to a integer value. This is very useful to save memory for sparse data long values + * @param node + * @return the integer + */ + static public int misses = 0; + static public int good = 0; + + + + + /** + * Write the data + * @param dos + * @throws IOException + */ + static public void writeData(DataOutputStream dos) throws IOException { + + dos.writeInt(getFeatureSet().size()); + for(Entry<String, HashMap<String,Integer>> e : getFeatureSet().entrySet()) { + dos.writeUTF(e.getKey()); + dos.writeInt(e.getValue().size()); + + for(Entry<String,Integer> e2 : e.getValue().entrySet()) { + + if(e2.getKey()==null) DB.println("key "+e2.getKey()+" value "+e2.getValue()+" e -key "+e.getKey()); + dos.writeUTF(e2.getKey()); + dos.writeInt(e2.getValue()); + } + } + } + public void read(DataInputStream din) throws IOException { + + int size = din.readInt(); + for(int i=0; i<size;i++) { + String k = din.readUTF(); + int size2 = din.readInt(); + + HashMap<String,Integer> h = new HashMap<String,Integer>(); + getFeatureSet().put(k,h); + for(int j = 0;j<size2;j++) { + h.put(din.readUTF(), din.readInt()); + } + getFeatureCounter().put(k, size2); + } + + calculateBits(); + } + + + /** + * Clear the data + */ + static public void clearData() { + getFeatureSet().clear(); + m_featureBits.clear(); + getFeatureSet().clear(); + } + + public HashMap<String,Integer> getFeatureCounter() { + return m_featureCounters; + } + + static public HashMap<String,HashMap<String,Integer>> getFeatureSet() { + return m_featureSets; + } + + static public String[] reverse(HashMap<String,Integer> v){ + String[] set = new String[v.size()]; + for(Entry<String,Integer> e : v.entrySet()) { + set[e.getValue()]=e.getKey(); + } + return set; + } + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/mtag/Options.java b/dependencyParser/basic/mate-tools/src/is2/mtag/Options.java new file mode 100755 index 0000000..6b9d806 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/mtag/Options.java @@ -0,0 +1,45 @@ +package is2.mtag; + +import is2.util.OptionsSuper; + +public final class Options extends OptionsSuper { + + + public Options (String[] args) { + + for(int i = 0; i < args.length; i++) { + + if (args[i].equals("--help")) explain(); + + if (args[i].equals("-nonormalize")) { + normalize=false; + } else if (args[i].equals("-features")) { + features= args[i+1]; i++; + } else if (args[i].equals("-hsize")) { + hsize= Integer.parseInt(args[i+1]); i++; + } else if (args[i].equals("-len")) { + maxLen= Integer.parseInt(args[i+1]); i++; + } else super.addOption(args, i); + } + } + + private void explain() { + System.out.println("Usage: "); + System.out.println("java -cp anna.jar is2.mtag.Tagger [Options]"); + System.out.println(); + System.out.println("Example: "); + System.out.println(" java -cp mate.jar is2.mtag.Tagger -model eps3.model -train corpora/conll08st/train/train.closed -test corpora/conll08st/devel/devel.closed -out b3.test -eval corpora/conll08st/devel/devel.closed -count 2000 -i 6"); + System.out.println(""); + System.out.println("Options:"); + System.out.println(""); + System.out.println(" -train <file> the corpus a model is trained on; default "+this.trainfile); + System.out.println(" -test <file> the input corpus for testing; default "+this.testfile); + System.out.println(" -out <file> the output corpus (result) of a test run; default "+this.outfile); + System.out.println(" -model <file> the parsing model for traing the model is stored in the files"); + System.out.println(" and for parsing the model is load from this file; default "+this.modelName); + System.out.println(" -i <number> the number of training iterations; good numbers are 10 for smaller corpora and 6 for bigger; default "+this.numIters); + System.out.println(" -count <number> the n first sentences of the corpus are take for the training default "+this.count); + + System.exit(0); + } +} diff --git a/dependencyParser/basic/mate-tools/src/is2/mtag/Pipe.java b/dependencyParser/basic/mate-tools/src/is2/mtag/Pipe.java new file mode 100755 index 0000000..b25b953 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/mtag/Pipe.java @@ -0,0 +1,508 @@ +package is2.mtag; + + +import is2.data.Cluster; +import is2.data.F2SF; +import is2.data.Instances; +import is2.data.InstancesTagger; +import is2.data.Long2Int; +import is2.data.Long2IntInterface; +import is2.data.ParametersFloat; +import is2.data.PipeGen; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; +import is2.tools.IPipe; +import is2.util.OptionsSuper; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map.Entry; + + +final public class Pipe extends PipeGen implements IPipe { + + public static int _CEND; + + + private static final String STWRD = "STWRD",STPOS = "STPOS",END = "END",STR = "STR"; + + public String[] types; + + Cluster cl; + + final public MFO mf =new MFO(); + public Long2IntInterface li; + + + + final MFO.Data4 d1 = new MFO.Data4(),d2 = new MFO.Data4(),d3 = new MFO.Data4(),dw = new MFO.Data4(); + final MFO.Data4 dwp = new MFO.Data4(),dp = new MFO.Data4(); + + + private OptionsSuper options; + private int _ewrd; + static private int _mid, _strp,_endp; + + public Pipe (Options options, Long2Int long2Int) throws IOException { + this.options = options; + + li =long2Int; + } + + public Pipe (OptionsSuper options) { + this.options = options; + } + + + public HashMap<Integer,Integer> form2morph = new HashMap<Integer, Integer> (); + + + public Instances createInstances(String file) { + + CONLLReader09 depReader = new CONLLReader09(CONLLReader09.NO_NORMALIZE); + + depReader.startReading(file); + mf.register(POS,"<root-POS>"); + + mf.register(FEAT, CONLLReader09.NO_TYPE); + mf.register(FEAT, ""); + + InstancesTagger is = new InstancesTagger(); + + System.out.println("Registering feature parts "); + + HashMap<String,HashSet<String>> op2form = new HashMap<String, HashSet<String>> (); + HashMap<String,Integer> freq = new HashMap<String, Integer> (); + + + int ic=0; + while(true) { + SentenceData09 instance1 = depReader.getNext(); + if (instance1== null) break; + ic++; + + + String[] w = instance1.forms; + for(int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1]); + for(int i1 = 0; i1 < w.length; i1++) registerChars(CHAR, w[i1]); + for(int i1 = 0; i1 < w.length; i1++) { + mf.register(WORD, w[i1].toLowerCase()); + Integer f = freq.get(w[i1].toLowerCase()); + if (f==null) freq.put(w[i1].toLowerCase(), 1); + else freq.put(w[i1].toLowerCase(), f+1); + + HashSet<String> forms = op2form.get(w[i1].toLowerCase()); + if (forms==null) { + forms = new HashSet<String>(); + op2form.put(w[i1].toLowerCase(), forms); + } + forms.add(instance1.ofeats[i1]==null?"_":instance1.ofeats[i1]); + } + for(int i1 = 0; i1 < w.length; i1++) registerChars(CHAR, w[i1].toLowerCase()); + + w = instance1.plemmas; + for(int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1]); + for(int i1 = 0; i1 < w.length; i1++) registerChars(CHAR, w[i1]); + + w = instance1.ppos; + for(int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); + + w = instance1.gpos; + for(int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); + + w = instance1.ofeats; + for(int i1 = 0; i1 < w.length; i1++) if (w[i1]!=null) mf.register(FEAT, w[i1]); + + // w = instance1.pfeats; + //for(int i1 = 0; i1 < w.length; i1++) if (w[i1]!=null) mf.register(FEAT, w[i1]); + } + + + for(Entry<String,HashSet<String>> e : op2form.entrySet()) { + if (e.getValue().size()==1 &&freq.get(e.getKey())>10) { + // System.out.println("found map "+e.getKey()+" "+e.getValue()+" "+freq.get(e.getKey())); + form2morph.put(mf.getValue(Pipe.WORD, e.getKey()), mf.getValue(FEAT, (String)e.getValue().toArray()[0])); + } + } + + initFeatures(); + + mf.calculateBits(); + initValues(); + + System.out.println(""+mf.toString()); + + depReader.startReading(file); + + int num1 = 0; + long start1 = System.currentTimeMillis(); + + System.out.print("Creating Features: "); + is.init(ic, mf) ; + int del=0; + + while(true) { + if (num1 % 100 ==0) {del = outValue(num1, del);} + SentenceData09 instance1 = depReader.getNext(is); + if (instance1== null) break; + + if (num1>options.count) break; + + num1++; + } + long end1 = System.currentTimeMillis(); + System.gc(); + long mem2 = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory(); + System.out.print(" time "+(end1-start1)+" mem "+(mem2/1024)+" kb"); + + types = new String[mf.getFeatureCounter().get(FEAT)]; + + for(Entry<String,Integer> e : mf.getFeatureSet().get(FEAT).entrySet()) { + types[e.getValue()] = e.getKey(); + } + + + if (options.clusterFile==null)cl = new Cluster(); + else cl= new Cluster(options.clusterFile, mf,6); + + + System.out.println("Num Features: " + types.length); + + + + depReader.startReading(file); + + + + int num11=0; + + while(true) { + + SentenceData09 instance = depReader.getNext(); + if (instance==null) break; + + is.fillChars(instance, num11, _CEND); + + + if (num11>options.count) break; + + num11++; + } + + return is;//.toNativeArray(); + + } + + private void registerChars(String type, String word) { + for(int i=0;i<word.length();i++) mf.register(type, Character.toString(word.charAt(i))); + } + + + + public void initValues() { + s_feat = mf.getFeatureBits(FEAT); + s_word = mf.getFeatureBits(WORD); + s_type = mf.getFeatureBits(TYPE); + s_char = mf.getFeatureBits(CHAR); + s_pos =mf.getFeatureBits(POS); + // dl1.a[0] = s_type; dl1.a[1] = s_pos; + // for (int k = 2; k < 7; k++) dl1.a[k] = s_pos; + + d1.a0 = s_type; d1.a1 = s_feat; d1.a2= s_word; + d2.a0 = s_type; d2.a1 = s_feat; d2.a2= s_feat; d2.a3= s_feat; d2.a4= s_feat; d2.a5= s_feat; d2.a6= s_feat; + d3.a0 = s_type; d3.a1 = s_feat; d3.a2= s_char; d3.a3= s_char; d3.a4= s_char; d3.a5= s_char; d3.a6= s_char; d3.a7= s_char; + dp.a0 = s_type; dp.a1 = s_feat; dp.a2= s_pos; dp.a3= s_pos; dp.a4= s_feat;// dp.a5= s_char; dp.a6= s_char; dp.a7= s_char; + dw.a0 = s_type; dw.a1 = s_feat;dw.a2= s_word; dw.a3= s_word; dw.a4= s_word; dw.a5= s_word; dw.a6= s_word; dw.a7= s_word; + dwp.a0 = s_type; dwp.a1 = s_feat;dwp.a2= s_word ; dwp.a3= s_feat; dwp.a4= s_word; + + } + + public static short s_feat,s_word,s_type,s_dir,s_dist,s_char,s_pos; + + + + /** + * Initialize the features types. + */ + public void initFeatures() { + + for(int t=0;t<62;t++) { + mf.register(TYPE,"F"+t); + } + + +// _mid = mf.register(POS, MID); + _strp = mf.register(POS, STR); + _endp= mf.register(POS, END); + + mf.register(WORD, STR); + _ewrd = mf.register(WORD, END); + + + _CEND = mf.register(CHAR, END); + + + + + // optional features + mf.register(WORD,STWRD); + mf.register(POS,STPOS); + + + } + + + final public void addCF(InstancesTagger is, int ic, String fs,int i, int pfeat[],short ppos[], int[] forms, int[] lemmas, long[] vs) { + + int c0= is.chars[ic][i][0], c1=is.chars[ic][i][1], c2=is.chars[ic][i][2], c3=is.chars[ic][i][3], c4=is.chars[ic][i][4],c5=is.chars[ic][i][5]; + int e0 =is.chars[ic][i][6], e1 =is.chars[ic][i][7],e2 =is.chars[ic][i][8],e3 =is.chars[ic][i][9],e4 =is.chars[ic][i][10]; + + int f=1,n=0; + short upper =0, number = 1; + for(int k1=0;k1<fs.length();k1++){ + char c = fs.charAt(k1); + if (Character.isUpperCase(c)) { + if (k1==0) upper=1; + else { + // first char + another + if (upper==1) upper=3; + // another uppercase in the word + else if (upper==0) upper=2; + } + } + + if (Character.isDigit(c) && k1==0) number =2 ; + else if (Character.isDigit(c) && number==1) number = 3; + } + + int form = forms[i]; + + int len = forms.length; + long l; + d1.v0 = f++; d1.v2=form; l=mf.calc3(d1); vs[n++]=mf.calc3(d1); + + d1.v0 = f++; d1.v2=is.formlc[ic][i]; vs[n++]=mf.calc3(d1); + + d3.v2=c0; d3.v3=c1; d3.v4=c2; d3.v5=c3; d3.v6=c4; + d3.v0=f++; vs[n++]=mf.calc3(d3); + d3.v0=f++; vs[n++]=mf.calc4(d3); + d3.v0=f++; vs[n++]=mf.calc5(d3); + d3.v0=f++; vs[n++]=mf.calc6(d3); + d3.v0=f++; vs[n++]=mf.calc7(d3); + + if (form!=-1) { + d3.v2=c2; d3.v3=c3; d3.v4=c4; d3.v5=c5; d3.v6=cl.getLP(form); + d3.v0=f; vs[n++]=mf.calc6(d3); d3.v0=f+1; vs[n++]=mf.calc7(d3); + } + f+=2; + + if (form>0) { + d3.v0=f; d3.v5=cl.getLP(form); vs[n++]=mf.calc6(d3); + d3.v0=f+1; d3.v4=cl.getLP(form); vs[n++]=mf.calc5(d3); + d3.v0=f+2; d3.v3=cl.getLP(form); vs[n++]=mf.calc4(d3); + } + f+=3; + + d3.v2=e0; d3.v3=e1; d3.v4=e2; d3.v5=e3; d3.v6=e4; + d3.v0 =f++; vs[n++]=mf.calc3(d3); + d3.v0 =f++; vs[n++]=l=mf.calc4(d3); vs[n++]=d3.calcs(3, upper, l); + d3.v0 =f++; vs[n++]=l=mf.calc5(d3); vs[n++]=d3.calcs(3, upper, l); + d3.v0 =f++; vs[n++]=l=mf.calc6(d3); vs[n++]=d3.calcs(3, upper, l); + d3.v0 =f++; vs[n++]=l=mf.calc7(d3); vs[n++]=d3.calcs(3, upper, l); + + if (form>0) { + d3.v0=f; d3.v5=cl.getLP(form); vs[n++]=mf.calc6(d3); + d3.v0=f+1; d3.v4=cl.getLP(form); vs[n++]=mf.calc5(d3); + d3.v0=f+2; d3.v3=cl.getLP(form); vs[n++]=mf.calc4(d3); + } + f+=3; + + + dw.v0=f++; dw.v2=i+1<len?forms[i+1]:_ewrd;dw.v3= forms[i];vs[n++]=mf.calc4(dw); + + if (len>i+1) { + + dw.v0=f; dw.v2= forms[i+1]; vs[n++]=mf.calc3(dw); + d3.v0=f+1; d3.v2 =is.chars[ic][i+1][0];vs[n++]=mf.calc3(d3); + d3.v0=f+2; d3.v2 =is.chars[ic][i+1][6];vs[n++]=mf.calc3(d3); + + d3.v2=e0; d3.v3=e1; + + d3.v0 =f+3; d3.v4 =is.chars[ic][i+1][0];vs[n++]=mf.calc5(d3); + d3.v0 =f+4; d3.v4 =is.chars[ic][i+1][6];vs[n++]=mf.calc5(d3); + + + + if (is.chars[ic][i+1][11]>1 ) { // instance.forms[i+1].length() + + d3.v0=f+5; d3.v2=is.chars[ic][i+1][0]; d3.v3=is.chars[ic][i+1][1]; vs[n++]=mf.calc4(d3); + d3.v0=f+6; d3.v2=is.chars[ic][i+1][6]; d3.v3=is.chars[ic][i+1][7]; vs[n++]=mf.calc4(d3); + + d3.v2=e0; d3.v3=e1; + + d3.v0=f+7; d3.v4 = is.chars[ic][i+1][0]; d3.v5 =is.chars[ic][i+1][1]; vs[n++]=mf.calc6(d3); + d3.v0=f+8; d3.v4 = is.chars[ic][i+1][6]; d3.v5=is.chars[ic][i+1][7]; vs[n++]=mf.calc6(d3); + + if (forms[i+1]>0) { + d3.v0=f+9; d3.v2=is.chars[ic][i+1][0]; d3.v3=is.chars[ic][i+1][1]; d3.v4 =cl.getLP(forms[i+1]); vs[n++]=mf.calc5(d3); + d3.v0=f+10; d3.v2=is.chars[ic][i+1][6]; d3.v3=is.chars[ic][i+1][7]; d3.v4 =cl.getLP(forms[i+1]); vs[n++]=mf.calc5(d3); + } + } + + if (forms[i+1]>0) { + dw.v0=f+11; dw.v2= cl.getLP(forms[i+1]); dw.v3= forms[i];vs[n++]=mf.calc4(dw); + } + + if (len>i+2) { + dw.v0=f+12; dw.v2= forms[i+2]; dw.v3 = forms[i+1];vs[n++]=mf.calc4(dw);vs[n++]=mf.calc3(dw); +// d2.v0=f+13; d2.v2=pfeat[i+1]; d2.v3= pfeat[i+2]; vs[n++]=mf.calc4(d2); + // dp.v0= f+14; dp.v2=ppos[i+1]; dp.v3=ppos[i+2]; vs[n++]=mf.calc4(dp); + + } + + if (len>i+3) { + dw.v0=f+14; dw.v2= forms[i+3]; dw.v3 = forms[i+2]; vs[n++]=mf.calc4(dw); vs[n++]=mf.calc3(dw); + + } + } + f+=16; + + // length + d2.v0=f++; d2.v2=is.chars[ic][i][11];vs[n++]=mf.calc3(d2); + + + // contains a number + d2.v0=f++; d2.v2=number; vs[n++]=mf.calc3(d2); + d1.v0=f++; d1.v2=lemmas[i]; vs[n++]=mf.calc3(d1); + + if (i!=0 &&len>i+1) { + dw.v0=f; dw.v2=lemmas[i-1];dw.v3=lemmas[i+1];vs[n++]=mf.calc4(dw); + d2.v0=f+1; d2.v2=pfeat[i-1]; d2.v3=pfeat[i+1];vs[n++]=mf.calc4(d2); + } + f+=2; + + d2.v0= f++; d2.v2=i>=1? pfeat[i-1]:_strp; vs[n++]=mf.calc3(d2); + dp.v0= f++; dp.v2=ppos[i]; vs[n++]=mf.calc3(dp); + + if (i>0) { + dw.v0 = f++; dw.v2 =i>=1? forms[i-1]:_strp; vs[n++]=mf.calc3(dw); + dw.v0 = f++; dw.v2 = i>=1? lemmas[i-1]:_strp; vs[n++]=mf.calc3(dw); + + if (len>i+1) { +// d2.v0=f; d2.v2= pfeat[i-1];d2.v3= pfeat[i+1]; vs[n++]=mf.calc4(d2); + // dp.v0= f+1; dp.v2=ppos[i-1]; dp.v3=ppos[i+1]; vs[n++]=mf.calc4(dp); + + } + f++; + dp.v0= f++; dp.v2=ppos[i]; dp.v3=ppos[i-1]; vs[n++]=mf.calc4(dp); + + if (i>1) { + d2.v0=f++; d2.v2=i<2?_strp: pfeat[i-2]; vs[n++]=mf.calc3(d2); + d2.v0=f++; d2.v2= pfeat[i-1]; d2.v3= pfeat[i-2]; vs[n++]=mf.calc4(d2); + + dw.v0=f++; dw.v2= forms[i-2]; vs[n++]=mf.calc3(dw); + dwp.v0=f++; dwp.v2 = forms[i-1]; dwp.v3 = pfeat[i-2];vs[n++]=mf.calc4(dwp); + dwp.v0=f++; dwp.v2 = forms[i-2]; dwp.v3 = pfeat[i-1];vs[n++]=mf.calc4(dwp); + + if (i>2) { + d2.v0=f++; d2.v2=pfeat[i-3]; vs[n++]=mf.calc3(d2); + d2.v0=f++; d2.v2=pfeat[i-2]; d2.v3= pfeat[i-3]; vs[n++]=mf.calc4(d2); + dw.v0=f++; dw.v2 = forms[i-3]; dw.v3 = forms[i-2]; vs[n++]=mf.calc4(dw); + // dp.v0= f++; dp.v2=ppos[i-3]; dp.v3=ppos[i-2]; vs[n++]=mf.calc4(dp); + } + } + } + vs[n] = Integer.MIN_VALUE; + } + + + + + + + + public int fillFeatureVectorsOne(ParametersFloat params, int w1, String form, Instances is, int n, int[] features, long[] vs) { + double best = -1; + int bestType=-1; + + F2SF f = new F2SF(params.parameters); + //is.gfeats[n] + addCF((InstancesTagger)is, n, form, w1, features,is.pposs[n], is.forms[n], is.plemmas[n], vs); + + for(int t = 0; t < types.length; t++) { + + f.clear(); + int p = t<<Pipe.s_type; + for(int k=vs.length-1;k>=0;k--) if (vs[k]>=0) f.add(li.l2i(vs[k]+p)); + if (f.score >best) { + bestType=t; + best =f.score; + } + + } + return bestType; + + } + + + + //static ArrayList<T> todo = new ArrayList<T>(); + static SentenceData09 instance; + + + public static int _FC =200; + + + /** + * Write the lemma that are not mapped by operations + * @param dos + */ + public void writeMap(DataOutputStream dos) { + + try { + dos.writeInt(this.form2morph.size()); + for(Entry<Integer, Integer> e : form2morph.entrySet()) { + dos.writeInt(e.getKey()); + dos.writeInt(e.getValue()); + } + } catch (IOException e1) { + e1.printStackTrace(); + } + } + + + + /** + * Read the form-lemma mapping not read by operations + * @param dis + */ + public void readMap(DataInputStream dis) { + try { + int size = dis.readInt(); + for(int i =0; i<size;i++) { + form2morph.put(dis.readInt(), dis.readInt()); + } + } catch (IOException e1) { + e1.printStackTrace(); + } + } + + + /* (non-Javadoc) + * @see is2.tools.IPipe#write(java.io.DataOutputStream) + */ + @Override + public void write(DataOutputStream dos) { + try { + cl.write(dos); + writeMap(dos); + } catch (IOException e) { + e.printStackTrace(); + } + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/mtag/Tagger.java b/dependencyParser/basic/mate-tools/src/is2/mtag/Tagger.java new file mode 100644 index 0000000..da31a5b --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/mtag/Tagger.java @@ -0,0 +1,371 @@ +package is2.mtag; + + +import is2.data.Cluster; +import is2.data.FV; +import is2.data.Instances; +import is2.data.InstancesTagger; +import is2.data.Long2Int; +import is2.data.ParametersFloat; +import is2.data.PipeGen; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; +import is2.io.CONLLWriter09; +import is2.tools.IPipe; +import is2.tools.Train; +import is2.tools.Tool; +import is2.util.DB; +import is2.util.OptionsSuper; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.Map.Entry; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; +import java.util.zip.ZipOutputStream; + + +public class Tagger implements Tool, Train { + + ExtractorM pipe; + ParametersFloat params; + + + /** + * Initialize + * @param options + */ + public Tagger (Options options) { + + // load the model + try { + readModel(options); + } catch (Exception e) { + e.printStackTrace(); + } + + } + + /** + * @param string + * @throws IOException + */ + public Tagger(String modelFileName) { + this(new Options(new String[] {"-model",modelFileName})); + } + + public Tagger() { } + + public static void main (String[] args) throws FileNotFoundException, Exception + { + + Options options = new Options(args); + + Tagger tagger = new Tagger(); + + if (options.train) { + + Long2Int li = new Long2Int(options.hsize); + tagger.pipe = new ExtractorM (options,li); + InstancesTagger is = (InstancesTagger)tagger.pipe.createInstances(options.trainfile); + ParametersFloat params = new ParametersFloat(li.size()); + + tagger.train(options, tagger.pipe,params,is); + tagger.writeModel(options, tagger.pipe, params); + } + + if (options.test) { + + tagger.readModel(options); + tagger.out(options,tagger.pipe, tagger.params); + } + + if (options.eval) { + + System.out.println("\nEvaluate:"); + Evaluator.evaluate(options.goldfile, options.outfile,options.format); + } + } + + /* (non-Javadoc) + * @see is2.mtag2.Learn#writeModel(is2.mtag2.Options, is2.mtag2.Pipe, is2.data.ParametersFloat) + */ + public void writeModel(OptionsSuper options, IPipe pipe,ParametersFloat params) { + + try { + ZipOutputStream zos = new ZipOutputStream(new BufferedOutputStream(new FileOutputStream(options.modelName))); + zos.putNextEntry(new ZipEntry("data")); + DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(zos)); + + MFO.writeData(dos); + + MFO.clearData(); + + DB.println("number of parameters "+params.parameters.length); + dos.flush(); + params.write(dos); + pipe.write(dos); + dos.flush(); + dos.close(); + } catch (Exception e){ + e.printStackTrace(); + } + } + + /* (non-Javadoc) + * @see is2.mtag2.Learn#readModel(is2.mtag2.Options) + */ + public void readModel(OptionsSuper options) { + + try { + pipe = new ExtractorM(options); + params = new ParametersFloat(0); + + // load the model + ZipInputStream zis = new ZipInputStream(new BufferedInputStream(new FileInputStream(options.modelName))); + zis.getNextEntry(); + DataInputStream dis = new DataInputStream(new BufferedInputStream(zis)); + pipe.mf.read(dis); + pipe.initValues(); + pipe.initFeatures(); + + params.read(dis); + pipe.li = new Long2Int(params.parameters.length); + pipe.cl = new Cluster(dis); + pipe.readMap(dis); + dis.close(); + + this.pipe.types = new String[pipe.mf.getFeatureCounter().get(ExtractorM.FFEATS)]; + for(Entry<String,Integer> e :pipe.mf.getFeatureSet().get(ExtractorM.FFEATS).entrySet()) + this.pipe.types[e.getValue()] = e.getKey(); + + + DB.println("Loading data finished. "); + + DB.println("number of parameter "+params.parameters.length); + DB.println("number of classes "+this.pipe.types.length); + } catch(Exception e) { + e.printStackTrace(); + } + } + + /* (non-Javadoc) + * @see is2.mtag2.Learn#train(is2.mtag2.Options, is2.mtag2.Pipe, is2.data.ParametersFloat, is2.data.InstancesTagger) + */ + public void train(OptionsSuper options, IPipe pipe, ParametersFloat params, Instances is) { + + int i = 0; + int del=0; + + String[] wds = this.pipe.mf.reverse(this.pipe.mf.getFeatureSet().get(ExtractorM.WORD)); + int numInstances = is.size(); + + float upd = (options.numIters*numInstances + 1); + + + for(i = 0; i < options.numIters; i++) { + + long start = System.currentTimeMillis(); + + + long last= System.currentTimeMillis(); + + FV pred = new FV(), gold = new FV(); + int correct =0,count=0; + + for(int n = 0; n < numInstances; n++) { + + upd--; + + if((n+1) % 500 == 0) del= PipeGen.outValueErr(n+1, (count-correct),(float)correct/(float)count,del,last,upd); + + int length = is.length(n); + + int feats[] = new int[length]; + long[] vs = new long[ExtractorM._FC]; + + + for(int w1 = 0; w1 < length; w1++) { + + + count++; + + if (this.pipe.form2morph.get(is.forms[n][w1])!=null){ + correct++; + continue; + } + + int bestType = this.pipe.fillFeatureVectorsOne(params, w1, wds[is.forms[n][w1]],is, n, is.gfeats[n],vs); + feats[w1]=bestType; + + + if (bestType == is.gfeats[n][w1] ) { + correct++; + continue; + } + + pred.clear(); + int p = bestType << ExtractorM.s_type; + // System.out.println("test type "+bestType+" ex type "+ExtractorM.s_type); + for(int k=0;k<vs.length;k++) { + if (vs[k]==Integer.MIN_VALUE) break; + if (vs[k]>=0) pred.add(this.pipe.li.l2i(vs[k]+p)); + } + + gold.clear(); + p = is.gfeats[n][w1] << ExtractorM.s_type; + for(int k=0;k<vs.length;k++) { + if (vs[k]==Integer.MIN_VALUE) break; + if (vs[k]>=0) gold.add(this.pipe.li.l2i(vs[k]+p)); + } + params.update(pred,gold, (float)upd, 1.0f); + } + + } + + long end = System.currentTimeMillis(); + String info = "time "+(end-start); + del= PipeGen.outValueErr(numInstances, (count-correct),(float)correct/(float)count,del,last,0,info); + + System.out.println(); + } + + params.average(i*is.size()); + + } + + + public void out (OptionsSuper options, IPipe pipe, ParametersFloat params) { + + + try { + long start = System.currentTimeMillis(); + + CONLLReader09 depReader = new CONLLReader09(options.testfile, options.formatTask); + CONLLWriter09 depWriter = new CONLLWriter09(options.outfile, options.formatTask); + + depReader.normalizeOn=false; + + System.out.print("Processing Sentence: "); + pipe.initValues(); + + int cnt = 0; + int del=0; + while(true) { + + InstancesTagger is = new InstancesTagger(); + is.init(1, this.pipe.mf); + cnt++; + + SentenceData09 instance = depReader.getNext(is); + if (instance == null || instance.forms == null) break; + is.fillChars(instance, 0, ExtractorM._CEND); + + instance = exec(instance, this.pipe, params,(InstancesTagger)is); + + SentenceData09 i09 = new SentenceData09(instance); + i09.createSemantic(instance); + + if (options.overwritegold) i09.ofeats = i09.pfeats; + + depWriter.write(i09); + + if (cnt%100==0) del=PipeGen.outValue(cnt, del); + + } + depWriter.finishWriting(); + + del=PipeGen.outValue(cnt, del); + + long end = System.currentTimeMillis(); + System.out.println(PipeGen.getSecondsPerInstnace(cnt,(end-start))); + System.out.println(PipeGen.getUsedTime((end-start))); + } catch(Exception e){ + e.printStackTrace(); + } + } + + + private SentenceData09 exec(SentenceData09 instance, ExtractorM pipe, ParametersFloat params, InstancesTagger is) { + + int length = instance.ppos.length; + + short[] feats = new short[instance.gpos.length]; + + long vs[] = new long[ExtractorM._FC]; + + String[] forms = instance.forms; + + instance.pfeats = new String[instance.gpos.length]; + + + for(int j = 0; j < length; j++) { + if (pipe.form2morph.get(is.forms[0][j])!=null) { + feats[j] = (short)pipe.form2morph.get(is.forms[0][j]).intValue(); + instance.pfeats[j] = this.pipe.types[feats[j]]; + } else { + + int bestType = pipe.fillFeatureVectorsOne(params,j, forms[j], is, 0,feats,vs); + feats[j] = (short)bestType; + instance.pfeats[j]= this.pipe.types[bestType]; + } + } + for(int j = 0; j < length; j++) { + if (pipe.form2morph.get(is.forms[0][j])!=null) { + feats[j] =(short)pipe.form2morph.get(is.forms[0][j]).intValue(); + instance.pfeats[j] = this.pipe.types[feats[j]]; + } else { + + int bestType = pipe.fillFeatureVectorsOne(params,j, forms[j], is, 0,feats,vs); + feats[j] = (short)bestType; + instance.pfeats[j]= this.pipe.types[bestType]; + } + } + return instance; + } + + + + /* (non-Javadoc) + * @see is2.tools.Tool#apply(is2.data.SentenceData09) + */ + @Override + public SentenceData09 apply(SentenceData09 snt) { + + try { + SentenceData09 it = new SentenceData09(); + it.createWithRoot(snt); + + InstancesTagger is = new InstancesTagger(); + is.init(1, pipe.mf); + is.createInstance09(it.forms.length); + + String[] forms = it.forms; + + + int length = forms.length; + + // is.setForm(0, 0, CONLLReader09.ROOT); + for(int i=0;i<length;i++) is.setForm(0, i, forms[i]); + for(int i=0;i<length;i++) is.setLemma(0, i, it.plemmas[i]); + for(int i=0;i<length;i++) is.setPPoss(0, i, it.ppos[i]); + + is.fillChars(it, 0, ExtractorM._CEND); + + exec(it,pipe,params,is); + SentenceData09 i09 = new SentenceData09(it); + i09.createSemantic(it); + return i09; + } catch(Exception e) { + e.printStackTrace(); + } + + return null; + } +} diff --git a/dependencyParser/basic/mate-tools/src/is2/parser/Closed.java b/dependencyParser/basic/mate-tools/src/is2/parser/Closed.java new file mode 100755 index 0000000..ed61657 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parser/Closed.java @@ -0,0 +1,32 @@ +package is2.parser; + +import is2.data.Parse; + + +final public class Closed { + + public float p; +// short b,e,m; + byte dir; + + Closed d; + Open u; + + public Closed(short s, short t, int m, int dir,Open u, Closed d, float score) { + // this.b = s; + // this.e = t; + // this.m = (short)m; + this.dir = (byte)dir; + this.u=u; + this.d =d; + p=score; + } + + + public void create(Parse parse) { + if (u != null) u.create(parse); + if (d != null) d.create(parse); + } +} + + diff --git a/dependencyParser/basic/mate-tools/src/is2/parser/D5.java b/dependencyParser/basic/mate-tools/src/is2/parser/D5.java new file mode 100644 index 0000000..58adc0a --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parser/D5.java @@ -0,0 +1,254 @@ +/** + * + */ +package is2.parser; + +import is2.data.DX; + +import is2.data.IFV; +import is2.data.Long2IntInterface; + +/** + * @author Dr. Bernd Bohnet, 30.10.2010 + * + * + */ +final public class D5 extends DX { + + + + + public long shift; + private long h; + + + + + /* (non-Javadoc) + * @see is2.parser52L.DX#cz2() + */ + final public void cz2() { + + if (v0<0||v1<0) { + shift=0; + h=-1; + return ; + } + + h = v0 | v1<<(shift=a0); + shift +=a1; + + } + + + + /* (non-Javadoc) + * @see is2.parser52L.DX#cz3() + */ + final public void cz3() { + + if (v0<0||v1<0||v2<0) { + shift=0; + h=-1; + return ; + + } + + h = v0 | v1<<(shift=a0) | v2<<(shift +=a1); + shift= shift + a2; + + } + + + /* (non-Javadoc) + * @see is2.parser52L.DX#cz4() + */ + final public void cz4() { + if (v0<0||v1<0||v2<0||v3<0) { + shift=0; + h=-1; + return ; + } + + h = v0 | v1<<(shift=a0) | v2<<(shift +=a1) | v3<<(shift +=a2); + shift= shift +a3; + + } + + + + /* (non-Javadoc) + * @see is2.parser52L.DX#cz5() + */ + final public void cz5() { + + if (v0<0||v1<0||v2<0||v3<0||v4<0) { + shift=0; + h=-1; + return ; + } + + h = v0 | v1<<(shift=a0) | v2<<(shift +=a1) | v3<<(shift +=a2) | v4<<(shift +=a3); + shift =shift+a4; + + + } + + + /* (non-Javadoc) + * @see is2.parser52L.DX#cz6() + */ + final public void cz6() { + + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0) { + shift=0; + h=-1; + return ; + } + + h = v0 | v1<<(shift=a0) | v2<<(shift +=a1) | v3<<(shift +=a2) | v4<<(shift +=a3) | v5<<(shift +=a4); + shift =shift+a5; + + } + + /* (non-Javadoc) + * @see is2.parser52L.DX#cz7() + */ + final public void cz7() { + + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0||v6<0) { + shift=0; + h=-1; + return ; + } + + h = v0 | v1<<(shift=a0) | v2<<(shift +=a1) | v3<<(shift +=a2) | v4<<(shift +=a3) | v5<<(shift +=a4) | v6<<(shift +=a5); + shift =shift+a6; + + } + + + /* (non-Javadoc) + * @see is2.parser52L.DX#cz8() + */ + final public void cz8() { + + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0||v6<0||v7<0) { + h=-1; + shift=0; + return ; + } + + h = v0 | v1<<(shift=a0) | v2<<(shift +=a1) | v3<<(shift +=a2) | v4<<(shift +=a3) | v5<<(shift +=a4) | v6<<(shift +=a5) | v7<<(shift +=a6); + shift =shift+a7; + + + } + + + + + + + /* (non-Javadoc) + * @see is2.parser52L.DX#clean() + */ + final public void clean() { + v0=0;v1=0;v2=0;v3=0;v4=0;v5=0;v6=0;v7=0;v8=0; + shift=0;h =0; + } + + public final Long2IntInterface _li; + public D5(Long2IntInterface li) { + _li=li; + } + + /* (non-Javadoc) + * @see is2.parser52L.DX#cs(int, int) + */ + final public long cs(int b, int v) { + if (h<0) { + h=-1; shift=0; + return -1; + } + + h |= (long)v<<shift; + shift +=b; + if (shift>64) { + System.out.println("shift too large "+shift); + new Exception().printStackTrace(); + } + + return h; + + } + /* (non-Javadoc) + * @see is2.parser52L.DX#csa(int, int) + */ + final public long csa(int b, int v) { + if (h<0) { + h=-1; shift=0; return -1; + } + + h |= (long)v<<shift; + shift +=b; + if (shift>64) { + System.out.println("shift too large "+shift); + new Exception().printStackTrace(); + } + + return h; + + } + + /* (non-Javadoc) + * @see is2.parser52L.DX#csa(int, int, is2.data.IFV) + */ + final public void csa(int b, int v, IFV f ) { + if (h<0) { + h=-1; shift=0; return; + } + + h |= (long)v<<shift; + shift +=b; + if (shift>64) { + System.out.println("shift too large "+shift); + new Exception().printStackTrace(); + } + + f.add((int)_li.l2i(h)); + } + + /* (non-Javadoc) + * @see is2.parser52L.DX#getVal() + */ + public long getVal() { + if (h<0) { + h=-1; shift=0; return h; + } + return h; + } + + /* (non-Javadoc) + * @see is2.parser52L.DX#map(is2.data.IFV, long) + */ + public void map(IFV f, long l) { + if (l>0) f.add(_li.l2i(l)); + } + + + + /* (non-Javadoc) + * @see is2.data.DX#computeLabeValue(short, short) + */ + @Override + public int computeLabeValue(int label, int shift) { + return label<<shift; + } + + public void fix() { + + } + + +} \ No newline at end of file diff --git a/dependencyParser/basic/mate-tools/src/is2/parser/Decoder.java b/dependencyParser/basic/mate-tools/src/is2/parser/Decoder.java new file mode 100755 index 0000000..9fe833a --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parser/Decoder.java @@ -0,0 +1,161 @@ +package is2.parser; + +import java.util.ArrayList; +import java.util.concurrent.ExecutorService; + + +import is2.data.DataFES; +import is2.data.Parse; +import is2.util.DB; + + +/** + * @author Bernd Bohnet, 01.09.2009 + * + * This methods do the actual work and they build the dependency trees. + */ +final public class Decoder { + + public static final boolean TRAINING = true; + public static long timeDecotder; + public static long timeRearrange; + + /** + * Threshold for rearrange edges non-projective + */ + public static float NON_PROJECTIVITY_THRESHOLD = 0.3F; + + + static ExecutorService executerService =java.util.concurrent.Executors.newFixedThreadPool(Parser.THREADS); + + // do not initialize + private Decoder() {}; + + + /** + * Build a dependency tree based on the data + * @param pos part-of-speech tags + * @param x the data + * @param projective projective or non-projective + * @param edges the edges + * @return a parse tree + * @throws InterruptedException + */ + public static Parse decode(short[] pos, DataFES x, boolean projective, boolean training) throws InterruptedException { + + long ts = System.nanoTime(); + + if (executerService.isShutdown()) executerService = java.util.concurrent.Executors.newCachedThreadPool(); + final int n = pos.length; + + final Open O[][][][] = new Open[n][n][2][]; + final Closed C[][][][] = new Closed[n][n][2][]; + + ArrayList<ParallelDecoder> pe = new ArrayList<ParallelDecoder>(); + + for(int i=0;i<Parser.THREADS ;i++) pe.add(new ParallelDecoder(pos, x, O, C, n)); + + for (short k = 1; k < n; k++) { + + // provide the threads the data + for (short s = 0; s < n; s++) { + short t = (short) (s + k); + if (t >= n) break; + + ParallelDecoder.add(s,t); + } + + executerService.invokeAll(pe); + } + + float bestSpanScore = (-1.0F / 0.0F); + Closed bestSpan = null; + for (int m = 1; m < n; m++) + if (C[0][n - 1][1][m].p > bestSpanScore) { + bestSpanScore = C[0][n - 1][1][m].p; + bestSpan = C[0][n - 1][1][m]; + } + + // build the dependency tree from the chart + Parse out= new Parse(pos.length); + + bestSpan.create(out); + + out.heads[0]=-1; + out.labels[0]=0; + + timeDecotder += (System.nanoTime()-ts); + + ts = System.nanoTime(); + + if (!projective) rearrange(pos, out.heads, out.labels,x,training); + + timeRearrange += (System.nanoTime()-ts); + + return out; + } + + + /** + * This is the parallel non-projective edge re-arranger + * + * @param pos part-of-speech tags + * @param heads parent child relation + * @param labs edge labels + * @param x the data + * @param edges the existing edges defined by part-of-speech tags + * @throws InterruptedException + */ + public static void rearrange(short[] pos, short[] heads, short[] labs, DataFES x, boolean training) throws InterruptedException { + + int threads =(pos.length>Parser.THREADS)? Parser.THREADS: pos.length; + + + + // wh what to change, nPar - new parent, nType - new type + short wh = -1, nPar = -1,nType = -1; + ArrayList<ParallelRearrange> pe = new ArrayList<ParallelRearrange>(); + + while(true) { + boolean[][] isChild = new boolean[heads.length][heads.length]; + for(int i = 1, l1=1; i < heads.length; i++,l1=i) + while((l1= heads[l1]) != -1) isChild[l1][i] = true; + + float max = Float.NEGATIVE_INFINITY; + float p = Extractor.encode3(pos, heads, labs, x); + + pe.clear(); + for(int i=0;i<threads;i++) pe.add(new ParallelRearrange( isChild, pos,x,heads,labs)); + + for(int ch = 1; ch < heads.length; ch++) { + + for(short pa = 0; pa < heads.length; pa++) { + if(ch == pa || pa == heads[ch] || isChild[ch][pa]) continue; + + ParallelRearrange.add(p,(short) ch, pa); + } + } + executerService.invokeAll(pe); + + for(ParallelRearrange.PA rp :ParallelRearrange.order) + if(max < rp.max ) { + max = rp.max; wh = rp.wh; + nPar = rp.nPar; nType = rp.nType ; + } + ParallelRearrange.order.clear(); + + if(max <= NON_PROJECTIVITY_THRESHOLD) break; // bb: changed from 0.0 + + heads[wh] = nPar; + labs[wh] = nType; + + } + } + + public static String getInfo() { + + return "Decoder non-projectivity threshold: "+NON_PROJECTIVITY_THRESHOLD; + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/parser/Edges.java b/dependencyParser/basic/mate-tools/src/is2/parser/Edges.java new file mode 100644 index 0000000..5ad892b --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parser/Edges.java @@ -0,0 +1,206 @@ +/** + * + */ +package is2.parser; + +import is2.data.PipeGen; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Map.Entry; + +/** + * @author Dr. Bernd Bohnet, 13.05.2009; + * + * + */ +public final class Edges { + + + private static short[][][] edges; + private static HashMap<Short,Integer> labelCount = new HashMap<Short,Integer>(); + + private static HashMap<String,Integer> slabelCount = new HashMap<String,Integer>(); + + + static short[] def = new short[1]; + + private Edges () {} + + /** + * @param length + */ + public static void init(int length) { + edges = new short[length][length][]; + } + + + public static void findDefault(){ + + int best =0; + + + + for(Entry<Short,Integer> e : labelCount.entrySet()) { + + + if (best<e.getValue()) { + best = e.getValue(); + def[0]=e.getKey(); + } + } + + + // labelCount=null; + // String[] types = new String[mf.getFeatureCounter().get(PipeGen.REL)]; + // for (Entry<String, Integer> e : MFO.getFeatureSet().get(PipeGen.REL).entrySet()) types[e.getValue()] = e.getKey(); + + is2.util.DB.println("set default label to "+def[0]+" " ); + + // System.out.println("found default "+def[0]); + + } + + + final static public void put(int pos1, int pos2, short label) { + putD(pos1, pos2,label); + // putD(pos2, pos1,!dir, label); + } + + + final static public void putD(int pos1, int pos2, short label) { + + Integer lc = labelCount.get(label); + if (lc==null) labelCount.put(label, 1); + else labelCount.put(label, lc+1); + + String key = pos1+"-"+pos2+label; + Integer lcs = slabelCount.get(key); + if (lcs==null) slabelCount.put(key, 1); + else slabelCount.put(key, lcs+1); + + if (edges[pos1][pos2]==null) { + edges[pos1][pos2]=new short[1]; + edges[pos1][pos2][0]=label; + +// edgesh[pos1][pos2][dir?0:1] = new TIntHashSet(2); +// edgesh[pos1][pos2][dir?0:1].add(label); + } else { + short labels[] = edges[pos1][pos2]; + for(short l : labels) { + //contains label already? + if(l==label) return; + } + + short[] nlabels = new short[labels.length+1]; + System.arraycopy(labels, 0, nlabels, 0, labels.length); + nlabels[labels.length]=label; + edges[pos1][pos2]=nlabels; + + // edgesh[pos1][pos2][dir?0:1].add(label); + } + } + + final static public short[] get(int pos1, int pos2) { + + if (pos1<0 || pos2<0 || edges[pos1][pos2]==null) return def; + return edges[pos1][pos2]; + } + + + /** + * @param dis + */ + static public void write(DataOutputStream d) throws IOException { + + int len = edges.length; + d.writeShort(len); + + for(int p1 =0;p1<len;p1++) { + for(int p2 =0;p2<len;p2++) { + if (edges[p1][p2]==null) d.writeShort(0); + else { + d.writeShort(edges[p1][p2].length); + for(int l =0;l<edges[p1][p2].length;l++) { + d.writeShort(edges[p1][p2][l]); + } + + } + } + } + + d.writeShort(def[0]); + + } + + + /** + * @param dis + */ + public static void read(DataInputStream d) throws IOException { + int len = d.readShort(); + + edges = new short[len][len][]; + for(int p1 =0;p1<len;p1++) { + for(int p2 =0;p2<len;p2++) { + int ll = d.readShort(); + if (ll==0) { + edges[p1][p2]=null; + } else { + edges[p1][p2] = new short[ll]; + for(int l =0;l<ll;l++) { + edges[p1][p2][l]=d.readShort(); + } + } + } + } + + def[0]= d.readShort(); + + } + + public static class C implements Comparator<Short> { + + public C() { + super(); + } + + String _key; + + public C(String key) { + super(); + _key=key; + } + + /* (non-Javadoc) + * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) + */ + @Override + public int compare(Short l1, Short l2) { + + // int c1 = labelCount.get(l1); + // int c2 = labelCount.get(l2); + // if (true) return c1==c2?0:c1>c2?-1:1; + + int x1 = slabelCount.get(_key+l1.shortValue()); + int x2 = slabelCount.get(_key+l2.shortValue()); + // System.out.println(x1+" "+x2); + + + return x1==x2?0:x1>x2?-1:1; + + + + } + + + + + + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/parser/Evaluator.java b/dependencyParser/basic/mate-tools/src/is2/parser/Evaluator.java new file mode 100755 index 0000000..c764cc6 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parser/Evaluator.java @@ -0,0 +1,94 @@ +package is2.parser; + +import is2.data.SentenceData09; +import is2.io.CONLLReader09; + + +public class Evaluator { + + + + public static final String PUNCT ="!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; + + public static class Results { + + public int total; + public int corr; + public float las; + public float ula; + + } + + public static Results evaluate (String act_file, String pred_file) throws Exception { + + CONLLReader09 goldReader = new CONLLReader09(act_file, -1); + CONLLReader09 predictedReader = new CONLLReader09(pred_file, -1); + + int total = 0, corr = 0, corrL = 0; + int numsent = 0, corrsent = 0, corrsentL = 0; + SentenceData09 goldInstance = goldReader.getNext(); + SentenceData09 predInstance = predictedReader.getNext(); + + while(goldInstance != null) { + + int instanceLength = goldInstance.length(); + + if (instanceLength != predInstance.length()) + System.out.println("Lengths do not match on sentence "+numsent); + + int[] goldHeads = goldInstance.heads; + String[] goldLabels = goldInstance.labels; + int[] predHeads = predInstance.heads; + String[] predLabels = predInstance.labels; + + boolean whole = true; + boolean wholeL = true; + + // NOTE: the first item is the root info added during nextInstance(), so we skip it. + + int punc=0; + for (int i = 1; i < instanceLength; i++) { + if (predHeads[i] == goldHeads[i]) { + corr++; + + if (goldLabels[i].equals(predLabels[i])) corrL++; + else { + // System.out.println(numsent+" error gold "+goldLabels[i]+" "+predLabels[i]+" head "+goldHeads[i]+" child "+i); + wholeL = false; + } + } + else { + // System.out.println(numsent+"error gold "+goldLabels[i]+" "+predLabels[i]+" head "+goldHeads[i]+" child "+i); + whole = false; wholeL = false; + } + } + total += ((instanceLength - 1) - punc); // Subtract one to not score fake root token + + if(whole) corrsent++; + if(wholeL) corrsentL++; + numsent++; + + goldInstance = goldReader.getNext(); + predInstance = predictedReader.getNext(); + } + + Results r = new Results(); + + r.total = total; + r.corr = corr; + r.las =(float)Math.round(((double)corrL/total)*100000)/1000; + r.ula =(float)Math.round(((double)corr /total)*100000)/1000; + System.out.print("Total: " + total+" \tCorrect: " + corr+" "); + System.out.println("LAS: " + (double)Math.round(((double)corrL/total)*100000)/1000+" \tTotal: " + (double)Math.round(((double)corrsentL/numsent)*100000)/1000+ + " \tULA: " + (double)Math.round(((double)corr /total)*100000)/1000+" \tTotal: " + (double)Math.round(((double)corrsent /numsent)*100000)/1000); + + return r; + } + + + public static float round (double v){ + + return Math.round(v*10000F)/10000F; + } + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/parser/Extractor.java b/dependencyParser/basic/mate-tools/src/is2/parser/Extractor.java new file mode 100755 index 0000000..35c90f2 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parser/Extractor.java @@ -0,0 +1,973 @@ +package is2.parser; + + + +import java.util.concurrent.atomic.AtomicInteger; + +import is2.data.Cluster; +import is2.data.D4; +import is2.data.D6; +import is2.data.DX; + +import is2.data.DataFES; +import is2.data.F2SF; +import is2.data.FV; +import is2.data.IFV; +import is2.data.Instances; +import is2.data.Long2IntInterface; + + +import is2.util.DB; +import is2.util.OptionsSuper; + + + + +final public class Extractor { + + static final int _SIB = 85; + public static int s_rel,s_word,s_type,s_dir,s_dist,s_feat,s_child,s_spath,s_lpath,s_pos,s_rel1; + public final DX d0 ,dl1,dl2, dwr,dr,dwwp,dw,dwp,dlf,d3lp, d2lp,d2pw,d2pp ; + + public final Long2IntInterface li; + + public boolean s_stack=false; + + public Extractor(Long2IntInterface li, boolean stack, int what) { + + s_stack=stack; + + this.li=li; + + if (what == OptionsSuper.MULTIPLICATIVE) { + d0 = new D6(li);dl1 = new D6(li);dl2 = new D6(li);dwr = new D6(li);dr = new D6(li);dwwp = new D6(li); + dw = new D6(li);dwp = new D6(li);dlf = new D6(li);d3lp = new D6(li); d2lp = new D6(li); d2pw = new D6(li); d2pp = new D6(li); + } else { + d0 = new D5(li);dl1 = new D5(li);dl2 = new D5(li);dwr = new D5(li);dr = new D5(li);dwwp = new D5(li); + dw = new D5(li);dwp = new D5(li);dlf = new D5(li);d3lp = new D5(li); d2lp = new D5(li); d2pw = new D5(li); d2pp = new D5(li); + } + + } + + public static void initStat(int what ) { + MFO mf = new MFO(); + if (what == OptionsSuper.MULTIPLICATIVE) { + + DB.println("mult (d4) "); + + s_rel = mf.getFeatureCounter().get(REL).intValue()*16; + s_rel1 =mf.getFeatureCounter().get(REL).intValue()+1; + s_pos = mf.getFeatureCounter().get(POS).intValue(); + s_word = mf.getFeatureCounter().get(WORD).intValue(); + s_type = mf.getFeatureCounter().get(TYPE).intValue(); + s_dir = (int)(mf.getFeatureCounter().get(DIR)); + la = (mf.getValue(DIR, LA)); + ra = (mf.getValue(DIR, RA)); + s_dist = (int)(mf.getFeatureCounter().get(DIST));//mf.getFeatureBits(DIST); + s_feat = (int)(mf.getFeatureCounter().get(Pipe.FEAT));//mf.getFeatureBits(Pipe.FEAT); + s_spath = (mf.getFeatureCounter().get(Cluster.SPATH)==null?0:mf.getFeatureCounter().get(Cluster.SPATH));//mf.getFeatureBits(Cluster.SPATH); + s_lpath = (mf.getFeatureCounter().get(Cluster.LPATH)==null?0:mf.getFeatureCounter().get(Cluster.LPATH));//mf.getFeatureBits(Cluster.LPATH); + + } else { + + s_rel = mf.getFeatureBits(REL); + s_pos = mf.getFeatureBits(POS); + s_word = mf.getFeatureBits(WORD); + s_type = mf.getFeatureBits(TYPE); + s_dir = mf.getFeatureBits(DIR); + la = mf.getValue(DIR, LA); + ra = mf.getValue(DIR, RA); + s_dist = mf.getFeatureBits(DIST); + s_feat = mf.getFeatureBits(Pipe.FEAT); + s_spath = mf.getFeatureBits(Cluster.SPATH); + s_lpath = mf.getFeatureBits(Cluster.LPATH); + + DB.println("shift init (d5) "); + } + + + + } + + public void init(){ + + + + d0.a0 = s_type;d0.a1 = s_pos;d0.a2 = s_pos;d0.a3 = s_pos;d0.a4 = s_pos;d0.a5 = s_pos;d0.a6 = s_pos;d0.a7 = s_pos; d0.fix(); + dl1.a0 = s_type;dl1.a1 = s_rel; dl1.a2 = s_pos;dl1.a3 = s_pos; dl1.a4 = s_pos; dl1.a5 = s_pos; dl1.a6 = s_pos; dl1.a7 = s_pos; dl1.fix(); + dl2.a0 = s_type;dl2.a1 = s_rel;dl2.a2 = s_word;dl2.a3 = s_pos;dl2.a4 = s_pos;dl2.a5 = s_pos;dl2.a6 = s_pos;dl2.a7 = s_pos; dl2.fix(); + dwp.a0 = s_type; dwp.a1 = s_rel; dwp.a2 = s_word; dwp.a3 = s_pos; dwp.a4 = s_pos; dwp.a5 = s_word;dwp.fix(); + dwwp.a0 = s_type; dwwp.a1 = s_rel; dwwp.a2 = s_word; dwwp.a3 = s_word; dwwp.a4 = s_pos; dwwp.a5 = s_word; dwwp.fix(); + dlf.a0 = s_type;dlf.a1 = s_rel; dlf.a2 = s_pos;dlf.a3 = s_pos; dlf.a4 = s_feat; dlf.a5 = s_feat; dlf.a6 = s_pos; dlf.a7 = s_pos; dlf.fix(); + d3lp.a0 = s_type; d3lp.a1 = s_rel; d3lp.a2 = s_lpath; d3lp.a3 = s_lpath; d3lp.a4 = s_lpath; d3lp.a5 = s_word; d3lp.a6 = s_spath; d3lp.a7 = s_spath;d3lp.fix(); + d2lp.a0 = s_type; d2lp.a1 = s_rel; d2lp.a2 = s_lpath; d2lp.a3 = s_lpath; d2lp.a4 = s_word; d2lp.a5 = s_word; d2lp.fix(); //d3lp.a6 = s_spath; d3lp.a7 = s_spath; + d2pw.a0 = s_type; d2pw.a1 = s_rel; d2pw.a2 = s_lpath; d2pw.a3 = s_lpath; d2pw.a4 = s_word; d2pw.a5 = s_word;d2pw.fix(); //d3lp.a6 = s_spath; d3lp.a7 = s_spath; + d2pp.a0 = s_type; d2pp.a1 = s_rel; d2pp.a2 = s_lpath; d2pp.a3 = s_lpath; d2pp.a4 = s_pos; d2pp.a5 = s_pos; d2pp.fix(); //d3lp.a6 = s_spath; d3lp.a7 = s_spath; + } + + + + + public int basic(short[] pposs, int p, int d, IFV f) + { + + d0.clean(); dl1.clean(); dl2.clean(); dwp.clean(); dwwp.clean(); dlf.clean(); d3lp.clean(); + + d3lp.clean(); d2lp.clean();d2pw.clean(); d2pp.clean(); + + int n=1; + int dir= (p < d)? ra:la; + d0.v0= n++; d0.v1=pposs[p]; d0.v2=pposs[d]; //d0.stop=4; + int end= (p >= d ? p : d); + int start = (p >= d ? d : p) + 1; + + for(int i = start ; i <end ; i++) { + d0.v3=pposs[i]; + d0.cz4(); + d0.csa(s_dir,dir,f); + } + return n; + } + + + public int firstm(Instances is, int i, int prnt, int dpnt, int label, Cluster cluster, long[] f) + { + + + for(int k=0;k<f.length;k++) f[k]=0; + + short[] pposs = is.pposs[i]; + int[] form =is.forms[i]; + short[][] feats = is.feats[i]; + + + int pF = form[prnt],dF = form[dpnt]; + int pL = is.plemmas[i][prnt],dL = is.plemmas[i][dpnt]; + int pP = pposs[prnt],dP = pposs[dpnt]; + + int prntLS = pF==-1?-1:cluster.getLP(pF), chldLS = dF==-1?-1:cluster.getLP(dF); + + // final int dir= (prnt < dpnt)? ra:la; + + if (pF>maxForm) pF=-1; + if (pL>maxForm) pL=-1; + + if (dF>maxForm) dF=-1; + if (dL>maxForm) dL=-1; + + + int n=3,c=0; + + dl2.v1=label; + dl2.v0= n++; dl2.v2=pF; dl2.v3=dP; dl2.cz4(); f[c++]=dl2.getVal(); + dl2.v0= n++; dl2.cz3(); f[c++]=dl2.getVal(); + dl2.v0= n++; dl2.v2=dF; dl2.v3=pP; dl2.cz4(); f[c++]=dl2.getVal(); + dl2.v0= n++; dl2.cz3(); f[c++]=dl2.getVal(); + + + dwwp.v1=label; + dwwp.v0= n++; dwwp.v2=pF; dwwp.v3=dF; dwwp.cz4(); f[c++]=dwwp.getVal(); + + dl1.v1=label; + dl1.v0= n++; dl1.v2=dP; dl1.cz3(); f[c++]=dl1.getVal(); + dl1.v0= n++; dl1.v2=pP; dl1.cz3(); f[c++]=dl1.getVal(); + dl1.v0= n++; dl1.v3=dP; dl1.cz4(); f[c++]=dl1.getVal(); + + int pPm1 = prnt > 0 ? pposs[prnt - 1] : s_str, dPm1 = dpnt > 0 ? pposs[dpnt - 1] : s_str; + int pPp1 = prnt < pposs.length - 1 ? pposs[prnt + 1]:s_end, dPp1 = dpnt < pposs.length - 1 ? pposs[dpnt + 1]:s_end; + + int pPm2 = prnt > 1 ? pposs[prnt - 2] : s_str, dPm2 = dpnt > 1 ? pposs[dpnt - 2] : s_str; + int pPp2 = prnt < pposs.length - 2 ? pposs[prnt + 2]:s_end, dPp2 = dpnt < pposs.length - 2 ? pposs[dpnt + 2]:s_end; + + int pFm1 = prnt > 0 ? form[prnt - 1] : s_stwrd, dFm1 = dpnt > 0 ? form[dpnt - 1] : s_stwrd; + int pFp1 = prnt < form.length - 1 ? form[prnt + 1]:s_stwrd, dFp1 = dpnt < form.length - 1 ? form[dpnt + 1]:s_stwrd; + + + if (prnt-1 == dpnt) pPm1 =-1; + if (prnt == dpnt-1) dPm1 =-1; + + if (prnt+1 == dpnt) pPp1 =-1; + if (prnt == dpnt+1) dPp1 =-1; + + if (prnt-2 == dpnt) pPm2 =-1; + if (prnt == dpnt-2) dPm2 =-1; + + if (prnt+2 == dpnt) pPp2 =-1; + if (prnt == dpnt+2) dPp2 =-1; + + + dl1.v0= n++;dl1.v2=pP; dl1.v3=pPp1; dl1.v4=dP;dl1.v5=dPp1; dl1.v6= (prnt+1==dpnt?4:prnt==dpnt+1?5:6) ; dl1.cz7(); f[c++]=dl1.getVal(); + dl1.v0= n++; dl1.v5=dPm1; dl1.v6= (prnt+1==dpnt?4:prnt==dpnt-1?5:6) ; dl1.cz7(); f[c++]=dl1.getVal(); + dl1.v0= n++; dl1.v3=pPm1; dl1.v6= (prnt-1==dpnt?4:prnt==dpnt-1?5:6) ; dl1.cz7(); f[c++]=dl1.getVal(); + dl1.v0= n++; dl1.v5=dPp1; dl1.v6= (prnt-1==dpnt?4:prnt==dpnt+1?5:6) ; dl1.cz7(); f[c++]=dl1.getVal(); + + + dl1.v0= n++; dl1.v3=pPm1; dl1.v5= (prnt-1==dpnt?4:5) ; dl1.cz6(); f[c++]=dl1.getVal(); + dl1.v0= n++; dl1.v3=dPm1; dl1.v5= (prnt==dpnt-1?4:5) ; dl1.cz6(); f[c++]=dl1.getVal(); + dl1.v0= n++; dl1.v3=dPp1; dl1.v5= (prnt==dpnt+1?4:5) ; dl1.cz6(); f[c++]=dl1.getVal(); + dl1.v0= n++; dl1.v3=pPp1; dl1.v5= (prnt+1==dpnt?4:5) ; dl1.cz6(); f[c++]=dl1.getVal(); + + dl1.v0= n++;dl1.v2=pP; dl1.v3=pPp2; dl1.v4=dP;dl1.v5=dPp2; dl1.v6= (prnt+2==dpnt?4:prnt==dpnt+2?5:6) ; dl1.cz7(); f[c++]=dl1.getVal(); + dl1.v0= n++; dl1.v5=dPm2; dl1.v6= (prnt+2==dpnt?4:prnt==dpnt-2?5:6) ; dl1.cz7(); f[c++]=dl1.getVal(); + dl1.v0= n++; dl1.v3=pPm2; dl1.v6= (prnt-2==dpnt?4:prnt==dpnt-2?5:6) ; dl1.cz7(); f[c++]=dl1.getVal(); + dl1.v0= n++; dl1.v5=dPp2; dl1.v6= (prnt-2==dpnt?4:prnt==dpnt+2?5:6) ; dl1.cz7(); f[c++]=dl1.getVal(); + + + // remove this again + dl1.v0= n++; dl1.v3=pPm2; dl1.v5= (prnt-2==dpnt?4:5); dl1.cz6(); f[c++]=dl1.getVal(); + dl1.v0= n++; dl1.v3=dPm2; dl1.v5= (prnt==dpnt-2?4:5); dl1.cz6(); f[c++]=dl1.getVal(); + dl1.v0= n++; dl1.v3=dPp2; dl1.v5= (prnt==dpnt+2?4:5); dl1.cz6(); f[c++]=dl1.getVal(); + dl1.v0= n++; dl1.v3=pPp2; dl1.v5= (prnt+2==dpnt?4:5); dl1.cz6(); f[c++]=dl1.getVal(); + + + + dl2.v0= n++; dl2.v3=dFm1; dl2.v3=pPp1;dl2.v4=pP; dl2.v5= (prnt+1==dpnt?4:prnt==dpnt-1?5:6) ; dl2.cz6(); f[c++]=dl2.getVal(); + dl2.v0= n++; dl2.v3=dFp1; dl2.v3=pPm1; dl2.v5= (prnt-1==dpnt?4:prnt==dpnt+1?5:6) ; dl2.cz6(); f[c++]=dl2.getVal(); + dl2.v0= n++; dl2.v3=pFm1; dl2.v3=dPp1;dl2.v4=dP; dl2.v5= (prnt-1==dpnt?4:prnt==dpnt+1?5:6) ; dl2.cz6(); f[c++]=dl2.getVal(); + dl2.v0= n++; dl2.v3=pFp1; dl2.v3=dPm1; dl2.v5= (prnt+1==dpnt?4:prnt==dpnt-1?5:6) ; dl2.cz6(); f[c++]=dl2.getVal(); + + + // maybe without dir + dl2.v0= n++; dl2.v3=dFm1; dl2.v3=dPm2;dl2.v4=pP; dl2.v5= (prnt==dpnt-1?4:prnt==dpnt-2?5:6) ; dl2.cz6(); f[c++]=dl2.getVal(); + dl2.v0= n++; dl2.v3=dFp1; dl2.v3=dPp2; dl2.v5= (prnt==dpnt+1?4:prnt==dpnt+2?5:6) ; dl2.cz6(); f[c++]=dl2.getVal(); + dl2.v0= n++; dl2.v3=pFm1; dl2.v3=pPm2;dl2.v4=dP; dl2.v5= (prnt-1==dpnt?4:prnt-2==dpnt?5:6) ; dl2.cz6(); f[c++]=dl2.getVal(); + dl2.v0= n++; dl2.v3=pFp1; dl2.v3=pPp2; dl2.v5= (prnt+1==dpnt?4:prnt+2==dpnt?5:6) ; dl2.cz6(); f[c++]=dl2.getVal(); + + + dwwp.v0= n++; dwwp.v2=pF; dwwp.v3=dF; dwwp.v4=dP; dwwp.cz5(); f[c++]=dwwp.getVal(); + dwwp.v0= n++; dwwp.v2=pF; dwwp.v3=dF; dwwp.v4=pP; dwwp.cz5(); f[c++]=dwwp.getVal(); +// dwwp.v0= n++; dwwp.v2=dF; dwwp.v3=pF; dwwp.v4=pP; dwwp.v4=dP; dwwp.cz6(); f[c++]=dwwp.getVal(); + + + // until here + + + // lemmas + + dl2.v1=label; + dl2.v0= n++; dl2.v2=pL; dl2.v3=dP; dl2.cz4(); f[c++]=dl2.getVal(); + dl2.v0= n++; dl2.cz3(); f[c++]=dl2.getVal(); + dl2.v0= n++; dl2.v2=dL; dl2.v3=pP; dl2.cz4(); f[c++]=dl2.getVal(); + dl2.v0= n++; dl2.cz3(); f[c++]=dl2.getVal(); + + + dwwp.v1=label; + dwwp.v0= n++; dwwp.v2=pL; dwwp.v3=dL; dwwp.cz4(); f[c++]=dwwp.getVal(); + + dwp.v1= label; + dwp.v0=n++;dwp.v2=dL; dwp.v3=pP;dwp.v4=dP;dwp.v5=pL; //dwp.cz6(); f[c++]=dwp.getVal(); + + dwp.v0=n++;dwp.v2=pL; dwp.v3=pP;dwp.v4=dP; dwp.v0=n++;dwp.cz5(); f[c++]=dwp.getVal(); + + + + dwp.v0=n++;dwp.v2=pL; dwp.cz5(); f[c++]=dwp.getVal(); + dwwp.v0= n++; dwwp.v2=pL; dwwp.v3=dL; dwwp.v4=dP; dwwp.cz5(); f[c++]=dwwp.getVal(); + dwwp.v0= n++; dwwp.v4=pP; dwwp.cz5(); f[c++]=dwwp.getVal(); + + + // cluster + if (cluster.size()>10) { + d2pw.v1=label; + d2pw.v0=n++; d2pw.v2=prntLS; d2pw.v3=chldLS; d2pw.cz4(); f[c++]=d2pw.getVal(); + d2pw.v0=n++; d2pw.v4=pF; d2pw.cz5(); f[c++]=d2pw.getVal(); + d2pw.v0=n++; d2pw.v4=dF; d2pw.cz5(); f[c++]=d2pw.getVal(); + // d2pw.v0=n++; d2pw.v5=pF; d2pw.cz6(); f[c++]=d2pw.getVal(); + + + d2pp.v1=label; + d2pp.v0=n++; d2pp.v2=prntLS; d2pp.v3=chldLS; d2pp.cz4(); f[c++]=d2pp.getVal(); + d2pp.v0=n++; d2pp.v4=pP; d2pp.cz5(); f[c++]=d2pp.getVal(); + d2pp.v0=n++; d2pp.v4=dP; d2pp.cz5(); f[c++]=d2pp.getVal(); + d2pp.v0=n++; d2pp.v5=pP; d2pp.cz6(); f[c++]=d2pp.getVal(); + } + + if (s_stack) { + + short[] prel = is.plabels[i]; + short[] phead = is.pheads[i]; + + //take those in for stacking + dl2.v1=label; + dl2.v0= n++;dl2.v2=prel[dpnt];dl2.v3=pP;dl2.v4=dP; dl2.v5=prnt==phead[dpnt]?1:2; dl2.cz6(); f[c++]=dl2.getVal(); + dl2.v0= n++;dl2.v2=pP;dl2.v3=dP; dl2.v4=prnt==phead[dpnt]?1:2; dl2.cz5(); f[c++]=dl2.getVal(); + } + + + + if (feats==null) return c; + + short[] featsP =feats[prnt], featsD =feats[dpnt]; + dlf.v0= n++; dlf.v1=label; dlf.v2=pP; dlf.v3=dP; + c =extractFeat(f, c, featsP, featsD); + + + return c; + } + + + public int second(Instances is , int i,int p, int d, int x, int label, Cluster cluster, long[] f) + { + + //for(int k=0;k<f.length;k++) f[k]=0; + + dl1.clean(); dwp.clean();dlf.clean(); dwwp.clean(); + + short[] pos= is.pposs[i]; + int[] forms=is.forms[i],lemmas=is.plemmas[i]; + + + int pP = pos[p], dP = pos[d]; + int pF = forms[p],dF = forms[d]; + int pL = lemmas[p], cL = lemmas[d]; + + int sP = x!=-1 ? pos[x] : s_str, sF = x!=-1 ? forms[x] : s_stwrd, sL = x!=-1 ? lemmas[x] : s_stwrd; + + int n=_SIB; + if (pF>maxForm) pF=-1; + if (pL>maxForm) pL=-1; + + if (dF>maxForm) dF=-1; + if (cL>maxForm) cL=-1; + + if (sF>maxForm) sF=-1; + if (sL>maxForm) sL=-1; + + int c =0; + + dl1.v1=label;dwwp.v1=label;dwp.v1=label; + + dl1.v0= n++;dl1.v2=pP; dl1.v3=dP;dl1.v4=sP; dl1.cz5(); f[c++]=dl1.getVal() ; // f[c++]=dl1.csa(s_dist,dist); + dl1.v0= n++; dl1.v3=sP; dl1.cz4(); f[c++]=dl1.getVal(); //f[c++]=dl1.csa(s_dist,dist); + dl1.v0= n++; dl1.v2=dP; dl1.cz4(); f[c++]=dl1.getVal(); //f[c++]=dl1.csa(s_dist,dist); + + // sibling only could be tried + + dwwp.v0= n++; dwwp.v2=pF; dwwp.v3=sF; dwwp.cz4(); f[c++]=dwwp.getVal(); //f[c++]=dwwp.csa(s_dist,dist); + dwwp.v0= n++; dwwp.v2=dF; dwwp.cz4(); f[c++]=dwwp.getVal(); //f[c++]=dwwp.csa(s_dist,dist); + + // 154 + dwp.v0= n++; dwp.v2=sF; dwp.v3=pP; dwp.cz4(); f[c++]=dwp.getVal(); //f[c++]=dwp.csa(s_dist,dist); + dwp.v0= n++; /*dwp.v1=label; */dwp.v3=dP; dwp.cz4(); f[c++]=dwp.getVal(); //f[c++]=dwp.csa(s_dist,dist); + dwp.v0= n++; /*dwp.v1=label;*/ dwp.v2=pF; dwp.v3=sP; dwp.cz4(); f[c++]=dwp.getVal(); //f[c++]=dwp.csa(s_dist,dist); + dwp.v0= n++; /*dwp.v1=label;*/ dwp.v2=dF; dwp.cz4(); f[c++]=dwp.getVal();// f[c++]=dwp.csa(s_dist,dist); + + // 158 + //lemmas + + dwwp.v0= n++; dwwp.v2=pL; dwwp.v3=sL; dwwp.cz4(); f[c++]=dwwp.getVal(); + dwwp.v0= n++; dwwp.v2=cL; dwwp.cz4(); f[c++]=dwwp.getVal(); //f[c++]=dwwp.csa(s_dist,dist); + dwp.v0= n++; dwp.v2=sL; dwp.v3=pP; dwp.cz4(); f[c++]=dwp.getVal();// f[c++]=dwp.csa(s_dist,dist); + dwp.v0= n++; dwp.v3=dP; dwp.cz4(); f[c++]=dwp.getVal(); // f[c++]=dwp.csa(s_dist,dist); + + // 162 + dwp.v0= n++; dwp.v2=pL; dwp.v3=sP; dwp.cz4(); f[c++]=dwp.getVal(); //f[c++]=dwp.csa(s_dist,dist); + dwp.v0= n++; dwp.v2=cL; dwp.cz4(); f[c++]=dwp.getVal();// f[c++]=dwp.csa(s_dist,dist); + + // clusters + if (cluster.size()>10) { + + } + + int pPm1 = p!=0 ? pos[p-1] : s_str; + int chldPm1 = d-1>=0 ? pos[d-1] : s_str; + int prntPp1 = p!=pos.length-1 ? pos[p+1] : s_end; + int chldPp1 = d!=pos.length-1 ? pos[d+1] : s_end; + + // sibling part of speech minus and plus 1 + int sPm1 = x>0 ? pos[x-1]:s_str; + int sPp1 = x<pos.length-1 ? pos[x + 1]:s_end; + + if (x+1==x|| x+1==p || x+1==d) sPp1=-1; + if (p+1==x|| p+1==p || p+1==d) prntPp1=-1; + if (d+1==x|| d+1==p || d+1==d) chldPp1=-1; + + if (x-1==x|| x-1==p || x-1==d) sPm1=-1; + if (d-1==x|| d-1==p || d-1==d) chldPm1=-1; + if (p-1==x|| p-1==p || p-1==d) pPm1=-1; + + + dl1.v0=n++; dl1.v2=sP; dl1.v3=sPp1;dl1.v4=pP; dl1.cz5(); f[c++]=dl1.getVal(); + // 165 + dl1.v0=n++; dl1.v2=sP; dl1.v3=sPm1;dl1.v4=pP; dl1.v5= (x-1==p?3:x-1==d?4:5); dl1.cz6(); f[c++]= dl1.getVal(); //dl1.getVal();// f.add(li.l2i(l)); + dl1.v0=n++; dl1.v2=sP; dl1.v3=pP;dl1.v4=prntPp1;dl1.v5= (x==p+1?3:4); dl1.cz6(); f[c++]=dl1.getVal();// f.add(li.l2i(l)); + dl1.v0=n++; dl1.v2=sP; dl1.v3=pP;dl1.v4=pPm1;dl1.v5= (x==p-1?3:4); dl1.cz6(); f[c++]=dl1.getVal();// f.add(li.l2i(l)); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sPp1;dl1.v4=pPm1;dl1.v5=pP;dl1.v6= (x==p-1?3:x==p+1?4:5); dl1.cz7(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sPm1; dl1.v3=sP;dl1.v4=pPm1;dl1.v5=pP;dl1.v6= (x==p-1?3:x-1==p?4:5); dl1.cz7(); f[c++]=dl1.getVal();// f.add(li.l2i(l)); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sPp1;dl1.v4=pP;dl1.v5=prntPp1;dl1.v6= (x+1==p?3:x==p+1?4:5); dl1.cz7(); f[c++]=dl1.getVal();// f.add(li.l2i(l)); + dl1.v0=n++; dl1.v2=sPm1; dl1.v3=sP; dl1.v4=pP;dl1.v5=prntPp1;dl1.v6= (x==p-1?3:x==p+1?4:5); dl1.cz7(); f[c++]=dl1.getVal();// f.add(li.l2i(l)); + + dl1.v0=n++; dl1.v2=sP; dl1.v3=sPp1;dl1.v4=dP; dl1.v5= (x+1==d?3:x+1==p?4:5); dl1.cz6(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sPm1;dl1.v4=dP; dl1.v5= (x-1==d?3:x-1==p?4:5); dl1.cz6(); f[c++]=dl1.getVal(); + + dl1.v0=n++; dl1.v2=sP; dl1.v3=dP;dl1.v4=chldPp1;dl1.v5= (x==d+1?3:d+1==p?4:5); dl1.cz6(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sP; dl1.v3=dP;dl1.v4=chldPm1; dl1.v5= (x==d-1?3:d-1==p?4:5); dl1.cz6(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sPp1;dl1.v4=chldPm1;dl1.v5=dP; dl1.v6= (x==d-1?3:x+1==d?4:5); dl1.cz7(); f[c++]=dl1.getVal();// f.add(li.l2i(l)); + dl1.v0=n++; dl1.v2=sPm1; dl1.v3=sP;dl1.v4=chldPm1;dl1.v5=dP; dl1.v6= (x-1==d?3:d-1==x?4:5); dl1.cz7(); f[c++]=dl1.getVal(); + dl1.v0= n++;dl1.v2=sP; dl1.v3=sPp1;dl1.v4=dP;dl1.v5=chldPp1;dl1.v6= (x==d+1?3:x+1==d?4:5); dl1.cz7();f[c++]=dl1.getVal();// f.add(li.l2i(l)); + dl1.v0= n++; dl1.v2=sPm1; dl1.v3=sP;dl1.v4=dP;dl1.v5=chldPp1;dl1.v6= (x-1==d?3:d+1==x?4:5);dl1.cz7(); f[c++]=dl1.getVal(); + + // c=61; + /* + if (cluster.size()>10) { + AtomicInteger N = new AtomicInteger(n); + c = addClusterFeatures(d, p, x, pos, forms, cluster, N, c, f,label); + n = N.get(); + } + */ + // take those in for stacking + + if (s_stack) { + short[] prel = is.plabels[i],phead=is.pheads[i]; + + int g = p==phead[d]?1:2 ; + if (x>=0) g += p==phead[x]?4:8; + + int gr = x==-1?s_relend:prel[x]; + + + dl2.v1 = label; + dl2.v0= n++;dl2.v2=prel[d];dl2.v3=g;dl2.v4=sP;dl2.v5=dP;dl2.cz6();f[c++]=dl2.getVal(); + dl2.v0= n++;dl2.v2=prel[d];dl2.v3=g;dl2.v4=sP;dl2.v5=pP;dl2.cz6();f[c++]=dl2.getVal(); + dl2.v0= n++;dl2.v2=prel[d];dl2.v3=g;dl2.v4=sP;dl2.v5=pP;dl2.v6=dP;dl2.cz7();f[c++]=dl2.getVal(); + + dl2.v0= n++;dl2.v2=gr;dl2.v3=g;dl2.v4=sP;dl2.v5=dP;dl2.cz6();f[c++]=dl2.getVal(); + dl2.v0= n++;dl2.v2=gr;dl2.v3=g;dl2.v4=sP;dl2.v5=pP;dl2.cz6();f[c++]=dl2.getVal(); + dl2.v0= n++;dl2.v2=gr;dl2.v3=g;dl2.v4=sP;dl2.v5=pP;dl2.v6=dP;dl2.cz7();f[c++]=dl2.getVal(); + } + + short[][] feats=is.feats[i]; + + if (feats==null) return c; + + + short[] featsP =feats[d]; + short[] featsSbl =x!=-1?feats[x]:null; + dlf.v1=label; + dlf.v0= n++; dlf.v2=sP; dlf.v3=dP; + c = extractFeat(f, c ,featsP, featsSbl); + + featsP =feats[p]; + + + dlf.v0= n++; dlf.v1=label; dlf.v2=sP; dlf.v3=pP; + c = extractFeat(f, c ,featsP, featsSbl); + + + return c; + } + + + /** + * Separated this method to speed up parsing + * @param d + * @param p + * @param x + * @param pos + * @param forms + * @param cluster + * @param N + * @param c + * @param f + * @return + */ + int addClusterFeatures(Instances is, int i, int d, int p, int x, Cluster cluster, int c, long[] f, int label) { + + // int n= N.get(); + + short[] pos= is.pposs[i]; + int[] forms=is.forms[i]; + + int n=190; + int pP = pos[p], dP = pos[d]; + int sP = x!=-1 ? pos[x] : s_str; + + + int pLSp1 = p != pos.length - 1 ? forms[p + 1]==-1?-1:cluster.getLP(forms[p + 1]): _cend; + int cLSp1 = d != pos.length - 1 ? forms[d + 1] ==-1?-1:cluster.getLP(forms[d + 1]):_cend; + int sLSp1 = x < pos.length -1 ? forms[x + 1] ==-1?-1:cluster.getLP(forms[x + 1]) : _cend; + + int pLSm1 = p!=0 ? forms[p - 1]==-1?-1:cluster.getLP(forms[p - 1]): _cstr; + int cLSm1 = d-1>=0 ? forms[d - 1] ==-1?-1:cluster.getLP(forms[d - 1]):_cstr; + int sLSm1 = x>0 ? forms[x - 1] ==-1?-1:cluster.getLP(forms[x - 1]):_cstr; + + //int c=61; + int pF = forms[p],dF = forms[d], sF = x!=-1 ? forms[x] : s_stwrd; + int prntLS = pF==-1?-1:cluster.getLP(pF), chldLS = dF==-1?-1:cluster.getLP(dF); + + int sblLS = (x != -1)&&(sF!=-1) ? cluster.getLP(sF) : s_stwrd; + + + d2lp.v1=label; + d2lp.v0= n++; d2lp.v2=prntLS; d2lp.v3=sblLS; d2lp.cz4(); f[c++]=d2lp.getVal(); + d2lp.v0= n++; d2lp.v2=chldLS; d2lp.v3=sblLS; d2lp.cz4(); f[c++]=d2lp.getVal();// f[c++]=d2lp.csa(s_dist,dist); + + d3lp.v1= label; + d3lp.v0= n++; d3lp.v2=prntLS; d3lp.v3=chldLS; d3lp.v4=sblLS;d3lp.cz5(); f[c++]=d3lp.getVal(); + + d2lp.v0= n++; d2lp.v2=prntLS; d2lp.v3=chldLS; d2lp.v4=sF; d2lp.cz5(); f[c++]=d2lp.getVal(); //f[c++]=d2lp.csa(s_dist,dist); + d2lp.v0= n++; d2lp.v2=prntLS; d2lp.v3=sblLS; d2lp.v4=dF; d2lp.cz5(); f[c++]=d2lp.getVal(); //f[c++]=d2lp.csa(s_dist,dist); + d2lp.v0= n++; d2lp.v2=chldLS; d2lp.v3=sblLS; d2lp.v4=pF; d2lp.cz5(); f[c++]=d2lp.getVal(); //f[c++]=d2lp.csa(s_dist,dist); + + d2pp.v1=label; + d2pp.v0= n++; d2pp.v2=prntLS; d2pp.v3=chldLS; d2pp.v4=sP; d2pp.cz5(); f[c++]=d2pp.getVal(); //f[c++]=d2pp.csa(s_dist,dist); + d2pp.v0= n++; d2pp.v2=prntLS; d2pp.v3=sblLS; d2pp.v4=dP; d2pp.cz5(); f[c++]=d2pp.getVal(); //f[c++]=d2pp.csa(s_dist,dist); + d2pp.v0= n++; d2pp.v2=chldLS; d2pp.v3=sblLS; d2pp.v4=pP; d2pp.cz5(); f[c++]=d2pp.getVal(); //f[c++]=d2pp.csa(s_dist,dist); + + + if (x+1==x|| x+1==p || x+1==d) sLSp1=-1; + if (p+1==x|| p+1==p || p+1==d) pLSp1=-1; + if (d+1==x|| d+1==p || d+1==d) cLSp1=-1; + + if (x-1==x|| x-1==p || x-1==d) sLSm1=-1; + if (d-1==x|| d-1==p || d-1==d) cLSm1=-1; + if (p-1==x|| p-1==p || p-1==d) pLSm1=-1; + + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=pP; dl1.cz5(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSm1;dl1.v4=pP; dl1.cz5(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sP; dl1.v3=pP;dl1.v4=pLSp1;dl1.cz5(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sP; dl1.v3=pP;dl1.v4=pLSm1; dl1.cz5(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=pLSm1;dl1.v5=pP; dl1.cz6(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sLSm1; dl1.v3=sP;dl1.v4=pLSm1;dl1.v5=pP; dl1.cz6(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=pP;dl1.v5=pLSp1; dl1.cz6(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sLSm1; dl1.v3=sP; dl1.v4=pP;dl1.v5=pLSp1; dl1.cz6(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=dP; dl1.cz5(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSm1;dl1.v4=dP; dl1.cz5(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sP; dl1.v3=dP;dl1.v4=cLSp1; dl1.cz5(); f[c++]=dl1.getVal(); + dl1.v0=n++;dl1.v2=sP; dl1.v3=dP;dl1.v4=cLSm1; dl1.cz5(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSm1;dl1.v4=cLSm1;dl1.v5=dP; dl1.cz6(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sLSm1; dl1.v3=sP;dl1.v4=cLSm1;dl1.v5=dP;dl1.cz6(); f[c++]=dl1.getVal(); + dl1.v0=n++;dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=dP;dl1.v5=cLSp1;dl1.cz6();f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sLSm1; dl1.v3=sP;dl1.v4=dP;dl1.v5=cLSp1; dl1.cz6(); f[c++]=dl1.getVal(); + + + + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=pP; dl1.cz5(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSm1;dl1.v4=pP; dl1.cz5(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sP; dl1.v3=pP;dl1.v4=pLSp1;dl1.cz5(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sP; dl1.v3=pP;dl1.v4=pLSm1; dl1.cz5(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=pLSm1;dl1.v5=pP; dl1.cz6(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sLSm1; dl1.v3=sP;dl1.v4=pLSm1;dl1.v5=pP; dl1.cz6(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=pP;dl1.v5=pLSp1; dl1.cz6(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sLSm1; dl1.v3=sP; dl1.v4=pP;dl1.v5=pLSp1; dl1.cz6(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=dP; dl1.cz5(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSm1;dl1.v4=dP; dl1.cz5(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sP; dl1.v3=dP;dl1.v4=cLSp1; dl1.cz5(); f[c++]=dl1.getVal(); + dl1.v0=n++;dl1.v2=sP; dl1.v3=dP;dl1.v4=cLSm1; dl1.cz5(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sP; dl1.v3=sLSm1;dl1.v4=cLSm1;dl1.v5=dP; dl1.cz6(); f[c++]=dl1.getVal(); + dl1.v0=n++; dl1.v2=sLSm1; dl1.v3=sP;dl1.v4=cLSm1;dl1.v5=dP;dl1.cz6(); f[c++]=dl1.getVal(); + dl1.v0= n++;dl1.v2=sP; dl1.v3=sLSp1;dl1.v4=dP;dl1.v5=cLSp1;dl1.cz6();f[c++]=dl1.getVal(); + dl1.v0= n++; dl1.v2=sLSm1; dl1.v3=sP;dl1.v4=dP;dl1.v5=cLSp1; dl1.cz6(); f[c++]=dl1.getVal(); + + + return c; + } + + private int extractFeat(long[] f, int cnt, short[] featsP, short[] featsD) { + if (featsP!=null && featsD!=null) { + for(short i1=0;i1<featsP.length;i1++) { + for(short i2=0;i2<featsD.length;i2++) { + dlf.v4=featsP[i1]; dlf.v5=featsD[i2]; + dlf.cz6(); f[cnt++]=dlf.getVal(); + } + } + } else if (featsP==null && featsD!=null) { + + for(short i2=0;i2<featsD.length;i2++) { + dlf.v4=nofeat; dlf.v5=featsD[i2]; + dlf.cz6(); f[cnt++]=dlf.getVal(); + + } + } else if (featsP!=null && featsD==null) { + + for(short i1=0;i1<featsP.length;i1++) { + dlf.v4=featsP[i1]; dlf.v5=nofeat; + dlf.cz6(); f[cnt++]=dlf.getVal(); + + } + } + return cnt; + } + + + + public FV encodeCat(Instances is, int ic, short pposs[], int forms[], int[] lemmas, short[] heads, short[] types, short feats[][], Cluster cluster, FV f) { + + + long[] svs = new long[250]; + + for (int i = 1; i < heads.length; i++) { + + + basic(pposs, heads[i], i, f); + + int w1 = heads[i]<i?heads[i]:i; + int w2 = heads[i]<i?i:heads[i]; + + int dir =heads[i]<i?0:s_rel1; + int label = types[i] + dir; + + int c = firstm(is, ic, w1, w2, label, cluster,svs); + for(int k=0;k<c;k++) dl1.map(f,svs[k]); + + int ch,cmi,cmo; + if (heads[i] < i) { + ch = rightmostRight(heads, heads[i], i); + cmi = leftmostLeft(heads, i, heads[i]); + cmo = rightmostRight(heads, i, heads.length); + + } else { + ch = leftmostLeft(heads, heads[i], i); + cmi = rightmostRight(heads, i, heads[i]); + cmo = leftmostLeft(heads, i, 0); + } + + int lx =types[i] + s_rel1*((heads[i]<i?0:1) +8); + c =second(is,ic,w1, w2,ch, lx, cluster, svs); + for(int k=0;k<c;k++) dl1.map(f,svs[k]); + c = addClusterFeatures(is,ic, w1, w2, ch, cluster, c, svs,lx); + for(int k=0;k<c;k++) dl1.map(f,svs[k]); + + lx =types[i]+s_rel1*((heads[i]<i?0:1) + ((cmi < i)?0:2) ); + c =second(is, ic,w1,w2,cmi, lx, cluster, svs); + for(int k=0;k<c;k++) dl1.map(f,svs[k]); + + c = addClusterFeatures(is,ic, w1, w2, cmi, cluster, c, svs,lx); + for(int k=0;k<c;k++) dl1.map(f,svs[k]); + + lx =types[i]+s_rel1*((heads[i]<i?0:1) + ((cmo < i)?0:2) ); + c =second(is, ic, w1,w2,cmo, lx, cluster, svs); + for(int k=0;k<c;k++) dl1.map(f,svs[k]); + + c = addClusterFeatures(is,ic, w1, w2, cmo, cluster, c, svs,lx); + for(int k=0;k<c;k++) dl1.map(f,svs[k]); + } + + return f; + } + + public void compare(Instances is, int ic, short pos[], short[] heads, short[] types, Cluster cluster, F2SF f, DataFES x) { + + + long[] svs = new long[250]; + + float fx =0.0F; + + + for (int i = 1; i < heads.length; i++) { + + f.clear(); + basic(pos, heads[i], i, f); + + if (x.pl[heads[i]][i] != f.getScore()) { + DB.println("basic diff "+x.pl[heads[i]][i] +" fg "+f.getScore()); + } + + int w1 = heads[i]<i?heads[i]:i; + int w2 = heads[i]<i?i:heads[i]; + + int dir =heads[i]<i?0:s_rel1; + int label = types[i] + dir; + f.clear(); + int c = firstm(is, ic, w1, w2, label, cluster,svs); + for(int k=0;k<c;k++) dl1.map(f,svs[k]); + + if (x.lab[heads[i]][i][types[i]] != f.getScore()) { + DB.println("first diff "+x.lab[heads[i]][i][types[i]] +" fg "+f.getScore()); + } + + short[] labels = Edges.get(pos[heads[i]], pos[i]); + int lid=-1; + for(int k=0;k<labels.length;k++) if (types[i]== labels[k]) {lid= k;break;} + + + + int ch,cmi,cmo; + if (heads[i] < i) { + ch = rightmostRight(heads, heads[i], i); + cmi = leftmostLeft(heads, i, heads[i]); + cmo = rightmostRight(heads, i, heads.length); + + } else { + ch = leftmostLeft(heads, heads[i], i); + cmi = rightmostRight(heads, i, heads[i]); + cmo = leftmostLeft(heads, i, 0); + } + + f.clear(); + + + int lx =types[i] + s_rel1*((heads[i]<i?0:1) +8); + c =second(is,ic,w1, w2,ch, lx, cluster, svs); + for(int k=0;k<c;k++) dl1.map(f,svs[k]); + + if (x.sib[heads[i]][i][ch==-1?heads[i]:ch][lid] != f.getScore()) { + DB.println("sib diff "+x.sib[heads[i]][i][ch==-1?i:ch][lid] +" fg "+f.getScore()); + } + + f.clear(); + + + lx =types[i]+s_rel1*((heads[i]<i?0:1) + ((cmi < i)?0:2) ); + c =second(is, ic,w1,w2,cmi, lx, cluster, svs); + for(int k=0;k<c;k++) dl1.map(f,svs[k]); + + if (x.gra[heads[i]][i][cmi==-1?i:cmi][lid] != f.getScore() ) { + DB.println("gcm diff "+x.gra[heads[i]][i][cmi==-1?i:cmi][lid] +" fg "+f.getScore()+" cmi "+cmi+" i "+i+ + " head "+heads[i]+" w1 "+w1+" w2 "+w2+" label "+lx+" "+((heads[i]<i?0:1) + ((cmi < i)?0:2) ) ); + + System.out.println("w1 "+w1+" w2 "+w2+" cmi "+cmi+" label "+label+" "); + + for (long k : svs) System.out.print(k+" "); + System.out.println(); + + } + f.clear(); + lx =types[i]+s_rel1*((heads[i]<i?0:1) + ((cmo < i)?0:2) ); + c =second(is, ic, w1,w2,cmo, lx, cluster, svs); + for(int k=0;k<c;k++) dl1.map(f,svs[k]); + + if (x.gra[heads[i]][i][cmo==-1?i:cmo][lid] != f.getScore() ) { + DB.println("gcm diff "+x.gra[heads[i]][i][cmo==-1?i:cmo][lid] +" fg "+f.getScore()+" cmo "+cmo+" i "+i+ + " head "+heads[i]+" w1 "+w1+" w2 "+w2+" label "+lx+" "+((heads[i]<i?0:1) + ((cmi < i)?0:2) ) ); + + System.out.println("w1 "+w1+" w2 "+w2+" cmi "+cmi+" label "+label+" "); + + for (long k : svs) System.out.print(k+" "); + System.out.println(); + + } + } + + + } + + + public short[] searchLabel(Instances is, int ic, short pposs[], int forms[], int[] lemmas, short[] heads, short[] types, short feats[][], Cluster cluster, IFV f) { + + + long[] svs = new long[250]; + + short[] newLabels = new short[types.length]; + + for (int i = 1; i < heads.length; i++) { + + + // int n =basic(pposs, forms, heads[i], i, cluster, f); + + int ch,cmi,cmo; + if (heads[i] < i) { + ch = rightmostRight(heads, heads[i], i); + cmi = leftmostLeft(heads, i, heads[i]); + cmo = rightmostRight(heads, i, heads.length); + + } else { + ch = leftmostLeft(heads, heads[i], i); + cmi = rightmostRight(heads, i, heads[i]); + cmo = leftmostLeft(heads, i, 0); + } + + + short labels[] = Edges.get(pposs[is.heads[ic][i]],pposs[i]); + + float best = -1000; + short bestL = 0; + for(int j=0; j< labels.length;j++) { + + f.clear(); + firstm(is, ic, heads[i], i, labels[j], cluster,svs); + for(int k=0;k<svs.length;k++) dl1.map(f,svs[k]); + + second(is,ic,heads[i], i,ch, labels[j], cluster, svs); + for(int k=0;k<svs.length;k++) dl1.map(f,svs[k]); + + second(is, ic,heads[i],i,cmi, labels[j], cluster, svs); + for(int k=0;k<svs.length;k++) dl1.map(f,svs[k]); + + second(is, ic, heads[i],i,cmo, labels[j], cluster, svs); + for(int k=0;k<svs.length;k++) dl1.map(f,svs[k]); + + if (best < f.getScore()) { + best= (float)f.getScore(); + bestL= labels[j]; + newLabels[i]=bestL; + } + + + } + } + return newLabels; + + //return f; + } + + + public static float encode3(short[] pos, short heads[] , short[] types, DataFES d2) { + + + float v = 0F; + for (int i = 1; i < heads.length; i++) { + + // int dir= (heads[i] < i)? 0:1; + + v += d2.pl[heads[i]][i]; + v += d2.lab[heads[i]][i][types[i]]; + + // boolean left = i<heads[i]; + short[] labels = Edges.get(pos[heads[i]], pos[i]); + int lid=-1; + for(int k=0;k<labels.length;k++) if (types[i]== labels[k]) {lid= k;break;} + + int ch,cmi,cmo; + if (heads[i] < i) { + ch = rightmostRight(heads, heads[i], i); + cmi = leftmostLeft(heads, i, heads[i]); + cmo = rightmostRight(heads, i, heads.length); + + if (ch==-1) ch=heads[i]; + if (cmi==-1) cmi=heads[i]; + if (cmo==-1) cmo=heads[i]; + + } else { + ch = leftmostLeft(heads, heads[i], i); + cmi = rightmostRight(heads, i, heads[i]); + cmo = leftmostLeft(heads, i, 0); + + if (ch==-1) ch=i; + if (cmi==-1) cmi=i; + if (cmo==-1) cmo=i; + } + v += d2.sib[heads[i]][i][ch][lid]; + v += d2.gra[heads[i]][i][cmi][lid]; + v += d2.gra[heads[i]][i][cmo][lid]; + } + return v; + } + + public static float encode3(short[] pos, short heads[] , short[] types, DataFES d2, float[] scores) { + + float v = 0F; + for (int i = 1; i < heads.length; i++) { + + + scores[i]= d2.pl[heads[i]][i]; + scores[i] += d2.lab[heads[i]][i][types[i]]; + + short[] labels = Edges.get(pos[heads[i]], pos[i]); + int lid=-1; + for(int k=0;k<labels.length;k++) if (types[i]== labels[k]) {lid= k;break;} + + int ch,cmi,cmo; + if (heads[i] < i) { + ch = rightmostRight(heads, heads[i], i); + cmi = leftmostLeft(heads, i, heads[i]); + cmo = rightmostRight(heads, i, heads.length); + + if (ch==-1) ch=heads[i]; + if (cmi==-1) cmi=heads[i]; + if (cmo==-1) cmo=heads[i]; + + } else { + ch = leftmostLeft(heads, heads[i], i); + cmi = rightmostRight(heads, i, heads[i]); + cmo = leftmostLeft(heads, i, 0); + + if (ch==-1) ch=i; + if (cmi==-1) cmi=i; + if (cmo==-1) cmo=i; + } + scores[i] += d2.sib[heads[i]][i][ch][lid]; + scores[i] += d2.gra[heads[i]][i][cmi][lid]; + scores[i] += d2.gra[heads[i]][i][cmo][lid]; + } + return v; + } + + + public static int rightmostRight(short[] heads, int head, int max) { + int rightmost = -1; + for (int i = head + 1; i < max; i++) if (heads[i] == head) rightmost = i; + + return rightmost; + } + + public static int leftmostLeft(short[] heads, int head, int min) { + int leftmost = -1; + for (int i = head - 1; i > min; i--) if (heads[i] == head) leftmost = i; + return leftmost; + } + + public static final String REL = "REL",END = "END",STR = "STR",LA = "LA",RA = "RA"; + + private static int ra,la; + private static int s_str; + private static int s_end, _cend,_cstr, s_stwrd,s_relend; + + protected static final String TYPE = "TYPE",DIR = "D"; + public static final String POS = "POS"; + protected static final String DIST = "DIST",MID = "MID"; + + private static final String _0 = "0",_4 = "4",_3 = "3", _2 = "2",_1 = "1",_5 = "5",_10 = "10"; + + private static int di0, d4,d3,d2,d1,d5,d10; + + + private static final String WORD = "WORD",STWRD = "STWRD", STPOS = "STPOS"; + + + + private static int nofeat; + + + public static int maxForm; + + + /** + * Initialize the features. + * @param maxFeatures + */ + static public void initFeatures() { + + + MFO mf = new MFO(); + mf.register(POS, MID); + s_str = mf.register(POS, STR); + s_end = mf.register(POS, END); + + s_relend = mf.register(REL, END); + + _cstr= mf.register(Cluster.SPATH,STR); + _cend=mf.register(Cluster.SPATH,END); + + + mf.register(TYPE, POS); + + s_stwrd=mf.register(WORD,STWRD); + mf.register(POS,STPOS); + + la = mf.register(DIR, LA); + ra = mf.register(DIR, RA); + + // mf.register(TYPE, CHAR); + + mf.register(TYPE, Pipe.FEAT); + nofeat=mf.register(Pipe.FEAT, "NOFEAT"); + + for(int k=0;k<150;k++) mf.register(TYPE, "F"+k); + + + di0=mf.register(DIST, _0); + d1=mf.register(DIST, _1); + d2=mf.register(DIST, _2); + d3=mf.register(DIST, _3); + d4=mf.register(DIST, _4); + d5=mf.register(DIST, _5); + // d5l=mf.register(DIST, _5l); + d10=mf.register(DIST, _10); + + + } + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/parser/MFO.java b/dependencyParser/basic/mate-tools/src/is2/parser/MFO.java new file mode 100755 index 0000000..519ea06 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parser/MFO.java @@ -0,0 +1,257 @@ +package is2.parser; + + +import is2.data.IEncoder; +import is2.data.IEncoderPlus; +import is2.data.IFV; +import is2.data.Long2IntInterface; +import is2.util.DB; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map.Entry; + +/** + * Map Features, do not map long to integer + * + * @author Bernd Bohnet, 20.09.2009 + */ + +final public class MFO implements IEncoderPlus { + + /** The features and its values */ + static private final HashMap<String,HashMap<String,Integer>> m_featureSets = new HashMap<String,HashMap<String,Integer>>(); + + /** The feature class and the number of values */ + static private final HashMap<String,Integer> m_featureCounters = new HashMap<String,Integer>(); + + /** The number of bits needed to encode a feature */ + static final HashMap<String,Integer> m_featureBits = new HashMap<String,Integer>(); + + /** Integer counter for long2int */ + static private int count=0; + + /** Stop growing */ + public boolean stop=false; + + final public static String NONE="<None>"; + + + + + + + + public MFO () {} + + + public int size() {return count;} + + + + /** + * Register an attribute class, if it not exists and add a possible value + * @param type + * @param type2 + */ + final public int register(String a, String v) { + + HashMap<String,Integer> fs = getFeatureSet().get(a); + if (fs==null) { + fs = new HashMap<String,Integer>(); + getFeatureSet().put(a, fs); + fs.put(NONE, 0); + getFeatureCounter().put(a, 1); + } + Integer c = getFeatureCounter().get(a); + + Integer i = fs.get(v); + if (i==null) { + fs.put(v, c); + c++; + getFeatureCounter().put(a,c); + return c-1; + } else return i; + } + + /** + * Calculates the number of bits needed to encode a feature + */ + public void calculateBits() { + + int total=0; + for(Entry<String,Integer> e : getFeatureCounter().entrySet() ){ + int bits =(int)Math.ceil((Math.log(e.getValue()+1)/Math.log(2))); + m_featureBits.put(e.getKey(), bits); + total+=bits; + // System.out.println(" "+e.getKey()+" bits "+bits+" number "+(e.getValue()+1)); + } + +// System.out.println("total number of needed bits "+total); + } + + + + public String toString() { + + StringBuffer content = new StringBuffer(); + for(Entry<String,Integer> e : getFeatureCounter().entrySet() ){ + content.append(e.getKey()+" "+e.getValue()); + content.append(':'); + // HashMap<String,Integer> vs = getFeatureSet().get(e.getKey()); + content.append(getFeatureBits(e.getKey())); + + /*if (vs.size()<120) + for(Entry<String,Integer> e2 : vs.entrySet()) { + content.append(e2.getKey()+" ("+e2.getValue()+") "); + }*/ + content.append('\n'); + + } + return content.toString(); + } + + + + static final public short getFeatureBits(String a) { + if(m_featureBits.get(a)==null) return 0; + return (short)m_featureBits.get(a).intValue(); + } + + + + /** + * Get the integer place holder of the string value v of the type a + * + * @param t the type + * @param v the value + * @return the integer place holder of v + */ + final public int getValue(String t, String v) { + + if (m_featureSets.get(t)==null) return -1; + Integer vi = m_featureSets.get(t).get(v); + if (vi==null) return -1; //stop && + return vi.intValue(); + } + + /** + * Static version of getValue + * @see getValue + */ + static final public int getValueS(String a, String v) { + + if (m_featureSets.get(a)==null) return -1; + Integer vi = m_featureSets.get(a).get(v); + if (vi==null) return -1; //stop && + return vi.intValue(); + } + + public int hasValue(String a, String v) { + + Integer vi = m_featureSets.get(a).get(v); + if (vi==null) return -1; + return vi.intValue(); + } + + + public static String printBits(int k) { + StringBuffer s = new StringBuffer(); + for(int i =0;i<31;i++) { + s.append((k&0x00000001)==1?'1':'0'); + k=k>>1; + + } + s.reverse(); + return s.toString(); + } + + + + + + + + /** + * Maps a long to a integer value. This is very useful to save memory for sparse data long values + * @param l + * @return the integer + */ + static public int misses = 0; + static public int good = 0; + + + + + /** + * Write the data + * @param dos + * @throws IOException + */ + static public void writeData(DataOutputStream dos) throws IOException { + dos.writeInt(getFeatureSet().size()); + // DB.println("write"+getFeatureSet().size()); + for(Entry<String, HashMap<String,Integer>> e : getFeatureSet().entrySet()) { + dos.writeUTF(e.getKey()); + dos.writeInt(e.getValue().size()); + + for(Entry<String,Integer> e2 : e.getValue().entrySet()) { + + if(e2.getKey()==null) DB.println("key "+e2.getKey()+" value "+e2.getValue()+" e -key "+e.getKey()); + dos.writeUTF(e2.getKey()); + dos.writeInt(e2.getValue()); + + } + + } + } + public void read(DataInputStream din) throws IOException { + + int size = din.readInt(); + for(int i=0; i<size;i++) { + String k = din.readUTF(); + int size2 = din.readInt(); + + HashMap<String,Integer> h = new HashMap<String,Integer>(); + getFeatureSet().put(k,h); + for(int j = 0;j<size2;j++) { + h.put(din.readUTF(), din.readInt()); + } + getFeatureCounter().put(k, size2); + } + + count =size; + // stop(); + calculateBits(); + } + + + /** + * Clear the data + */ + static public void clearData() { + getFeatureSet().clear(); + m_featureBits.clear(); + getFeatureSet().clear(); + } + + public HashMap<String,Integer> getFeatureCounter() { + return m_featureCounters; + } + + static public HashMap<String,HashMap<String,Integer>> getFeatureSet() { + return m_featureSets; + } + + static public String[] reverse(HashMap<String,Integer> v){ + String[] set = new String[v.size()]; + for(Entry<String,Integer> e : v.entrySet()) { + set[e.getValue()]=e.getKey(); + } + return set; + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/parser/Open.java b/dependencyParser/basic/mate-tools/src/is2/parser/Open.java new file mode 100755 index 0000000..35f14a7 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parser/Open.java @@ -0,0 +1,38 @@ +package is2.parser; + +import is2.data.Parse; + + +final public class Open { + + public float p; + short s, e, label; + byte dir; + + Closed left; + Closed right; + + public Open(short s, short t, short dir, short label,Closed left, Closed right, float p) { + this.s = s; + this.e = t; + this.label = label; + this.dir = (byte)dir; + this.left =left; + this.right=right; + this.p=p; + } + + + void create(Parse parse) { + if (dir == 0) { + parse.heads[s] = e; + if (label != -1) parse.labels[s] = label; + } else { + parse.heads[e] = s; + if (label != -1) parse.labels[e] = label; + } + if (left != null) left.create(parse); + if (right != null) right.create(parse); + } + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/parser/Options.java b/dependencyParser/basic/mate-tools/src/is2/parser/Options.java new file mode 100755 index 0000000..3c8b551 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parser/Options.java @@ -0,0 +1,63 @@ +package is2.parser; + +import is2.util.OptionsSuper; + + +public final class Options extends OptionsSuper { + + + public Options (String[] args) { + + + + for(int i = 0; i < args.length; i++) { + + if (args[i].equals("--help")) explain(); + + if (args[i].equals("-decode")) { + decodeProjective = args[i+1].equals("proj"); i++; + } else if (args[i].equals("-decodeTH")) { + decodeTH = Double.parseDouble(args[i+1]); i++; + } else if (args[i].equals("-nonormalize")) { + normalize=false; + } else if (args[i].equals("-features")) { + features= args[i+1]; i++; + } else if (args[i].equals("-hsize")) { + hsize= Integer.parseInt(args[i+1]); i++; + } else if (args[i].equals("-len")) { + maxLen= Integer.parseInt(args[i+1]); i++; + } else if (args[i].equals("-cores")) { + cores= Integer.parseInt(args[i+1]); i++; + } else if (args[i].equals("-no2nd")) { + no2nd= true; + } else if (args[i].equals("-few2nd")) { + few2nd= true; + } else super.addOption(args, i); + + } + + + + } + + private void explain() { + System.out.println("Usage: "); + System.out.println("java -class mate.jar is2.parser.Parser [Options]"); + System.out.println(); + System.out.println("Example: "); + System.out.println(" java -class mate.jar is2.parser.Parser -model eps3.model -train corpora/conll08st/train/train.closed -test corpora/conll08st/devel/devel.closed -out b3.test -eval corpora/conll08st/devel/devel.closed -count 2000 -i 6"); + System.out.println(""); + System.out.println("Options:"); + System.out.println(""); + System.out.println(" -train <file> the corpus a model is trained on; default "+this.trainfile); + System.out.println(" -test <file> the input corpus for testing; default "+this.testfile); + System.out.println(" -out <file> the output corpus (result) of a test run; default "+this.outfile); + System.out.println(" -model <file> the parsing model for traing the model is stored in the files"); + System.out.println(" and for parsing the model is load from this file; default "+this.modelName); + System.out.println(" -i <number> the number of training iterations; good numbers are 10 for smaller corpora and 6 for bigger; default "+this.numIters); + System.out.println(" -count <number> the n first sentences of the corpus are take for the training default "+this.count); + System.out.println(" -format <number> conll format of the year 8 or 9; default "+this.formatTask); + + System.exit(0); + } +} diff --git a/dependencyParser/basic/mate-tools/src/is2/parser/ParallelDecoder.java b/dependencyParser/basic/mate-tools/src/is2/parser/ParallelDecoder.java new file mode 100755 index 0000000..dd18f5f --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parser/ParallelDecoder.java @@ -0,0 +1,170 @@ +package is2.parser; + + +import is2.data.DataFES; + + +import java.util.ArrayList; +import java.util.concurrent.Callable; + +/** + * @author Bernd Bohnet, 30.08.2009 + * + * This class implements a parallel feature extractor. + */ +final public class ParallelDecoder implements Callable<Object> +{ + // some constants + private static final float INIT_BEST = (-1.0F / 0.0F); + private static final boolean[] DIR ={false,true}; + + // the data space of the weights for a dependency tree + final private DataFES x; + + private short[] pos; + + private Open O[][][][]; + private Closed C[][][][] ; + + private int length; + + boolean done=false; + public boolean waiting =false; + + /** + * Initialize the parallel decoder. + * + * @param pos part-of-speech + * @param d data + * @param edges part-of-speech edge mapping + * @param o open spans + * @param c closed spans + * @param length number of words + */ + public ParallelDecoder(short[] pos, DataFES d, Open o[][][][], Closed c[][][][], int length) { + + this.pos =pos; + this.x =d; + + this.O=o; + this.C=c; + this.length=length; + } + + + private static class DSet { short w1,w2;} + + @Override + public Object call() { + + + try { + + while (true){ + + DSet set = get(); +// if (done && set==null) break; + + if (set ==null) return null; + + short s=set.w1, t=set.w2; + + for(short dir =0;dir<2;dir++) { + + short[] labs = (dir==1) ? Edges.get(pos[s],pos[t]):Edges.get(pos[t],pos[s]); + + O[s][t][dir] = new Open[labs.length]; + + for (int l = 0; l <labs.length; l++) { + + + double tRP = INIT_BEST; + + Closed tL = null, tR = null; + + for (int r = s; r < t; r++) { + + if (s == 0 && r != 0) continue; + + double tLPr = INIT_BEST,tRPr = INIT_BEST; + Closed tLCld = null, tRCld = null; + + if (r == s) tLPr = dir==1 ? x.sib[s][t][s][l] : + x.gra[t][s][s][l]; + else + for (int i = s + 1; i <= r; i++) + if (((dir==1 ? x.sib[s][t][i][l] : x.gra[t][s][i][l]) + C[s][r][1][i].p) > tLPr) { + tLPr = ((dir==1 ? x.sib[s][t][i][l] : x.gra[t][s][i][l]) + C[s][r][1][i].p);tLCld = C[s][r][1][i];} + + if (r == t-1) tRPr = dir==1 ? x.gra[s][t][s][l] : x.sib[t][s][s][l]; + else + for (int i = r + 1; i < t; i++) + if (((dir == 1 ? x.gra[s][t][i][l] : + x.sib[t][s][i][l]) + + C[r+1][t][0][i].p) > tRPr) { + tRPr = ((dir==1?x.gra[s][t][i][l]:x.sib[t][s][i][l]) + C[r+1][t][0][i].p); tRCld=C[r + 1][t][0][i]; + } + + if (tLPr + tRPr > tRP) {tRP = tLPr + tRPr; tL = tLCld;tR = tRCld;} + } + O[s][t][dir][l] = new Open(s, t, dir, labs[l],tL, tR, + (float) ( tRP+((dir==1)?x.pl[s][t]: x.pl[t][s]) + ((dir==1)? x.lab[s][t][labs[l]]:x.lab[t][s][labs[l]]))); + } + } + C[s][t][1] = new Closed[length]; C[s][t][0] = new Closed[length]; + + for (int m = s ; m <= t; m++) { + for(boolean d : DIR) { + if ((d && m!=s)||!d && (m!=t && s!=0)) { + + // create closed structure + + double top = INIT_BEST; + + Open tU = null; Closed tL = null; + int numLabels =O[(d ? s : m)][(d ? m : t)][d?1:0].length; + + //for (int l = numLabels-1; l >=0; l--) { + for (int l = 0; l < numLabels; l++) { + + Open hi = O[(d ? s : m)][(d ? m : t)][d?1:0][l]; + for (int amb = m + (d?1:-1); amb != (d?t:s) + (d?1:-1); amb += (d?1:-1)) { + + if ((hi.p + C[d?m:s][d?t:m][d?1:0][amb].p +x.gra[d?s:t][m][amb][l]) > top) { + top = (hi.p + C[d?m:s][d?t:m][d?1:0][amb].p +x.gra[d?s:t][m][amb][l]); tU = hi; tL=C[d?m:s][d?t:m][d?1:0][amb];} + + } + + if ((m == (d ? t : s)) && (hi.p + x.gra[d?s:t][d?t:s][m][l]) > top) { + top = (hi.p + x.gra[d ? s : t][d?t:s][m][l]); tU = hi; tL = null;} + } + C[s][t][d?1:0][m] = new Closed(s, t, m, d?1:0,tU,tL,(float) top); + + + } + } + } + } + } catch (Exception e ) { + e.printStackTrace(); + System.exit(0); + } + return null; + } + + public static ArrayList<DSet> sets = new ArrayList<DSet>(); + + static synchronized private DSet get() { + synchronized (sets) { + if (sets.size()==0) return null; + return sets.remove(sets.size()-1); + } + } + + public static void add(short w1, short w2){ + DSet ds =new DSet(); + ds.w1=w1; + ds.w2=w2; + sets.add(ds); + } +} diff --git a/dependencyParser/basic/mate-tools/src/is2/parser/ParallelExtract.java b/dependencyParser/basic/mate-tools/src/is2/parser/ParallelExtract.java new file mode 100755 index 0000000..4313bfd --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parser/ParallelExtract.java @@ -0,0 +1,246 @@ +package is2.parser; + +import is2.data.Cluster; +import is2.data.DataFES; +import is2.data.F2SF; +import is2.data.FV; +import is2.data.Instances; +import is2.data.Long2IntInterface; +import is2.util.DB; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.concurrent.Callable; + + + + + +/** + * @author Bernd Bohnet, 30.08.2009 + * + * This class implements a parallel feature extractor. + */ +final public class ParallelExtract implements Callable<Object> +{ + // the data space of the weights for a dependency tree + final DataFES d; + + // the data extractor does the actual work + final Extractor extractor; + + private Instances is; + private int i; + + private F2SF para; + + private Cluster cluster; + + + public ParallelExtract(Extractor e, Instances is, int i, DataFES d, F2SF para,Cluster cluster) { + + this.is =is; + extractor=e; + this.d =d; + this.i=i; + this.para=para; + this.cluster = cluster; + } + + + public static class DSet { + int w1,w2; + } + + public Object call() { + + try { + + F2SF f= para; + + + short[] pos=is.pposs[i]; + int length = pos.length; + + long[] gvs = new long[50]; + long[] svs = new long[220]; + + while (true) { + + DSet set = get(); + if (set ==null) break; + + int w1=set.w1; + int w2=set.w2; + + + f.clear(); + extractor.basic(pos, w1, w2, f); + d.pl[w1][w2]=f.getScoreF(); + + + f.clear(); + + extractor.basic(pos, w2, w1, f); + d.pl[w2][w1]=f.getScoreF(); + + short[] labels = Edges.get(pos[w1], pos[w2]); + float[] lab = d.lab[w1][w2]; + + final Long2IntInterface li = extractor.li; + + int c = extractor.firstm(is, i, w1, w2, 0, cluster, svs); + + for (int l = 0; l <lab.length ; l++) lab[l]=-100 ; + + for (int l = 0; l <labels.length ; l++) { + short label = labels[l]; + + f.clear(); + int lv = extractor.d0.computeLabeValue(label,Extractor.s_type); + for(int k=0;k<c;k++)if (svs[k]>0) f.add(li.l2i(svs[k]+lv)); + + + lab[label]=f.getScoreF(); + } + + labels = Edges.get(pos[w2], pos[w1]); + lab = d.lab[w2][w1]; + + for (int l = 0; l <lab.length ; l++) lab[l]=-100 ; + + + for (int l = 0; l <labels.length ; l++) { + int label = labels[l]; + + f.clear(); + int lv = extractor.d0.computeLabeValue(label + Extractor.s_rel1 ,Extractor.s_type); + for(int k=0;k<c;k++)if (svs[k]>0) f.add(li.l2i(svs[k]+lv)); + + lab[label]=f.getScoreF(); + } + + int s = w1<w2 ? w1 : w2; + int e = w1<w2 ? w2 : w1; + + + for(int m=0;m<length;m++) { + + int g = (m==s||e==m) ? -1 : m; + + int cn =extractor.second(is, i, w1,w2,g, 0, cluster, svs); + int cc = extractor.addClusterFeatures(is,i, w1, w2, g, cluster, 0, gvs,0); + //for(int k=0;k<c;k++) dl1.map(f,svs[k]); + + + if(m>=w1) { + labels = Edges.get(pos[w1], pos[w2]); + float[] lab2 = new float[labels.length]; + for (int l = 0; l <labels.length ; l++) { + + short label = labels[l]; + + int lx =label+Extractor.s_rel1*( g < w2?0:2 ); + + f.clear(); + int lv = extractor.d0.computeLabeValue(lx,Extractor.s_type); + for(int k=0;k<cn;k++)if (svs[k]>0) f.add(li.l2i(svs[k]+lv)); + for(int k=0;k<cc;k++)if (gvs[k]>0) f.add(li.l2i(gvs[k]+lv)); + + lab2[l] = f.getScoreF(); + } + d.gra[w1][w2][m] =lab2; + } + + + if (m<=w2) { + labels = Edges.get(pos[w2], pos[w1]); + float lab2[]; + d.gra[w2][w1][m] = lab2 = new float[labels.length]; + for (int l = 0; l <labels.length ; l++) { + + int label = labels[l] ; + int lx =label+Extractor.s_rel1*(1 + (g < w1?0:2) ); + + f.clear(); + int lv = extractor.d0.computeLabeValue(lx,Extractor.s_type); + for(int k=0;k<cn;k++)if (svs[k]>0) f.add(li.l2i(svs[k]+lv)); + for(int k=0;k<cc;k++)if (gvs[k]>0) f.add(li.l2i(gvs[k]+lv)); + + lab2[l] = f.getScoreF(); + + } + } + + + g = (m==s||e==m) ? -1 : m; + + // int cn = extractor.second(is,i,w1,w2,g,0, cluster, svs,Extractor._SIB); + if (m >=w1 && m<=w2) { + labels = Edges.get(pos[w1], pos[w2]); + float lab2[]= new float[labels.length]; + d.sib[w1][w2][m] = lab2; + + for (int l = 0; l <labels.length ; l++) { + + short label = labels[l]; + + int lx =label+Extractor.s_rel1*( 8); + f.clear(); + int lv = extractor.d0.computeLabeValue(lx,Extractor.s_type); + for(int k=0;k<cn;k++) if (svs[k]>0) f.add(li.l2i(svs[k]+lv)); + for(int k=0;k<cc;k++) if (gvs[k]>0) f.add(li.l2i(gvs[k]+lv)); + + + lab2[l] = (float)f.score;//f.getScoreF(); + } + } + if (m >=w1 && m <=w2) { + labels = Edges.get(pos[w2], pos[w1]); + float[] lab2 = new float[labels.length]; + d.sib[w2][w1][m]=lab2; + for (int l = 0; l <labels.length ; l++) { + + int label = labels[l] ; + + int lx =label+Extractor.s_rel1*(9); + + f.clear(); + int lv = extractor.d0.computeLabeValue(lx,Extractor.s_type); + for(int k=0;k<cn;k++) if (svs[k]>0) f.add(li.l2i(svs[k]+lv)); + for(int k=0;k<cc;k++) if (gvs[k]>0) f.add(li.l2i(gvs[k]+lv)); + + lab2[l] = f.score;//f.getScoreF(); + } + } + } + } + + } catch(Exception e ) { + e.printStackTrace(); + } + return null; + } + + + static ArrayList<DSet> sets = new ArrayList<DSet>(); + + private DSet get() { + + synchronized (sets) { + if (sets.size()==0) return null; + return sets.remove(sets.size()-1); + } + } + static public void add(int w1, int w2){ + DSet ds =new DSet(); + ds.w1=w1; + ds.w2=w2; + sets.add(ds); + } + + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/parser/ParallelRearrange.java b/dependencyParser/basic/mate-tools/src/is2/parser/ParallelRearrange.java new file mode 100755 index 0000000..dfd995a --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parser/ParallelRearrange.java @@ -0,0 +1,127 @@ +package is2.parser; + +import is2.data.DataFES; + +import java.util.ArrayList; +import java.util.concurrent.Callable; + +/** + * @author Dr. Bernd Bohnet, 30.08.2009 + * + * This class implements a parallel edge rearrangement for non-projective parsing; + * The linear method was first suggest by Rayn McDonald et. al. 2005. + */ +final public class ParallelRearrange implements Callable<Object> { + + // new parent child combination to explore + final static class PA { + final float p; + final short ch, pa; + public float max; + public short wh; + public short nPar; + public short nType; + public PA(float p2, short ch2, short pa2) { p=p2; ch=ch2;pa=pa2;} + } + + // list of parent child combinations + static ArrayList<PA> parents = new ArrayList<PA>(); + static ArrayList<PA> order = new ArrayList<PA>(); + // best new parent child combination, found so far + public float max; + + // some data from the dependency tree + //private EdgesC edges; + private short[] pos; + private DataFES x; + private boolean[][] isChild ; + public short[] heads,types; + + // child, new parent, new label + public short wh,nPar,nType; + + /** + * Initialize the parallel rearrange thread + * + * @param isChild2 is a child + * @param edgesC the part-of-speech edge mapping + * @param pos the part-of-speech + * @param x the data + * @param s the heads + * @param ts the types + */ + public ParallelRearrange(boolean[][] isChild2,short[] pos, DataFES x, short[] s, short[] ts) { + + heads =new short[s.length]; + System.arraycopy(s, 0, heads, 0, s.length); + + types =new short[ts.length]; + System.arraycopy(ts, 0, types, 0, ts.length); + + isChild=isChild2; + //edges = edgesC; + this.pos =pos; + this.x=x; + } + + + @Override + public Object call() { + + // check the list of new possible parents and children for a better combination + while(true) { + PA px = getPA(); + if (px==null) break; + + float max=0; + short pa =px.pa, ch =px.ch; + + if(ch == pa || pa == heads[ch] || isChild[ch][pa]) continue; + + short oldP = heads[ch], oldT = types[ch]; + + heads[ch]=pa; + + short[] labels = Edges.get(pos[pa], pos[ch]); + + for(int l=0;l<labels.length;l++) { + + types[ch]=labels[l]; + + float p_new = Extractor.encode3(pos, heads, types, x); + + if(max < p_new-px.p ) { + max = p_new-px.p; wh = ch; nPar = pa; nType = labels[l] ; + px.max=max; + px.wh=ch; + px.nPar = pa; + px.nType =labels[l]; + } + } + heads[ch]= oldP; types[ch]=oldT; + } + return null; + } + + /** + * Add a child-parent combination which are latter explored for rearrangement + * + * @param p2 + * @param ch2 + * @param pa + */ + static public void add(float p2, short ch2, short pa) { + PA px = new PA(p2,ch2,pa); + parents.add(px); + order.add(px); + } + + static private PA getPA() { + synchronized (parents) { + if (parents.size()==0) return null; + return parents.remove(parents.size()-1); + } + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/parser/Parameters.java b/dependencyParser/basic/mate-tools/src/is2/parser/Parameters.java new file mode 100755 index 0000000..cb13a69 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parser/Parameters.java @@ -0,0 +1,38 @@ +/** + * + */ +package is2.parser; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; + +import is2.data.FV; +import is2.data.IFV; +import is2.data.Instances; +import is2.data.Parse; + +/** + * @author Bernd Bohnet, 31.08.2009 + * + * + */ +public abstract class Parameters { + + + public abstract void average(double avVal); + + public abstract void update(FV act, FV pred, Instances isd, int instc, Parse d, double upd, double e); + + public abstract void write(DataOutputStream dos) throws IOException; + + public abstract void read(DataInputStream dis ) throws IOException; + + public abstract int size(); + + /** + * @return + */ + public abstract IFV getFV() ; + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/parser/ParametersFloat.java b/dependencyParser/basic/mate-tools/src/is2/parser/ParametersFloat.java new file mode 100755 index 0000000..c2cbe93 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parser/ParametersFloat.java @@ -0,0 +1,137 @@ +package is2.parser; + +import is2.data.F2SF; +import is2.data.FV; +import is2.data.Instances; +import is2.data.Parse; +import is2.util.DB; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; + + + +final public class ParametersFloat extends Parameters { + + public float[] parameters; + public float[] total; + + public ParametersFloat(int size) { + parameters = new float[size]; + total = new float[size]; + for(int i = 0; i < parameters.length; i++) { + parameters[i] = 0F; + total[i] = 0F; + } + } + + /** + * @param parameters2 + */ + public ParametersFloat(float[] p) { + parameters =p; + } + + + @Override + public void average(double avVal) { + for(int j = 0; j < total.length; j++) { + parameters[j] = total[j]/((float)avVal); + } + total =null; + } + + public ParametersFloat average2(double avVal) { + float[] px = new float[this.parameters.length]; + for(int j = 0; j < total.length; j++) { + px[j] = total[j]/((float)avVal); + } + ParametersFloat pf = new ParametersFloat(px); + return pf; + } + + @Override + public void update(FV act, FV pred, Instances isd, int instc, Parse d, double upd, double e) { + + e++; + + float lam_dist = getScore(act) - getScore(pred); + + float b = (float)e-lam_dist; + + FV dist = act.getDistVector(pred); + + dist.update(parameters, total, hildreth(dist,b), upd,false); + } + + protected double hildreth(FV a, double b) { + + double A = a.dotProduct(a); + if (A<=0.0000000000000000001) return 0.0; + return b/A; + } + + + public float getScore(FV fv) { + if (fv ==null) return 0.0F; + return fv.getScore(parameters,false); + + } + + @Override + final public void write(DataOutputStream dos) throws IOException{ + + dos.writeInt(parameters.length); + for(float d : parameters) dos.writeFloat(d); + + } + + @Override + public void read(DataInputStream dis ) throws IOException{ + + parameters = new float[dis.readInt()]; + int notZero=0; + for(int i=0;i<parameters.length;i++) { + parameters[i]=dis.readFloat(); + if (parameters[i]!=0.0F) notZero++; + } + + + DB.println("read parameters "+parameters.length+" not zero "+notZero); + + } + + public int countNZ() { + + int notZero=0; + for(int i=0;i<parameters.length;i++) { + if (parameters[i]!=0.0F) notZero++; + } + return notZero; + + // DB.println("read parameters "+parameters.length+" not zero "+notZero); + + } + + + /* (non-Javadoc) + * @see is2.sp09k99995.Parameters#getFV() + */ + @Override + public F2SF getFV() { + return new F2SF(parameters); + } + + + /* (non-Javadoc) + * @see is2.sp09k99999.Parameters#size() + */ + @Override + public int size() { + return parameters.length; + } + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/parser/Parser.java b/dependencyParser/basic/mate-tools/src/is2/parser/Parser.java new file mode 100755 index 0000000..d6ba6d6 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parser/Parser.java @@ -0,0 +1,664 @@ +package is2.parser; + + +import is2.data.Cluster; +import is2.data.DataF; +import is2.data.DataFES; +import is2.data.F2SF; +import is2.data.FV; +import is2.data.Instances; +import is2.data.Long2Int; +import is2.data.Long2IntInterface; +import is2.data.Parse; +import is2.data.PipeGen; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; +import is2.io.CONLLWriter09; +import is2.tools.Retrainable; +import is2.tools.Tool; +import is2.util.DB; +import is2.util.OptionsSuper; +import is2.util.ParserEvaluator; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.Date; +import java.util.Map.Entry; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; +import java.util.zip.ZipOutputStream; + + + +public class Parser implements Tool, Retrainable { + + // output evaluation info + private static final boolean MAX_INFO = true; + + public static int THREADS =4; + + public Long2IntInterface l2i; + public ParametersFloat params; + public Pipe pipe; + public OptionsSuper options; + + + // keep some of the parsing information for later evaluation + public Instances is; + DataFES d2; + public Parse d= null; + + /** + * Initialize the parser + * @param options + */ + public Parser (OptionsSuper options) { + + this.options=options; + pipe = new Pipe(options); + + params = new ParametersFloat(0); + + // load the model + try { + readModel(options, pipe, params); + } catch (Exception e) { + e.printStackTrace(); + } + + } + + + /** + * @param modelFileName The file name of the parsing model + */ + public Parser(String modelFileName) { + this(new Options(new String[]{"-model",modelFileName})); + } + + + /** + * + */ + public Parser() { + // TODO Auto-generated constructor stub + } + + + public static void main (String[] args) throws Exception + { + + + + + long start = System.currentTimeMillis(); + OptionsSuper options = new Options(args); + + + Runtime runtime = Runtime.getRuntime(); + THREADS = runtime.availableProcessors(); + if (options.cores<THREADS&&options.cores>0) THREADS =options.cores; + DB.println("Found " + runtime.availableProcessors()+" cores use "+THREADS); + + + + if (options.train) { + + Parser p =new Parser(); + p.options=options; + + p.l2i = new Long2Int(options.hsize); + + p.pipe = new Pipe (options); + Instances is = new Instances(); + + Extractor.initFeatures(); + p.pipe.extractor = new Extractor[THREADS]; + DB.println("hsize "+options.hsize); + + DB.println("Use "+(options.featureCreation==OptionsSuper.MULTIPLICATIVE?"multiplication":"shift")+"-based feature creation function"); + for (int t=0;t<THREADS;t++) p.pipe.extractor[t]=new Extractor(p.l2i, options.stack, options.featureCreation); + + DB.println("Stacking "+options.stack); + + p.pipe.createInstances(options.trainfile,is); + + p.params = new ParametersFloat(p.l2i.size()); + + p.train(options, p.pipe,p.params,is,p.pipe.cl); + + p.writeModell(options, p.params, null,p.pipe.cl); + + } + + if (options.test) { + + // Parser p = new Parser(); + Parser p = new Parser(options); + + // p. pipe = new Pipe(options); + // p. params = new ParametersFloat(0); // total should be zero and the parameters are later read + + // load the model + + // p.readModel(options, p.pipe, p.params); + + DB.println("label only? "+options.label); + + p.out(options, p.pipe, p.params, !MAX_INFO, options.label); + } + + + + + System.out.println(); + + if (options.eval) { + System.out.println("\nEVALUATION PERFORMANCE:"); + ParserEvaluator.evaluate(options.goldfile, options.outfile); + } + + long end = System.currentTimeMillis(); + System.out.println("used time "+((float)((end-start)/100)/10)); + + Decoder.executerService.shutdown(); + Pipe.executerService.shutdown(); + System.out.println("end."); + + + } + + /** + * Read the models and mapping + * @param options + * @param pipe + * @param params + * @throws IOException + */ + public void readModel(OptionsSuper options, Pipe pipe, Parameters params) throws IOException { + + + DB.println("Reading data started"); + + // prepare zipped reader + ZipInputStream zis = new ZipInputStream(new BufferedInputStream(new FileInputStream(options.modelName))); + zis.getNextEntry(); + DataInputStream dis = new DataInputStream(new BufferedInputStream(zis)); + + pipe.mf.read(dis); + + pipe.cl = new Cluster(dis); + + params.read(dis); + this.l2i = new Long2Int(params.size()); + DB.println("parsing -- li size "+l2i.size()); + + + pipe.extractor = new Extractor[THREADS]; + + boolean stack = dis.readBoolean(); + + options.featureCreation=dis.readInt(); + + for (int t=0;t<THREADS;t++) pipe.extractor[t]=new Extractor(l2i, stack,options.featureCreation); + DB.println("Stacking "+stack); + + Extractor.initFeatures(); + Extractor.initStat(options.featureCreation); + + + for (int t=0;t<THREADS;t++) pipe.extractor[t].init(); + + Edges.read(dis); + + options.decodeProjective = dis.readBoolean(); + + Extractor.maxForm = dis.readInt(); + + boolean foundInfo =false; + try { + String info =null; + int icnt = dis.readInt(); + for(int i=0;i<icnt;i++) { + info = dis.readUTF(); + System.out.println(info); + } + } catch (Exception e) { + if (!foundInfo) System.out.println("no info about training"); + } + + + dis.close(); + + DB.println("Reading data finnished"); + + Decoder.NON_PROJECTIVITY_THRESHOLD =(float)options.decodeTH; + + Extractor.initStat(options.featureCreation); + + } + + + + /** + * Do the training + * @param instanceLengths + * @param options + * @param pipe + * @param params + * @param is + * @param cluster + * @throws IOException + * @throws InterruptedException + * @throws ClassNotFoundException + */ + public void train(OptionsSuper options, Pipe pipe, ParametersFloat params, Instances is, Cluster cluster) + throws IOException, InterruptedException, ClassNotFoundException { + + + DB.println("\nTraining Information "); + DB.println("-------------------- "); + + + Decoder.NON_PROJECTIVITY_THRESHOLD =(float)options.decodeTH; + + if (options.decodeProjective) System.out.println("Decoding: "+(options.decodeProjective?"projective":"non-projective")); + else System.out.println(""+Decoder.getInfo()); + int numInstances = is.size(); + + int maxLenInstances =0; + for(int i=0;i<numInstances;i++) if (maxLenInstances<is.length(i)) maxLenInstances=is.length(i); + + DataFES data = new DataFES(maxLenInstances, pipe.mf.getFeatureCounter().get(PipeGen.REL).shortValue()); + + int iter = 0; + int del=0; + float error =0; + float f1=0; + + FV pred = new FV(); + FV act = new FV(); + + double upd = (double)(numInstances*options.numIters)+1; + + for(; iter < options.numIters; iter++) { + + System.out.print("Iteration "+iter+": "); + + long start = System.currentTimeMillis(); + + long last= System.currentTimeMillis(); + error=0; + f1=0; + for(int n = 0; n < numInstances; n++) { + + upd--; + + if (is.labels[n].length>options.maxLen) continue; + + String info = " td "+((Decoder.timeDecotder)/1000000F)+" tr "+((Decoder.timeRearrange)/1000000F) + +" te "+((Pipe.timeExtract)/1000000F); + + if((n+1) %500 == 0) del= PipeGen.outValueErr(n+1, error,f1/n,del, last, upd,info); + + short pos[] = is.pposs[n]; + + data = pipe.fillVector((F2SF)params.getFV(), is, n, data, cluster); + + Parse d = Decoder.decode(pos, data, options.decodeProjective, Decoder.TRAINING); + + double e= pipe.errors(is, n ,d); + + if (d.f1>0)f1+=d.f1; + + if (e<=0) continue; + + pred.clear(); + pipe.extractor[0].encodeCat(is,n,pos,is.forms[n],is.plemmas[n],d.heads, d.labels, is.feats[n],pipe.cl, pred); + + error += e; + + params.getFV(); + + + act.clear(); + pipe.extractor[0].encodeCat(is,n,pos,is.forms[n],is.plemmas[n],is.heads[n], is.labels[n], is.feats[n],pipe.cl, act); + + params.update(act, pred, is, n, d, upd,e); + } + + String info = " td "+((Decoder.timeDecotder)/1000000F)+" tr "+((Decoder.timeRearrange)/1000000F) + +" te "+((Pipe.timeExtract)/1000000F)+" nz "+params.countNZ(); + PipeGen.outValueErr(numInstances, error,f1/numInstances,del,last, upd,info); + del=0; + long end = System.currentTimeMillis(); + System.out.println(" time:"+(end-start)); + + + ParametersFloat pf = params.average2((iter+1)*is.size()); + try { + + if (options.testfile!=null && options.goldfile!=null) { + out (options, pipe, pf, ! MAX_INFO,false); + ParserEvaluator.evaluate(options.goldfile, options.outfile); + // writeModell(options, pf, ""+(iter+1),pipe.cl); + } + + + } catch (Exception e) { + e.printStackTrace(); + } + + if (error==0) { + DB.println("stopped because learned all lessons"); + break; + } + + Decoder.timeDecotder=0;Decoder.timeRearrange=0; Pipe.timeExtract=0; + + + } + if (options.average)params.average(iter*is.size()); + } + + + /** + * Do the parsing job + * + * @param options + * @param pipe + * @param params + * @throws IOException + */ + private void out (OptionsSuper options, Pipe pipe, ParametersFloat params, boolean maxInfo, boolean labelOnly) + throws Exception { + + long start = System.currentTimeMillis(); + + CONLLReader09 depReader = new CONLLReader09(options.testfile, options.formatTask); + CONLLWriter09 depWriter = new CONLLWriter09(options.outfile, options.formatTask); + + int cnt = 0; + int del=0; + long last = System.currentTimeMillis(); + + if (maxInfo) System.out.println("\nParsing Information "); + if (maxInfo) System.out.println("------------------- "); + + if (maxInfo && !options.decodeProjective) System.out.println(""+Decoder.getInfo()); + + System.out.print("Processing Sentence: "); + + while(true) { + + // Instances is = new Instances(); + // is.init(1, new MFO(),options.formatTask); + + // SentenceData09 instance = pipe.nextInstance(is, depReader); + + SentenceData09 instance = depReader.getNext(); + if (instance==null) break; + cnt++; + + SentenceData09 i09 = this.parse(instance,params, labelOnly,options); + + depWriter.write(i09); + del=PipeGen.outValue(cnt, del,last); + + } + //pipe.close(); + depWriter.finishWriting(); + long end = System.currentTimeMillis(); + // DB.println("errors "+error); + if (maxInfo) System.out.println("Used time " + (end-start)); + if (maxInfo) System.out.println("forms count "+Instances.m_count+" unkown "+Instances.m_unkown); + + } + + /** + * Parse a single sentence + * + * @param instance + * @param params + * @param labelOnly + * @param options + * @return + */ + public SentenceData09 parse (SentenceData09 instance, ParametersFloat params, boolean labelOnly, OptionsSuper options) { + + String[] types = new String[pipe.mf.getFeatureCounter().get(PipeGen.REL)]; + for (Entry<String, Integer> e : MFO.getFeatureSet().get(PipeGen.REL).entrySet()) types[e.getValue()] = e.getKey(); + + is = new Instances(); + is.init(1, new MFO(),options.formatTask); + new CONLLReader09().insert(is, instance); + + // use for the training ppos + + SentenceData09 i09 = new SentenceData09(instance); + i09.createSemantic(instance); + + if (labelOnly) { + F2SF f2s =params.getFV(); + + // repair pheads + + is.pheads[0]= is.heads[0]; + + for(int l=0;l<is.pheads[0].length;l++) { + if (is.pheads[0][l]<0)is.pheads[0][l]=0; + } + + short[] labels = pipe.extractor[0].searchLabel(is, 0, is.pposs[0], is.forms[0], is.plemmas[0], is.pheads[0], is.plabels[0], is.feats[0], pipe.cl, f2s); + + for(int j = 0; j < instance.forms.length-1; j++) { + i09.plabels[j] = types[labels[j+1]]; + i09.pheads[j] = is.pheads[0][j+1]; + } + return i09; + } + + if (options.maxLength > instance.length() && options.minLength <= instance.length()) { + try { + // System.out.println("prs "+instance.forms[0]); + // System.out.println("prs "+instance.toString()); + d2 = pipe.fillVector(params.getFV(), is,0,null,pipe.cl);//cnt-1 + d =Decoder.decode(is.pposs[0],d2,options.decodeProjective, !Decoder.TRAINING); //cnt-1 + + }catch (Exception e) { + e.printStackTrace(); + } + + for(int j = 0; j < instance.forms.length-1; j++) { + i09.plabels[j] = types[d.labels[j+1]]; + i09.pheads[j] = d.heads[j+1]; + } + } + return i09; + + } + + is2.io.CONLLReader09 reader = new is2.io.CONLLReader09(true); + /* (non-Javadoc) + * @see is2.tools.Tool#apply(is2.data.SentenceData09) + */ + @Override + public SentenceData09 apply(SentenceData09 snt09) { + + SentenceData09 it = new SentenceData09(); + it.createWithRoot(snt09); + + SentenceData09 out=null; + try { + + + // for(int k=0;k<it.length();k++) { + // it.forms[k] = reader.normalize(it.forms[k]); + // it.plemmas[k] = reader.normalize(it.plemmas[k]); + // } + + out = parse(it,this.params,false,options); + + + } catch(Exception e) { + e.printStackTrace(); + } + + Decoder.executerService.shutdown(); + Pipe.executerService.shutdown(); + + return out; + } + + /** + * Get the edge scores of the last parse. + * @return the scores + */ + public float[] getInfo() { + + + float[] scores = new float[is.length(0)]; + Extractor.encode3(is.pposs[0], d.heads, d.labels, d2,scores); + + return scores; + } + + + /** + * Write the parsing model + * + * @param options + * @param params + * @param extension + * @throws FileNotFoundException + * @throws IOException + */ + private void writeModell(OptionsSuper options, ParametersFloat params, String extension, Cluster cs) throws FileNotFoundException, IOException { + + String name = extension==null?options.modelName:options.modelName+extension; + // System.out.println("Writting model: "+name); + ZipOutputStream zos = new ZipOutputStream(new BufferedOutputStream(new FileOutputStream(name))); + zos.putNextEntry(new ZipEntry("data")); + DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(zos)); + + MFO.writeData(dos); + cs.write(dos); + + params.write(dos); + + dos.writeBoolean(options.stack); + dos.writeInt(options.featureCreation); + + + Edges.write(dos); + + dos.writeBoolean(options.decodeProjective); + + dos.writeInt(Extractor.maxForm); + + dos.writeInt(5); // Info count + dos.writeUTF("Used parser "+Parser.class.toString()); + dos.writeUTF("Creation date "+(new SimpleDateFormat("yyyy.MM.dd HH:mm:ss")).format(new Date())); + dos.writeUTF("Training data "+options.trainfile); + dos.writeUTF("Iterations "+options.numIters+" Used sentences "+options.count); + dos.writeUTF("Cluster "+options.clusterFile); + + dos.flush(); + dos.close(); + } + + + @Override + public boolean retrain(SentenceData09 sentence, float upd, int iterations) { + + params.total = params.parameters; + + boolean done=false; + + for(int k=0;k<iterations;k++) { + try { + // create the data structure + DataFES data = new DataFES(sentence.length(), pipe.mf.getFeatureCounter().get(PipeGen.REL).shortValue()); + + + Instances is = new Instances(); + is.m_encoder =pipe.mf; + + + + is.init(1, pipe.mf,options.formatTask); + new CONLLReader09().insert(is, sentence); + + // String list[] = ((MFO)is.m_encoder).reverse(((MFO)is.m_encoder).getFeatureSet().get(Pipe.POS)); + // for(String s :list) { + // System.out.println(s+" "); + // } + + // for(int i=0;i<is.length(0);i++) { + + // System.out.printf("%d\t %d\t %d \n",i,is.forms[0][i],is.pposs[0][i] ); + // System.out.printf("%s\t form:%s pos:%s\n",i,sentence.forms[i],sentence.ppos[i]); + + // } + + SentenceData09 i09 = new SentenceData09(sentence); + i09.createSemantic(sentence); + + + + // create the weights + data = pipe.fillVector((F2SF)params.getFV(), is, 0, data, pipe.cl); + + short[] pos = is.pposs[0]; + + // parse the sentence + Parse d = Decoder.decode(pos, data, options.decodeProjective, Decoder.TRAINING); + + // training successful? + double e= pipe.errors(is, 0 ,d); + // System.out.println("errors "+e); + if (e==0) { + + + done= true; + break; + } + + // update the weight vector + FV pred = new FV(); + pipe.extractor[0].encodeCat(is,0,pos,is.forms[0],is.plemmas[0],d.heads, d.labels, is.feats[0],pipe.cl, pred); + + params.getFV(); + + FV act = new FV(); + pipe.extractor[0].encodeCat(is,0,pos,is.forms[0],is.plemmas[0],is.heads[0], is.labels[0], is.feats[0],pipe.cl, act); + + params.update(act, pred, is, 0, d, upd,e); + + if (upd >0)upd--; + + } catch(Exception e) { + e.printStackTrace(); + } + } + Decoder.executerService.shutdown(); + Pipe.executerService.shutdown(); + + + return done; + } + + + @Override + public boolean retrain(SentenceData09 sentence, float upd, int iterations, boolean print) { + // TODO Auto-generated method stub + return retrain( sentence, upd, iterations); + } +} \ No newline at end of file diff --git a/dependencyParser/basic/mate-tools/src/is2/parser/Pipe.java b/dependencyParser/basic/mate-tools/src/is2/parser/Pipe.java new file mode 100755 index 0000000..13e9389 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parser/Pipe.java @@ -0,0 +1,221 @@ +package is2.parser; + +import is2.data.Cluster; +import is2.data.DataF; +import is2.data.DataFES; +import is2.data.F2SF; +import is2.data.Instances; + +import is2.data.Parse; +import is2.data.PipeGen; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; + +import is2.util.OptionsSuper; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.concurrent.ExecutorService; + +final public class Pipe extends PipeGen { + + public Extractor[] extractor; + final public MFO mf = new MFO(); + + public Cluster cl; + + + private OptionsSuper options; + public static long timeExtract; + + public Pipe(OptionsSuper o) { + options = o; + } + + public void createInstances(String file, Instances is) + throws Exception { + + CONLLReader09 depReader = new CONLLReader09(file); + + mf.register(REL,"<root-type>"); + + // register at least one predicate since the parsing data might not contain predicates as in + // the Japaness corpus but the development sets contains some + + long sl=0; + + System.out.print("Registering feature parts of sentence: "); + int ic = 0; + int del = 0; + while (true) { + SentenceData09 instance = depReader.getNext(); + if (instance == null) break; + ic++; + + sl+=instance.labels.length; + + if (ic % 1000 == 0) { + del = outValue(ic, del); + } + + String[] labs1 = instance.labels; + for (int i1 = 0; i1 < labs1.length; i1++) mf.register(REL, labs1[i1]); + + String[] w = instance.forms; + for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, depReader.normalize(w[i1])); + + w = instance.plemmas; + for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, depReader.normalize(w[i1])); + + + w = instance.ppos; + for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); + + w = instance.gpos; + for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); + + if (instance.feats !=null) { + String fs[][] = instance.feats; + for (int i1 = 0; i1 < fs.length; i1++){ + w =fs[i1]; + if (w==null) continue; + for (int i2 = 0; i2 < w.length; i2++) mf.register(FEAT, w[i2]); + } + } + + if ((ic-1)>options.count) break; + } + del = outValue(ic, del); + + System.out.println(); + Extractor.initFeatures(); + + Extractor.maxForm = mf.getFeatureCounter().get(WORD); + + if (options.clusterFile==null)cl = new Cluster(); + else cl= new Cluster(options.clusterFile, mf,6); + + + + mf.calculateBits(); + Extractor.initStat(options.featureCreation); + + System.out.println(""+mf.toString()); + + for(Extractor e : extractor) e.init(); + + depReader.startReading(file); + + int num1 = 0; + + is.init(ic, new MFO()); + + Edges.init(mf.getFeatureCounter().get(POS)); + + + System.out.print("Creating edge filters and read corpus: "); + del = 0; + + while (true) { + if (num1 % 100 == 0) del = outValue(num1, del); + + SentenceData09 instance1 = depReader.getNext(is); + + if (instance1 == null) break; + + int last = is.size() - 1; + short[] pos =is.pposs[last]; + + for (int k = 0; k < is.length(last); k++) { + if (is.heads[last][k] < 0) continue; + Edges.put(pos[is.heads[last][k]],pos[k], is.labels[last][k]); +// Edges.put(pos[k],pos[is.heads[last][k]], is.labels[last][k]); + } + + if (!options.allFeatures && num1 > options.count) break; + + num1++; + + } + del = outValue(num1, del); + System.out.println(); + Edges.findDefault(); + } + + + /** + * Creates an instance for outputParses + * + * @param is + * @return + * @throws IOException + */ + protected final SentenceData09 nextInstance(Instances is, CONLLReader09 depReader) throws Exception { + + SentenceData09 instance = depReader.getNext(is); + if (instance == null || instance.forms == null) return null; + + return instance; + } + + public static ExecutorService executerService =java.util.concurrent.Executors.newFixedThreadPool(Parser.THREADS); + + + public DataFES fillVector(F2SF params, Instances is,int inst, DataFES d, Cluster cluster) throws InterruptedException { + + long ts = System.nanoTime(); + + if (executerService.isShutdown()) executerService =java.util.concurrent.Executors.newCachedThreadPool(); + + + final int length = is.length(inst); + if (d ==null || d.len<length)d = new DataFES(length,mf.getFeatureCounter().get(PipeGen.REL).shortValue()); + + ArrayList<ParallelExtract> pe = new ArrayList<ParallelExtract>(); + for(int i=0;i<Parser.THREADS;i++) pe.add(new ParallelExtract(extractor[i],is, inst, d, (F2SF)params.clone(), cluster)); + + for (int w1 = 0; w1 < length; w1++) { + for (int w2 =w1+1; w2 < length; w2++) { + + if (w1==w2) continue; + + ParallelExtract.add(w1, w2); + + + } + } +// for(int i=0;i<efp.length;i++) efp[i].start(); +// for(int i=0;i<efp.length;i++) efp[i].join(); + executerService.invokeAll( pe); + + timeExtract += (System.nanoTime()-ts); + + + + + + + return d; + } + + public double errors( Instances is, int ic, Parse p) { + short[] act = is.heads[ic]; + double correct = 0; + + // do not count root + for(int i = 1; i < act.length; i++) { + + // if (is.ppos[ic] ==null ) System.out.println("mf null"+is.ppos[ic][i]); + if (p.heads[i]==act[i] ){ + correct+=0.5; + if (p.labels[i]==is.labels[ic][i] ) correct+=0.5; + } + } + + double x = ((double)act.length- 1 - correct ); + + p.f1 = (double)correct / (double)(act.length-1); + + return x; + } +} diff --git a/dependencyParser/mate-tools/classes/is2/parser/package.html b/dependencyParser/basic/mate-tools/src/is2/parser/package.html index a4f40a2..a4f40a2 100755 --- a/dependencyParser/mate-tools/classes/is2/parser/package.html +++ b/dependencyParser/basic/mate-tools/src/is2/parser/package.html diff --git a/dependencyParser/basic/mate-tools/src/is2/parserR2/Decoder.java b/dependencyParser/basic/mate-tools/src/is2/parserR2/Decoder.java new file mode 100755 index 0000000..1f0424e --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parserR2/Decoder.java @@ -0,0 +1,377 @@ +package is2.parserR2; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.concurrent.ExecutorService; + +import decoder.ParallelDecoder; +import decoder.ParallelRearrangeNBest; +import decoder.ParallelRearrangeNBest2; +import extractors.Extractor; + + +import is2.data.Closed; +import is2.data.DataF; +import is2.data.Instances; +import is2.data.Open; +import is2.data.Parse; +import is2.data.ParseNBest; +import is2.util.DB; + + +/** + * @author Bernd Bohnet, 01.09.2009 + * + * This methods do the actual work and they build the dependency trees. + */ +final public class Decoder { + + public static final boolean TRAINING = true; + public static long timeDecotder; + public static long timeRearrange; + + public static final boolean LAS = true; + + /** + * Threshold for rearrange edges non-projective + */ + public static float NON_PROJECTIVITY_THRESHOLD = 0.3F; + + public static ExecutorService executerService =java.util.concurrent.Executors.newFixedThreadPool(Parser.THREADS); + + + // do not initialize + private Decoder() {}; + + + /** + * Build a dependency tree based on the data + * @param pos part-of-speech tags + * @param x the data + * @param projective projective or non-projective + * @param edges the edges + * @return a parse tree + * @throws InterruptedException + */ + public static List<ParseNBest> decode(short[] pos, DataF x, boolean projective, Extractor extractor) throws InterruptedException { + + long ts = System.nanoTime(); + + if (executerService.isShutdown()) executerService = java.util.concurrent.Executors.newCachedThreadPool(); + final int n = pos.length; + + final Open O[][][][] = new Open[n][n][2][]; + final Closed C[][][][] = new Closed[n][n][2][]; + + ArrayList<ParallelDecoder> pe = new ArrayList<ParallelDecoder>(); + + for(int i=0;i<Parser.THREADS ;i++) pe.add(new ParallelDecoder(pos, x, O, C, n)); + + for (short k = 1; k < n; k++) { + + // provide the threads the data + for (short s = 0; s < n; s++) { + short t = (short) (s + k); + if (t >= n) break; + + ParallelDecoder.add(s,t); + } + + executerService.invokeAll(pe); + } + + double bestSpanScore = (-1.0F / 0.0F); + Closed bestSpan = null; + for (int m = 1; m < n; m++) + if (C[0][n - 1][1][m].p > bestSpanScore) { + bestSpanScore = C[0][n - 1][1][m].p; + bestSpan = C[0][n - 1][1][m]; + } + + // build the dependency tree from the chart + ParseNBest out= new ParseNBest(pos.length); + + bestSpan.create(out); + + out.heads[0]=-1; + out.labels[0]=0; + bestProj=out; + + timeDecotder += (System.nanoTime()-ts); + // DB.println(""+out); + + ts = System.nanoTime(); + List<ParseNBest> parses; + + if (!projective) { + + // if (training) + // rearrange(pos, out.heads, out.types,x,training); + //else { + // DB.println("bestSpan score "+(float)bestSpan.p+" comp score "+Extractor.encode3(pos, out.heads, out.types, x)); + // System.out.println(); + // Parse best = new Parse(out.heads,out.types,Extractor.encode3(pos, out.heads, out.types, x)); + parses = rearrangeNBest(pos, out.heads, out.labels,x,extractor); + // DB.println("1best "+parses.get(0).f1); + // DB.println(""+parses.get(0).toString()); + + + // for(ParseNBest p :parses) if (p.heads==null) p.signature2parse(p.signature()); + + /// if (parses.get(0).f1>(best.f1+NON_PROJECTIVITY_THRESHOLD)) out = parses.get(0); + // else out =best; + + // } + } else { + parses = new ArrayList<ParseNBest>(); + parses.add(out); + } + timeRearrange += (System.nanoTime()-ts); + + return parses; + } + + static Parse bestProj = null; + + + + + /** + * This is the parallel non-projective edge re-arranger + * + * @param pos part-of-speech tags + * @param heads parent child relation + * @param labs edge labels + * @param x the data + * @param edges the existing edges defined by part-of-speech tags + * @throws InterruptedException + */ + public static List<ParseNBest> rearrangeNBestP(short[] pos, short[] heads, short[] labs, DataF x, Extractor extractor) throws InterruptedException { + + ArrayList<ParallelRearrangeNBest2> pe = new ArrayList<ParallelRearrangeNBest2>(); + + int round =0; + ArrayList<ParseNBest> parses = new ArrayList<ParseNBest>(); + ParseNBest px =new ParseNBest(); + px.signature(heads,labs); + //Object extractor; + px.f1=extractor.encode3(pos, heads, labs, x); + parses.add(px); + + float lastNBest = Float.NEGATIVE_INFINITY; + + HashSet<Parse> done = new HashSet<Parse>(); + gnu.trove.THashSet<CharSequence> contained = new gnu.trove.THashSet<CharSequence>(); + + while(true) { + + pe.clear(); + + // used the first three parses + int ic=0, considered=0; + while(true) { + + if (parses.size()<=ic || considered>11) break; + + ParseNBest parse = parses.get(ic); + + ic++; + // parse already extended + if (done.contains(parse)) continue; + considered++; + + parse.signature2parse(parse.signature()); + + done.add(parse); + + + boolean[][] isChild = new boolean[heads.length][heads.length]; + + for(int i = 1, l1=1; i < heads.length; i++,l1=i) + while((l1= heads[l1]) != -1) isChild[l1][i] = true; + + + // check the list of new possible parents and children for a better combination + for(short ch = 1; ch < heads.length; ch++) { + for(short pa = 0; pa < heads.length; pa++) { + if(ch == pa || pa == heads[ch] || isChild[ch][pa]) continue; + ParallelRearrangeNBest2.add(parse.clone(), ch, pa); + } + } + + } + + for(int t =0;t<Parser.THREADS;t++) pe.add(new ParallelRearrangeNBest2( pos,x,lastNBest,extractor, NON_PROJECTIVITY_THRESHOLD) ); + + + executerService.invokeAll(pe); + + // avoid to add parses several times + for(ParallelRearrangeNBest2 rp : pe) { + for(int k=rp.parses.size()-1;k>=0;k--) { + if (lastNBest>rp.parses.get(k).f1) continue; + CharSequence sig = rp.parses.get(k).signature(); + if (!contained.contains(sig)) { + parses.add(rp.parses.get(k)); + contained.add(sig); + } + } + } + + Collections.sort(parses); + + if (round >=2) break; + round ++; + + // do not use to much memory + if (parses.size()>Parser.NBest) { + // if (parses.get(Parser.NBest).f1>lastNBest) lastNBest = (float)parses.get(Parser.NBest).f1; + parses.subList(Parser.NBest, parses.size()-1).clear(); + } + } + return parses; + } + + + /** + * This is the parallel non-projective edge re-arranger + * + * @param pos part-of-speech tags + * @param heads parent child relation + * @param labs edge labels + * @param x the data + * @param edges the existing edges defined by part-of-speech tags + * @throws InterruptedException + */ + public static List<ParseNBest> rearrangeNBest(short[] pos, short[] heads, short[] labs, DataF x, Extractor extractor) throws InterruptedException { + + ArrayList<ParallelRearrangeNBest> pe = new ArrayList<ParallelRearrangeNBest>(); + + int round =0; + ArrayList<ParseNBest> parses = new ArrayList<ParseNBest>(); + ParseNBest px =new ParseNBest(); + px.signature(heads,labs); + //Object extractor; + px.f1=extractor.encode3(pos, heads, labs, x); + parses.add(px); + + float lastNBest = Float.NEGATIVE_INFINITY; + + HashSet<Parse> done = new HashSet<Parse>(); + gnu.trove.THashSet<CharSequence> contained = new gnu.trove.THashSet<CharSequence>(); + while(true) { + + pe.clear(); + + // used the first three parses + int i=0; + while(true) { + + if (parses.size()<=i||pe.size()>12) break; + + ParseNBest parse = parses.get(i); + + i++; + + // parse already extended + if (done.contains(parse)) continue; + +// DB.println("err "+parse.heads); + + parse.signature2parse(parse.signature()); + + done.add(parse); + pe.add(new ParallelRearrangeNBest( pos,x,parse,lastNBest,extractor, (float)parse.f1,NON_PROJECTIVITY_THRESHOLD) ); + } + + executerService.invokeAll(pe); + + // avoid to add parses several times + for(ParallelRearrangeNBest rp : pe) { + for(int k=rp.parses.size()-1;k>=0;k--) { + if (lastNBest>rp.parses.get(k).f1) continue; + CharSequence sig = rp.parses.get(k).signature(); + if (!contained.contains(sig)) { + parses.add(rp.parses.get(k)); + contained.add(sig); + } + } + } + + Collections.sort(parses); + + if (round >=2) break; + round ++; + + // do not use to much memory + if (parses.size()>Parser.NBest) { + if (parses.get(Parser.NBest).f1>lastNBest) lastNBest = (float)parses.get(Parser.NBest).f1; + parses.subList(Parser.NBest, parses.size()-1).clear(); + } + } + return parses; + } + + public static String getInfo() { + + return "Decoder non-projectivity threshold: "+NON_PROJECTIVITY_THRESHOLD; + } + + + /** + * @param parses + * @param is + * @param i + * @return + */ + public static int getGoldRank(List<ParseNBest> parses, Instances is, int i, boolean las) { + + for(int p=0;p<parses.size();p++) { + + if (parses.get(p).heads==null)parses.get(p).signature2parse(parses.get(p).signature()); + + boolean eq =true; + for(int w =1;w<is.length(0);w++) { + if (is.heads[i][w]!=parses.get(p).heads[w] || (is.labels[i][w]!=parses.get(p).labels[w]&& las )) { + eq=false; + break; + } + } + if (eq) return p; + } + return -1; + } + + public static int getSmallestError(List<ParseNBest> parses, Instances is, int i, boolean las) { + + int smallest=-1; + for(int p=0;p<parses.size();p++) { + + int err=0; + for(int w =1;w<is.length(0);w++) { + if (is.heads[i][w]!=parses.get(p).heads[w] || (is.labels[i][w]!=parses.get(p).labels[w] && las )) { + err++; + } + } + if (smallest==-1||smallest>err) smallest=err; + if (smallest==0) return 0; + } + return smallest; + } + + public static int getError(ParseNBest parse, Instances is, int i, boolean las) { + + + int err=0; + for(int w =1;w<is.length(i);w++) { + if (is.heads[i][w]!=parse.heads[w] || (is.labels[i][w]!=parse.labels[w] && las )) { + err++; + } + } + return err; + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/parserR2/Options.java b/dependencyParser/basic/mate-tools/src/is2/parserR2/Options.java new file mode 100755 index 0000000..b5ec0f9 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parserR2/Options.java @@ -0,0 +1,77 @@ +package is2.parserR2; + +import is2.util.OptionsSuper; + + +public final class Options extends OptionsSuper { + + + int start=0, end=0; + String prefix_model ="m"; + String prefix_test ="t"; + + public Options (String[] args) { + + for(int i = 0; i < args.length; i++) { + + if (args[i].equals("--help")) explain(); + + if (args[i].equals("-decode")) { + decodeProjective = args[i+1].equals("proj"); i++; + } else if (args[i].equals("-decodeTH")) { + decodeTH = Double.parseDouble(args[i+1]); i++; + } else if (args[i].equals("-nonormalize")) { + normalize=false; + } else if (args[i].equals("-features")) { + features= args[i+1]; i++; + } else if (args[i].equals("-hsize")) { + hsize= Integer.parseInt(args[i+1]); i++; + } else if (args[i].equals("-len")) { + maxLen= Integer.parseInt(args[i+1]); i++; + } else if (args[i].equals("-cores")) { + cores= Integer.parseInt(args[i+1]); i++; + } else if (args[i].equals("-best")) { + best= Integer.parseInt(args[i+1]); i++; + } else if (args[i].equals("-start")) { + start= Integer.parseInt(args[i+1]); i++; + } else if (args[i].equals("-end")) { + end= Integer.parseInt(args[i+1]); i++; + } else if (args[i].equals("-prefix-model")) { + prefix_model= args[i+1]; i++; + } else if (args[i].equals("-prefix-test")) { + prefix_test= args[i+1]; i++; + } else if (args[i].equals("-mapping")) { + this.useMapping= args[i+1]; i++; + } else if (args[i].equals("-no2nd")) { + no2nd= true; + } else if (args[i].equals("-few2nd")) { + few2nd= true; + } else super.addOption(args, i); + + } + + + + } + + private void explain() { + System.out.println("Usage: "); + System.out.println("java -class mate.jar is2.parser.Parser [Options]"); + System.out.println(); + System.out.println("Example: "); + System.out.println(" java -class mate.jar is2.parser.Parser -model eps3.model -train corpora/conll08st/train/train.closed -test corpora/conll08st/devel/devel.closed -out b3.test -eval corpora/conll08st/devel/devel.closed -count 2000 -i 6"); + System.out.println(""); + System.out.println("Options:"); + System.out.println(""); + System.out.println(" -train <file> the corpus a model is trained on; default "+this.trainfile); + System.out.println(" -test <file> the input corpus for testing; default "+this.testfile); + System.out.println(" -out <file> the output corpus (result) of a test run; default "+this.outfile); + System.out.println(" -model <file> the parsing model for traing the model is stored in the files"); + System.out.println(" and for parsing the model is load from this file; default "+this.modelName); + System.out.println(" -i <number> the number of training iterations; good numbers are 10 for smaller corpora and 6 for bigger; default "+this.numIters); + System.out.println(" -count <number> the n first sentences of the corpus are take for the training default "+this.count); + System.out.println(" -format <number> conll format of the year 8 or 9; default "+this.formatTask); + + System.exit(0); + } +} diff --git a/dependencyParser/basic/mate-tools/src/is2/parserR2/Parameters.java b/dependencyParser/basic/mate-tools/src/is2/parserR2/Parameters.java new file mode 100755 index 0000000..0917ea8 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parserR2/Parameters.java @@ -0,0 +1,38 @@ +/** + * + */ +package is2.parserR2; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; + +import is2.data.FV; +import is2.data.IFV; +import is2.data.Instances; +import is2.data.Parse; + +/** + * @author Bernd Bohnet, 31.08.2009 + * + * + */ +public abstract class Parameters { + + + public abstract void average(double avVal); + + public abstract void update(FV act, FV pred, Instances isd, int instc, Parse d, double upd, double e); + + public abstract void write(DataOutputStream dos) throws IOException; + + public abstract void read(DataInputStream dis ) throws IOException; + + public abstract int size(); + + /** + * @return + */ + public abstract IFV getFV() ; + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/parserR2/ParametersFloat.java b/dependencyParser/basic/mate-tools/src/is2/parserR2/ParametersFloat.java new file mode 100755 index 0000000..44e6d76 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parserR2/ParametersFloat.java @@ -0,0 +1,181 @@ +package is2.parserR2; + +import is2.data.F2SF; +import is2.data.FV; +import is2.data.FVR; +import is2.data.Instances; +import is2.data.Parse; +import is2.util.DB; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; + + + +final public class ParametersFloat extends Parameters { + + public float[] parameters; + private float[] total; + + public ParametersFloat(int size) { + parameters = new float[size]; + total = new float[size]; + for(int i = 0; i < parameters.length; i++) { + parameters[i] = 0F; + total[i] = 0F; + } + } + + + /** + * @param parameters2 + */ + public ParametersFloat(float[] p) { + parameters =p; + } + + + @Override + public void average(double avVal) { + for(int j = 0; j < total.length; j++) { + parameters[j] = total[j]/((float)avVal); + } + total =null; + } + + public ParametersFloat average2(double avVal) { + float[] px = new float[this.parameters.length]; + for(int j = 0; j < total.length; j++) { + px[j] = total[j]/((float)avVal); + } + ParametersFloat pf = new ParametersFloat(px); + return pf; + } + + + public void update(FV act, FV pred, Instances isd, int instc, Parse dx, double upd, double e, + float d, float f) { + + e++; + + float lam_dist = d - f; + + float b = (float)e-lam_dist; + + FV dist = act.getDistVector(pred); + + dist.update(parameters, total, hildreth(dist,b), upd,false); + } + + public void update(FV act, FV pred, Instances isd, int instc, Parse dx, double upd, double e) { + + e++; + + float lam_dist = getScore(act) - getScore(pred); + + float b = (float)e-lam_dist; + + FV dist = act.getDistVector(pred); + + dist.update(parameters, total, hildreth(dist,b), upd,false); + } + + public void update(FVR act, FVR pred, Instances isd, int instc, Parse dx, double upd, double e, float lam_dist) { + + e++; + + + float b = (float)e-lam_dist; + + FVR dist = act.getDistVector(pred); + + dist.update(parameters, total, hildreth(dist,b), upd,false); + } + + + protected double hildreth(FV a, double b) { + + double A = a.dotProduct(a); + if (A<=0.0000000000000000001) return 0.0; + return b/A; + } + + + protected double hildreth(FVR a, double b) { + + double A = a.dotProduct(a); + if (A<=0.0000000000000000001) return 0.0; + return b/A; + } + + + public float getScore(FV fv) { + if (fv ==null) return 0.0F; + return fv.getScore(parameters,false); + + } + + public float getScore(FVR fv) { //xx + if (fv ==null) return 0.0F; + return fv.getScore(parameters,false); + + } + + + @Override + final public void write(DataOutputStream dos) throws IOException{ + + dos.writeInt(parameters.length); + for(float d : parameters) dos.writeFloat(d); + + } + + @Override + public void read(DataInputStream dis ) throws IOException{ + + parameters = new float[dis.readInt()]; + int notZero=0; + for(int i=0;i<parameters.length;i++) { + parameters[i]=dis.readFloat(); + if (parameters[i]!=0.0F) notZero++; + } + + + DB.println("read parameters "+parameters.length+" not zero "+notZero); + + } + + public int countNZ() { + + int notZero=0; + for(int i=0;i<parameters.length;i++) { + if (parameters[i]!=0.0F) notZero++; + } + return notZero; + + // DB.println("read parameters "+parameters.length+" not zero "+notZero); + + } + + + /* (non-Javadoc) + * @see is2.sp09k99995.Parameters#getFV() + */ + @Override + public F2SF getFV() { + return new F2SF(parameters); + } + + + /* (non-Javadoc) + * @see is2.sp09k99999.Parameters#size() + */ + @Override + public int size() { + return parameters.length; + } + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/parserR2/Parser.java b/dependencyParser/basic/mate-tools/src/is2/parserR2/Parser.java new file mode 100755 index 0000000..b0cfe9e --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parserR2/Parser.java @@ -0,0 +1,690 @@ +package is2.parserR2; + + +import is2.data.Cluster; +import is2.data.DataF; +import is2.data.Edges; +import is2.data.F2SF; +import is2.data.FV; +import is2.data.Instances; +import is2.data.Long2Int; +import is2.data.Long2IntInterface; +import is2.data.MFB; +import is2.data.Parse; +import is2.data.ParseNBest; +import is2.data.PipeGen; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; +import is2.io.CONLLWriter09; + +import is2.tools.Tool; +import is2.util.DB; +import is2.util.OptionsSuper; +import is2.util.ParserEvaluator; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map.Entry; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; +import java.util.zip.ZipOutputStream; + +//import extractors.ExtractorClusterStackedR2; +import extractors.Extractor; +import extractors.ExtractorFactory; + + + +public class Parser implements Tool { + + // output evaluation info + private static final boolean MAX_INFO = true; + + public static int THREADS =4; + + Long2IntInterface l2i; + ParametersFloat params; + Pipe pipe; + OptionsSuper options; + + HashMap<Integer,Integer> rank = new HashMap<Integer,Integer>(); + int amongxbest=0, amongxbest_ula=0, nbest=0,bestProj=0, smallestErrorSum=0, countAllNodes=0; + static int NBest =1000; + + ExtractorFactory extractorFactory = new ExtractorFactory(ExtractorFactory.StackedClusteredR2); + + + /** + * Initialize the parser + * @param options + */ + public Parser (OptionsSuper options) { + + this.options=options; + pipe = new Pipe(options); + + params = new ParametersFloat(0); + + // load the model + try { + readModel(options, pipe, params); + } catch (Exception e) { + e.printStackTrace(); + } + + } + + + /** + * @param modelFileName The file name of the parsing model + */ + public Parser(String modelFileName) { + this(new Options(new String[]{"-model",modelFileName})); + } + + + /** + * + */ + public Parser() { + // TODO Auto-generated constructor stub + } + + + public static void main (String[] args) throws Exception + { + + long start = System.currentTimeMillis(); + OptionsSuper options = new Options(args); + + NBest = options.best; + + DB.println("n-best"+NBest); + + Runtime runtime = Runtime.getRuntime(); + THREADS = runtime.availableProcessors(); + if (options.cores<THREADS&&options.cores>0) THREADS =options.cores; + + DB.println("Found " + runtime.availableProcessors()+" cores use "+THREADS); + + if (options.train) { + + Parser p =new Parser(); + p.options=options; + + p.l2i = new Long2Int(options.hsize); + + p.pipe = new Pipe (options); + Instances is = new Instances(); + + p.pipe.extractor = new Extractor[THREADS]; + + for (int t=0;t<THREADS;t++) p.pipe.extractor[t]=p.extractorFactory.getExtractor( p.l2i); + + p.params = new ParametersFloat(p.l2i.size()); + + if (options.useMapping!=null) { + String model = options.modelName; + + options.modelName = options.useMapping; + DB.println("Using mapping of model "+options.modelName); + ZipInputStream zis = new ZipInputStream(new BufferedInputStream(new FileInputStream(options.modelName))); + zis.getNextEntry(); + DataInputStream dis = new DataInputStream(new BufferedInputStream(zis)); + p.pipe.mf.read(dis); + + DB.println("read\n"+p.pipe.mf.toString()); + + ParametersFloat params = new ParametersFloat(0); + params.read(dis); + + Edges.read(dis); + + dis.close(); + DB.println("end read model"); + options.modelName = model; + } + + + p.pipe.createInstances(options.trainfile,is); + + + p.train(options, p.pipe,p.params,is,p.pipe.cl); + + p.writeModell(options, p.params, null,p.pipe.cl); + + } + + if (options.test) { + + Parser p = new Parser(); + p.options=options; + + p. pipe = new Pipe(options); + p. params = new ParametersFloat(0); // total should be zero and the parameters are later read + + // load the model + + p.readModel(options, p.pipe, p.params); + + DB.println("test on "+options.testfile); + + System.out.println(""+p.pipe.mf.toString()); + + + p.outputParses(options, p.pipe, p.params, !MAX_INFO); + + } + + System.out.println(); + + if (options.eval) { + System.out.println("\nEVALUATION PERFORMANCE:"); + ParserEvaluator.evaluate(options.goldfile, options.outfile); + } + + long end = System.currentTimeMillis(); + System.out.println("used time "+((float)((end-start)/100)/10)); + + Decoder.executerService.shutdown(); + Pipe.executerService.shutdown(); + System.out.println("end."); + + + } + + /** + * Read the models and mapping + * @param options + * @param pipe + * @param params + * @throws IOException + */ + public void readModel(OptionsSuper options, Pipe pipe, Parameters params) throws IOException { + + + DB.println("Reading data started"); + + // prepare zipped reader + ZipInputStream zis = new ZipInputStream(new BufferedInputStream(new FileInputStream(options.modelName))); + zis.getNextEntry(); + DataInputStream dis = new DataInputStream(new BufferedInputStream(zis)); + + pipe.mf.read(dis); + + pipe.cl = new Cluster(dis); + + params.read(dis); + this.l2i = new Long2Int(params.size()); + DB.println("parsing -- li size "+l2i.size()); + + pipe.extractor = new Extractor[THREADS]; + + for (int t=0;t<THREADS;t++) pipe.extractor[t]=this.extractorFactory.getExtractor(l2i); + + Edges.read(dis); + + options.decodeProjective = dis.readBoolean(); + + int maxForm = dis.readInt(); + + for (int t=0;t<THREADS;t++) { + pipe.extractor[t].setMaxForm(maxForm); + pipe.extractor[t].initStat(); + pipe.extractor[t].init(); + } + + boolean foundInfo =false; + try { + String info =null; + int icnt = dis.readInt(); + for(int i=0;i<icnt;i++) { + info = dis.readUTF(); + System.out.println(info); + } + } catch (Exception e) { + if (!foundInfo) System.out.println("no info about training"); + } + + + dis.close(); + + DB.println("Reading data finnished"); + + Decoder.NON_PROJECTIVITY_THRESHOLD =(float)options.decodeTH; + for (int t=0;t<THREADS;t++) { + pipe.extractor[t].initStat(); + pipe.extractor[t].init(); + } + + } + + + + /** + * Do the training + * @param instanceLengths + * @param options + * @param pipe + * @param params + * @param is + * @param cluster + * @throws IOException + * @throws InterruptedException + * @throws ClassNotFoundException + */ + public void train(OptionsSuper options, Pipe pipe, ParametersFloat params, Instances is, Cluster cluster) + throws IOException, InterruptedException, ClassNotFoundException { + + + DB.println("\nTraining Information "); + DB.println("-------------------- "); + + + Decoder.NON_PROJECTIVITY_THRESHOLD =(float)options.decodeTH; + + if (options.decodeProjective) System.out.println("Decoding: "+(options.decodeProjective?"projective":"non-projective")); + else System.out.println(""+Decoder.getInfo()); + int numInstances = is.size(); + + int maxLenInstances =0; + for(int i=0;i<numInstances;i++) if (maxLenInstances<is.length(i)) maxLenInstances=is.length(i); + + DataF data = new DataF(maxLenInstances, pipe.mf.getFeatureCounter().get(PipeGen.REL).shortValue()); + + int iter = 0; + int del=0; + float error =0; + float f1=0; + + FV pred = new FV(); + FV act = new FV(); + + double upd = (double)(numInstances*options.numIters)+1; + + for(; iter < options.numIters; iter++) { + + System.out.print("Iteration "+iter+": "); + + long start = System.currentTimeMillis(); + + long last= System.currentTimeMillis(); + error=0; + f1=0; + for(int n = 0; n < numInstances; n++) { + + upd--; + + if (is.labels[n].length>options.maxLen) continue; + + String info = " td "+((Decoder.timeDecotder)/1000000F)+" tr "+((Decoder.timeRearrange)/1000000F) + +" te "+((Pipe.timeExtract)/1000000F); + + if((n+1) %500 == 0) del= PipeGen.outValueErr(n+1,Math.round(error*1000)/1000,f1/n,del, last, upd,info); + + short pos[] = is.pposs[n]; + + data = pipe.fillVector((F2SF)params.getFV(), is, n, data, cluster, THREADS, l2i); + + List<ParseNBest> parses = Decoder.decode(pos, data, options.decodeProjective,pipe.extractor[0]); + Parse d = parses.get(0); + double e= pipe.errors(is, n ,d); + + if (d.f1>0)f1+=(d.labels.length-1 -e) /(d.labels.length-1); + + if (e<=0) continue; + + // get predicted feature vector + pred.clear(); + pipe.extractor[0].encodeCat(is,n,pos,is.forms[n],is.plemmas[n],d.heads, d.labels, is.feats[n],pipe.cl, pred); + + error += e; + + act.clear(); + pipe.extractor[0].encodeCat(is,n,pos,is.forms[n],is.plemmas[n],is.heads[n], is.labels[n], is.feats[n],pipe.cl, act); + + params.update(act, pred, is, n, d, upd,e); + } + + String info = " td "+((Decoder.timeDecotder)/1000000F)+" tr "+((Decoder.timeRearrange)/1000000F) + +" te "+((Pipe.timeExtract)/1000000F)+" nz "+params.countNZ(); + PipeGen.outValueErr(numInstances,Math.round(error*1000)/1000,f1/numInstances,del,last, upd,info); + del=0; + long end = System.currentTimeMillis(); + System.out.println(" time:"+(end-start)); + + + ParametersFloat pf = params.average2((iter+1)*is.size()); + try { + + if (options.testfile!=null) { + outputParses (options, pipe, pf, ! MAX_INFO); + ParserEvaluator.evaluate(options.goldfile, options.outfile); + // writeModell(options, pf, ""+(iter+1),pipe.cl); + } + + + } catch (Exception e) { + e.printStackTrace(); + } + + + Decoder.timeDecotder=0;Decoder.timeRearrange=0; Pipe.timeExtract=0; + + + } + params.average(iter*is.size()); + } + + + /** + * Do the parsing + * @param options + * @param pipe + * @param params + * @throws IOException + */ + private void outputParses (OptionsSuper options, Pipe pipe, ParametersFloat params, boolean maxInfo) throws Exception { + + long start = System.currentTimeMillis(); + + CONLLReader09 depReader = new CONLLReader09(options.testfile, options.formatTask); + CONLLWriter09 depWriter = new CONLLWriter09(options.outfile, options.formatTask); + +// ExtractorClusterStacked.initFeatures(); + + int cnt = 0; + int del=0; + long last = System.currentTimeMillis(); + + if (maxInfo) System.out.println("\nParsing Information "); + if (maxInfo) System.out.println("------------------- "); + + if (maxInfo && !options.decodeProjective) System.out.println(""+Decoder.getInfo()); + + // if (!maxInfo) System.out.println(); + + String[] types = new String[pipe.mf.getFeatureCounter().get(PipeGen.REL)]; + for (Entry<String, Integer> e : pipe.mf.getFeatureSet().get(PipeGen.REL).entrySet()) types[e.getValue()] = e.getKey(); + + + System.out.print("Processing Sentence: "); + + while(true) { + + Instances is = new Instances(); + is.init(1, new MFB(),options.formatTask); + + SentenceData09 instance = pipe.nextInstance(is, depReader); + if (instance==null) break; + cnt++; + + SentenceData09 i09 = this.parse(instance,params); + + // } + depWriter.write(i09); + del=PipeGen.outValue(cnt, del,last); + // DB.println("xbest "+amongxbest+" cnt "+cnt+" "+((float)((float)amongxbest/cnt))+" nbest "+((float)nbest/cnt)+ + // " 1best "+((float)(rank.get(0)==null?0:rank.get(0))/cnt)+" best-proj "+((float)bestProj/cnt)); + + } + + //pipe.close(); + + depWriter.finishWriting(); + long end = System.currentTimeMillis(); + DB.println("rank\n"+rank+"\n"); + DB.println("x-best-las "+amongxbest+" x-best-ula "+amongxbest_ula+" cnt "+cnt+" x-best-las " + +((float)((float)amongxbest/cnt))+ + " x-best-ula "+((float)((float)amongxbest_ula/cnt))+ + " nbest "+((float)nbest/cnt)+ + " 1best "+((float)(rank.get(0)==null?0:rank.get(0))/cnt)+ + " best-proj "+((float)bestProj/cnt)+ + " Sum LAS "+((float)this.smallestErrorSum/countAllNodes)); + + // DB.println("errors "+error); + + rank.clear(); + amongxbest=0;amongxbest_ula=0; + cnt=0; + nbest=0; + bestProj=0; + if (maxInfo) System.out.println("Used time " + (end-start)); + if (maxInfo) System.out.println("forms count "+Instances.m_count+" unkown "+Instances.m_unkown); + + } + + + /** + * Do the parsing + * @param options + * @param pipe + * @param params + * @throws IOException + */ + private void getNBest(OptionsSuper options, Pipe pipe, ParametersFloat params, boolean maxInfo) throws Exception { + + + CONLLReader09 depReader = new CONLLReader09(options.testfile, options.formatTask); + + // ExtractorClusterStacked.initFeatures(); + + int cnt = 0; + + String[] types = new String[pipe.mf.getFeatureCounter().get(PipeGen.REL)]; + for (Entry<String, Integer> e : pipe.mf.getFeatureSet().get(PipeGen.REL).entrySet()) types[e.getValue()] = e.getKey(); + +// System.out.print("Processing Sentence: "); + + while(true) { + + Instances is = new Instances(); + is.init(1, new MFB(),options.formatTask); + + SentenceData09 instance = pipe.nextInstance(is, depReader); + if (instance==null) break; + cnt++; + + this.parseNBest(instance); + } + + //pipe.close(); +// depWriter.finishWriting(); +// long end = System.currentTimeMillis(); +// DB.println("rank\n"+rank+"\n"); +// DB.println("x-best-las "+amongxbest+" x-best-ula "+amongxbest_ula+" cnt "+cnt+" x-best-las " +// +((float)((float)amongxbest/cnt))+ +// " x-best-ula "+((float)((float)amongxbest_ula/cnt))+ +// " nbest "+((float)nbest/cnt)+ +// " 1best "+((float)(rank.get(0)==null?0:rank.get(0))/cnt)+ +// " best-proj "+((float)bestProj/cnt)); + // DB.println("errors "+error); + + + } + + + public SentenceData09 parse (SentenceData09 instance, ParametersFloat params) { + + String[] types = new String[pipe.mf.getFeatureCounter().get(PipeGen.REL)]; + for (Entry<String, Integer> e : MFB.getFeatureSet().get(PipeGen.REL).entrySet()) types[e.getValue()] = e.getKey(); + + Instances is = new Instances(); + is.init(1, new MFB(),options.formatTask); + new CONLLReader09().insert(is, instance); + + String[] forms = instance.forms; + + // use for the training ppos + DataF d2; + try { + d2 = pipe.fillVector(params.getFV(), is,0,null,pipe.cl,THREADS,l2i);//cnt-1 + } catch (Exception e ) { + e.printStackTrace(); + return null; + } + short[] pos = is.pposs[0]; + + List<ParseNBest> parses=null; + Parse d= null; + try { + parses =Decoder.decode(pos,d2,options.decodeProjective,pipe.extractor[0]); //cnt-1 + d = parses.get(0); + }catch (Exception e) { + e.printStackTrace(); + } + + if (parses.size()>NBest) parses = parses.subList(0,NBest); + + int g_las = Decoder.getGoldRank(parses, is,0,Decoder.LAS); + int g_ula = Decoder.getGoldRank(parses, is,0,!Decoder.LAS); + + int smallest = Decoder.getSmallestError(parses, is,0,!Decoder.LAS); + smallestErrorSum+=is.length(0)-smallest; + countAllNodes+=is.length(0); + + if (g_las>=0) amongxbest++; + if (g_ula>=0) amongxbest_ula++; + + nbest+=parses.size(); + + Integer r = rank.get(g_las); + if (r==null) rank.put(g_las, 1); + else rank.put(g_las, r+1); + + float err = (float)this.pipe.errors(is,0, d); + + float errBestProj = (float)this.pipe.errors(is,0, Decoder.bestProj); + + if (errBestProj==0) bestProj++; + + SentenceData09 i09 = new SentenceData09(instance); + + i09.createSemantic(instance); + + for(int j = 0; j < forms.length-1; j++) { + i09.plabels[j] = types[d.labels[j+1]]; + i09.pheads[j] = d.heads[j+1]; + } + return i09; + + } + + public List<ParseNBest> parseNBest (SentenceData09 instance) { + + Instances is = new Instances(); + is.init(1, new MFB(),options.formatTask); + new CONLLReader09().insert(is, instance); + + // use for the training ppos + DataF d2; + try { + d2 = pipe.fillVector(params.getFV(), is,0,null,pipe.cl,THREADS, l2i);//cnt-1 + } catch (Exception e ) { + e.printStackTrace(); + return null; + } + short[] pos = is.pposs[0]; + + List<ParseNBest> parses=null; + try { + parses =Decoder.decode(pos,d2,options.decodeProjective,pipe.extractor[0]); //cnt-1 + }catch (Exception e) { + e.printStackTrace(); + } + + + if (parses.size()>NBest) parses = parses.subList(0,NBest); + + return parses; + + } + + + + /* (non-Javadoc) + * @see is2.tools.Tool#apply(is2.data.SentenceData09) + */ + + @Override + public SentenceData09 apply(SentenceData09 snt09) { + + SentenceData09 it = new SentenceData09(); + it.createWithRoot(snt09); + + SentenceData09 out=null; + try { + + + // for(int k=0;k<it.length();k++) { + // it.forms[k] = reader.normalize(it.forms[k]); + // it.plemmas[k] = reader.normalize(it.plemmas[k]); + // } + + out = parse(it,this.params); + + + } catch(Exception e) { + e.printStackTrace(); + } + + Decoder.executerService.shutdown(); + Pipe.executerService.shutdown(); + + return out; + } + + /** + * Write the parsing model + * + * @param options + * @param params + * @param extension + * @throws FileNotFoundException + * @throws IOException + */ + private void writeModell(OptionsSuper options, ParametersFloat params, String extension, Cluster cs) throws FileNotFoundException, IOException { + + String name = extension==null?options.modelName:options.modelName+extension; +// System.out.println("Writting model: "+name); + ZipOutputStream zos = new ZipOutputStream(new BufferedOutputStream(new FileOutputStream(name))); + zos.putNextEntry(new ZipEntry("data")); + DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(zos)); + + MFB.writeData(dos); + cs.write(dos); + + params.write(dos); + + Edges.write(dos); + + dos.writeBoolean(options.decodeProjective); + + dos.writeInt(pipe.extractor[0].getMaxForm()); + + dos.writeInt(5); // Info count + dos.writeUTF("Used parser "+Parser.class.toString()); + dos.writeUTF("Creation date "+(new SimpleDateFormat("yyyy.MM.dd HH:mm:ss")).format(new Date())); + dos.writeUTF("Training data "+options.trainfile); + dos.writeUTF("Iterations "+options.numIters+" Used sentences "+options.count); + dos.writeUTF("Cluster "+options.clusterFile); + + dos.flush(); + dos.close(); + } + + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/parserR2/Pipe.java b/dependencyParser/basic/mate-tools/src/is2/parserR2/Pipe.java new file mode 100755 index 0000000..ad545ec --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parserR2/Pipe.java @@ -0,0 +1,257 @@ +package is2.parserR2; + +import is2.data.Cluster; +import is2.data.DataF; +import is2.data.Edges; +import is2.data.F2SF; +import is2.data.Instances; +import is2.data.Long2IntInterface; +import is2.data.MFB; + +import is2.data.Parse; +import is2.data.PipeGen; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; + +import is2.util.DB; +import is2.util.OptionsSuper; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.concurrent.ExecutorService; + +import extractors.Extractor; +import extractors.ExtractorClusterStacked; +import extractors.ExtractorClusterStackedR2; +import extractors.ParallelExtract; + +final public class Pipe extends PipeGen { + + public Extractor[] extractor; + final public MFB mf = new MFB(); + + Cluster cl; + + + private OptionsSuper options; + public static long timeExtract; + + public Pipe(OptionsSuper o) { + options = o; + } + + public void createInstances(String file, Instances is) + // throws Exception + + { + + + CONLLReader09 depReader = new CONLLReader09(file); + + mf.register(REL,"<root-type>"); + + // register at least one predicate since the parsing data might not contain predicates as in + // the Japaness corpus but the development sets contains some + + long sl=0; + + System.out.print("Registering feature parts of sentence: "); + int ic = 0; + int del = 0; + while (true) { + SentenceData09 instance = depReader.getNext(); + if (instance == null) break; + ic++; + + sl+=instance.labels.length; + + if (ic % 1000 == 0) { + del = outValue(ic, del); + } + + String[] labs1 = instance.labels; + for (int i1 = 0; i1 < labs1.length; i1++) mf.register(REL, labs1[i1]); + + String[] w = instance.forms; + for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, depReader.normalize(w[i1])); + + w = instance.plemmas; + for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, depReader.normalize(w[i1])); + + + w = instance.ppos; + for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); + + w = instance.gpos; + for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); + + if (instance.feats !=null) { + String fs[][] = instance.feats; + for (int i1 = 0; i1 < fs.length; i1++){ + w =fs[i1]; + if (w==null) continue; + for (int i2 = 0; i2 < w.length; i2++) mf.register(FEAT, w[i2]); + } + } + + if ((ic-1)>options.count) break; + } + del = outValue(ic, del); + + + for(Extractor e : extractor) { + e.setMaxForm(mf.getFeatureCounter().get(WORD)); + } + + if (options.clusterFile==null)cl = new Cluster(); + else cl= new Cluster(options.clusterFile, mf,6); + + mf.calculateBits(); + + System.out.println(""+mf.toString()); + + for(Extractor e : extractor) { + e.initStat(); + e.init(); + } + + depReader.startReading(file); + + int num1 = 0; + + + Edges.init(mf.getFeatureCounter().get(POS)); + + + System.out.print("Creating edge filters and read corpus: "); + del = 0; + + is.init(ic, new MFB()); + + while (true) { + if (num1 % 100 == 0) del = outValue(num1, del); + + SentenceData09 instance1 = depReader.getNext(is); + + if (instance1 == null) break; + + int last = is.size() - 1; + short[] pos =is.pposs[last]; + + for (int k = 0; k < is.length(last); k++) { + if (is.heads[last][k] < 0) continue; + Edges.put(pos[is.heads[last][k]],pos[k], k < is.heads[last][k],is.labels[last][k]); + } + + if (!options.allFeatures && num1 > options.count) break; + + num1++; + + } + del = outValue(num1, del); + System.out.println(); + Edges.findDefault(); + } + + + public void getInstances(String file, Instances is){ + CONLLReader09 depReader = new CONLLReader09(file); + + int ic =options.count+2; + + is.init(ic, new MFB()); + + int num1 =0,del=0; + while (true) { + if (num1 % 100 == 0) del = outValue(num1, del); + + SentenceData09 instance1 = depReader.getNext(is); + + if (instance1 == null) break; + + if (!options.allFeatures && num1 > options.count) break; + + num1++; + + } + del = outValue(num1, del); + System.out.println(); + + } + + + /** + * Creates an instance for outputParses + * + * @param is + * @return + * @throws IOException + */ + protected final SentenceData09 nextInstance(Instances is, CONLLReader09 depReader) throws Exception { + + SentenceData09 instance = depReader.getNext(is); + if (instance == null || instance.forms == null) return null; + + return instance; + } + + public static ExecutorService executerService =java.util.concurrent.Executors.newFixedThreadPool(Parser.THREADS); + + + public DataF fillVector(F2SF params, Instances is,int inst, DataF d, Cluster cluster, int threads, Long2IntInterface li) throws InterruptedException { + + long ts = System.nanoTime(); + + if (executerService.isShutdown()) executerService =java.util.concurrent.Executors.newCachedThreadPool(); + + + final int length = is.length(inst); + if (d ==null || d.len<length)d = new DataF(length,mf.getFeatureCounter().get(PipeGen.REL).shortValue()); + + ArrayList<ParallelExtract> pe = new ArrayList<ParallelExtract>(); + + + for(int i=0;i<threads;i++) { + +// DB.println(""+((ExtractorClusterStackedR2)extractor[i]).s_dist); + pe.add(new ParallelExtract( extractor[i],is, inst, d, (F2SF)params.clone(), cluster, li)); + } + + for (int w1 = 0; w1 < length; w1++) { + for (int w2 = 0; w2 < length; w2++) { + if (w1==w2) continue; + ParallelExtract.add(w1, w2); + } + } + executerService.invokeAll( pe); + + timeExtract += (System.nanoTime()-ts); + + + return d; + } + + /** + * the loss function + */ + public double errors( Instances is, int ic, Parse p) { + + if (p.heads==null) p.signature2parse(p.signature()); + short[] act = is.heads[ic]; + double correct = 0; + + // do not count root + for(int i = 1; i < act.length; i++) { + if (p.heads[i]==act[i] ){ + correct+=0.5; + if (p.labels[i]==is.labels[ic][i] ) correct+=0.5; + } + } + + double x = ((double)act.length- 1 - correct ); + + //p.f1 = (double)correct / (double)(act.length-1); + + return x; + } +} diff --git a/dependencyParser/basic/mate-tools/src/is2/parserR2/PipeReranker.java b/dependencyParser/basic/mate-tools/src/is2/parserR2/PipeReranker.java new file mode 100644 index 0000000..87286ac --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parserR2/PipeReranker.java @@ -0,0 +1,138 @@ +package is2.parserR2; + +import is2.data.Cluster; +import is2.data.DataF; +import is2.data.Edges; +import is2.data.F2SF; +import is2.data.Instances; +import is2.data.MFB; +import is2.data.ParseNBest; + +import is2.data.Parse; +import is2.data.PipeGen; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; + +import is2.util.OptionsSuper; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.concurrent.ExecutorService; + +import extractors.ExtractorClusterStacked; +import extractors.ExtractorReranker; +import extractors.ParallelExtract; + +final public class PipeReranker extends PipeGen { + + public ExtractorReranker extractor; + final public MFB mf = new MFB(); + + Cluster cl; + + + private OptionsSuper options; + public static long timeExtract; + + public PipeReranker(OptionsSuper o) { + options = o; + } + + public void createInstances(String file, Instances is) + // throws Exception + + { + + + CONLLReader09 depReader = new CONLLReader09(file); + + mf.register(REL,"<root-type>"); + + // register at least one predicate since the parsing data might not contain predicates as in + // the Japaness corpus but the development sets contains some + + long sl=0; + + System.out.print("Registering feature parts of sentence: "); + int ic = 0; + int del = 0; + while (true) { + SentenceData09 instance = depReader.getNext(); + if (instance == null) break; + ic++; + + sl+=instance.labels.length; + + if (ic % 1000 == 0) { + del = outValue(ic, del); + } + + String[] labs1 = instance.labels; + for (int i1 = 0; i1 < labs1.length; i1++) mf.register(REL, labs1[i1]); + + String[] w = instance.forms; + for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, depReader.normalize(w[i1])); + + w = instance.plemmas; + for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, depReader.normalize(w[i1])); + + + w = instance.ppos; + for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); + + w = instance.gpos; + for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); + + if (instance.feats !=null) { + String fs[][] = instance.feats; + for (int i1 = 0; i1 < fs.length; i1++){ + w =fs[i1]; + if (w==null) continue; + for (int i2 = 0; i2 < w.length; i2++) mf.register(FEAT, w[i2]); + } + } + + if ((ic-1)>options.count) break; + } + del = outValue(ic, del); + + System.out.println(); + ExtractorReranker.initFeatures(); + + ExtractorReranker.maxForm = mf.getFeatureCounter().get(WORD); + + if (options.clusterFile==null)cl = new Cluster(); + else cl= new Cluster(options.clusterFile, mf,6); + + mf.calculateBits(); + extractor.initStat(); + + System.out.println(""+mf.toString()); + + extractor.init(); + depReader.startReading(file); + + int num1 = 0; + + is.init(ic, new MFB()); + + Edges.init(mf.getFeatureCounter().get(POS)); + + del = 0; + + + del = outValue(num1, del); + System.out.println(); + } + + + + public static ExecutorService executerService =java.util.concurrent.Executors.newFixedThreadPool(Parser.THREADS); + + + + + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/parserR2/Reranker.java b/dependencyParser/basic/mate-tools/src/is2/parserR2/Reranker.java new file mode 100644 index 0000000..a531c3c --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/parserR2/Reranker.java @@ -0,0 +1,1059 @@ +package is2.parserR2; + + +import is2.data.Cluster; +import is2.data.DataF; +import is2.data.Edges; +import is2.data.F2SF; +import is2.data.FVR; +import is2.data.Instances; +import is2.data.Long2Int; +import is2.data.Long2IntInterface; +import is2.data.MFB; +import is2.data.Parse; +import is2.data.ParseNBest; +import is2.data.PipeGen; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; +import is2.io.CONLLWriter09; +import is2.tools.Tool; +import is2.util.DB; +import is2.util.OptionsSuper; +import is2.util.ParserEvaluator; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map.Entry; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; +import java.util.zip.ZipOutputStream; + +import extractors.Extractor; +import extractors.ExtractorClusterStacked; +import extractors.ExtractorReranker; + + + +public class Reranker implements Tool { + + + public static int THREADS =4; + + Long2IntInterface l2i; + + // the parser models + ParametersFloat paramsParsers[]; + + // the re-ranker model + ParametersFloat parametersReranker; + PipeReranker pipeReranker; + Pipe pipe; + Options options; + + HashMap<Integer,Integer> rank = new HashMap<Integer,Integer>(); + int amongxbest=0, amongxbest_ula=0, nbest=0,bestProj=0, smallestErrorSum=0, countAllNodes=0; + static int NBest =1000; + + + + + /** + * Initialize the parser + * @param options + */ + public Reranker (Options options) { + + this.options=options; + + } + + + /** + * @param modelFileName The file name of the parsing model + */ + public Reranker(String modelFileName) { + this(new Options(new String[]{"-model",modelFileName})); + } + + + + public Reranker() { + // TODO Auto-generated constructor stub + } + + + public static void main (String[] args) throws Exception + { + + long start = System.currentTimeMillis(); + Options options = new Options(args); + + NBest = options.best; + + DB.println("n-best "+NBest); + + Runtime runtime = Runtime.getRuntime(); + THREADS = runtime.availableProcessors(); + + if (options.cores<THREADS&&options.cores>0) THREADS =options.cores; + + DB.println("Found " + runtime.availableProcessors()+" cores use "+THREADS); + + + if (options.train) { + + Reranker p =new Reranker(); + p.options=options; + + + p.l2i = new Long2Int(options.hsize); + p.pipeReranker = new PipeReranker(options); + p.pipeReranker.extractor = new ExtractorReranker(p.l2i); + + + // initialize the parser + p.pipe = new Pipe(options); + + // read parsing models + p.paramsParsers = new ParametersFloat[options.end+1]; + for(int m=0;m<=options.end;m++) { + String name = options.prefix_model+m; + p.paramsParsers[m] = new ParametersFloat(0); + p.readModel(name, p.pipe, p.paramsParsers[m]); + } + + // set up the reranker + p.parametersReranker = new ParametersFloat(p.l2i.size()); + + Instances[] iss = new Instances[options.end+1]; + + for(int m=0;m<=options.end;m++) { + String name = options.prefix_test+m; + iss[m] = new Instances(); + DB.println("create instances of part "+name); + p.pipe.getInstances(name, iss[m]); + } + + + ExtractorReranker.initFeatures(); + p.pipeReranker.extractor.init(); + + p.pipeReranker.extractor.initStat(); + + p.train(options,iss); + + p.writeModell(options, p.parametersReranker, null,p.pipe.cl); + } + + if (options.test) { + + Reranker p = new Reranker(); + p.options=options; + + // set up the reranker + p.l2i = new Long2Int(options.hsize); + p.pipeReranker = new PipeReranker(options); + p.pipeReranker.extractor = new ExtractorReranker(p.l2i); + p.parametersReranker = new ParametersFloat(p.l2i.size()); + + + // initialize the parser + p.pipe = new Pipe(options); + + // read parsing models + p.paramsParsers = new ParametersFloat[options.end+1]; + + String nbestName ="n-best+"+options.testfile.substring(options.testfile.length()-12,options.testfile.length()-1); + File fnbest = new File(nbestName); + int read = fnbest.exists()?2:1; + + if (read != 2) + for(int m=0;m<=options.end;m++) { + String name = options.prefix_model+m; + p.paramsParsers[m] = new ParametersFloat(0); + p.readModel(name, p.pipe, p.paramsParsers[m]); + } + + p.readModel(options.modelName, p.pipeReranker, p.parametersReranker); + + + ExtractorReranker.initFeatures(); + p.pipeReranker.extractor.initStat(); + p.pipeReranker.extractor.init(); + + p.rerankedParses(options, p.pipe, p.parametersReranker, false, nbestName); + + } + + System.out.println(); + + if (options.eval) { + System.out.println("\nEVALUATION PERFORMANCE:"); + ParserEvaluator.evaluate(options.goldfile, options.outfile); + } + + long end = System.currentTimeMillis(); + System.out.println("used time "+((float)((end-start)/100)/10)); + + Decoder.executerService.shutdown(); + Pipe.executerService.shutdown(); + System.out.println("end."); + + + } + + /** + * Read the models and mapping + * @param options + * @param pipe + * @param prm + * @throws IOException + */ + public void readModel(String modelName, Pipe pipe, Parameters prm) throws IOException { + + + DB.println("Reading data started: "+modelName); + + // prepare zipped reader + ZipInputStream zis = new ZipInputStream(new BufferedInputStream(new FileInputStream(modelName))); + zis.getNextEntry(); + DataInputStream dis = new DataInputStream(new BufferedInputStream(zis)); + + pipe.mf.read(dis); + + pipe.cl = new Cluster(dis); + + prm.read(dis); + + Long2Int l2i = new Long2Int(prm.size()); + DB.println("li size "+l2i.size()); + + pipe.extractor = new ExtractorClusterStacked[THREADS]; + + for (int t=0;t<THREADS;t++) pipe.extractor[t]=new ExtractorClusterStacked(l2i); + + ExtractorClusterStacked.initFeatures(); + + + for (int t=0;t<THREADS;t++) { + pipe.extractor[t].initStat(); + pipe.extractor[t].init(); + } + + Edges.read(dis); + + options.decodeProjective = dis.readBoolean(); + + ExtractorClusterStacked.maxForm = dis.readInt(); + + boolean foundInfo =false; + try { + String info =null; + int icnt = dis.readInt(); + for(int i=0;i<icnt;i++) { + info = dis.readUTF(); + System.out.println(info); + } + } catch (Exception e) { + if (!foundInfo) System.out.println("no info about training"); + } + + + dis.close(); + + DB.println("Reading data finnished"); + + Decoder.NON_PROJECTIVITY_THRESHOLD =(float)options.decodeTH; + + // ExtractorClusterStacked.initStat(); + + } + + /** + * Read the models and mapping + * @param options + * @param pipe + * @param params + * @throws IOException + */ + public void readModel(String modelName, PipeReranker pipe, Parameters params) throws IOException { + + DB.println("Reading data started: "+modelName); + + // prepare zipped reader + ZipInputStream zis = new ZipInputStream(new BufferedInputStream(new FileInputStream(modelName))); + zis.getNextEntry(); + DataInputStream dis = new DataInputStream(new BufferedInputStream(zis)); + + pipe.mf.read(dis); + + // DB.println("reranker model "+pipe.mf.toString()); + + pipe.cl = new Cluster(dis); + + params.read(dis); + this.l2i = new Long2Int(params.size()); + DB.println("li size "+l2i.size()); + + pipe.extractor = new ExtractorReranker(l2i); + + ExtractorReranker.initFeatures(); + ExtractorReranker.initStat(); + + pipe.extractor.init(); + + Edges.read(dis); + + options.decodeProjective = dis.readBoolean(); + + ExtractorClusterStacked.maxForm = dis.readInt(); + + boolean foundInfo =false; + try { + String info =null; + int icnt = dis.readInt(); + for(int i=0;i<icnt;i++) { + info = dis.readUTF(); + System.out.println(info); + } + } catch (Exception e) { + if (!foundInfo) System.out.println("no info about training"); + } + + + dis.close(); + + DB.println("Reading data finnished"); + + Decoder.NON_PROJECTIVITY_THRESHOLD =(float)options.decodeTH; + + //ExtractorClusterStacked.initStat(); + + } + + + /** + * Do the training + * @param instanceLengths + * @param options + * @param pipe + * @param parametersReranker + * @param is + * @param cluster + * @throws IOException + * @throws InterruptedException + * @throws ClassNotFoundException + */ + public void train(OptionsSuper options, Instances[] iss) + throws IOException, InterruptedException, ClassNotFoundException { + + + + int read = 0; // 0 compute -- 1 compute and write -- 2 read parses + + + DB.println("Training Information "); + DB.println("-------------------- "); + + ExtractorReranker.initStat(); + pipeReranker.extractor.init(); + + for(Extractor e : this.pipe.extractor) { + e.init(); + } + + int numInstances =0; + int maxLenInstances =0; + // int maxLenSentence=1; + for(Instances is : iss) { + numInstances += is.size(); + for(int i=0;i<is.size();i++) if (maxLenInstances<is.length(i)) maxLenInstances=is.length(i); + } + + + DataF data = new DataF(maxLenInstances, pipe.mf.getFeatureCounter().get(PipeGen.REL).shortValue()); + + int iter = 0; + int del=0; + float error =0; + float f1=0; + + + + + double upd = (double)(options.count*options.numIters)+options.numIters*10; + + //float[][] = new float[this.NBest][3]; + FVR act = new FVR(); + + FVR pred = new FVR(); + + FVR f = new FVR(); + long[] vs = new long[ExtractorReranker._FC*maxLenInstances]; + + + for(; iter < options.numIters; iter++) { + + + + + System.out.print("Iteration "+iter+": "); + error=0; + f1=0; + + float las =0, cnt=0,averageScore=0; + + + float firstBestTotalError=0,totalError=0; + + long start = System.currentTimeMillis(); + + long last= System.currentTimeMillis(); + + long rerankTime = 0; + + + String nbest ="n-best"; + File fnbest = new File(nbest); + read = fnbest.exists()?2:1; + + DataInputStream dis =null; + DataOutputStream dos = null; + + if (read==1) { + + DB.println("computing and writting nbest list to file: "+nbest); + + ZipOutputStream zos = new ZipOutputStream(new BufferedOutputStream(new FileOutputStream(nbest))); + zos.putNextEntry(new ZipEntry("data")); + dos = new DataOutputStream(new BufferedOutputStream(zos)); + } + + + // start reading again + if (read ==2) { + + // DB.println("reading nbest list from file: "+nbest); + + ZipInputStream zis = new ZipInputStream(new BufferedInputStream(new FileInputStream(nbest))); + zis.getNextEntry(); + dis = new DataInputStream(new BufferedInputStream(zis)); + } + + HashMap<Integer,Integer> remapped = new HashMap<Integer,Integer>(); + + int i=0,round=0,instance=0,length=0,count=0, changes=0; + for(Instances is : iss) { + + F2SF fparser =this.paramsParsers[instance].getFV(); + round++; + + + // go over the sentences in the instance + for(int n = 0; n < is.size(); n++) { + count+=1; + length +=is.length(n); + upd--; + + if (is.labels[n].length>options.maxLen) continue; + + List<ParseNBest> parses=null; + + short pos[] = is.pposs[n]; + + // read or write nbest list + if (read==0|| read==1 && iter==0) { + data = pipe.fillVector(fparser, is, n, data, pipe.cl,THREADS,l2i); + parses = Decoder.decode(pos, data, options.decodeProjective,pipe.extractor[0]); + + if (parses.size()>NBest) parses = parses.subList(0, NBest); + + + if (read==1) { + // write the forest + dos.writeInt(parses.size()); + for(int k=0;k<parses.size();k++) { + dos.writeUTF(parses.get(k).signature()); + dos.writeFloat((float)parses.get(k).f1); + } + } + } else if (read==2) { + parses = new ArrayList<ParseNBest>(); + int parseCount = dis.readInt(); + for(int k=0;k<parseCount;k++) { + ParseNBest p = new ParseNBest(dis.readUTF(),dis.readFloat()); + if (parses.size()<NBest) parses.add(p); + } + } + + + int best =0; float bestScore=-100; + int goldBest =0; double goldError=Float.MAX_VALUE; + + long startReranking = System.currentTimeMillis(); + + // score the n-best parses + for(int k=0;k<parses.size();k++) { + + ParseNBest p= parses.get(k); + + pipeReranker.extractor.extractFeatures(is,n,p,parses.indexOf(p),vs,pipe.cl); + + int rank=1*ExtractorReranker.s_type; + + f.clear(); + + for(int j=0;j<vs.length;j++) { + if (vs[j]==Integer.MIN_VALUE) break; + if (vs[j]>0) f.add(pipeReranker.extractor.li.l2i(vs[j]+rank)); + } + + f.add(pipeReranker.extractor.li.l2i(1+rank),(float)p.f1); + float score = (float)(parametersReranker.getScore(f)); + if (score>bestScore) { //rankScore[k][2]> + bestScore =score; + best=k; + + } + } + + // get the best parse in the n-best list + for(int k=0;k<parses.size();k++) { + + if (parses.get(k).heads.length!=is.length(n)) { + DB.println("error "+n+" "+parses.get(k).heads.length+" "+is.length(n)); + continue; + } + double errg = pipe.errors(is, n, parses.get(k)); + if (goldError > errg) { + goldError = errg; + goldBest=k; + } + } + + ParseNBest firstBest = parses.get(0); + ParseNBest predParse = parses.get(best); + ParseNBest goldBestParse = parses.get(goldBest); + + double e= pipe.errors(is, n ,predParse); + + Integer ctb = remapped.get(best); + if (ctb==null) remapped.put(best, 1); + else remapped.put(best, ctb+1); + + String info = " 1best-error "+((length-firstBestTotalError)/length)+ + " reranked "+((length-totalError)/length)+ + " chd "+changes+" "+" ps las "+(las/cnt)+" avs "+((float)averageScore/(float)count)+" "; + + + + if((n+1) %500 == 0) del= PipeGen.outValueErr(count,Math.round(error*1000)/1000,f1/count,del, last, upd,info); + + firstBestTotalError+=Decoder.getError(firstBest, is, n, Decoder.LAS); + + totalError+=Decoder.getError(predParse, is, n, Decoder.LAS); + + + rerankTime +=System.currentTimeMillis()-startReranking; + + if (best!=0){ + changes++; + } + + las +=is.length(n)-Decoder.getError(goldBestParse, is, n, Decoder.LAS); + cnt +=is.length(n); + + averageScore+=predParse.f1; + + + if (options.count<count) break; + + + if (Decoder.getError(goldBestParse, is, n, Decoder.LAS)>= + Decoder.getError(predParse, is, n, Decoder.LAS) ) continue; + + + // get predicted feature vector + pipeReranker.extractor.extractFeatures(is,n,predParse,parses.indexOf(predParse),vs,pipe.cl); + + pred.clear(); + int rank=1*ExtractorReranker.s_type; + + for(int j=0;j<vs.length;j++) { + if (vs[j]==Integer.MIN_VALUE) break; + if (vs[j]>0) pred.add(pipeReranker.extractor.li.l2i(vs[j]+rank)); + } + pred.add(pipeReranker.extractor.li.l2i(1+rank),(float)predParse.f1); + error += 1; + + pipeReranker.extractor.extractFeatures(is,n,goldBestParse,parses.indexOf(goldBestParse),vs,pipe.cl); + + + act.clear(); + rank=1*ExtractorReranker.s_type; + for(int j=0;j<vs.length;j++) { + if (vs[j]==Integer.MIN_VALUE) break; + if (vs[j]>0) act.add(pipeReranker.extractor.li.l2i(vs[j]+rank)); + } + + act.add(pipeReranker.extractor.li.l2i(1+rank),(float)goldBestParse.f1); + float lam_dist =(float)( parametersReranker.getScore(act) - + (parametersReranker.getScore(pred))); + + + + parametersReranker.update(act, pred, is, n, null, upd, e,lam_dist); + + } + instance++; + + } + + String info = " td "+((Decoder.timeDecotder)/1000000F)+" tr "+((Decoder.timeRearrange)/1000000F) + +" te "+((Pipe.timeExtract)/1000000F)+" nz "+parametersReranker.countNZ()+ + " 1best-error "+((length-firstBestTotalError)/length)+ + " reranked-best "+((length-totalError)/length)+ + " rds "+round+" "+ + " rerank-t "+(rerankTime/count)+ + " chd "+changes+" "+" ps las "+(las/cnt)+" avs "+((float)averageScore/(float)count)+" "; + + + // DB.println("remapped "+remapped); + + PipeGen.outValueErr(count,Math.round(error*1000)/1000,f1/count,del,last, upd,info); + del=0; + long end = System.currentTimeMillis(); + System.out.println(" time:"+(end-start)); + i++; + // ParametersFloat pf = params.average2((iter+1)*is.size()); + + + + Decoder.timeDecotder=0;Decoder.timeRearrange=0; Pipe.timeExtract=0; + + if (dos!=null)dos.close(); + if (dis!=null)dis.close(); + + } + DB.println("sb "+parametersReranker.parameters[this.pipeReranker.extractor.li.l2i(4090378920L+1*ExtractorReranker.s_type)]);//4090378266 + parametersReranker.average(iter*numInstances); + + } + + + /** + * Do the parsing + * @param options + * @param pipe + * @param params + * @throws IOException + */ + private void rerankedParses (OptionsSuper options, Pipe pipe, ParametersFloat params, boolean maxInfo, String nbestName) throws Exception { + + long start = System.currentTimeMillis(); + + ExtractorClusterStacked.initFeatures(); + + DataInputStream dis =null; + DataOutputStream dos = null; + + float olas=0, olcnt =0; + + File fnbest = new File(nbestName); + int read = fnbest.exists()?2:1; + if (read==1) { + + DB.println("computing and writting nbest list to file: "+nbestName); + + ZipOutputStream zos = new ZipOutputStream(new BufferedOutputStream(new FileOutputStream(nbestName))); + zos.putNextEntry(new ZipEntry("data")); + dos = new DataOutputStream(new BufferedOutputStream(zos)); + } + + + + if (read ==2) { + + // DB.println("reading nbest list from file: "+nbestName); + + ZipInputStream zis = new ZipInputStream(new BufferedInputStream(new FileInputStream(nbestName))); + zis.getNextEntry(); + dis = new DataInputStream(new BufferedInputStream(zis)); + } + + for(int m =0;m< this.paramsParsers.length;m++) { + + + CONLLReader09 depReader = new CONLLReader09(options.testfile, options.formatTask); + CONLLWriter09 depWriter = new CONLLWriter09(options.outfile, options.formatTask); + + float las=0,lcnt =0, averageScore =0; + int cnt = 0; + int del=0; + + + long last = System.currentTimeMillis(); + + + String[] types = new String[pipe.mf.getFeatureCounter().get(PipeGen.REL)]; + for (Entry<String, Integer> e : pipe.mf.getFeatureSet().get(PipeGen.REL).entrySet()) types[e.getValue()] = e.getKey(); + + System.out.print("Processing Sentence: "); + + + FVR f = new FVR(); + + while(true) { + + Instances is = new Instances(); + is.init(1, new MFB(),options.formatTask); + + SentenceData09 instance = pipe.nextInstance(is, depReader); + if (instance==null) break; + cnt++; + + String[] forms = instance.forms; + + List<ParseNBest> parses =null; + + // read or write nbest list + if (read==0|| read==1) { + parses = this.parseNBest(instance, this.paramsParsers[m]); + // data = pipe.fillVector(fparser, is, n, data, pipe.cl,THREADS); + // parses = Decoder.decode(pos, data, options.decodeProjective); + + if (parses.size()>NBest) parses = parses.subList(0, NBest); + + + if (read==1) { + // write the forest + dos.writeInt(parses.size()); + for(int k=0;k<parses.size();k++) { + dos.writeUTF(parses.get(k).signature()); + dos.writeFloat((float)parses.get(k).f1); + } + } + } else if (read==2) { + parses = new ArrayList<ParseNBest>(); + int parseCount = dis.readInt(); + for(int k=0;k<parseCount;k++) { + ParseNBest p = new ParseNBest(dis.readUTF(),dis.readFloat()); + if (parses.size()<NBest) parses.add(p); + } + } + + nbest +=parses.size(); + + + //List<ParseNBest> parses = this.parseNBest(instance, this.paramsParsers[m]); + + long vs[] = new long[ExtractorReranker._FC*is.length(0)]; + + float bestScore=0; + int best=0; + + + for(int k=0;k<parses.size();k++) { + + ParseNBest p= parses.get(k); + + pipeReranker.extractor.extractFeatures(is,0,p,k,vs,pipeReranker.cl); + + int rank=1*ExtractorReranker.s_type; + f.clear(); + for(int j=0;j<vs.length;j++) { + if (vs[j]==Integer.MIN_VALUE) break; + if (vs[j]>0) f.add(pipeReranker.extractor.li.l2i(vs[j]+rank)); + } + f.add(pipeReranker.extractor.li.l2i(1+rank),(float)p.f1); + + float score = (float)(parametersReranker.getScore(f)); + if (score>bestScore) { //rankScore[k][2]> + bestScore =score; + best=k; + + } + } + // change to best + ParseNBest d = parses.get(best); + + las +=(is.length(0)-1)-Decoder.getError(d, is, 0, Decoder.LAS); + lcnt +=is.length(0)-1; + + averageScore+=d.f1; + + SentenceData09 i09 = new SentenceData09(instance); + + i09.createSemantic(instance); + + for(int j = 0; j < forms.length-1; j++) { + i09.plabels[j] = types[d.labels[j+1]]; + i09.pheads[j] = d.heads[j+1]; + } + + + depWriter.write(i09); + String info =""+((float)(averageScore/(float)cnt))+" "; + + if (cnt%10 ==0) + del=PipeGen.outValueErr(cnt, lcnt-las, las/lcnt, del, last, 0, info);//outValue(cnt, del,last, info); + + } + + //pipe.close(); + + depWriter.finishWriting(); + long end = System.currentTimeMillis(); + DB.println("rank\n"+rank+"\n"); + DB.println("x-best-las "+amongxbest+" x-best-ula "+amongxbest_ula+" cnt "+cnt+" x-best-las " + +((float)((float)amongxbest/cnt))+ + " x-best-ula "+((float)((float)amongxbest_ula/cnt))+ + " nbest "+((float)nbest/cnt)+ + " 1best "+((float)(rank.get(0)==null?0:rank.get(0))/cnt)+ + " best-proj "+((float)bestProj/cnt)+ + " Sum LAS "+((float)this.smallestErrorSum/countAllNodes)+" "+ + ""+(las/lcnt)); + + // DB.println("errors "+error); + olas+=las; + olcnt+=lcnt; + rank.clear(); + amongxbest=0;amongxbest_ula=0; + cnt=0; + nbest=0; + bestProj=0; + if (maxInfo) System.out.println("Used time " + (end-start)); + if (maxInfo) System.out.println("forms count "+Instances.m_count+" unkown "+Instances.m_unkown); + } + + if (dos !=null) { + dos.flush(); + dos.close(); + } + if (dis!=null)dis.close(); + + DB.println("\n overall las "+(olas/olcnt)); + } + + + /** + * Do the parsing + * @param options + * @param pipe + * @param params + * @throws IOException + */ + private void getNBest(OptionsSuper options, Pipe pipe, ParametersFloat params, boolean maxInfo) throws Exception { + + + CONLLReader09 depReader = new CONLLReader09(options.testfile, options.formatTask); + + ExtractorClusterStacked.initFeatures(); + + int cnt = 0; + + String[] types = new String[pipe.mf.getFeatureCounter().get(PipeGen.REL)]; + for (Entry<String, Integer> e : pipe.mf.getFeatureSet().get(PipeGen.REL).entrySet()) types[e.getValue()] = e.getKey(); + + // System.out.print("Processing Sentence: "); + + while(true) { + + Instances is = new Instances(); + is.init(1, new MFB(),options.formatTask); + + SentenceData09 instance = pipe.nextInstance(is, depReader); + if (instance==null) break; + cnt++; + + this.parseNBest(instance, this.paramsParsers[0]); + } + + //pipe.close(); + // depWriter.finishWriting(); + // long end = System.currentTimeMillis(); + // DB.println("rank\n"+rank+"\n"); + // DB.println("x-best-las "+amongxbest+" x-best-ula "+amongxbest_ula+" cnt "+cnt+" x-best-las " + // +((float)((float)amongxbest/cnt))+ + // " x-best-ula "+((float)((float)amongxbest_ula/cnt))+ + // " nbest "+((float)nbest/cnt)+ + // " 1best "+((float)(rank.get(0)==null?0:rank.get(0))/cnt)+ + // " best-proj "+((float)bestProj/cnt)); + // DB.println("errors "+error); + + + } + + + public SentenceData09 parse (SentenceData09 instance, ParametersFloat params) { + + String[] types = new String[pipe.mf.getFeatureCounter().get(PipeGen.REL)]; + for (Entry<String, Integer> e : MFB.getFeatureSet().get(PipeGen.REL).entrySet()) types[e.getValue()] = e.getKey(); + + Instances is = new Instances(); + is.init(1, new MFB(),options.formatTask); + new CONLLReader09().insert(is, instance); + + String[] forms = instance.forms; + + // use for the training ppos + DataF d2; + try { + d2 = pipe.fillVector(params.getFV(), is,0,null,pipe.cl, THREADS,l2i);//cnt-1 + } catch (Exception e ) { + e.printStackTrace(); + return null; + } + short[] pos = is.pposs[0]; + + List<ParseNBest> parses=null; + Parse d= null; + try { + parses =Decoder.decode(pos,d2,options.decodeProjective,pipe.extractor[0]); //cnt-1 + d = parses.get(0); + }catch (Exception e) { + e.printStackTrace(); + } + + if (parses.size()>NBest) parses = parses.subList(0,NBest); + + int g_las = Decoder.getGoldRank(parses, is,0,Decoder.LAS); + int g_ula = Decoder.getGoldRank(parses, is,0,!Decoder.LAS); + + int smallest = Decoder.getSmallestError(parses, is,0,!Decoder.LAS); + smallestErrorSum+=is.length(0)-smallest; + countAllNodes+=is.length(0); + + if (g_las>=0) amongxbest++; + if (g_ula>=0) amongxbest_ula++; + + nbest+=parses.size(); + + Integer r = rank.get(g_las); + if (r==null) rank.put(g_las, 1); + else rank.put(g_las, r+1); + + float err = (float)this.pipe.errors(is,0, d); + + float errBestProj = (float)this.pipe.errors(is,0, Decoder.bestProj); + + if (errBestProj==0) bestProj++; + + SentenceData09 i09 = new SentenceData09(instance); + + i09.createSemantic(instance); + + for(int j = 0; j < forms.length-1; j++) { + i09.plabels[j] = types[d.labels[j+1]]; + i09.pheads[j] = d.heads[j+1]; + } + return i09; + + } + + public List<ParseNBest> parseNBest (SentenceData09 instance, ParametersFloat params) { + + Instances is = new Instances(); + is.init(1, new MFB(),options.formatTask); + new CONLLReader09().insert(is, instance); + + + + // use for the training ppos + DataF d2; + try { + d2 = pipe.fillVector(params.getFV(), is,0,null,pipe.cl,THREADS,l2i);//cnt-1 + } catch (Exception e ) { + e.printStackTrace(); + return null; + } + short[] pos = is.pposs[0]; + + List<ParseNBest> parses=null; + try { + parses =Decoder.decode(pos,d2,options.decodeProjective,pipe.extractor[0]); //cnt-1 + }catch (Exception e) { + e.printStackTrace(); + } + + if (parses.size()>NBest) parses = parses.subList(0,NBest); + + return parses; + + } + + + + /* (non-Javadoc) + * @see is2.tools.Tool#apply(is2.data.SentenceData09) + */ + @Override + public SentenceData09 apply(SentenceData09 snt09) { + + try { + parse(snt09,this.parametersReranker); + } catch(Exception e) { + e.printStackTrace(); + } + + Decoder.executerService.shutdown(); + Pipe.executerService.shutdown(); + + return snt09; + } + + /** + * Write the parsing model + * + * @param options + * @param params + * @param extension + * @throws FileNotFoundException + * @throws IOException + */ + private void writeModell(OptionsSuper options, ParametersFloat params, String extension, Cluster cs) throws FileNotFoundException, IOException { + + String name = extension==null?options.modelName:options.modelName+extension; + DB.println("Writting model: "+name); + ZipOutputStream zos = new ZipOutputStream(new BufferedOutputStream(new FileOutputStream(name))); + zos.putNextEntry(new ZipEntry("data")); + DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(zos)); + + MFB.writeData(dos); + cs.write(dos); + + params.write(dos); + + Edges.write(dos); + + dos.writeBoolean(options.decodeProjective); + + dos.writeInt(ExtractorClusterStacked.maxForm); + + dos.writeInt(5); // Info count + dos.writeUTF("Used parser "+Reranker.class.toString()); + dos.writeUTF("Creation date "+(new SimpleDateFormat("yyyy.MM.dd HH:mm:ss")).format(new Date())); + dos.writeUTF("Training data "+options.trainfile); + dos.writeUTF("Iterations "+options.numIters+" Used sentences "+options.count); + dos.writeUTF("Cluster "+options.clusterFile); + + dos.flush(); + dos.close(); + } + + + + +} diff --git a/dependencyParser/mate-tools/classes/is2/parserR2/package.html b/dependencyParser/basic/mate-tools/src/is2/parserR2/package.html index 6b06482..6b06482 100755 --- a/dependencyParser/mate-tools/classes/is2/parserR2/package.html +++ b/dependencyParser/basic/mate-tools/src/is2/parserR2/package.html diff --git a/dependencyParser/basic/mate-tools/src/is2/tag/ExtractorT2.java b/dependencyParser/basic/mate-tools/src/is2/tag/ExtractorT2.java new file mode 100644 index 0000000..a37dbbe --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/tag/ExtractorT2.java @@ -0,0 +1,523 @@ +package is2.tag; + + +import is2.data.Cluster; +import is2.data.F2SF; +import is2.data.Instances; +import is2.data.InstancesTagger; +import is2.data.Long2IntInterface; +import is2.data.ParametersFloat; +import is2.data.PipeGen; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; +import is2.tools.IPipe; +import is2.util.OptionsSuper; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map.Entry; + + + +final public class ExtractorT2 extends PipeGen implements IPipe { + + final static int _MAX=71; + + private static final String STWRD = "STWRD", STPOS = "STPOS"; + + private static short s_pos,s_word,s_char; + protected static short s_type; + private static int _strp,_ewrd; + static int _CEND; + + public String[] types; + + final public MFO mf; + + final MFO.Data4 d1 = new MFO.Data4(),d2 = new MFO.Data4(),d3 = new MFO.Data4(), + dw = new MFO.Data4(), dwp = new MFO.Data4(); + + Cluster cl; + + private OptionsSuper options; + + public ExtractorT2 (OptionsSuper options, MFO mf) throws IOException { + this.mf =mf; + this.options = options; + } + public HashMap<Integer, int[]> _pps = new HashMap<Integer, int[]>(); + + private Lexicon lx; + + public int corpusWrds = 0; + + + + + /* (non-Javadoc) + * @see is2.tag5.IPipe#createInstances(java.lang.String, java.io.File, is2.data.InstancesTagger) + */ + public Instances createInstances(String file) { + return createInstances(file, -1, -1); + } + + + public Instances createInstances(String file, int skipStart, int skipEnd) { + + InstancesTagger is = new InstancesTagger(); + + CONLLReader09 depReader = new CONLLReader09(CONLLReader09.NO_NORMALIZE); + + depReader.startReading(file); + mf.register(POS,"<root-POS>"); + mf.register(WORD,"<root>"); + + System.out.println("Registering feature parts "); + + HashMap<Integer, HashSet<Integer>> pps = new HashMap<Integer, HashSet<Integer>>(); + + int ic=0; + while(true) { + + SentenceData09 instance1 = depReader.getNext(); + + if (instance1== null) break; + ic++; + + String[] w = instance1.forms; + for(int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1]); + for(int i1 = 0; i1 < w.length; i1++) registerChars(CHAR, w[i1]); + for(int i1 = 0; i1 < w.length; i1++) registerChars(CHAR, w[i1].toLowerCase()); + + + w = instance1.plemmas; + for(int i1 = 0; i1 < w.length; i1++) mf.register(WORD, w[i1]); + for(int i1 = 0; i1 < w.length; i1++) registerChars(CHAR, w[i1]); + + w = instance1.gpos; + for(int i1 = 0; i1 < w.length; i1++) { + mf.register(POS, w[i1]); + } + for(int i1 = 0; i1 < w.length; i1++) { + HashSet<Integer> ps = pps.get(mf.getValue(POS,w[i1])); + if (ps==null) { + ps= new HashSet<Integer>(); + pps.put(mf.getValue(POS,w[i1]), ps); + } + if (i1+1<w.length) ps.add(mf.getValue(POS,w[i1+1])); + } + + } + + for(Entry<Integer,HashSet<Integer>> e : pps.entrySet()) { + int[] ps = new int[e.getValue().size()]; + int j=0; + for(int k : e.getValue().toArray(new Integer[0])) { + ps[j++] =k; + } + _pps.put(e.getKey(), ps); + // System.out.println("put "+e.getKey()+" "+ps.length+" pps size "+_pps.size()); + } + + System.out.println("words in corpus "+(corpusWrds=mf.getFeatureCounter().get(ExtractorT2.WORD))); + if (options.clusterFile==null)cl = new Cluster(); + else cl= new Cluster(options.clusterFile, mf,6); + + if (options.lexicon==null)lx = new Lexicon(new byte[0][0]); + else lx= new Lexicon(options.lexicon,mf); + + initFeatures(); + + mf.calculateBits(); + initValues(); + + System.out.println(""+mf.toString()); + + depReader.startReading(file); + + int num1 = 0; + + int instanceCount=0; + + System.out.print("Creating Instances: "); + + is.init(ic, mf) ; + int del=0; + + while(true) { + if (num1 % 100 ==0) del = outValue(num1, del); + + if (num1>=skipStart && num1<skipEnd && skipStart>=0) { + SentenceData09 instance1 = depReader.getNext(); + if (instance1== null) break; + num1++; + continue; + } + + + SentenceData09 instance1 = depReader.getNext(is); + if (instance1== null) break; + + is.fillChars(instance1, instanceCount,_CEND); + for(int k=0;k<instance1.length();k++) { + if (instance1.ppos[k].contains("\\|")) + + is.pposs[num1][k] = (short)mf.getValue(FM, instance1.ppos[k].split("\\|")[1]); + } + + + if (num1>options.count) break; + + num1++; + instanceCount++; + } + outValue(num1, del); + System.out.println(); + + types= mf.reverse(mf.getFeatureSet().get(POS)); + return is; + } + + private void registerChars(String type, String word) { + for(int i=0;i<word.length();i++) mf.register(type, Character.toString(word.charAt(i))); + } + + + /* (non-Javadoc) + * @see is2.tag5.IPipe#initValues() + */ + public void initValues() { + s_pos = mf.getFeatureBits(POS); + s_word = mf.getFeatureBits(WORD); + s_type = mf.getFeatureBits(TYPE); + s_char = mf.getFeatureBits(CHAR); + + d1.a0 = s_type; d1.a1 = s_pos; d1.a2= s_word;d1.a3= s_word; + d2.a0 = s_type; d2.a1 = s_pos; d2.a2= s_pos; d2.a3= s_pos; d2.a4= s_pos; d2.a5= s_pos; d2.a6= s_pos; + d3.a0 = s_type; d3.a1 = s_pos; d3.a2= s_char; d3.a3= s_char; d3.a4= s_char; d3.a5= s_char; d3.a6= s_char; d3.a7= s_char; + dw.a0 = s_type; dw.a1 = s_pos;dw.a2= s_word; dw.a3= s_word; dw.a4= s_word; dw.a5= s_word; dw.a6= s_word; dw.a7= s_word; + dwp.a0 = s_type; dwp.a1 = s_pos;dwp.a2= s_word ; dwp.a3= s_pos; dwp.a4= s_word; + + } + + /* (non-Javadoc) + * @see is2.tag5.IPipe#initFeatures() + */ + public void initFeatures() { + // 62 + for(int t=0;t<67;t++) mf.register(TYPE, "F"+t); + + mf.register(POS, MID); + _strp = mf.register(POS, STR); + mf.register(POS, END); + + mf.register(WORD, STR); + _ewrd =mf.register(WORD, END); + + _CEND = mf.register(CHAR, END); + + mf.register(WORD,STWRD); + mf.register(POS,STPOS); + + + } + + final public void addFeatures(InstancesTagger is, int ic, String fs,int i, short pposs[], int[] forms, int[] lemmas, long[] vs) { + + int c0= is.chars[ic][i][0], c1=is.chars[ic][i][1], c2=is.chars[ic][i][2], c3=is.chars[ic][i][3], c4=is.chars[ic][i][4],c5=is.chars[ic][i][5]; + int e0 =is.chars[ic][i][6], e1 =is.chars[ic][i][7],e2 =is.chars[ic][i][8],e3 =is.chars[ic][i][9],e4 =is.chars[ic][i][10]; + + int f=1,n=0; + short upper =0, number = 1; + for(int k1=0;k1<fs.length();k1++){ + char c = fs.charAt(k1); + if (Character.isUpperCase(c)) { + if (k1==0) upper=1; + else { + // first char + another + if (upper==1) upper=3; + // another uppercase in the word + else if (upper==0) upper=2; + } + } + + // first + if (Character.isDigit(c) && k1==0) number =2 ; + else if (Character.isDigit(c) && number==1) number = 3; + // if(number==2 &&Character.isDigit(c)) number=4; + // if(number==4 && !Character.isDigit(c)) number=5; + } + + // if (i==0 && upper>0) upper+=4; + int form = forms[i], form2 = forms[i]<corpusWrds?forms[i]:-1; + + int len = forms.length; + long l; + d1.v0 = f++; d1.v2=form2; l=mf.calc3(d1); vs[n++]=mf.calc3(d1); + + d1.v0 = f++; d1.v2=is.formlc[ic][i]; vs[n++]=mf.calc3(d1); + + + d3.v2=c0; d3.v3=c1; d3.v4=c2; d3.v5=c3; d3.v6=c4; + d3.v0=f++; vs[n++]=mf.calc3(d3); + d3.v0=f++; vs[n++]=mf.calc4(d3); + d3.v0=f++; vs[n++]=mf.calc5(d3); + d3.v0=f++; vs[n++]=mf.calc6(d3); + d3.v0=f++; vs[n++]=mf.calc7(d3); + + if (form!=-1) { + d3.v2=c2; d3.v3=c3; d3.v4=c4; d3.v5=c5; d3.v6=cl.getLP(form); + d3.v0=f; vs[n++]=mf.calc6(d3); d3.v0=f+1; vs[n++]=mf.calc7(d3); + } + f+=2; + + if (form>0) { + d3.v0=f; d3.v5=cl.getLP(form); vs[n++]=mf.calc6(d3); + d3.v0=f+1; d3.v4=cl.getLP(form); vs[n++]=mf.calc5(d3); + d3.v0=f+2; d3.v3=cl.getLP(form); vs[n++]=mf.calc4(d3); + } + f+=5; + + d3.v2=e0; d3.v3=e1; d3.v4=e2; d3.v5=e3; d3.v6=e4; + d3.v0 =f++; vs[n++]=mf.calc3(d3); + d3.v0 =f++; vs[n++]=l=mf.calc4(d3); vs[n++]=d3.calcs(3, upper, l); + d3.v0 =f++; vs[n++]=l=mf.calc5(d3); vs[n++]=d3.calcs(3, upper, l); + d3.v0 =f++; vs[n++]=l=mf.calc6(d3); vs[n++]=d3.calcs(3, upper, l); + d3.v0 =f++; vs[n++]=l=mf.calc7(d3); vs[n++]=d3.calcs(3, upper, l); + + if (form>0) { + d3.v0=f; d3.v5=cl.getLP(form); vs[n++]=mf.calc6(d3); + d3.v0=f+1; d3.v4=cl.getLP(form); vs[n++]=mf.calc5(d3); + d3.v0=f+2; d3.v3=cl.getLP(form); vs[n++]=mf.calc4(d3); + + d3.v2=e0; d3.v3=e1; d3.v4=e2; + + d3.v0=f+3; d3.v2=lx.getTag(form); vs[n++]=mf.calc3(d3); + d3.v0=f+4; d3.v4=cl.getLP(form); vs[n++]=mf.calc5(d3); + d3.v0=f+5; d3.v3=cl.getLP(form); vs[n++]=mf.calc4(d3); + } + f+=6; + + // sign three-grams + d3.v0=f++;d3.v2=c1; d3.v3=c2; d3.v4=c3; vs[n++]=mf.calc5(d3); + d3.v0=f++;d3.v2=c2; d3.v3=c3; d3.v4=c4; vs[n++]=mf.calc5(d3); + d3.v0=f++;d3.v2=c3; d3.v3=c4; d3.v4=c5; vs[n++]=mf.calc5(d3); + + // sign quad-grams + d3.v0=f++;d3.v2=c1; d3.v3=c2; d3.v4=c3; d3.v5=c4; vs[n++]=mf.calc6(d3); + d3.v0=f++;d3.v2=c2; d3.v3=c3; d3.v4=c4; d3.v5=c5; vs[n++]=mf.calc6(d3); // changed to 6 + + if (i+1<len && forms[i+1]<this.corpusWrds) {dw.v0=f; dw.v2=forms[i+1];dw.v3= form2;vs[n++]=mf.calc4(dw);} + f++; + + if (len>i+1) { + + if (forms[i+1]<corpusWrds){dw.v0=f; dw.v2= forms[i+1]; vs[n++]=mf.calc3(dw);} + + d3.v0=f+1; d3.v2 =is.chars[ic][i+1][0];vs[n++]=mf.calc3(d3); + d3.v0=f+2; d3.v2 =is.chars[ic][i+1][6];vs[n++]=mf.calc3(d3); + + d3.v2=e0; d3.v3=e1; + + d3.v0 =f+3; d3.v4 =is.chars[ic][i+1][0];vs[n++]=mf.calc5(d3); + d3.v0 =f+4; d3.v4 =is.chars[ic][i+1][6];vs[n++]=mf.calc5(d3); + + if (is.chars[ic][i+1][11]>1 ) { // instance.forms[i+1].length() + + d3.v0=f+5; d3.v2=is.chars[ic][i+1][0]; d3.v3=is.chars[ic][i+1][1]; vs[n++]=mf.calc4(d3); + d3.v0=f+6; d3.v2=is.chars[ic][i+1][6]; d3.v3=is.chars[ic][i+1][7]; vs[n++]=mf.calc4(d3); + + d3.v2=e0; d3.v3=e1; + + d3.v0=f+7; d3.v4 = is.chars[ic][i+1][0]; d3.v5 =is.chars[ic][i+1][1]; vs[n++]=mf.calc6(d3); + d3.v0=f+8; d3.v4 = is.chars[ic][i+1][6]; d3.v5=is.chars[ic][i+1][7]; vs[n++]=mf.calc6(d3); + + if (forms[i+1]>0) { + d3.v0=f+9; d3.v2=is.chars[ic][i+1][0]; d3.v3=is.chars[ic][i+1][1]; d3.v4 =cl.getLP(forms[i+1]); vs[n++]=mf.calc5(d3); + d3.v0=f+10; d3.v2=is.chars[ic][i+1][6]; d3.v3=is.chars[ic][i+1][7]; d3.v4 =cl.getLP(forms[i+1]); vs[n++]=mf.calc5(d3); + } + } + + if (forms[i+1]>0) { + + + dw.v0=f+11; dw.v2= cl.getLP(forms[i+1]); dw.v3= form2;vs[n++]=mf.calc4(dw); + + // if (forms[i]>0){ + // dw.v0=f+12; dw.v2= cl.getLP(forms[i+1]); dw.v3=lx.getTag(form);vs[n++]=mf.calc4(dw); + // dw.v0=f+13; dw.v2= cl.getLP(forms[i]); dw.v3=lx.getTag(forms[i+1]);vs[n++]=mf.calc4(dw); + // } + } + + + if (len>i+2) { + if (forms[i+2]<corpusWrds && forms[i+1]<corpusWrds) { + dw.v0=f+12; dw.v2= forms[i+2]; dw.v3 = forms[i+1];vs[n++]=mf.calc4(dw);vs[n++]=mf.calc3(dw); + } + d2.v0=f+13; d2.v2=pposs[i+1]; d2.v3= pposs[i+2]; vs[n++]=mf.calc4(d2); + } + + if (len>i+3) { + if (forms[i+3]<this.corpusWrds && forms[i+2]<this.corpusWrds) { + dw.v0=f+14; dw.v2= forms[i+3]; dw.v3 = forms[i+2]; vs[n++]=mf.calc4(dw); vs[n++]=mf.calc3(dw); + } + } + } + f+=15; + + // length + d2.v0=f++; d2.v2=is.chars[ic][i][11];vs[n++]=mf.calc3(d2); + + + // contains a number + d2.v0=f++; d2.v2=number; vs[n++]=mf.calc3(d2); + if (lemmas[i]< corpusWrds) {d1.v0=f; d1.v2=lemmas[i]; vs[n++]=mf.calc3(d1); } + f++; + + if (i!=0 &&len>i+1) { + + if (lemmas[i-1]< corpusWrds&& lemmas[i+1]<corpusWrds) {dw.v0=f; dw.v2=lemmas[i-1];dw.v3=lemmas[i+1];vs[n++]=mf.calc4(dw);} + + d2.v0=f+1; d2.v2=pposs[i-1]; d2.v3=pposs[i+1];vs[n++]=mf.calc4(d2); + } + f+=2; + + d2.v0= f++; d2.v2=i>=1? pposs[i-1]:_strp; vs[n++]=mf.calc3(d2); + + if (i>0) { + + dw.v0 = f; dw.v2 =i>=1? forms[i-1]<corpusWrds?forms[i-1]:-1:_strp; vs[n++]=mf.calc3(dw); + f++; + + if (lemmas[i-1]<corpusWrds) {dw.v0 = f; dw.v2 = i>=1? lemmas[i-1]:_strp; vs[n++]=mf.calc3(dw);} + f++; + + //if (len>i+1) {d2.v0=f; d2.v2= pposs[i-1];d2.v3= pposs[i+1]; vs[n++]=mf.calc4(d2);} + //f++; + + if (i>1) { + + d2.v0=f++; d2.v2=i<2?_strp: pposs[i-2]; vs[n++]=mf.calc3(d2); + d2.v0=f++; d2.v2= pposs[i-1]; d2.v3= pposs[i-2]; vs[n++]=mf.calc4(d2); + + if (forms[i-2]<corpusWrds) {dw.v0=f;dw.v2= forms[i-2]; vs[n++]=mf.calc3(dw);} f++; + if (forms[i-1]<corpusWrds) {dwp.v0=f;dwp.v2 = forms[i-1]; dwp.v3 = pposs[i-2];vs[n++]=mf.calc4(dwp); } f++; + if (forms[i-2]<corpusWrds) {dwp.v0=f;dwp.v2 = forms[i-2]; dwp.v3 = pposs[i-1];vs[n++]=mf.calc4(dwp);} f++; + + if (i>2) { + d2.v0=f++; d2.v2=pposs[i-3]; vs[n++]=mf.calc3(d2); + d2.v0=f++; d2.v2=pposs[i-2]; d2.v3= pposs[i-3]; vs[n++]=mf.calc4(d2); + if(forms[i-3]<this.corpusWrds && forms[i-2]<this.corpusWrds) { + dw.v0=f; dw.v2 = forms[i-3]; dw.v3 = forms[i-2]; vs[n++]=mf.calc4(dw); + } + f++; + } + } + } + vs[n] = Integer.MIN_VALUE; + } + + + public int fillFeatureVectorsOne(String fs, ParametersFloat params, int w1, InstancesTagger is, int n, short[] pos,Long2IntInterface li, float[] score) { + + float best = -1000; + int bestType = -1; + + F2SF f = new F2SF(params.parameters); + + long vs[] = new long[_MAX]; + int lemmas[]; + if (options.noLemmas) lemmas = new int[is.length(n)]; + else lemmas = is.plemmas[n]; + addFeatures(is,n,fs,w1,pos,is.forms[n], lemmas, vs); + + //for(int t = 0; t < types.length; t++) { + + for(int t=0;t<types.length;t++) { + + int p = t<<s_type; + + f.clear(); + for(int k=0;vs[k]!=Integer.MIN_VALUE;k++) if(vs[k]>0) f.add(li.l2i(vs[k]+p)); + if (f.score > best) { + bestType=t; + score[w1]= best =f.score; + } + } + return bestType; + + } + + public ArrayList<POS> classify(String fs, ParametersFloat params, int w1, InstancesTagger is, int n, short[] pos, Long2IntInterface li) { + + + F2SF f = new F2SF(params.parameters); + + long vs[] = new long[_MAX]; + int lemmas[]; + if (options.noLemmas) lemmas = new int[is.length(n)]; + else lemmas = is.plemmas[n]; + addFeatures(is,n,fs,w1,pos,is.forms[n], lemmas, vs); + + ArrayList<POS> best = new ArrayList<POS>(types.length); + + for(int t=0;t<types.length;t++) { + + int p = t<<s_type; + + f.clear(); + f.add(vs,li, p); + POS px = new POS(t, f.score); + best.add(px); + } + Collections.sort(best); + return best; + + } + + /* (non-Javadoc) + * @see is2.tag5.IPipe#write(java.io.DataOutputStream) + */ + @Override + public void write(DataOutputStream dos){ + try { + this.cl.write(dos); + this.lx.write(dos); + dos.writeInt(this.corpusWrds); + dos.writeInt(_pps.size()); + + for(Entry<Integer,int[]> e : _pps.entrySet()) { + dos.writeInt(e.getValue().length); + for(int k : e.getValue()) dos.writeInt(k); + dos.writeInt(e.getKey()); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + + + public void read(DataInputStream dis){ + try { + this.cl =new Cluster(dis); + this.lx =new Lexicon(dis); + this.corpusWrds = dis.readInt(); + + int pc = dis.readInt(); + for(int j=0;j<pc;j++) { + int ps[] = new int [dis.readInt()]; + for(int k=0;k<ps.length;k++) ps[k]=dis.readInt(); + _pps.put(dis.readInt(), ps); + } + // System.out.println("_pps "+ps.length); + + } catch (IOException e) { + e.printStackTrace(); + } + } +} diff --git a/dependencyParser/basic/mate-tools/src/is2/tag/Lexicon.java b/dependencyParser/basic/mate-tools/src/is2/tag/Lexicon.java new file mode 100644 index 0000000..8a85813 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/tag/Lexicon.java @@ -0,0 +1,140 @@ +/** + * + */ +package is2.tag; + +import is2.data.IEncoderPlus; +import is2.data.PipeGen; +import is2.util.DB; + +import java.io.BufferedReader; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; + +/** + * @author Dr. Bernd Bohnet, 07.01.2011 + * + * + */ +public class Lexicon { + + public static final String FR = "FR",TAG = "TAG"; + + final byte[][] word2tag; + public Lexicon(byte[][] w2t) { + + word2tag = w2t; + } + + public Lexicon(String clusterFile, IEncoderPlus mf) { + + final String REGEX = "\t"; + + // register words + try { + BufferedReader inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(clusterFile),"UTF-8"),32768); + + int cnt=0; + String line; + while ((line =inputReader.readLine())!=null) { + + try { + String[] split = line.split(REGEX); + // int f = Integer.parseInt(split[2]); +// if (f>2) { + cnt++; + mf.register(PipeGen.WORD, split[0]); + mf.register(TAG, split[1]); //tag + + if (split.length>1) mf.register(FR, split[1]); // frequency +// } + } catch(Exception e) { + System.out.println("Error in lexicon line "+cnt+" error: "+e.getMessage()); + } + } + System.out.println("read number of words from lexicon "+cnt); + inputReader.close(); + + } catch (Exception e) { + e.printStackTrace(); + } + + word2tag = new byte[mf.getFeatureCounter().get(PipeGen.WORD)][1]; + // insert words + try { + String line; + BufferedReader inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(clusterFile),"UTF-8"),32768); + + while ((line =inputReader.readLine())!=null) { + + String[] split = line.split(REGEX); + int w =mf.getValue(PipeGen.WORD, split[0]); + if (w<0) continue; + word2tag[w][0] = (byte)mf.getValue(TAG, split[1]); + // if (split.length>1) word2tag[w][1]= (byte)mf.getValue(FR, split[2]); // frequency + } + inputReader.close(); + int fill=0; + for(int l = 0; l<word2tag.length; l++ ){ + if (word2tag[l][0]!=0) fill++; + } + System.out.println("filled "+fill+" of "+word2tag.length); + + } catch (Exception e) { + e.printStackTrace(); + } + } + + /** + * Read the cluster + * @param dos + * @throws IOException + */ + public Lexicon(DataInputStream dis) throws IOException { + + word2tag = new byte[dis.readInt()][1]; + for(int i =0;i<word2tag.length;i++) { + word2tag[i][0]=dis.readByte(); +// word2tag[i][1]=dis.readByte(); + } + DB.println("Read lexicon with "+word2tag.length+" words "); + } + + /** + * Write the cluster + * @param dos + * @throws IOException + */ + public void write(DataOutputStream dos) throws IOException { + + dos.writeInt(word2tag.length); + for(byte[] i : word2tag) { + dos.writeByte(i[0]); +// dos.writeByte(i[1]); + } + + } + + /** + * @param form + * @return + */ + public int getTag(int form) { + if (word2tag.length<form || form<0) return -1; + return word2tag[form][0]; + } + + /** + * @param form + * @return + */ + public int getConf(int form) { + if (word2tag.length<form || form<0) return -1; + return word2tag[form][1]; + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/tag/MFO.java b/dependencyParser/basic/mate-tools/src/is2/tag/MFO.java new file mode 100644 index 0000000..df790e3 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/tag/MFO.java @@ -0,0 +1,537 @@ +package is2.tag; + + +import is2.data.IEncoderPlus; +import is2.util.DB; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map.Entry; + +/** + * Map Features, do not map long to integer + * + * @author Bernd Bohnet, 20.09.2009 + */ + +final public class MFO implements IEncoderPlus { + + /** The features and its values */ + private final HashMap<String,HashMap<String,Integer>> m_featureSets = new HashMap<String,HashMap<String,Integer>>(); + + /** The feature class and the number of values */ + private final HashMap<String,Integer> m_featureCounters = new HashMap<String,Integer>(); + + /** The number of bits needed to encode a feature */ + final HashMap<String,Integer> m_featureBits = new HashMap<String,Integer>(); + + /** Integer counter for long2int */ + //private int count=0; + + /** Stop growing */ + public boolean stop=false; + + final public static String NONE="<None>"; + + + + + + final public static class Data4 { + public int shift; + public short a0,a1,a2,a3,a4,a5,a6,a7,a8,a9; + public int v0,v1,v2,v3,v4,v5,v6,v7,v8,v9; + + final public long calcs(int b, long v, long l) { + if (l<0) return l; + l |= v<<shift; + shift +=b; + return l; + } + + + final public long calc2() { + + if (v0<0||v1<0) return -1; + + long l = v0; + shift =a0; + l |= (long)v1<<shift; + shift +=a1; + + return l; + } + + + + final public long calc3() { + + if (v0<0||v1<0||v2<0) return -1; + // if (v1<0||v2<0) return -1; + + long l = v0; + shift =a0; + l |= (long)v1<<shift; + shift +=a1; + l |= (long)v2<<shift; + shift=(short) (shift + a2); + + //shift=; + return l; + } + + + final public long calc4() { + if (v0<0||v1<0||v2<0||v3<0) return -1; + + long l = v0; + shift =a0; + l |= (long)v1<<shift; + shift +=a1; + l |= (long)v2<<shift; + shift +=a2; + l |= (long)v3<<shift; + shift= shift +a3; + + return l; + } + + + + final public long calc5() { + + if (v0<0||v1<0||v2<0||v3<0||v4<0) return -1; + + long l = v0; + shift =a0; + l |= (long)v1<<shift; + shift +=a1; + l |= (long)v2<<shift; + shift +=a2; + l |= (long)v3<<shift; + shift +=a3; + l |= (long)v4<<shift; + shift =shift+a4; + + return l; + } + + + final public long calc6() { + + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0) return -1; + + long l = v0; + shift =a0; + l |= (long)v1<<shift; + shift +=a1; + l |= (long)v2<<shift; + shift +=a2; + l |= (long)v3<<shift; + shift +=a3; + l |= (long)v4<<shift; + shift +=a4; + l |= (long)v5<<shift; + shift =shift+a5; + + return l; + } + + final public long calc7() { + + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0||v6<0) return -1; + + long l = v0; + shift =a0; + l |= (long)v1<<shift; + shift +=a1; + l |= (long)v2<<shift; + shift +=a2; + l |= (long)v3<<shift; + shift +=a3; + l |= (long)v4<<shift; + shift +=a4; + l |= (long)v5<<shift; + shift +=a5; + l |= (long)v6<<shift; + shift =shift+a6; + + return l; + } + + + final public long calc8() { + + if (v0<0||v1<0||v2<0||v3<0||v4<0||v5<0||v6<0||v7<0) return -1; + + long l = v0; + shift =a0; + l |= (long)v1<<shift; + shift +=a1; + l |= (long)v2<<shift; + shift +=a2; + l |= (long)v3<<shift; + shift +=a3; + l |= (long)v4<<shift; + shift +=a4; + l |= (long)v5<<shift; + shift +=a5; + l |= (long)v6<<shift; + shift +=a6; + l |= (long)v7<<shift; + shift =shift+a7; + + return l; + } + + } + + public MFO () {} + + + // public int size() {return count;} + + + final public void stop() { + stop=true; + } + + final public void start() { + stop=false; + } + + + /** + * Register an attribute class, if it not exists and add a possible value + * @param type + * @param type2 + */ + final public int register(String a, String v) { + + HashMap<String,Integer> fs = getFeatureSet().get(a); + if (fs==null) { + fs = new HashMap<String,Integer>(); + getFeatureSet().put(a, fs); + fs.put(NONE, 0); + getFeatureCounter().put(a, 1); + } + Integer c = getFeatureCounter().get(a); + + Integer i = fs.get(v); + if (i==null) { + fs.put(v, c); + c++; + getFeatureCounter().put(a,c); + return c-1; + } else return i; + } + + /** + * Calculates the number of bits needed to encode a feature + */ + public void calculateBits() { + + int total=0; + for(Entry<String,Integer> e : getFeatureCounter().entrySet() ){ + int bits =(int)Math.ceil((Math.log(e.getValue()+1)/Math.log(2))); + m_featureBits.put(e.getKey(), bits); + total+=bits; + // System.out.println(" "+e.getKey()+" bits "+bits+" number "+(e.getValue()+1)); + } + + // System.out.println("total number of needed bits "+total); + } + + + + @Override + public String toString() { + + StringBuffer content = new StringBuffer(); + for(Entry<String,Integer> e : getFeatureCounter().entrySet() ){ + content.append(e.getKey()+" "+e.getValue()); + content.append(':'); + // HashMap<String,Integer> vs = getFeatureSet().get(e.getKey()); + content.append(getFeatureBits(e.getKey())); + + /*if (vs.size()<120) + for(Entry<String,Integer> e2 : vs.entrySet()) { + content.append(e2.getKey()+" ("+e2.getValue()+") "); + }*/ + content.append('\n'); + + } + return content.toString(); + } + + + static final public long calcs(Data4 d,int b, long v, long l) { + if (l<0) return l; + l |= v<<d.shift; + d.shift +=b; + return l; + } + + + final public short getFeatureBits(String a) { + return (short)m_featureBits.get(a).intValue(); + } + + + + /** + * Get the integer place holder of the string value v of the type a + * + * @param t the type + * @param v the value + * @return the integer place holder of v + */ + final public int getValue(String t, String v) { + + if (m_featureSets.get(t)==null) return -1; + Integer vi = m_featureSets.get(t).get(v); + if (vi==null) return -1; //stop && + return vi.intValue(); + } + + /** + * Static version of getValue + * @see getValue + */ + final public int getValueS(String a, String v) { + + if (m_featureSets.get(a)==null) return -1; + Integer vi = m_featureSets.get(a).get(v); + if (vi==null) return -1; //stop && + return vi.intValue(); + } + + public int hasValue(String a, String v) { + + Integer vi = m_featureSets.get(a).get(v); + if (vi==null) return -1; + return vi.intValue(); + } + + + + + final public long calc2(Data4 d) { + + if (d.v0<0||d.v1<0) return -1; + // if (d.v1<0||d.v2<0) return -1; + + long l = d.v0; + short shift =d.a0; + l |= (long)d.v1<<shift; + shift +=d.a1; + // l |= (long)d.v2<<shift; + d.shift=shift; + + //d.shift=; + return l; + } + + + + final public long calc3(Data4 d) { + + if (d.v0<0||d.v1<0||d.v2<0) return -1; + + long l = d.v0; + short shift =d.a0; + l |= (long)d.v1<<shift; + shift +=d.a1; + l |= (long)d.v2<<shift; + d.shift=shift + d.a2; + return l; + } + + + final public long calc4(Data4 d) { + if (d.v0<0||d.v1<0||d.v2<0||d.v3<0) return -1; + + long l = d.v0; + int shift =d.a0; + l |= (long)d.v1<<shift; + shift +=d.a1; + l |= (long)d.v2<<shift; + shift +=d.a2; + l |= (long)d.v3<<shift; + d.shift= shift +d.a3; + + return l; + } + + + final public long calc5(Data4 d) { + + if (d.v0<0||d.v1<0||d.v2<0||d.v3<0||d.v4<0) return -1; + + long l = d.v0; + int shift =d.a0; + l |= (long)d.v1<<shift; + shift +=d.a1; + l |= (long)d.v2<<shift; + shift +=d.a2; + l |= (long)d.v3<<shift; + shift +=d.a3; + l |= (long)d.v4<<shift; + d.shift =shift+d.a4; + + return l; + } + + + final public long calc6(Data4 d) { + + if (d.v0<0||d.v1<0||d.v2<0||d.v3<0||d.v4<0||d.v5<0) return -1; + + long l = d.v0; + int shift =d.a0; + l |= (long)d.v1<<shift; + shift +=d.a1; + l |= (long)d.v2<<shift; + shift +=d.a2; + l |= (long)d.v3<<shift; + shift +=d.a3; + l |= (long)d.v4<<shift; + shift +=d.a4; + l |= (long)d.v5<<shift; + d.shift =shift+d.a5; + + return l; + } + + final public long calc7(Data4 d) { + + if (d.v0<0||d.v1<0||d.v2<0||d.v3<0||d.v4<0||d.v5<0||d.v6<0) return -1; + + long l = d.v0; + int shift =d.a0; + l |= (long)d.v1<<shift; + shift +=d.a1; + l |= (long)d.v2<<shift; + shift +=d.a2; + l |= (long)d.v3<<shift; + shift +=d.a3; + l |= (long)d.v4<<shift; + shift +=d.a4; + l |= (long)d.v5<<shift; + shift +=d.a5; + l |= (long)d.v6<<shift; + d.shift =shift+d.a6; + + return l; + } + + + final public long calc8(Data4 d) { + + if (d.v0<0||d.v1<0||d.v2<0||d.v3<0||d.v4<0||d.v5<0||d.v6<0||d.v7<0) return -1; + + long l = d.v0; + int shift =d.a0; + l |= (long)d.v1<<shift; + shift +=d.a1; + l |= (long)d.v2<<shift; + shift +=d.a2; + l |= (long)d.v3<<shift; + shift +=d.a3; + l |= (long)d.v4<<shift; + shift +=d.a4; + l |= (long)d.v5<<shift; + shift +=d.a5; + l |= (long)d.v6<<shift; + shift +=d.a6; + l |= (long)d.v7<<shift; + d.shift =shift+d.a7; + + return l; + } + + + + + + + + /** + * Maps a long to a integer value. This is very useful to save memory for sparse data long values + * @param node + * @return the integer + */ + static public int misses = 0; + static public int good = 0; + + + + + /** + * Write the data + * @param dos + * @throws IOException + */ + public void writeData(DataOutputStream dos) throws IOException { + dos.writeInt(getFeatureSet().size()); + for(Entry<String, HashMap<String,Integer>> e : getFeatureSet().entrySet()) { + dos.writeUTF(e.getKey()); + dos.writeInt(e.getValue().size()); + + for(Entry<String,Integer> e2 : e.getValue().entrySet()) { + + if(e2.getKey()==null) DB.println("key "+e2.getKey()+" value "+e2.getValue()+" e -key "+e.getKey()); + dos.writeUTF(e2.getKey()); + dos.writeInt(e2.getValue()); + + } + + } + } + public void read(DataInputStream din) throws IOException { + + int size = din.readInt(); + for(int i=0; i<size;i++) { + String k = din.readUTF(); + int size2 = din.readInt(); + + HashMap<String,Integer> h = new HashMap<String,Integer>(); + getFeatureSet().put(k,h); + for(int j = 0;j<size2;j++) { + h.put(din.readUTF(), din.readInt()); + } + getFeatureCounter().put(k, size2); + } + + calculateBits(); + } + + + /** + * Clear the data + */ + public void clearData() { + getFeatureSet().clear(); + m_featureBits.clear(); + getFeatureSet().clear(); + } + + public HashMap<String,Integer> getFeatureCounter() { + return m_featureCounters; + } + + public HashMap<String,HashMap<String,Integer>> getFeatureSet() { + return m_featureSets; + } + + static public String[] reverse(HashMap<String,Integer> v){ + String[] set = new String[v.size()]; + for(Entry<String,Integer> e : v.entrySet()) { + set[e.getValue()]=e.getKey(); + } + return set; + } + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/tag/Options.java b/dependencyParser/basic/mate-tools/src/is2/tag/Options.java new file mode 100644 index 0000000..540f8ed --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/tag/Options.java @@ -0,0 +1,125 @@ +package is2.tag; + +import is2.util.OptionsSuper; + +import java.io.File; + + +public final class Options extends OptionsSuper { + + + public Options (String[] args) { + + for(int i = 0; i < args.length; i++) { + String[] pair = args[i].split(":"); + + if (pair[0].equals("--help")) explain(); + else if (pair[0].equals("-train")) { + train = true; + trainfile = args[i+1]; + } else if (pair[0].equals("-eval")) { + eval = true; + goldfile =args[i+1]; i++; + } else if (pair[0].equals("-test")) { + test = true; + testfile = args[i+1]; i++; + } else if (pair[0].equals("-i")) { + numIters = Integer.parseInt(args[i+1]); i++; + } + else if (pair[0].equals("-out")) { + outfile = args[i+1]; i++; + } + else if (pair[0].equals("-decode")) { + decodeProjective = args[i+1].equals("proj"); i++; + } + else if (pair[0].equals("-confidence")) { + + conf = true; + } + + else if (pair[0].equals("-count")) { + count = Integer.parseInt(args[i+1]); i++; + } else if (pair[0].equals("-model")) { + modelName = args[i+1]; i++; + } else if (pair[0].equals("-tmp")) { + tmp = args[i+1]; i++; + } else if (pair[0].equals("-format")) { + //format = args[i+1]; + formatTask = Integer.parseInt(args[i+1]); i++; + } else if (pair[0].equals("-allfeatures")) { + allFeatures=true; + } else if (pair[0].equals("-nonormalize")) { + normalize=false; + }else if (pair[0].equals("-nframes")) { + //format = args[i+1]; + nbframes= args[i+1]; i++; + + + } else if (pair[0].equals("-pframes")) { + //format = args[i+1]; + pbframes= args[i+1]; i++; + } else if (pair[0].equals("-nopred")) { + nopred =true; + } else if (pair[0].equals("-divide")) { + keep =true; + } else if (pair[0].equals("-lexicon")) { + lexicon= args[i+1]; i++; + + } else super.addOption(args, i); + + } + + + + + + try { + + if (trainfile!=null) { + + if (keep && tmp!=null) { + trainforest = new File(tmp); + if (!trainforest.exists()) keep=false; + + } else + if (tmp!=null) { + trainforest = File.createTempFile("train", ".tmp", new File(tmp)); + trainforest.deleteOnExit(); + } + else { + trainforest = File.createTempFile("train", ".tmp"); //,new File("F:\\") + trainforest.deleteOnExit(); + } + + + } + + + } catch (java.io.IOException e) { + System.out.println("Unable to create tmp files for feature forests!"); + System.out.println(e); + System.exit(0); + } + } + + private void explain() { + System.out.println("Usage: "); + System.out.println("java -class mate.jar is2.parser.Parser [Options]"); + System.out.println(); + System.out.println("Example: "); + System.out.println(" java -class mate.jar is2.parser.Parser -model eps3.model -train corpora/conll08st/train/train.closed -test corpora/conll08st/devel/devel.closed -out b3.test -eval corpora/conll08st/devel/devel.closed -count 2000 -i 6"); + System.out.println(""); + System.out.println("Options:"); + System.out.println(""); + System.out.println(" -train <file> the corpus a model is trained on; default "+this.trainfile); + System.out.println(" -test <file> the input corpus for testing; default "+this.testfile); + System.out.println(" -out <file> the output corpus (result) of a test run; default "+this.outfile); + System.out.println(" -model <file> the parsing model for traing the model is stored in the files"); + System.out.println(" and for parsing the model is load from this file; default "+this.modelName); + System.out.println(" -i <number> the number of training iterations; good numbers are 10 for smaller corpora and 6 for bigger; default "+this.numIters); + System.out.println(" -count <number> the n first sentences of the corpus are take for the training default "+this.count); + System.out.println(" -format <number> conll format of the year 8 or 9; default "+this.formatTask); + + System.exit(0); + } +} diff --git a/dependencyParser/basic/mate-tools/src/is2/tag/POS.java b/dependencyParser/basic/mate-tools/src/is2/tag/POS.java new file mode 100644 index 0000000..c8e039f --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/tag/POS.java @@ -0,0 +1,29 @@ +package is2.tag; + +public class POS implements Comparable<POS> { + + // pos tag + public int p; + + // score of the tag + public float s; + + // the position of the word in the sentence + public int w; + + public POS(int p, float s) { + this.p=p; + this.s=s; + } + + @Override + public int compareTo(POS o) { + + return s>o.s?-1:s==o.s?0:1; + } + + public String toString() { + return ""+p+":"+s; + } + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/tag/Tagger.java b/dependencyParser/basic/mate-tools/src/is2/tag/Tagger.java new file mode 100644 index 0000000..b0c2dec --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/tag/Tagger.java @@ -0,0 +1,500 @@ +package is2.tag; + + + +import is2.data.F2SF; +import is2.data.FV; +import is2.data.Instances; +import is2.data.InstancesTagger; +import is2.data.Long2Int; +import is2.data.Long2IntInterface; +import is2.data.ParametersFloat; +import is2.data.PipeGen; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; +import is2.io.CONLLWriter09; +import is2.tools.IPipe; +import is2.tools.Tool; +import is2.tools.Train; +import is2.util.DB; +import is2.util.Evaluator; +import is2.util.OptionsSuper; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Map.Entry; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; +import java.util.zip.ZipOutputStream; + + +public class Tagger implements Tool, Train { + + public ExtractorT2 pipe; + public ParametersFloat params; + public Long2IntInterface li; + public MFO mf; + private OptionsSuper _options; + + /** + * Initialize + * @param options + */ + public Tagger (Options options) { + + + // load the model + try { + readModel(options); + } catch (Exception e) { + e.printStackTrace(); + } + } + public Tagger() { } + + /** + * @param modelFileName the file name of the model + */ + public Tagger(String modelFileName) { + this(new Options(new String[]{"-model",modelFileName})); + } + + public static void main (String[] args) throws FileNotFoundException, Exception + { + + long start = System.currentTimeMillis(); + Options options = new Options(args); + + + Tagger tagger = new Tagger(); + + if (options.train) { + + // depReader.normalizeOn=false; + + tagger.li = new Long2Int(options.hsize); + tagger.pipe = new ExtractorT2 (options, tagger.mf= new MFO()); + + //tagger.pipe.li =tagger.li; + + InstancesTagger is = (InstancesTagger)tagger.pipe.createInstances(options.trainfile); + + tagger.params = new ParametersFloat(tagger.li.size()); + + tagger.train(options, tagger.pipe,tagger.params,is); + tagger.writeModel(options, tagger.pipe, tagger.params); + + } + + if (options.test) { + + tagger.readModel(options); + + tagger.out(options,tagger.pipe, tagger.params); + } + + System.out.println(); + + if (options.eval) { + System.out.println("\nEVALUATION PERFORMANCE:"); + Evaluator.evaluateTagger(options.goldfile, options.outfile,options.format); + } + long end = System.currentTimeMillis(); + System.out.println("used time "+((float)((end-start)/100)/10)); + } + + public void readModel(OptionsSuper options) { + + try{ + pipe = new ExtractorT2(options, mf =new MFO()); + _options=options; + // load the model + ZipInputStream zis = new ZipInputStream(new BufferedInputStream(new FileInputStream(options.modelName))); + zis.getNextEntry(); + DataInputStream dis = new DataInputStream(new BufferedInputStream(zis)); + + pipe.mf.read(dis); + pipe.initValues(); + pipe.initFeatures(); + + params = new ParametersFloat(0); + params.read(dis); + li = new Long2Int(params.parameters.length); + pipe.read(dis); + + dis.close(); + + pipe.types = new String[pipe.mf.getFeatureCounter().get(ExtractorT2.POS)]; + for(Entry<String,Integer> e : pipe.mf.getFeatureSet().get(ExtractorT2.POS).entrySet()) + pipe.types[e.getValue()] = e.getKey(); + + DB.println("Loading data finished. "); + } catch(Exception e) { + e.printStackTrace(); + } + } + + /** + * Do the training + * @param instanceLengths + * @param options + * @param pipe + * @param params + * @throws IOException + * @throws InterruptedException + * @throws ClassNotFoundException + */ + public void train(OptionsSuper options, IPipe pipe, ParametersFloat params, Instances is2) { + + InstancesTagger is = (InstancesTagger)is2; + String wds[] = mf.reverse(this.pipe.mf.getFeatureSet().get(ExtractorT2.WORD)); + + int pd[] = new int[this.pipe.types.length]; + for(int k=0;k<pd.length;k++) pd[k]=k; + + int del=0; + F2SF f = new F2SF(params.parameters); + long vs[] = new long[ExtractorT2._MAX]; + + int types =this.pipe.types.length; + + double upd = options.numIters*is.size() +1; + + for(int i = 0; i <options.numIters ; i++) { + + long start = System.currentTimeMillis(); + + int numInstances = is.size(); + + long last= System.currentTimeMillis(); + FV pred = new FV(),gold = new FV(); + + int correct =0,count=0; + System.out.print("Iteration "+i+": "); + + for(int n = 0; n < numInstances; n++) { + + if((n+1) % 500 == 0) del= PipeGen.outValueErr(n+1, (count-correct),(float)correct/(float)count,del,last,upd); + + int length = is.length(n); + + upd--; + + for(int w = 1; w < length; w++) { + + double best = -1000; + short bestType = -1; + + int[] lemmas; //= is.lemmas[n]; + if (options.noLemmas)lemmas = new int[is.length(n)]; + else lemmas = is.plemmas[n]; + + this.pipe.addFeatures(is,n,wds[is.forms[n][w]],w,is.gpos[n],is.forms[n], lemmas, vs); + + for(short t=0;t<types;t++) { + + // the hypotheses of a part of speech tag + long p = t<<ExtractorT2.s_type; + f.clear(); + + // add the features to the vector + for(int k1=0;vs[k1]!=Integer.MIN_VALUE;k1++) { + if (vs[k1]>0) f.add(this.li.l2i(vs[k1]|p)); + } + + if (f.score > best) { + bestType=t; + best =f.score; + } + } + + count++; + if (bestType == is.gpos[n][w] ) { + correct++; + continue; + } + + pred.clear(); + for (int k1=0;vs[k1]!=Integer.MIN_VALUE;k1++) if (vs[k1]>0) pred.add(this.li.l2i(vs[k1]| bestType<<ExtractorT2.s_type)); + + gold.clear(); + for (int k1=0;vs[k1]!=Integer.MIN_VALUE;k1++) if (vs[k1]>0) gold.add(this.li.l2i(vs[k1] | is.gpos[n][w]<<ExtractorT2.s_type)); + + params.update(pred,gold, (float)upd, 1.0F); + } + } + + long end = System.currentTimeMillis(); + String info = "time "+(end-start); + PipeGen.outValueErr(numInstances, (count-correct),(float)correct/(float)count,del,last,upd,info); + System.out.println(); + del=0; + } + + params.average(options.numIters*is.size()); + + } + + + /** + * Tag a sentence + * @param options + * @param pipe + * @param params + * @throws IOException + */ + public void out (OptionsSuper options, IPipe pipe, ParametersFloat params) { + + try { + + + long start = System.currentTimeMillis(); +// change this backe!!! +// CONLLReader09 depReader = new CONLLReader09(options.testfile, CONLLReader09.NO_NORMALIZE); + CONLLReader09 depReader = new CONLLReader09(options.testfile); + + CONLLWriter09 depWriter = new CONLLWriter09(options.outfile); + + System.out.print("Processing Sentence: "); + pipe.initValues(); + + int cnt = 0; + int del=0; + while(true) { + + InstancesTagger is = new InstancesTagger(); + is.init(1, mf); + SentenceData09 instance = depReader.getNext(is); + if (instance == null || instance.forms == null) break; + + + is.fillChars(instance, 0, ExtractorT2._CEND); + + cnt++; + + + tag(is, instance); + + SentenceData09 i09 = new SentenceData09(instance); + i09.createSemantic(instance); + depWriter.write(i09); + + if(cnt % 100 == 0) del=PipeGen.outValue(cnt, del); + + } + del=PipeGen.outValue(cnt, del); + depWriter.finishWriting(); + + float min=1000, max=-1000; + + // int r[] = new int[14]; + /* + for(Entry<Float, Integer> e : map.entrySet()) { + if(e.getKey()<min)min=e.getKey(); + if(e.getKey()>max)max=e.getKey(); + + if(e.getKey()<0.2) r[0]++; + else if(e.getKey()<0.5) r[1]+=e.getValue(); + else if(e.getKey()<0.7) r[2]+=e.getValue(); + else if(e.getKey()<0.8) r[3]+=e.getValue(); + else if(e.getKey()<0.9) r[4]+=e.getValue(); + else if(e.getKey()<1.0) r[5]+=e.getValue(); + else if(e.getKey()<1.2) r[6]+=e.getValue(); + else if(e.getKey()<1.3) r[7]+=e.getValue(); + else if(e.getKey()<1.4) r[8]+=e.getValue(); + else if(e.getKey()<1.5) r[9]+=e.getValue(); + else if(e.getKey()<1.9) r[10]+=e.getValue(); + else if(e.getKey()<2.2) r[11]+=e.getValue(); + else if(e.getKey()<2.5) r[12]+=e.getValue(); + else if(e.getKey()>=2.5) r[13]+=e.getValue(); + } + */ + // for(int k=0;k<r.length;k++) System.out.println(k+" "+r[k][0]+" "+((float)r[k][1]/(float)r[k][0])+" good "+r[k][1]); + // System.out.println("min "+min+" "+max); + + long end = System.currentTimeMillis(); + System.out.println(PipeGen.getSecondsPerInstnace(cnt,(end-start))); + System.out.println(PipeGen.getUsedTime(end-start)); + } catch(Exception e) { + e.printStackTrace(); + } + } + + + public SentenceData09 tag(SentenceData09 instance){ + InstancesTagger is = new InstancesTagger(); + is.init(1, pipe.mf); + new CONLLReader09().insert(is, instance); + is.fillChars(instance, 0, ExtractorT2._CEND); + tag(is, instance); + + return instance; + } + + + private void tag(InstancesTagger is, SentenceData09 instance) { + + int length = instance.ppos.length; + + short[] pos = new short[instance.gpos.length]; + + float sc[] =new float[instance.ppos.length]; + + instance.ppos[0]= is2.io.CONLLReader09.ROOT_POS; + pos[0]=(short)pipe.mf.getValue(ExtractorT2.POS, is2.io.CONLLReader09.ROOT_POS); + + for(int j = 1; j < length; j++) { + + short bestType = (short)pipe.fillFeatureVectorsOne( instance.forms[j],params, j, is,0,pos,this.li,sc); + pos[j] = bestType; + instance.ppos[j]= pipe.types[bestType]; + } + + for(int j = 1; j < length; j++) { + + short bestType = (short)pipe.fillFeatureVectorsOne(instance.forms[j],params, j, is,0,pos,this.li,sc); + instance.ppos[j]= pipe.types[bestType]; + pos[j]=bestType; + } + } + + /** + * Tag a single word and return a n-best list of Part-of-Speech tags. + * + * @param is set of sentences + * @param instanceIndex index to the sentence in question + * @param word word to be tagged + * @return n-best list of Part-of-Speech tags + */ + public ArrayList<POS> tag(InstancesTagger is,int instanceIndex, int word, String wordForm) { + + return pipe.classify( wordForm , params, word, is, instanceIndex, is.pposs[instanceIndex], li); + + } + + public ArrayList<String> tagStrings(InstancesTagger is,int instanceIndex, int word, String wordForm) { + + ArrayList<POS> plist = pipe.classify( wordForm , params, word, is, instanceIndex, is.pposs[instanceIndex], li); + String pos[] = mf.reverse(this.pipe.mf.getFeatureSet().get(ExtractorT2.POS)); + + ArrayList<String> postags =null; + for(POS p : plist) { + try { + postags.add(pos[p.p]); + }catch(Exception e) { + e.printStackTrace(); + } + } + return postags; + + + } + + + + /** + * Tag a sentence + * @param options + * @param pipe + * @param parametersReranker + * @throws IOException + */ + public String[] tag (String[] words, String[] lemmas) { + + String[] pposs = new String[words.length]; + + try { + pipe.initValues(); + + int length = words.length+1; + + + InstancesTagger is = new InstancesTagger(); + is.init(1, pipe.mf); + is.createInstance09(length); + + SentenceData09 instance = new SentenceData09(); + instance.forms = new String[length]; + instance.forms[0]=is2.io.CONLLReader09.ROOT; + + instance.plemmas = new String[length]; + instance.plemmas[0]=is2.io.CONLLReader09.ROOT_LEMMA; + + for(int j = 0; j < words.length; j++) { + instance.forms[j+1]=words[j]; + instance.plemmas[j+1]=lemmas[j]; + } + + for(int j = 0; j < length; j++) { + is.setForm(0, j, instance.forms[j]); + is.setLemma(0, j, instance.plemmas[j]); + } + + instance.ppos = new String[length]; + + is.fillChars(instance, 0, ExtractorT2._CEND); + + this.tag(is, instance); + + for(int j = 0; j < words.length; j++) { + pposs[j] = instance.ppos[j+1]; + } + } catch(Exception e) { + e.printStackTrace(); + } + + return pposs; + + + } + + /* (non-Javadoc) + * @see is2.tools.Tool#apply(is2.data.SentenceData09) + */ + @Override + public SentenceData09 apply(SentenceData09 snt) { + + + SentenceData09 it = new SentenceData09(); + it.createWithRoot(snt); + it = tag(it); + SentenceData09 i09 = new SentenceData09(it); + i09.createSemantic(it); + return i09; + } + + + + /* (non-Javadoc) + * @see is2.tools.Train#writeModel(is2.util.OptionsSuper, is2.mtag2.Pipe, is2.data.ParametersFloat) + */ + @Override + public void writeModel(OptionsSuper options, IPipe pipe, is2.data.ParametersFloat params) { + try{ + ZipOutputStream zos = new ZipOutputStream(new BufferedOutputStream(new FileOutputStream(options.modelName))); + zos.putNextEntry(new ZipEntry("data")); + DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(zos)); + + this.pipe.mf.writeData(dos); + + DB.println("number of parameters "+params.parameters.length); + dos.flush(); + + params.write(dos); + pipe.write(dos); + dos.flush(); + dos.close(); + + } catch(Exception e) { + e.printStackTrace(); + } + } + +} diff --git a/dependencyParser/mate-tools/classes/is2/tag/package.html b/dependencyParser/basic/mate-tools/src/is2/tag/package.html index 469fdf6..469fdf6 100644 --- a/dependencyParser/mate-tools/classes/is2/tag/package.html +++ b/dependencyParser/basic/mate-tools/src/is2/tag/package.html diff --git a/dependencyParser/basic/mate-tools/src/is2/tools/IPipe.java b/dependencyParser/basic/mate-tools/src/is2/tools/IPipe.java new file mode 100644 index 0000000..d976074 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/tools/IPipe.java @@ -0,0 +1,30 @@ +/** + * + */ +package is2.tools; + +import is2.data.Instances; +import is2.data.InstancesTagger; + +import java.io.DataOutputStream; +import java.io.File; + +/** + * @author Dr. Bernd Bohnet, 25.12.2010 + * + * + */ +public interface IPipe { + + public abstract Instances createInstances(String file); + + public abstract void initValues(); + + /** + * Initialize the features types. + */ + public abstract void initFeatures(); + + public abstract void write(DataOutputStream dos); + +} \ No newline at end of file diff --git a/dependencyParser/basic/mate-tools/src/is2/tools/Retrainable.java b/dependencyParser/basic/mate-tools/src/is2/tools/Retrainable.java new file mode 100644 index 0000000..67a2e56 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/tools/Retrainable.java @@ -0,0 +1,25 @@ +package is2.tools; + +import is2.data.SentenceData09; + +/** + * Provides Methods for the retraining + * @author bohnetbd + * + */ +public interface Retrainable { + + /** + * Retrains with a update factor (upd). + * The retraining stops when the model was successful adapted or it gave up after the maximal iterations. + * + * @param sentence the data container of the new example. + * @param upd the update factor, e.g. 0.01 + * @param iterations maximal number of iterations that are tried to adapt the system. + * @return success = true -- else false + */ + public boolean retrain(SentenceData09 sentence, float upd, int iterations) ; + + boolean retrain(SentenceData09 sentence, float upd, int iterations, boolean print); + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/tools/Tool.java b/dependencyParser/basic/mate-tools/src/is2/tools/Tool.java new file mode 100644 index 0000000..06246a2 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/tools/Tool.java @@ -0,0 +1,25 @@ +/** + * + */ +package is2.tools; + +import is2.data.SentenceData09; + +/** + * @author Bernd Bohnet, 27.10.2010 + * + * Interface to all tools + */ +public interface Tool { + + + /** + * Uses the tool and applies it on the input sentence. + * The input is altered and has to include a root (token). + * + * @param i the input sentence + * @return The result of the performance without the root. + */ + SentenceData09 apply(SentenceData09 snt09); + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/tools/ToolIO.java b/dependencyParser/basic/mate-tools/src/is2/tools/ToolIO.java new file mode 100644 index 0000000..279a4ff --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/tools/ToolIO.java @@ -0,0 +1,17 @@ +/** + * + */ +package is2.tools; + +import is2.data.SentenceData09; + +/** + * @author Bernd Bohnet, 27.10.2010 + * + * Interface to all tools + */ +public interface ToolIO { + + void readModel(); + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/tools/Train.java b/dependencyParser/basic/mate-tools/src/is2/tools/Train.java new file mode 100644 index 0000000..234f937 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/tools/Train.java @@ -0,0 +1,25 @@ +/** + * + */ +package is2.tools; + +import is2.data.Instances; +import is2.data.ParametersFloat; +import is2.util.OptionsSuper; + +/** + * @author Dr. Bernd Bohnet, 24.12.2010 + * + * + */ +public interface Train { + + public abstract void writeModel(OptionsSuper options, IPipe pipe, ParametersFloat params); + + public abstract void readModel(OptionsSuper options); + + public abstract void train(OptionsSuper options, IPipe pipe, ParametersFloat params, Instances is); + + public abstract void out(OptionsSuper options, IPipe pipe, ParametersFloat params); + +} \ No newline at end of file diff --git a/dependencyParser/basic/mate-tools/src/is2/util/Convert.java b/dependencyParser/basic/mate-tools/src/is2/util/Convert.java new file mode 100644 index 0000000..1ed2389 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/util/Convert.java @@ -0,0 +1,455 @@ +/** + * + */ +package is2.util; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.Reader; +import java.nio.charset.Charset; +import java.util.ArrayList; + + +import is2.data.SentenceData09; +import is2.io.CONLLReader06; +import is2.io.CONLLReader08; +import is2.io.CONLLReader09; +import is2.io.CONLLWriter06; +import is2.io.CONLLWriter09; + +/** + * @author Dr. Bernd Bohnet, 01.03.2010 + * + * + */ +public class Convert { + + + + public static void main(String args[]) throws Exception { + + + if (args.length<2) { + + System.out.println("Usage"); + System.out.println(" java is2.util.Convert <in> <out> [-w06|-w0809|-yue] [-wordsonly]"); + + + } + + int todo =9; + boolean wordsOnly=false; + for(String a : args) { + if (a!=null && a.equals("-w06")) todo=6; + else if (a!=null && a.equals("-w0809")) todo=89; + else if (a!=null && a.equals("-yue")) todo=99; + else if (a!=null && a.equals("-utf8")) todo=8; + + if (a!=null && a.equals("-wordsonly")) wordsOnly=true; + + + } + + if (todo==9)convert(args[0],args[1]); + else if (todo==6) convert0906(args[0],args[1]); + else if (todo==8) convert8(args[0],args[1], args[2]); + else if (todo==89) convert0809(args[0],args[1]); + else if (todo==99) { + convertChnYue(args[0],args[1],wordsOnly); + } + + + } + + private static void convert8(String infile, String outfile, String format) { + try { + + System.out.println("availableCharsets: "+Charset.availableCharsets()); + + BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(infile), format)); + BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile), "UTF8")); + ; + int ch; + + int count =0, wcount=0;; + while ((ch = in.read()) > -1) { + count++; + + if (Character.isDefined(ch)) { + + out.write(ch); + wcount++; + } + } + in.close(); + out.close(); + System.out.println("read "+count+" chars and wrote "+wcount+" utf8 chars"); + } + catch (Exception e) { + e.printStackTrace(); + } + + } + + public static void convert(String source, String target) throws Exception { + + CONLLReader06 reader = new CONLLReader06(source); + CONLLWriter09 writer = new CONLLWriter09(target); + + int str =0; + while (true) { + SentenceData09 i = reader.getNext(); + str++; + if (i == null) break; + + + String[] formsNoRoot = new String[i.length()-1]; + String[] posNoRoot = new String[formsNoRoot.length]; + String[] lemmas = new String[formsNoRoot.length]; + + String[] org_lemmas = new String[formsNoRoot.length]; + + String[] of = new String[formsNoRoot.length]; + String[] pf = new String[formsNoRoot.length]; + + String[] pposs = new String[formsNoRoot.length]; + String[] labels = new String[formsNoRoot.length]; + String[] fillp = new String[formsNoRoot.length]; + + int[] heads = new int[formsNoRoot.length]; + + + + for(int j = 0; j < formsNoRoot.length; j++) { + formsNoRoot[j] = i.forms[j+1]; + if (formsNoRoot[j].length()==0 ||formsNoRoot[j].equals("")) { + System.out.println("error forms "+str); + // System.exit(0); + formsNoRoot[j]=" "; + } + posNoRoot[j] = i.gpos[j+1]; + if (posNoRoot[j].length()==0 ||posNoRoot[j].equals(" ")) { + System.out.println("error pos "+str); + // System.exit(0); + } + pposs[j] = i.ppos[j+1]; + if (pposs[j].length()==0 ||pposs[j].equals(" ")) { + System.out.println("error pos "+str); + //System.exit(0); + } + + labels[j] = i.labels[j+1]; + if (labels[j].length()==0 ||labels[j].equals(" ")) { + System.out.println("error lab "+str); + // System.exit(0); + } + heads[j] = i.heads[j+1]; + if(heads[j]> posNoRoot.length) { + System.out.println("head out of range "+heads[j]+" "+heads.length+" "+str); + heads[j]=posNoRoot.length; + } + + lemmas[j] = i.plemmas[j+1]; + if (lemmas[j].length()==0 ||lemmas[j].equals(" ")) { + System.out.println("error lab "+str); + // System.exit(0); + } + org_lemmas[j] = i.lemmas[j+1]; + if (org_lemmas[j].length()==0 ||org_lemmas[j].equals(" ")) { + System.out.println("error lab "+str); + // System.exit(0); + } + of[j] = i.ofeats[j+1]; + pf[j] = i.pfeats[j+1]; + if (str==6099) { + // System.out.println(formsNoRoot[j]+"\t"+posNoRoot[j]+"\t"+pposs[j]+"\t"+labels[j]+"\t"+heads[j]); + } + + // (instance.fillp!=null) fillp[j] = instance.fillp[j+1]; + } + + SentenceData09 i09 = new SentenceData09(formsNoRoot, formsNoRoot, formsNoRoot,pposs, pposs, labels, heads,fillp,of, pf); + + //public SentenceData09(String[] forms, String[] lemmas, String[] olemmas,String[] gpos, String[] ppos, String[] labs, int[] heads, String[] fillpred) { + //SentenceData09 + // SentenceData09 i2 = new SentenceData09(i.forms, i.lemmas,i.org_lemmas,); + + writer.write(i09); + + + } + writer.finishWriting(); + + + } + + + + public static void convertChnYue(String source, String target, boolean wordsOnly) throws Exception { + + + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(source),"UTF-8"),32768); + + CONLLWriter09 writer = new CONLLWriter09(target); + + int str =0; + while (true) { + + ArrayList<String[]> lines = new ArrayList<String[]>(); + + String line; + while((line = reader.readLine())!=null) { + + if (line.length()<2) break; + String split[] = line.split("\t"); + lines.add(split); + } + if (line ==null)break; + + str++; + + + String[] formsNoRoot = new String[lines.size()]; + String[] posNoRoot = new String[formsNoRoot.length]; + String[] lemmas = new String[formsNoRoot.length]; + + String[] org_lemmas = new String[formsNoRoot.length]; + + String[] of = new String[formsNoRoot.length]; + String[] pf = new String[formsNoRoot.length]; + + String[] pposs = new String[formsNoRoot.length]; + String[] labels = new String[formsNoRoot.length]; + String[] fillp = new String[formsNoRoot.length]; + + int[] heads = new int[formsNoRoot.length]; + + + + for(int j = 0; j < formsNoRoot.length; j++) { + formsNoRoot[j] = lines.get(j)[0]; + if (formsNoRoot[j].length()==0 ||formsNoRoot[j].equals("")) { + System.out.println("error forms "+str); + // System.exit(0); + formsNoRoot[j]="_"; + } + + posNoRoot[j] = lines.get(j)[1]; + if (posNoRoot[j].length()==0 ||posNoRoot[j].equals(" ")) { + System.out.println("error pos "+str); + // System.exit(0); + } + pposs[j] = "_"; + + labels[j] = lines.get(j)[3]; + if (labels[j].length()==0 ||labels[j].equals(" ")) { + System.out.println("error lab "+str); + labels[j] = "_"; + // System.exit(0); + } + heads[j] = Integer.parseInt(lines.get(j)[2])+1; + if(heads[j]> posNoRoot.length) { + System.out.println("head out of range "+heads[j]+" "+heads.length+" "+str); + heads[j]=posNoRoot.length; + } + + // 0 is root and not -1 + if (heads[j]==-1)heads[j]=0; + + lemmas[j] = "_"; + + org_lemmas[j] = "_"; + + of[j] = "_"; + pf[j] = "_"; + + if (wordsOnly) { + posNoRoot[j]="_"; + heads[j]=0; + labels[j] = "_"; + } + + // (instance.fillp!=null) fillp[j] = instance.fillp[j+1]; + } + + SentenceData09 i09 = new SentenceData09(formsNoRoot, lemmas, org_lemmas,posNoRoot, posNoRoot, labels, heads,fillp,of, pf); + + //public SentenceData09(String[] forms, String[] lemmas, String[] olemmas,String[] gpos, String[] ppos, String[] labs, int[] heads, String[] fillpred) { + //SentenceData09 + // SentenceData09 i2 = new SentenceData09(i.forms, i.lemmas,i.org_lemmas,); + + writer.write(i09); + + + } + writer.finishWriting(); + + + } + + + + /** + * Convert the 0 + * @param source + * @param target + * @throws Exception + */ + public static void convert0809(String source, String target) throws Exception { + + CONLLReader08 reader = new CONLLReader08(source); + CONLLWriter09 writer = new CONLLWriter09(target); + + int str =0; + while (true) { + SentenceData09 i = reader.getNext(); + str++; + if (i == null) break; + + + String[] formsNoRoot = new String[i.length()-1]; + String[] posNoRoot = new String[formsNoRoot.length]; + String[] lemmas = new String[formsNoRoot.length]; + + String[] org_lemmas = new String[formsNoRoot.length]; + + String[] of = new String[formsNoRoot.length]; + String[] pf = new String[formsNoRoot.length]; + + String[] pposs = new String[formsNoRoot.length]; + String[] labels = new String[formsNoRoot.length]; + String[] fillp = new String[formsNoRoot.length]; + + int[] heads = new int[formsNoRoot.length]; + + + + for(int j = 0; j < formsNoRoot.length; j++) { + formsNoRoot[j] = i.forms[j+1]; + if (formsNoRoot[j].length()==0 ||formsNoRoot[j].equals("")) { + System.out.println("error forms "+str); + // System.exit(0); + formsNoRoot[j]=" "; + } + posNoRoot[j] = i.gpos[j+1]; + if (posNoRoot[j].length()==0 ||posNoRoot[j].equals(" ")) { + System.out.println("error pos "+str); + // System.exit(0); + } + pposs[j] = i.ppos[j+1]; + if (pposs[j].length()==0 ||pposs[j].equals(" ")) { + System.out.println("error pos "+str); + //System.exit(0); + } + + labels[j] = i.labels[j+1]; + if (labels[j].length()==0 ||labels[j].equals(" ")) { + System.out.println("error lab "+str); + // System.exit(0); + } + heads[j] = i.heads[j+1]; + if(heads[j]> posNoRoot.length) { + System.out.println("head out of range "+heads[j]+" "+heads.length+" "+str); + heads[j]=posNoRoot.length; + } + + lemmas[j] = i.plemmas[j+1]; + if (lemmas[j].length()==0 ||lemmas[j].equals(" ")) { + System.out.println("error lab "+str); + // System.exit(0); + } + org_lemmas[j] = i.lemmas[j+1]; + // if (org_lemmas[j].length()==0 ||org_lemmas[j].equals(" ")) { + // System.out.println("error lab "+str); + // // System.exit(0); + // } +// of[j] = i.ofeats[j+1]; +// pf[j] = i.pfeats[j+1]; + if (str==6099) { + // System.out.println(formsNoRoot[j]+"\t"+posNoRoot[j]+"\t"+pposs[j]+"\t"+labels[j]+"\t"+heads[j]); + } + + // (instance.fillp!=null) fillp[j] = instance.fillp[j+1]; + } + + SentenceData09 i09 = new SentenceData09(formsNoRoot, org_lemmas, lemmas,pposs, pposs, labels, heads,fillp,of, pf); + + //public SentenceData09(String[] forms, String[] lemmas, String[] olemmas,String[] gpos, String[] ppos, String[] labs, int[] heads, String[] fillpred) { + //SentenceData09 + // SentenceData09 i2 = new SentenceData09(i.forms, i.lemmas,i.org_lemmas,); + + writer.write(i09); + + + } + writer.finishWriting(); + + + } + + + public static void convert0906(String source, String target) throws Exception { + + CONLLReader09 reader = new CONLLReader09(source); + CONLLWriter06 writer = new CONLLWriter06(target); + + + while (true) { + SentenceData09 i = reader.getNext(); + + if (i == null) break; + + + String[] formsNoRoot = new String[i.length()-1]; + String[] posNoRoot = new String[formsNoRoot.length]; + String[] lemmas = new String[formsNoRoot.length]; + + String[] org_lemmas = new String[formsNoRoot.length]; + + String[] of = new String[formsNoRoot.length]; + String[] pf = new String[formsNoRoot.length]; + + String[] pposs = new String[formsNoRoot.length]; + String[] labels = new String[formsNoRoot.length]; + String[] fillp = new String[formsNoRoot.length]; + + int[] heads = new int[formsNoRoot.length]; + + for(int j = 0; j < formsNoRoot.length; j++) { + formsNoRoot[j] = i.forms[j+1]; + posNoRoot[j] = i.gpos[j+1]; + pposs[j] = i.gpos[j+1]; + + labels[j] = i.labels[j+1]; + heads[j] = i.heads[j+1]; + lemmas[j] = i.plemmas[j+1]; + + org_lemmas[j] = i.lemmas[j+1]; + of[j] = i.ofeats[j+1]; + pf[j] = i.pfeats[j+1]; + + // (instance.fillp!=null) fillp[j] = instance.fillp[j+1]; + } + + SentenceData09 i09 = new SentenceData09(formsNoRoot, lemmas, org_lemmas,posNoRoot, pposs, labels, heads,fillp,of, pf); + + //public SentenceData09(String[] forms, String[] lemmas, String[] olemmas,String[] gpos, String[] ppos, String[] labs, int[] heads, String[] fillpred) { + //SentenceData09 + // SentenceData09 i2 = new SentenceData09(i.forms, i.lemmas,i.org_lemmas,); + + writer.write(i09); + + + } + writer.finishWriting(); + + + } + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/util/Convert0409.java b/dependencyParser/basic/mate-tools/src/is2/util/Convert0409.java new file mode 100644 index 0000000..7fc1142 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/util/Convert0409.java @@ -0,0 +1,182 @@ +/** + * + */ +package is2.util; + +import is2.data.SentenceData09; +import is2.io.CONLLReader04; +import is2.io.CONLLReader06; +import is2.io.CONLLReader09; +import is2.io.CONLLWriter06; +import is2.io.CONLLWriter09; + +/** + * @author Dr. Bernd Bohnet, 01.03.2010 + * + * + */ +public class Convert0409 { + + + + public static void main(String args[]) throws Exception { + + convert(args[0],args[1]); + + + } + + public static void convert(String source, String target) throws Exception { + + CONLLReader04 reader = new CONLLReader04(source); + CONLLWriter09 writer = new CONLLWriter09(target); + + int str =0; + while (true) { + SentenceData09 i = reader.getNext(); + str++; + if (i == null) break; + + + String[] formsNoRoot = new String[i.length()-1]; + String[] posNoRoot = new String[formsNoRoot.length]; + String[] lemmas = new String[formsNoRoot.length]; + + String[] org_lemmas = new String[formsNoRoot.length]; + + String[] of = new String[formsNoRoot.length]; + String[] pf = new String[formsNoRoot.length]; + + String[] pposs = new String[formsNoRoot.length]; + String[] labels = new String[formsNoRoot.length]; + String[] fillp = new String[formsNoRoot.length]; + + int[] heads = new int[formsNoRoot.length]; + + + + for(int j = 0; j < formsNoRoot.length; j++) { + formsNoRoot[j] = i.forms[j+1]; + if (formsNoRoot[j].length()==0 ||formsNoRoot[j].equals("")) { + System.out.println("error forms "+str); + // System.exit(0); + formsNoRoot[j]=" "; + } + posNoRoot[j] = i.gpos[j+1]; + if (posNoRoot[j].length()==0 ||posNoRoot[j].equals(" ")) { + System.out.println("error pos "+str); + // System.exit(0); + } + pposs[j] = i.ppos[j+1]; + if (pposs[j].length()==0 ||pposs[j].equals(" ")) { + System.out.println("error pos "+str); + //System.exit(0); + } + + labels[j] = i.labels[j+1]; + if (labels[j].length()==0 ||labels[j].equals(" ")) { + System.out.println("error lab "+str); + // System.exit(0); + } + heads[j] = i.heads[j+1]; + if(heads[j]> posNoRoot.length) { + System.out.println("head out of range "+heads[j]+" "+heads.length+" "+str); + heads[j]=posNoRoot.length; + } + + lemmas[j] = i.plemmas[j+1]; + if (lemmas[j].length()==0 ||lemmas[j].equals(" ")) { + System.out.println("error lab "+str); + // System.exit(0); + } + org_lemmas[j] = i.lemmas[j+1]; + if (org_lemmas[j].length()==0 ||org_lemmas[j].equals(" ")) { + System.out.println("error lab "+str); + // System.exit(0); + } + of[j] = i.ofeats[j+1]; + pf[j] = i.pfeats[j+1]; + if (str==6099) { + // System.out.println(formsNoRoot[j]+"\t"+posNoRoot[j]+"\t"+pposs[j]+"\t"+labels[j]+"\t"+heads[j]); + } + + // (instance.fillp!=null) fillp[j] = instance.fillp[j+1]; + } + + SentenceData09 i09 = new SentenceData09(formsNoRoot, lemmas, org_lemmas,pposs, pposs, labels, heads,fillp,of, pf); + + //public SentenceData09(String[] forms, String[] lemmas, String[] olemmas,String[] gpos, String[] ppos, String[] labs, int[] heads, String[] fillpred) { + //SentenceData09 + // SentenceData09 i2 = new SentenceData09(i.forms, i.lemmas,i.org_lemmas,); + + writer.write(i09); + + + } + writer.finishWriting(); + + + } + + + public static void convert0906(String source, String target) throws Exception { + + CONLLReader09 reader = new CONLLReader09(source); + CONLLWriter06 writer = new CONLLWriter06(target); + + + while (true) { + SentenceData09 i = reader.getNext(); + + if (i == null) break; + + + String[] formsNoRoot = new String[i.length()-1]; + String[] posNoRoot = new String[formsNoRoot.length]; + String[] lemmas = new String[formsNoRoot.length]; + + String[] org_lemmas = new String[formsNoRoot.length]; + + String[] of = new String[formsNoRoot.length]; + String[] pf = new String[formsNoRoot.length]; + + String[] pposs = new String[formsNoRoot.length]; + String[] labels = new String[formsNoRoot.length]; + String[] fillp = new String[formsNoRoot.length]; + + int[] heads = new int[formsNoRoot.length]; + + for(int j = 0; j < formsNoRoot.length; j++) { + formsNoRoot[j] = i.forms[j+1]; + posNoRoot[j] = i.gpos[j+1]; + pposs[j] = i.ppos[j+1]; + + labels[j] = i.labels[j+1]; + heads[j] = i.heads[j+1]; + lemmas[j] = i.plemmas[j+1]; + + org_lemmas[j] = i.lemmas[j+1]; + of[j] = i.ofeats[j+1]; + pf[j] = i.pfeats[j+1]; + + // (instance.fillp!=null) fillp[j] = instance.fillp[j+1]; + } + + SentenceData09 i09 = new SentenceData09(formsNoRoot, lemmas, org_lemmas,posNoRoot, pposs, labels, heads,fillp,of, pf); + + //public SentenceData09(String[] forms, String[] lemmas, String[] olemmas,String[] gpos, String[] ppos, String[] labs, int[] heads, String[] fillpred) { + //SentenceData09 + // SentenceData09 i2 = new SentenceData09(i.forms, i.lemmas,i.org_lemmas,); + + writer.write(i09); + + + } + writer.finishWriting(); + + + } + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/util/ConvertADJ.java b/dependencyParser/basic/mate-tools/src/is2/util/ConvertADJ.java new file mode 100644 index 0000000..b30aabb --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/util/ConvertADJ.java @@ -0,0 +1,129 @@ +/** + * + */ +package is2.util; + +import is2.data.SentenceData09; +import is2.io.CONLLReader04; +import is2.io.CONLLReader06; +import is2.io.CONLLReader09; +import is2.io.CONLLWriter06; +import is2.io.CONLLWriter09; + +/** + * @author Dr. Bernd Bohnet, 01.03.2010 + * + * + */ +public class ConvertADJ { + + + + public static void main(String args[]) throws Exception { + + convert(args[0],args[1]); + + + } + + public static void convert(String source, String target) throws Exception { + + CONLLReader09 reader = new CONLLReader09(source); +// CONLLWriter09 writer = new CONLLWriter09(target); + int adj=0,argadj=0; + int rb=0,argrb=0; + int str =0; + while (true) { + SentenceData09 i = reader.getNext(); + str++; + if (i == null) break; + + + for (int k =0;k<i.length();k++) { + + if (i.gpos[k].startsWith("JJ")) adj++; + if (i.gpos[k].startsWith("RB")) rb++; + + if (i.argposition!=null) { + for(int p=0;p<i.argposition.length;p++) { + if(i.argposition[p]!=null) + for(int a=0;a<i.argposition[p].length;a++) { + if(i.argposition[p][a]==k && i.gpos[k].startsWith("JJ")) argadj ++; + if(i.argposition[p][a]==k && i.gpos[k].startsWith("RB")) argrb ++; + } + + } + } + // (instance.fillp!=null) fillp[j] = instance.fillp[j+1]; + } + + + + } + System.out.println("adj "+adj+ " "+argadj); + System.out.println("rb "+rb+ " "+argrb); + + } + + + public static void convert0906(String source, String target) throws Exception { + + CONLLReader09 reader = new CONLLReader09(source); + CONLLWriter06 writer = new CONLLWriter06(target); + + + while (true) { + SentenceData09 i = reader.getNext(); + + if (i == null) break; + + + String[] formsNoRoot = new String[i.length()-1]; + String[] posNoRoot = new String[formsNoRoot.length]; + String[] lemmas = new String[formsNoRoot.length]; + + String[] org_lemmas = new String[formsNoRoot.length]; + + String[] of = new String[formsNoRoot.length]; + String[] pf = new String[formsNoRoot.length]; + + String[] pposs = new String[formsNoRoot.length]; + String[] labels = new String[formsNoRoot.length]; + String[] fillp = new String[formsNoRoot.length]; + + int[] heads = new int[formsNoRoot.length]; + + for(int j = 0; j < formsNoRoot.length; j++) { + formsNoRoot[j] = i.forms[j+1]; + posNoRoot[j] = i.gpos[j+1]; + pposs[j] = i.ppos[j+1]; + + labels[j] = i.labels[j+1]; + heads[j] = i.heads[j+1]; + lemmas[j] = i.plemmas[j+1]; + + org_lemmas[j] = i.lemmas[j+1]; + of[j] = i.ofeats[j+1]; + pf[j] = i.pfeats[j+1]; + + // (instance.fillp!=null) fillp[j] = instance.fillp[j+1]; + } + + SentenceData09 i09 = new SentenceData09(formsNoRoot, lemmas, org_lemmas,posNoRoot, pposs, labels, heads,fillp,of, pf); + + //public SentenceData09(String[] forms, String[] lemmas, String[] olemmas,String[] gpos, String[] ppos, String[] labs, int[] heads, String[] fillpred) { + //SentenceData09 + // SentenceData09 i2 = new SentenceData09(i.forms, i.lemmas,i.org_lemmas,); + + writer.write(i09); + + + } + writer.finishWriting(); + + + } + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/util/ConvertLowerCase0909.java b/dependencyParser/basic/mate-tools/src/is2/util/ConvertLowerCase0909.java new file mode 100644 index 0000000..e8f19f3 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/util/ConvertLowerCase0909.java @@ -0,0 +1,89 @@ +/** + * + */ +package is2.util; + +import is2.data.SentenceData09; +import is2.io.CONLLReader06; +import is2.io.CONLLReader09; +import is2.io.CONLLWriter06; +import is2.io.CONLLWriter09; + +/** + * @author Dr. Bernd Bohnet, 01.03.2010 + * + * + */ +public class ConvertLowerCase0909 { + + + + public static void main(String args[]) throws Exception { + + + + CONLLReader09 reader = new CONLLReader09(args[0]); + CONLLWriter09 writer = new CONLLWriter09(args[1]); + + int str =0; + while (true) { + SentenceData09 i = reader.getNext(); + str++; + if (i == null) break; + + SentenceData09 i09 = new SentenceData09(i); + i09.createSemantic(i); + + for(int k=0;k<i09.length();k++) { + i09.lemmas[k]=i09.lemmas[k].toLowerCase(); + i09.plemmas[k]=i09.plemmas[k].toLowerCase(); + + } + + writer.write(i09); + + + } + writer.finishWriting(); + + + } + + public static void convert(String source, String target) throws Exception { + + CONLLReader09 reader = new CONLLReader09(source); + CONLLWriter09 writer = new CONLLWriter09(target); + + int str =0; + while (true) { + SentenceData09 i = reader.getNext(); + str++; + if (i == null) break; + + SentenceData09 i09 = new SentenceData09(i); + i09.createSemantic(i); + + for(int k=0;k<i09.length();k++) { + i09.lemmas[k]=i09.lemmas[k].toLowerCase(); + i09.plemmas[k]=i09.plemmas[k].toLowerCase(); + + } + + //public SentenceData09(String[] forms, String[] lemmas, String[] olemmas,String[] gpos, String[] ppos, String[] labs, int[] heads, String[] fillpred) { + //SentenceData09 + // SentenceData09 i2 = new SentenceData09(i.forms, i.lemmas,i.org_lemmas,); + + writer.write(i09); + + + } + writer.finishWriting(); + + + } + + + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/util/ConvertTiger2CoNLL.java b/dependencyParser/basic/mate-tools/src/is2/util/ConvertTiger2CoNLL.java new file mode 100644 index 0000000..bb528f7 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/util/ConvertTiger2CoNLL.java @@ -0,0 +1,124 @@ +/** + * + */ +package is2.util; + + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.StringTokenizer; + +/** + * @author Dr. Bernd Bohnet, 17.01.2010 + * + * This class removes all information from a conll 2009 file except of columns 1 and 2 + * that contain the word id and the word form. + */ +public class ConvertTiger2CoNLL { + + public static void main (String[] args) throws IOException { + + + OptionsSuper options = new OptionsSuper(args,null); + + if (options.trainfile!= null){ + System.err.println("included sentences "+clean(options.trainfile, options.outfile, options.start, options.count)); + } + else System.err.println("Please proivde the file name -train <file-name>"); + + } + + /** + * @param trainfile + * @throws IOException + */ + private static int clean(String file, String outFile, int start, int numberOfSentences) throws IOException { + + System.err.println("writting to "+outFile); + System.err.println("start "+start+" to "+(start+numberOfSentences)); + int state=0; + + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file),"UTF-8"),32768); + BufferedWriter writer = new BufferedWriter(new java.io.OutputStreamWriter (new java.io.FileOutputStream (outFile),"UTF-8"),32768); + String l =null; + try { + + int id =1, snt=0,cnt=0; + + while( (l = reader.readLine())!=null) { + + + if (l.startsWith("#BOS")) { + state=1; //BOS + id=1; + snt++; + continue; + } + if (l.startsWith("#EOS") && state==1) { + state=2; //BOS + cnt++; + + writer.newLine(); + } + + if (start>snt || (start+numberOfSentences)<=snt) { + state=3; + } + + if (l.startsWith("#5")||l.startsWith("#6")||l.startsWith("#7")) continue; + if ((start+numberOfSentences)<=snt) break; + + if (state==3) continue; + + + if (state==1) { + + l = l.replace("\t\t", "\t"); + l = l.replace("\t\t", "\t"); + + StringTokenizer t = new StringTokenizer(l,"\t"); + int count=0; + + writer.write(""+id+"\t"); + + while (t.hasMoreTokens()) { + if (count==0) { + writer.write(t.nextToken()+"\t"); + } else if (count==1) { + writer.write(t.nextToken()+"\t_\t"); + } else if (count==2) { + writer.write(t.nextToken()+"\t_\t"); + } else if (count==3) { + writer.write(t.nextToken().replace(".", "|")+"\t_\t"); + } + else { + t.nextToken(); + } + count++; + } + writer.write("_\t_\t_\t_\t_\t_\t_\t_\t_"); + writer.newLine(); + } + id++; + } + writer.flush(); + writer.close(); + reader.close(); + + return cnt; + } catch (IOException e) { + e.printStackTrace(); + } + + + return -1; + + } + + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/util/DB.java b/dependencyParser/basic/mate-tools/src/is2/util/DB.java new file mode 100755 index 0000000..8218ea5 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/util/DB.java @@ -0,0 +1,81 @@ +package is2.util; + +import java.util.Calendar; +import java.util.GregorianCalendar; + + +public class DB { + + + private static final String ARROW = " -> "; + private static final String LEER = " " ; + private static final String BIG = " " ; + + private static boolean debug = true; + + final static public void println (Object err) { + + if (!debug) return; + + StackTraceElement[] ste = new Exception().getStackTrace(); + + StringBuffer msg = new StringBuffer(); + msg.append((getDate().append(LEER).substring(0,10))); + msg.append(' '); + msg.append(ste[1].getClassName()+" "+ste[1].getLineNumber()); + msg.append(':'); + msg.append(ste[1].getMethodName()); + msg.append(ARROW); + + int l = 55-msg.length(); + if (l < 0) l =0; + msg.append(BIG.substring(0, l)); + + +// if ((m_depth >= 0) && (m_depth < (BIG.length()) )) { +// vDebugMessage.append(BIG.substring(0, m_depth*2)); +// } + + msg.append(err); + + System.err.println(msg); + + + } + + final static public void prints (Object err) { + + if (!debug) return; + System.err.println(err); + + } + + + final private static StringBuffer getDate() { +// if (Preferences.s_debug <= BDebug.FAIL) return s_sb; + + GregorianCalendar s_cal = new GregorianCalendar(); + StringBuffer sb = new StringBuffer(); +// sb.append(s_cal.get(Calendar.HOUR_OF_DAY)); +// sb.append('_'); + sb.append(s_cal.get(Calendar.MINUTE)); + sb.append('.'); + sb.append(s_cal.get(Calendar.SECOND)); + sb.append('.'); + sb.append(s_cal.get(Calendar.MILLISECOND)); + + return sb; + } + + public static void setDebug(boolean b) { + debug=b; + + } + + public static boolean getDebug() { + + return debug; + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/util/Edges.java b/dependencyParser/basic/mate-tools/src/is2/util/Edges.java new file mode 100644 index 0000000..af1a658 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/util/Edges.java @@ -0,0 +1,206 @@ +/** + * + */ +package is2.util; + +import is2.data.PipeGen; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Map.Entry; + +/** + * @author Dr. Bernd Bohnet, 13.05.2009; + * + * + */ +public final class Edges { + + + private static short[][][] edges; + private static HashMap<Short,Integer> labelCount = new HashMap<Short,Integer>(); + + private static HashMap<String,Integer> slabelCount = new HashMap<String,Integer>(); + + + static short[] def = new short[1]; + + private Edges () {} + + /** + * @param length + */ + public static void init(int length) { + edges = new short[length][length][]; + } + + + public static void findDefault(){ + + int best =0; + + + + for(Entry<Short,Integer> e : labelCount.entrySet()) { + + + if (best<e.getValue()) { + best = e.getValue(); + def[0]=e.getKey(); + } + } + + + // labelCount=null; + // String[] types = new String[mf.getFeatureCounter().get(PipeGen.REL)]; + // for (Entry<String, Integer> e : MFO.getFeatureSet().get(PipeGen.REL).entrySet()) types[e.getValue()] = e.getKey(); + + is2.util.DB.println("set default label to "+def[0]+" " ); + + // System.out.println("found default "+def[0]); + + } + + + final static public void put(int pos1, int pos2, short label) { + putD(pos1, pos2,label); + // putD(pos2, pos1,!dir, label); + } + + + final static public void putD(int pos1, int pos2, short label) { + + Integer lc = labelCount.get(label); + if (lc==null) labelCount.put(label, 1); + else labelCount.put(label, lc+1); + + String key = pos1+"-"+pos2+label; + Integer lcs = slabelCount.get(key); + if (lcs==null) slabelCount.put(key, 1); + else slabelCount.put(key, lcs+1); + + if (edges[pos1][pos2]==null) { + edges[pos1][pos2]=new short[1]; + edges[pos1][pos2][0]=label; + +// edgesh[pos1][pos2][dir?0:1] = new TIntHashSet(2); +// edgesh[pos1][pos2][dir?0:1].add(label); + } else { + short labels[] = edges[pos1][pos2]; + for(short l : labels) { + //contains label already? + if(l==label) return; + } + + short[] nlabels = new short[labels.length+1]; + System.arraycopy(labels, 0, nlabels, 0, labels.length); + nlabels[labels.length]=label; + edges[pos1][pos2]=nlabels; + + // edgesh[pos1][pos2][dir?0:1].add(label); + } + } + + final static public short[] get(int pos1, int pos2) { + + if (pos1<0 || pos2<0 || edges[pos1][pos2]==null) return def; + return edges[pos1][pos2]; + } + + + /** + * @param dis + */ + static public void write(DataOutputStream d) throws IOException { + + int len = edges.length; + d.writeShort(len); + + for(int p1 =0;p1<len;p1++) { + for(int p2 =0;p2<len;p2++) { + if (edges[p1][p2]==null) d.writeShort(0); + else { + d.writeShort(edges[p1][p2].length); + for(int l =0;l<edges[p1][p2].length;l++) { + d.writeShort(edges[p1][p2][l]); + } + + } + } + } + + d.writeShort(def[0]); + + } + + + /** + * @param dis + */ + public static void read(DataInputStream d) throws IOException { + int len = d.readShort(); + + edges = new short[len][len][]; + for(int p1 =0;p1<len;p1++) { + for(int p2 =0;p2<len;p2++) { + int ll = d.readShort(); + if (ll==0) { + edges[p1][p2]=null; + } else { + edges[p1][p2] = new short[ll]; + for(int l =0;l<ll;l++) { + edges[p1][p2][l]=d.readShort(); + } + } + } + } + + def[0]= d.readShort(); + + } + + public static class C implements Comparator<Short> { + + public C() { + super(); + } + + String _key; + + public C(String key) { + super(); + _key=key; + } + + /* (non-Javadoc) + * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) + */ + @Override + public int compare(Short l1, Short l2) { + + // int c1 = labelCount.get(l1); + // int c2 = labelCount.get(l2); + // if (true) return c1==c2?0:c1>c2?-1:1; + + int x1 = slabelCount.get(_key+l1.shortValue()); + int x2 = slabelCount.get(_key+l2.shortValue()); + // System.out.println(x1+" "+x2); + + + return x1==x2?0:x1>x2?-1:1; + + + + } + + + + + + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/util/Evaluator.java b/dependencyParser/basic/mate-tools/src/is2/util/Evaluator.java new file mode 100644 index 0000000..c527303 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/util/Evaluator.java @@ -0,0 +1,616 @@ +package is2.util; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Hashtable; +import java.util.Map.Entry; + +//import org.apache.commons.math.stat.inference.TestUtils; + + +import is2.data.Parse; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; + + +public class Evaluator { + + + public static void main(String[] args) { + + Options options = new Options(args); + + if (options.eval && options.significant1==null ) { + + Results r = evaluate(options.goldfile, options.outfile); + + } + /* + else if (options.significant1!=null && options.significant2!=null ) { + + System.out.println("compare1 "+options.significant1); + System.out.println("compare2 "+options.significant2); + System.out.println("gold "+options.goldfile); + + Results r1 = evaluate(options.goldfile, options.significant1,false); + + System.out.println("file 1 done "); + + Results r2 = evaluate(options.goldfile, options.significant2,false); + + double[] s1 = new double[r1.correctHead.size()]; + double[] s2 = new double[r1.correctHead.size()]; + + for(int k=0;k<r1.correctHead.size();k++) { + s1[k] = r1.correctHead.get(k); + s2[k] = r2.correctHead.get(k); + } + + try { + double p = TestUtils.pairedTTest(s1, s2); + System.out.print("significant to "+p); + } catch (Exception e) { + e.printStackTrace(); + } + +// significant(options.significant1, options.significant2) ; + + + } + */ + else if (options.significant1!=null) { + Results r = evaluate(options.goldfile, options.outfile,true); +// significant(options.significant1, options.significant2) ; + + } + + + } + + + /** + * + * @param act_file + * @param pred_file + * @param what top, pos, length, mor + */ + public static void evaluateTagger (String act_file, String pred_file, String what) { + + + CONLLReader09 goldReader = new CONLLReader09(act_file); + + CONLLReader09 predictedReader = new CONLLReader09(); + predictedReader.startReading(pred_file); + + Hashtable<String,Integer> errors = new Hashtable<String,Integer>(); + Hashtable<String,StringBuffer> words = new Hashtable<String,StringBuffer>(); + + int total = 0, numsent = 0, corrT=0; + SentenceData09 goldInstance = goldReader.getNext(); + SentenceData09 predInstance = predictedReader.getNext(); + + + HashMap<Integer,int[]> correctL = new HashMap<Integer,int[]>(); + HashMap<String,int[]> pos = new HashMap<String,int[]>(); + HashMap<String,int[]> mor = new HashMap<String,int[]>(); + + float correctM = 0, allM=0;; + + while(goldInstance != null) { + + int instanceLength = goldInstance.length(); + + if (instanceLength != predInstance.length()) + System.out.println("Lengths do not match on sentence "+numsent); + + String gold[] = goldInstance.gpos; + String pred[] = predInstance.ppos; + + String goldM[] = goldInstance.ofeats; + String predM[] = predInstance.pfeats; + + + // NOTE: the first item is the root info added during nextInstance(), so we skip it. + + for (int i = 1; i < instanceLength; i++) { + + int[] cwr = correctL.get(i); + if (cwr ==null) { + cwr = new int[2]; + correctL.put(i, cwr); + } + cwr[1]++; + int[] correctPos = pos.get(gold[i]); + if (correctPos==null) { + correctPos = new int[2]; + pos.put(gold[i], correctPos); + } + correctPos[1]++; + + int[] correctMor = mor.get(goldM[i]); + if (correctMor==null) { + correctMor = new int[2]; + mor.put(goldM[i], correctMor); + } + + if ((goldM[i].equals("_")&&predM[i]==null) || goldM[i].equals(predM[i])) { + correctM++; + correctMor[0]++; + } + allM++; + correctMor[1]++; + + if (gold[i].equals(pred[i])) { + corrT++; + cwr[0]++; + correctPos[0]++; + } else { + String key = "gold: '"+gold[i]+"' pred: '"+pred[i]+"'"; + Integer cnt = errors.get(key); + StringBuffer errWrd = words.get(key); + if (cnt==null) { + errors.put(key,1); + words.put(key, new StringBuffer().append(goldInstance.forms[i])); + } + else { + errors.put(key,cnt+1); + errWrd.append(" "+goldInstance.forms[i]); + } + } + + + } + total += instanceLength - 1; // Subtract one to not score fake root token + + + numsent++; + + goldInstance = goldReader.getNext(); + predInstance = predictedReader.getNext(); + } + + + + + + // System.out.println("error gold:"+goldPos[i]+" pred:"+predPos[i]+" "+goldInstance.forms[i]+" snt "+numsent+" i:"+i); + ArrayList<Entry<String, Integer>> opsl = new ArrayList<Entry<String, Integer>>(); + for(Entry<String, Integer> e : errors.entrySet()) { + opsl.add(e); + } + + Collections.sort(opsl, new Comparator<Entry<String, Integer>>(){ + + @Override + public int compare(Entry<String, Integer> o1, + Entry<String, Integer> o2) { + + return o1.getValue()==o2.getValue()?0:o1.getValue()>o2.getValue()?-1:1; + } + + + }); + + + int cnt=0; + if (what.contains("top") ) { + System.out.println("top most errors:"); + for(Entry<String, Integer> e : opsl) { + cnt++; + if(e.getValue()>10) System.out.println(e.getKey()+" "+e.getValue()+" context: "+words.get(e.getKey())); + } + } + + if (what.contains("length")) { + for(int k=0;k<60;k++) { + int[] cwr = correctL.get(k); + if (cwr == null) continue; + System.out.print(k+":"+cwr[0]+":"+cwr[1]+":"+(((float)Math.round(10000*(float)((float)cwr[0])/(float)cwr[1]))/100)+" "); + } + System.out.println(); + } + + if (what.contains("pos")) { + for(Entry<String,int[]> e : pos.entrySet()) { + + System.out.print(e.getKey()+":"+e.getValue()[0]+":"+e.getValue()[1]+":"+ + (((float)Math.round(10000*((float)e.getValue()[0])/((float)e.getValue()[1])))/100)+" "); + + } + System.out.print(""); + } + System.out.println(); + if (what.contains("mor")) { + for(Entry<String,int[]> e : mor.entrySet()) { + + System.out.print(e.getKey()+":"+e.getValue()[0]+":"+e.getValue()[1]+":"+ + (((float)Math.round(10000*((float)e.getValue()[0])/((float)e.getValue()[1])))/100)+" "); + + } + System.out.print(""); + } + System.out.println("\nTokens: " + total+" Correct: " + corrT+" "+(float)corrT/total+" Correct M.:"+(int)correctM+ " morphology "+(correctM/total)); + } + + + + + public static int errors(SentenceData09 s, boolean uas) { + + int errors =0; + for (int k =1;k<s.length();k++) { + + if (s.heads[k] != s.pheads[k] && (uas || ! s.labels[k].equals(s.plabels[k]))) { + errors++; + } + } + return errors; + } + + public static int errors(SentenceData09 s1, SentenceData09 s2, HashMap<String,Integer> r1,HashMap<String,Integer> r2) { + + + + int errors =0; + for (int k =1;k<s1.length();k++) { + + if (s1.heads[k] != s1.pheads[k] || (! s1.labels[k].equals(s1.plabels[k]))) { + + if (s2.heads[k] != s2.pheads[k] || (! s2.labels[k].equals(s2.plabels[k]))) { + + // equal do nothing + + } else { + + Integer cnt = r1.get(s1.labels[k]); + if (cnt==null) cnt=0; + cnt++; + r1.put(s1.labels[k],cnt); + + + } + + } + + if (s2.heads[k] != s2.pheads[k] || (! s2.labels[k].equals(s2.plabels[k]))) { + + if (s1.heads[k] != s1.pheads[k] || (! s1.labels[k].equals(s1.plabels[k]))) { + + // equal do nothing + + } else { + + Integer cnt = r2.get(s2.labels[k]); + if (cnt==null) cnt=0; + cnt++; + r2.put(s2.labels[k],cnt); + + + } + + } + } + return errors; + } + + + public static final String PUNCT ="!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; + + public static class Results { + + public int total; + public int corr; + public float las; + public float ula; + public float lpas; + public float upla; + + ArrayList<Double> correctHead; + } + + public static Results evaluate (String act_file, String pred_file) { + return evaluate (act_file, pred_file,true); + } + public static Results evaluate (String act_file, String pred_file, boolean printEval) { + return evaluate ( act_file, pred_file, printEval, false); + } + + + public static Results evaluate (String act_file, String pred_file, boolean printEval, boolean sig) { + + CONLLReader09 goldReader = new CONLLReader09(act_file, -1); + CONLLReader09 predictedReader = new CONLLReader09(pred_file, -1); + + int total = 0, corr = 0, corrL = 0, Ptotal=0, Pcorr = 0, PcorrL = 0, BPtotal=0, BPcorr = 0, BPcorrL = 0, corrLableAndPos=0, corrHeadAndPos=0; + int corrLableAndPosP=0, corrHeadAndPosP=0,corrLableAndPosC=0; + int numsent = 0, corrsent = 0, corrsentL = 0, Pcorrsent = 0, PcorrsentL = 0,sameProj=0;; + int proj=0, nonproj=0, pproj=0, pnonproj=0, nonProjOk=0, nonProjWrong=0; + + int corrOne = 0; + + int correctChnWoPunc =0, correctLChnWoPunc=0,CPtotal=0; + SentenceData09 goldInstance = goldReader.getNext(); + + SentenceData09 predInstance = predictedReader.getNext(); + HashMap<String,Integer> label = new HashMap<String,Integer>(); + HashMap<String,Integer> labelCount = new HashMap<String,Integer>(); + HashMap<String,Integer> labelCorrect = new HashMap<String,Integer>(); + HashMap<String,Integer> falsePositive = new HashMap<String,Integer>(); + + // does the node have the correct head? + ArrayList<Double> correctHead = new ArrayList<Double>(); + + while(goldInstance != null) { + + int instanceLength = goldInstance.length(); + + if (instanceLength != predInstance.length()) + System.out.println("Lengths do not match on sentence "+numsent); + + int[] goldHeads = goldInstance.heads; + String[] goldLabels = goldInstance.labels; + int[] predHeads = predInstance.pheads; + String[] predLabels = predInstance.plabels; + + boolean whole = true; + boolean wholeL = true; + + boolean Pwhole = true; + boolean PwholeL = true; + + + int tlasS=0, totalS=0,corrLabels=0, XLabels=0; + + // NOTE: the first item is the root info added during nextInstance(), so we skip it. + + + + int punc=0, bpunc=0,totalChnWoPunc=0; + for (int i = 1; i < instanceLength; i++) { + + + + Parse p = new Parse(predHeads.length); + for (int k=0;k<p.heads.length;k++) p.heads[k]=(short) predHeads[k]; + + Parse g = new Parse(predHeads.length); + for (int k=0;k<g.heads.length;k++) g.heads[k]=(short) goldHeads[k]; + + + + + + { + Integer count = labelCount.get(goldLabels[i]); + if (count==null)count = 0; + + count++; + + labelCount.put(goldLabels[i], count); + + if(goldLabels[i].equals(predLabels[i])) { + Integer correct = labelCorrect.get(goldLabels[i]); + if (correct ==null) correct =0; + correct ++; + labelCorrect.put(goldLabels[i], correct); + + } else { + Integer fp = falsePositive.get(predLabels[i]); + if (fp ==null) fp =0; + fp ++; + falsePositive.put(predLabels[i], fp); + } + + + } + + { + + } + + + if (goldLabels[i].startsWith("PMOD")) XLabels++; + + boolean tlas =false; + if (predHeads[i] == goldHeads[i]) { + corr++; + + if (goldInstance.gpos[i].equals(predInstance.ppos[i])) corrHeadAndPos ++; + if (goldLabels[i].equals(predLabels[i])) { + corrL++; + // if (predLabels[i].startsWith("PMOD")) + corrLabels++; + // else correctHead.add(0); + if (goldInstance.gpos[i].equals(predInstance.ppos[i])) { + tlasS++; + tlas=true; + corrLableAndPos ++; + } + } + else { + // correctHead.add(0); + // System.out.println(numsent+" error gold "+goldLabels[i]+" "+predLabels[i]+" head "+goldHeads[i]+" child "+i); + wholeL = false; + } + } + else { + + //correctHead.add(0); + + // System.out.println(numsent+"error gold "+goldLabels[i]+" "+predLabels[i]+" head "+goldHeads[i]+" child "+i); + whole = false; wholeL = false; + + Integer count = label.get(goldLabels[i]); + + if (count==null)count = 0; + count++; + label.put(goldLabels[i], count); + + + + int d = Math.abs(goldInstance.heads[i]-i); + } + + + if( ! ("!\"#$%&''()*+,-./:;<=>?@[\\]^_{|}~``".contains(goldInstance.forms[i]))) { + + if (predHeads[i] == goldHeads[i]) { + BPcorr++; + + if (goldLabels[i].equals(predLabels[i])) { + BPcorrL++; + } + else { + // System.out.println(numsent+" error gold "+goldLabels[i]+" "+predLabels[i]+" head "+goldHeads[i]+" child "+i); + // PwholeL = false; + } + } else { + // System.out.println(numsent+"error gold "+goldLabels[i]+" "+predLabels[i]+" head "+goldHeads[i]+" child "+i); + //Pwhole = false; wholeL = false; + } + + } else bpunc++; + + if( ! (",.:''``".contains(goldInstance.forms[i]))) { + + + if (predHeads[i] == goldHeads[i]) { + if (goldInstance.gpos[i].equals(predInstance.ppos[i])) corrHeadAndPosP ++; + Pcorr++; + + if (goldLabels[i].equals(predLabels[i])) { + PcorrL++; + if (goldInstance.gpos[i].equals(predInstance.ppos[i])) corrLableAndPosP ++; + + } + else { + // System.out.println(numsent+" error gold "+goldLabels[i]+" "+predLabels[i]+" head "+goldHeads[i]+" child "+i); + PwholeL = false; + } + } else { + // System.out.println(numsent+"error gold "+goldLabels[i]+" "+predLabels[i]+" head "+goldHeads[i]+" child "+i); + Pwhole = false; PwholeL = false; + } + + } else punc++; + + + if( ! (goldInstance.gpos[i].toLowerCase().startsWith("pu"))) { + if (predHeads[i] == goldHeads[i]) { + correctChnWoPunc++; + + if (goldLabels[i].equals(predLabels[i])) { + correctLChnWoPunc++; + if (goldInstance.gpos[i].equals(predInstance.ppos[i])) corrLableAndPosC ++; + } + else { + // System.out.println(numsent+" error gold "+goldLabels[i]+" "+predLabels[i]+" head "+goldHeads[i]+" child "+i); + // PwholeL = false; + } + } else { + // System.out.println(numsent+"error gold "+goldLabels[i]+" "+predLabels[i]+" head "+goldHeads[i]+" child "+i); + // Pwhole = false; PwholeL = false; + } + + } else totalChnWoPunc++; + + + if (sig) { + if(tlas) System.out.println("1\t"); + else System.out.println("0\t"); + } + + } + total += ((instanceLength - 1)); // Subtract one to not score fake root token + + Ptotal += ((instanceLength - 1) - punc); + BPtotal += ((instanceLength - 1) - bpunc); + CPtotal += ((instanceLength - 1) - totalChnWoPunc); + if(whole) corrsent++; + if(wholeL) corrsentL++; + if(Pwhole) Pcorrsent++; + if(PwholeL) PcorrsentL++; + numsent++; + + goldInstance = goldReader.getNext(); + predInstance = predictedReader.getNext(); + correctHead.add((double) ((double)corrLabels/(instanceLength - 1))); + // System.out.println(""+((double)corrLabels/(instanceLength - 1))); + } + + Results r = new Results(); + + r.correctHead =correctHead; + int mult=100000, diff=1000; + + r.total = total; + r.corr = corr; + r.las =(float)Math.round(((double)corrL/total)*mult)/diff; + r.ula =(float)Math.round(((double)corr /total)*mult)/diff; + r.lpas =(float)Math.round(((double)corrLableAndPos/total)*mult)/diff; + r.upla =(float)Math.round(((double)corrHeadAndPos /total)*mult)/diff; + float tlasp = (float)Math.round(((double)corrLableAndPosP/Ptotal)*mult)/diff; + float tlasc = (float)Math.round(((double)corrLableAndPosC/Ptotal)*mult)/diff; + + // System.out.print("Total: " + total+" \tCorrect: " + corr+" "); + System.out.print(" LAS/Total/UAS/Total: " + r.las+"/" + (double)Math.round(((double)corrsentL/numsent)*mult)/diff+ + "/" + r.ula+"/" + (double)Math.round(((double)corrsent /numsent)*mult)/diff+" LPAS/UPAS "+r.lpas+"/"+r.upla); + + System.out.println("; without . " + (double)Math.round(((double)PcorrL/Ptotal)*mult)/diff+"/" + + (double)Math.round(((double)PcorrsentL/numsent)*mult)/diff+ + "/" + (double)Math.round(((double)Pcorr /Ptotal)*mult)/diff+"/" + + (double)Math.round(((double)Pcorrsent /numsent)*mult)/diff+" TLAS "+tlasp+ + " V2 LAS/UAS "+(double)Math.round(((double)BPcorrL/BPtotal)*mult)/diff+ + "/"+(double)Math.round(((double)BPcorr/BPtotal)*mult)/diff+ + " CHN LAS/UAS "+(double)Math.round(((double)correctLChnWoPunc/CPtotal)*mult)/diff+ + "/"+(double)Math.round(((double)correctChnWoPunc/CPtotal)*mult)/diff+" TLAS "+tlasc); + + float precisionNonProj = ((float)nonProjOk)/((float)nonProjOk+nonProjWrong); + float recallNonProj = ((float)nonProjOk)/((float)(nonproj)); + System.out.println("proj "+proj+" nonp "+nonproj+"; predicted proj "+pproj+" non "+pnonproj+"; nonp correct "+ + nonProjOk+" nonp wrong "+nonProjWrong+ + " precision=(nonProjOk)/(non-projOk+nonProjWrong): "+precisionNonProj+ + " recall=nonProjOk/nonproj="+recallNonProj+" F="+(2*precisionNonProj*recallNonProj)/(precisionNonProj+recallNonProj)); + + if (!printEval) return r; + + + HashMap<String,Integer> totalX = new HashMap<String,Integer>(); + HashMap<String,Integer> totalY = new HashMap<String,Integer>(); + + String A=" "; // & + System.out.println("label\ttp\tcount\trecall\t\ttp\tfp+tp\tprecision\t F-Score "); + + for(Entry<String, Integer> e : labelCount.entrySet()) { + + int tp = labelCorrect.get(e.getKey())==null?0:labelCorrect.get(e.getKey()).intValue(); + Integer count = labelCount.get(e.getKey()); + int fp = falsePositive.get(e.getKey())==null?0:falsePositive.get(e.getKey()).intValue(); + System.out.println(e.getKey()+"\t"+tp+"\t"+count+"\t"+roundPercent((float)tp/count)+"\t\t"+tp+"\t"+(fp+tp)+ + "\t"+roundPercent((float)tp/(fp+tp))+"\t\t"+roundPercent((((float)tp/count))+(float)tp/(fp+tp))/2F); //+totalD + } + + + + + return r; + } + + + public static float round (double v){ + + return Math.round(v*10000F)/10000F; + } + + public static float roundPercent (double v){ + + return Math.round(v*10000F)/100F; + } + + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/util/EvaluatorTagger.java b/dependencyParser/basic/mate-tools/src/is2/util/EvaluatorTagger.java new file mode 100644 index 0000000..c1ee7df --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/util/EvaluatorTagger.java @@ -0,0 +1,736 @@ +package is2.util; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Hashtable; +import java.util.Map.Entry; + + +import org.apache.commons.math.stat.inference.TestUtils; + +import is2.data.Parse; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; + + +public class EvaluatorTagger { + + + public static int TAGGER = 1; + public static int what = 0; + + public static void main(String[] args) { + + Options options = new Options(args); + + what = options.tt; + + if (options.eval && options.significant1==null ) { + + Results r = evaluate(options.goldfile, options.outfile); + + } else if (options.significant1!=null && options.significant2!=null ) { + + System.out.println("compare1 "+options.significant1); + System.out.println("compare2 "+options.significant2); + System.out.println("gold "+options.goldfile); + + check( options.significant1, options.significant2, options.testfile); + + Results r1 = evaluate(options.goldfile, options.significant1,false); + + System.out.println("file 1 done "); + + Results r2 = evaluate(options.goldfile, options.significant2,false); + + double[] s1 = new double[r1.correctHead.size()]; + double[] s2 = new double[r1.correctHead.size()]; + + for(int k=0;k<r1.correctHead.size();k++) { + s1[k] = r1.correctHead.get(k); + s2[k] = r2.correctHead.get(k); + } + + try { + double p = TestUtils.pairedTTest(s1, s2); + System.out.print("significant to "+p); + } catch (Exception e) { + e.printStackTrace(); + } + +// significant(options.significant1, options.significant2) ; + + + } else if (options.significant1!=null) { + Results r = evaluate(options.goldfile, options.outfile,true); +// significant(options.significant1, options.significant2) ; + + } + + + } + + + private static void check(String s1, String s2, String pos) { + CONLLReader09 s1reader = new CONLLReader09(s1, -1); + SentenceData09 s1i = s1reader.getNext(); + CONLLReader09 s2reader = new CONLLReader09(s2, -1); + SentenceData09 s2i = s2reader.getNext(); + + + HashMap<String,HashMap<String,Integer> > labchanged = new HashMap<String,HashMap<String,Integer> > (); + + int snt =0; + + while(s1i != null) { + + snt ++; + int good =0,wrong=0; + + for(int w=1;w<s1i.length();w++) { + + // p(s1:head-pos wrong s2:head-pos good => dep-wrong => dep-good) + + if (s1i.gpos[s1i.heads[w]].equals(pos) && ! + ! s1i.ppos[s1i.heads[w]].equals(s1i.gpos[s1i.heads[w]]) && s2i.ppos[s2i.heads[w]].equals(s2i.gpos[s2i.heads[w]]) + ) { + + + HashMap<String,Integer> changed = labchanged.get(s2i.labels[w]); + if (changed ==null) { + changed= new HashMap<String,Integer>(); + labchanged.put(s2i.labels[w], changed); + } + if (! (s1i.plabels[w].equals(s1i.labels[w]) && s1i.pheads[w] == s1i.heads[w] )&& + (s2i.plabels[w].equals(s2i.labels[w]) && s2i.pheads[w] == s2i.heads[w] ) ) { + good ++; + Integer goodL = changed.get("G"); + if (goodL== null) goodL =0; + goodL+=1; + changed.put("G", goodL); + } + else { + wrong++; + Integer wrongL = changed.get("W"); + if (wrongL== null) wrongL =0; + wrongL+=1; + changed.put("W", wrongL); + } + + + + + + } + + } + + if (good!=0 || wrong!=0) + System.out.println(snt+" changed yes:"+good+" no:"+wrong); + s1i = s1reader.getNext(); + s2i = s2reader.getNext(); + } + System.out.println(""+labchanged); + + } + + + /** + * + * @param act_file + * @param pred_file + * @param what top, pos, length, mor + */ + public static void evaluateTagger (String act_file, String pred_file, String what) { + + + CONLLReader09 goldReader = new CONLLReader09(act_file); + + CONLLReader09 predictedReader = new CONLLReader09(); + predictedReader.startReading(pred_file); + + Hashtable<String,Integer> errors = new Hashtable<String,Integer>(); + Hashtable<String,StringBuffer> words = new Hashtable<String,StringBuffer>(); + + int total = 0, numsent = 0, corrT=0; + SentenceData09 goldInstance = goldReader.getNext(); + SentenceData09 predInstance = predictedReader.getNext(); + + + HashMap<Integer,int[]> correctL = new HashMap<Integer,int[]>(); + HashMap<String,int[]> pos = new HashMap<String,int[]>(); + HashMap<String,int[]> mor = new HashMap<String,int[]>(); + + float correctM = 0, allM=0;; + + while(goldInstance != null) { + + int instanceLength = goldInstance.length(); + + if (instanceLength != predInstance.length()) + System.out.println("Lengths do not match on sentence "+numsent); + + String gold[] = goldInstance.gpos; + String pred[] = predInstance.ppos; + + String goldM[] = goldInstance.ofeats; + String predM[] = predInstance.pfeats; + + + // NOTE: the first item is the root info added during nextInstance(), so we skip it. + + for (int i = 1; i < instanceLength; i++) { + + int[] cwr = correctL.get(i); + if (cwr ==null) { + cwr = new int[2]; + correctL.put(i, cwr); + } + cwr[1]++; + int[] correctPos = pos.get(gold[i]); + if (correctPos==null) { + correctPos = new int[2]; + pos.put(gold[i], correctPos); + } + correctPos[1]++; + + int[] correctMor = mor.get(goldM[i]); + if (correctMor==null) { + correctMor = new int[2]; + mor.put(goldM[i], correctMor); + } + + if ((goldM[i].equals("_")&&predM[i]==null) || goldM[i].equals(predM[i])) { + correctM++; + correctMor[0]++; + } + allM++; + correctMor[1]++; + + if (gold[i].equals(pred[i])) { + corrT++; + cwr[0]++; + correctPos[0]++; + } else { + String key = "gold: '"+gold[i]+"' pred: '"+pred[i]+"'"; + Integer cnt = errors.get(key); + StringBuffer errWrd = words.get(key); + if (cnt==null) { + errors.put(key,1); + words.put(key, new StringBuffer().append(goldInstance.forms[i])); + } + else { + errors.put(key,cnt+1); + errWrd.append(" "+goldInstance.forms[i]); + } + } + + + } + total += instanceLength - 1; // Subtract one to not score fake root token + + + numsent++; + + goldInstance = goldReader.getNext(); + predInstance = predictedReader.getNext(); + } + + + + + + // System.out.println("error gold:"+goldPos[i]+" pred:"+predPos[i]+" "+goldInstance.forms[i]+" snt "+numsent+" i:"+i); + ArrayList<Entry<String, Integer>> opsl = new ArrayList<Entry<String, Integer>>(); + for(Entry<String, Integer> e : errors.entrySet()) { + opsl.add(e); + } + + Collections.sort(opsl, new Comparator<Entry<String, Integer>>(){ + + @Override + public int compare(Entry<String, Integer> o1, + Entry<String, Integer> o2) { + + return o1.getValue()==o2.getValue()?0:o1.getValue()>o2.getValue()?-1:1; + } + + + }); + + + int cnt=0; + if (what.contains("top") ) { + System.out.println("top most errors:"); + for(Entry<String, Integer> e : opsl) { + cnt++; + if(e.getValue()>10) System.out.println(e.getKey()+" "+e.getValue()+" context: "+words.get(e.getKey())); + } + } + + if (what.contains("length")) { + for(int k=0;k<60;k++) { + int[] cwr = correctL.get(k); + if (cwr == null) continue; + System.out.print(k+":"+cwr[0]+":"+cwr[1]+":"+(((float)Math.round(10000*(float)((float)cwr[0])/(float)cwr[1]))/100)+" "); + } + System.out.println(); + } + + if (what.contains("pos")) { + for(Entry<String,int[]> e : pos.entrySet()) { + + System.out.print(e.getKey()+":"+e.getValue()[0]+":"+e.getValue()[1]+":"+ + (((float)Math.round(10000*((float)e.getValue()[0])/((float)e.getValue()[1])))/100)+" "); + + } + System.out.print(""); + } + System.out.println(); + if (what.contains("mor")) { + for(Entry<String,int[]> e : mor.entrySet()) { + + System.out.print(e.getKey()+":"+e.getValue()[0]+":"+e.getValue()[1]+":"+ + (((float)Math.round(10000*((float)e.getValue()[0])/((float)e.getValue()[1])))/100)+" "); + + } + System.out.print(""); + } + System.out.println("\nTokens: " + total+" Correct: " + corrT+" "+(float)corrT/total+" Correct M.:"+(int)correctM+ " morphology "+(correctM/total)); + } + + + + + public static int errors(SentenceData09 s, boolean uas) { + + int errors =0; + for (int k =1;k<s.length();k++) { + + if (s.heads[k] != s.pheads[k] && (uas || ! s.labels[k].equals(s.plabels[k]))) { + errors++; + } + } + return errors; + } + + public static int errors(SentenceData09 s1, SentenceData09 s2, HashMap<String,Integer> r1,HashMap<String,Integer> r2) { + + + + int errors =0; + for (int k =1;k<s1.length();k++) { + + if (s1.heads[k] != s1.pheads[k] || (! s1.labels[k].equals(s1.plabels[k]))) { + + if (s2.heads[k] != s2.pheads[k] || (! s2.labels[k].equals(s2.plabels[k]))) { + + // equal do nothing + + } else { + + Integer cnt = r1.get(s1.labels[k]); + if (cnt==null) cnt=0; + cnt++; + r1.put(s1.labels[k],cnt); + + + } + + } + + if (s2.heads[k] != s2.pheads[k] || (! s2.labels[k].equals(s2.plabels[k]))) { + + if (s1.heads[k] != s1.pheads[k] || (! s1.labels[k].equals(s1.plabels[k]))) { + + // equal do nothing + + } else { + + Integer cnt = r2.get(s2.labels[k]); + if (cnt==null) cnt=0; + cnt++; + r2.put(s2.labels[k],cnt); + + + } + + } + } + return errors; + } + + + public static final String PUNCT ="!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; + + public static class Results { + + public int total; + public int corr; + public float las; + public float ula; + public float lpas; + public float upla; + + ArrayList<Double> correctHead; + } + + public static Results evaluate (String act_file, String pred_file) { + return evaluate (act_file, pred_file,true); + } + public static Results evaluate (String act_file, String pred_file, boolean printEval) { + return evaluate ( act_file, pred_file, printEval, false); + } + + + public static Results evaluate (String act_file, String pred_file, boolean printEval, boolean sig) { + + CONLLReader09 goldReader = new CONLLReader09(act_file, -1); + CONLLReader09 predictedReader = new CONLLReader09(pred_file, -1); + + int total = 0, corr = 0, corrL = 0, Ptotal=0, Pcorr = 0, PcorrL = 0, BPtotal=0, BPcorr = 0, BPcorrL = 0, corrLableAndPos=0, corrHeadAndPos=0; + int corrLableAndPosP=0, corrHeadAndPosP=0,corrLableAndPosC=0; + int numsent = 0, corrsent = 0, corrsentL = 0, Pcorrsent = 0, PcorrsentL = 0,sameProj=0;; + int proj=0, nonproj=0, pproj=0, pnonproj=0, nonProjOk=0, nonProjWrong=0; + + int corrOne = 0; + + int correctChnWoPunc =0, correctLChnWoPunc=0,CPtotal=0; + SentenceData09 goldInstance = goldReader.getNext(); + + SentenceData09 predInstance = predictedReader.getNext(); + HashMap<String,Integer> label = new HashMap<String,Integer>(); + HashMap<String,Integer> labelCount = new HashMap<String,Integer>(); + HashMap<String,Integer> labelCorrect = new HashMap<String,Integer>(); + HashMap<String,Integer> falsePositive = new HashMap<String,Integer>(); + HashMap<String,HashMap<String,Integer> > confusion = new HashMap<String,HashMap<String,Integer> >(); + + HashMap<String,HashMap<String,Integer> > posLabelAssign = new HashMap<String,HashMap<String,Integer> >(); + + // does the node have the correct head? + ArrayList<Double> correctHead = new ArrayList<Double>(); + + while(goldInstance != null) { + + int instanceLength = goldInstance.length(); + + if (instanceLength != predInstance.length()) + System.out.println("Lengths do not match on sentence "+numsent); + + int[] goldHeads = goldInstance.heads; + + String[] goldLabels,predLabels; + if (what == TAGGER) { + goldLabels= goldInstance.gpos; + predLabels= predInstance.ppos; + } + else { + goldLabels = goldInstance.labels ; + predLabels = predInstance.plabels ; + } + + + int[] predHeads = predInstance.pheads; + + + boolean whole = true; + boolean wholeL = true; + + boolean Pwhole = true; + boolean PwholeL = true; + + + int tlasS=0, totalS=0,corrLabels=0, XLabels=0; + + // NOTE: the first item is the root info added during nextInstance(), so we skip it. + + + + int punc=0, bpunc=0,totalChnWoPunc=0; + for (int i = 1; i < instanceLength; i++) { + + + + Parse p = new Parse(predHeads.length); + for (int k=0;k<p.heads.length;k++) p.heads[k]=(short) predHeads[k]; + + Parse g = new Parse(predHeads.length); + for (int k=0;k<g.heads.length;k++) g.heads[k]=(short) goldHeads[k]; + + + + HashMap<String,Integer> labelsNum =posLabelAssign.get(goldInstance.gpos[goldInstance.heads[i]]); + if (labelsNum== null) { + labelsNum = new HashMap<String,Integer>(); + posLabelAssign.put(goldInstance.gpos[goldInstance.heads[i]], labelsNum); + } + + Integer num = labelsNum.get(goldInstance.labels[i]); + if (num==null) num =0; + num++; + labelsNum.put(goldInstance.labels[i],num); + + + + Integer count = labelCount.get(goldLabels[i]); + if (count==null)count = 0; + + count++; + + labelCount.put(goldLabels[i], count); + + if(goldLabels[i].equals(predLabels[i]) && (what==TAGGER || predHeads[i] == goldHeads[i] )) { + Integer correct = labelCorrect.get(goldLabels[i]); + if (correct ==null) correct =0; + correct ++; + labelCorrect.put(goldLabels[i], correct); + + } else { + + Integer fp = falsePositive.get(predLabels[i]); + if (fp ==null) fp =0; + fp ++; + falsePositive.put(predLabels[i], fp); + + HashMap<String,Integer> conf = confusion.get(goldLabels[i]); + if (conf == null) confusion.put(goldLabels[i], conf = new HashMap<String,Integer>()); + + conf.put(predLabels[i], conf.get(predLabels[i])==null?1:conf.get(predLabels[i])+1); + + + } + + + + + + + + + + boolean tlas =false; + if (predHeads[i] == goldHeads[i]) { + corr++; + + if (goldInstance.gpos[i].equals(predInstance.ppos[i])) corrHeadAndPos ++; + if (goldLabels[i].equals(predLabels[i])) { + corrL++; + // if (predLabels[i].startsWith("PMOD")) + corrLabels++; + // else correctHead.add(0); + if (goldInstance.gpos[i].equals(predInstance.ppos[i])) { + tlasS++; + tlas=true; + corrLableAndPos ++; + } + } + else { + // correctHead.add(0); + // System.out.println(numsent+" error gold "+goldLabels[i]+" "+predLabels[i]+" head "+goldHeads[i]+" child "+i); + wholeL = false; + } + } + else { + + //correctHead.add(0); + + // System.out.println(numsent+"error gold "+goldLabels[i]+" "+predLabels[i]+" head "+goldHeads[i]+" child "+i); + whole = false; wholeL = false; + + count = label.get(goldLabels[i]); + + if (count==null)count = 0; + count++; + label.put(goldLabels[i], count); + + + + int d = Math.abs(goldInstance.heads[i]-i); + } + + + if( ! ("!\"#$%&''()*+,-./:;<=>?@[\\]^_{|}~``".contains(goldInstance.forms[i]))) { + + if (predHeads[i] == goldHeads[i]) { + BPcorr++; + + if (goldLabels[i].equals(predLabels[i])) { + BPcorrL++; + } + else { + // System.out.println(numsent+" error gold "+goldLabels[i]+" "+predLabels[i]+" head "+goldHeads[i]+" child "+i); + // PwholeL = false; + } + } else { + // System.out.println(numsent+"error gold "+goldLabels[i]+" "+predLabels[i]+" head "+goldHeads[i]+" child "+i); + //Pwhole = false; wholeL = false; + } + + } else bpunc++; + + if( ! (",.:''``".contains(goldInstance.forms[i]))) { + + + if (predHeads[i] == goldHeads[i]) { + if (goldInstance.gpos[i].equals(predInstance.ppos[i])) corrHeadAndPosP ++; + Pcorr++; + + if (goldLabels[i].equals(predLabels[i])) { + PcorrL++; + if (goldInstance.gpos[i].equals(predInstance.ppos[i])) corrLableAndPosP ++; + + } + else { + // System.out.println(numsent+" error gold "+goldLabels[i]+" "+predLabels[i]+" head "+goldHeads[i]+" child "+i); + PwholeL = false; + } + } else { + // System.out.println(numsent+"error gold "+goldLabels[i]+" "+predLabels[i]+" head "+goldHeads[i]+" child "+i); + Pwhole = false; PwholeL = false; + } + + } else punc++; + + + if( ! (goldInstance.gpos[i].toLowerCase().startsWith("pu"))) { + if (predHeads[i] == goldHeads[i]) { + correctChnWoPunc++; + + if (goldLabels[i].equals(predLabels[i])) { + correctLChnWoPunc++; + if (goldInstance.gpos[i].equals(predInstance.ppos[i])) corrLableAndPosC ++; + } + else { + // System.out.println(numsent+" error gold "+goldLabels[i]+" "+predLabels[i]+" head "+goldHeads[i]+" child "+i); + // PwholeL = false; + } + } else { + // System.out.println(numsent+"error gold "+goldLabels[i]+" "+predLabels[i]+" head "+goldHeads[i]+" child "+i); + // Pwhole = false; PwholeL = false; + } + + } else totalChnWoPunc++; + + + if (sig) { + if(tlas) System.out.println("1\t"); + else System.out.println("0\t"); + } + + } + total += ((instanceLength - 1)); // Subtract one to not score fake root token + + Ptotal += ((instanceLength - 1) - punc); + BPtotal += ((instanceLength - 1) - bpunc); + CPtotal += ((instanceLength - 1) - totalChnWoPunc); + if(whole) corrsent++; + if(wholeL) corrsentL++; + if(Pwhole) Pcorrsent++; + if(PwholeL) PcorrsentL++; + numsent++; + + goldInstance = goldReader.getNext(); + predInstance = predictedReader.getNext(); + correctHead.add((double) ((double)corrLabels/(instanceLength - 1))); + // System.out.println(""+((double)corrLabels/(instanceLength - 1))); + } + + Results r = new Results(); + + r.correctHead =correctHead; + int mult=100000, diff=1000; + + r.total = total; + r.corr = corr; + r.las =(float)Math.round(((double)corrL/total)*mult)/diff; + r.ula =(float)Math.round(((double)corr /total)*mult)/diff; + r.lpas =(float)Math.round(((double)corrLableAndPos/total)*mult)/diff; + r.upla =(float)Math.round(((double)corrHeadAndPos /total)*mult)/diff; + float tlasp = (float)Math.round(((double)corrLableAndPosP/Ptotal)*mult)/diff; + float tlasc = (float)Math.round(((double)corrLableAndPosC/Ptotal)*mult)/diff; + + // System.out.print("Total: " + total+" \tCorrect: " + corr+" "); + System.out.print(" LAS/Total/UAS/Total: " + r.las+"/" + (double)Math.round(((double)corrsentL/numsent)*mult)/diff+ + "/" + r.ula+"/" + (double)Math.round(((double)corrsent /numsent)*mult)/diff+" LPAS/UPAS "+r.lpas+"/"+r.upla); + + System.out.println("; without . " + (double)Math.round(((double)PcorrL/Ptotal)*mult)/diff+"/" + + (double)Math.round(((double)PcorrsentL/numsent)*mult)/diff+ + "/" + (double)Math.round(((double)Pcorr /Ptotal)*mult)/diff+"/" + + (double)Math.round(((double)Pcorrsent /numsent)*mult)/diff+" TLAS "+tlasp+ + " V2 LAS/UAS "+(double)Math.round(((double)BPcorrL/BPtotal)*mult)/diff+ + "/"+(double)Math.round(((double)BPcorr/BPtotal)*mult)/diff+ + " CHN LAS/UAS "+(double)Math.round(((double)correctLChnWoPunc/CPtotal)*mult)/diff+ + "/"+(double)Math.round(((double)correctChnWoPunc/CPtotal)*mult)/diff+" TLAS "+tlasc); + + float precisionNonProj = ((float)nonProjOk)/((float)nonProjOk+nonProjWrong); + float recallNonProj = ((float)nonProjOk)/((float)(nonproj)); + System.out.println("proj "+proj+" nonp "+nonproj+"; predicted proj "+pproj+" non "+pnonproj+"; nonp correct "+ + nonProjOk+" nonp wrong "+nonProjWrong+ + " precision=(nonProjOk)/(non-projOk+nonProjWrong): "+precisionNonProj+ + " recall=nonProjOk/nonproj="+recallNonProj+" F="+(2*precisionNonProj*recallNonProj)/(precisionNonProj+recallNonProj)); + + if (!printEval) return r; + + + HashMap<String,Integer> totalX = new HashMap<String,Integer>(); + HashMap<String,Integer> totalY = new HashMap<String,Integer>(); + + String A=" "; // & + System.out.println("label\ttp\tcount\trecall\t\ttp\tfp+tp\tprecision\t F-Score "); + + for(Entry<String, Integer> e : labelCount.entrySet()) { + + int tp = labelCorrect.get(e.getKey())==null?0:labelCorrect.get(e.getKey()).intValue(); + Integer count = labelCount.get(e.getKey()); + int fp = falsePositive.get(e.getKey())==null?0:falsePositive.get(e.getKey()).intValue(); + System.out.println(e.getKey()+"\t"+tp+"\t"+count+"\t"+roundPercent((float)tp/count)+"\t\t"+tp+"\t"+(fp+tp)+ + "\t"+roundPercent((float)tp/(fp+tp))+"\t\t"+roundPercent((((float)tp/count))+(float)tp/(fp+tp))/2F); //+totalD + } + + for(Entry<String, HashMap<String, Integer>> e : confusion.entrySet()) { + HashMap<String, Integer> values = e.getValue(); + ArrayList<Entry<String, Integer>> entries = new ArrayList<Entry<String, Integer>>(values.entrySet()); + Collections.sort(entries, new Comparator<Entry<String, Integer>>() { + + + + @Override + public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) { + + return o2.getValue().compareTo(o1.getValue()); + } + + + } + ); + + + System.out.println(e.getKey()+"\t"+entries); + + + } + System.out.println(""+posLabelAssign); + + + + return r; + } + + + public static float round (double v){ + + return Math.round(v*10000F)/10000F; + } + + public static float roundPercent (double v){ + + return Math.round(v*10000F)/100F; + } + + + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/util/ExtractParagraphs.java b/dependencyParser/basic/mate-tools/src/is2/util/ExtractParagraphs.java new file mode 100644 index 0000000..a9fabca --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/util/ExtractParagraphs.java @@ -0,0 +1,87 @@ +package is2.util; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.Reader; +import java.nio.channels.Channels; +import java.nio.channels.FileChannel; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.util.StringTokenizer; + +public class ExtractParagraphs { + + /** + + * @param args + * @throws IOException + */ + public static void main(String args[]) throws IOException { + + if (args.length<1) { + System.out.println("Please provide a file name."); + System.exit(0); + } + + File file = new File(args[0]); + file.isDirectory(); + String[] dirs = file.list(); + + BufferedWriter write = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[1]),"UTF-8"),32768); + int cnt=0; + +for (String fileName : dirs) { + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]+fileName),"UTF-8"),32768); + + + + + int state =0; + + String s; + while ((s = reader.readLine()) != null) { + + if (s.startsWith("<P>")||s.startsWith("<p>")) { + state=1; // paragraph start + continue; + } + + + + + if (s.startsWith("</P>")||s.startsWith("</p>")) { + state=2; // paragraph end + write.newLine(); + } + + boolean lastNL =false; + if (state==1) { + String sp[] = s.split("\\. "); + for(String p : sp) { + write.write(p); + // if (sp.length>1) write.newLine(); + } + cnt++; + } + } + + //if (cnt>5000) break; + + reader.close(); +} + write.flush(); + write.close(); + + System.out.println("Extract "+cnt+" lines "); + + + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/util/IntStack.java b/dependencyParser/basic/mate-tools/src/is2/util/IntStack.java new file mode 100644 index 0000000..e92c02c --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/util/IntStack.java @@ -0,0 +1,86 @@ +/** + * + */ +package is2.util; + + + +/** + * @author Dr. Bernd Bohnet, 01.06.2011 + * + * + */ +final public class IntStack { + + final public int[] stack; + public int position =-1; + + public IntStack(int size) { + if (size<=0) stack = new int[1]; + else stack = new int[size+1]; + } + + public IntStack(IntStack s) { + stack=s.stack; + position = s.position; + } + + + public int peek() { + return position==-1?-1:stack[position]; + } + + public void push(int i) { + // if (i ==2)new Exception().printStackTrace(); + stack[++position]=i; + } + + public int pop() { + return position==-1?-1:stack[position--]; + } + + public int size() { + return position+1; + } + + public boolean isEmpty() { + return position==-1?true:false; + } + + public int get(int p) { + return stack[p]; + } + + public void clear() { + position=-1; + } + + /** + * @param b + */ + public void addAll(IntStack b) { + + position=b.position; + if (position<0) return; + + for(int k=0; k<=position;k++) stack[k]=b.stack[k]; + + } + + public boolean contains(int s) {; + + for(int k=0; k<=position;k++) + if (stack[k]==s) return true; + + return false; + } + + public String toString() { + StringBuffer s = new StringBuffer(); + for(int k = position;k>=0;k--) { + s.append(k).append(":").append(this.stack[k]).append(" "); + } + return s.toString(); + } + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/util/Long2Int.java b/dependencyParser/basic/mate-tools/src/is2/util/Long2Int.java new file mode 100644 index 0000000..d461df8 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/util/Long2Int.java @@ -0,0 +1,81 @@ +package is2.util; + +import is2.data.Long2IntInterface; + + +/** + * @author Bernd Bohnet, 01.09.2009 + * + * Maps for the Hash Kernel the long values to the int values. + */ +final public class Long2Int implements Long2IntInterface { + + + public Long2Int() { + size=115911564; + } + + + public Long2Int(int s) { + size=s; + } + + + /** Integer counter for long2int */ + final private int size; //0x03ffffff //0x07ffffff + + + /* (non-Javadoc) + * @see is2.sp09k9992.Long2IntIterface#size() + */ + public int size() {return size;} + + /* (non-Javadoc) + * @see is2.sp09k9992.Long2IntIterface#start() + * has no meaning for this implementation + */ + final public void start() {} + + + /* (non-Javadoc) + * @see is2.sp09k9992.Long2IntIterface#l2i(long) + */ + final public int l2i(long l) { + if (l<0) return -1; + + // this works well LAS 88.138 + // int r= (int)(( l ^ (l&0xffffffff00000000L) >>> 29 ));//0x811c9dc5 ^ // 29 + // return Math.abs(r % size); + // this works a bit better and good with 0x03ffffff + // + /* + long r= l;//26 + l = (l>>12)&0xfffffffffffff000L; + r ^= l;//38 + l = (l>>11)&0xffffffffffffc000L; + r ^= l;//49 + l = (l>>9)& 0xffffffffffff0000L; //53 + r ^= l;//58 + l = (l>>7)&0xfffffffffffc0000L; //62 + r ^=l;//65 + int x = (int)r; + x = x % size; + // return x >= 0 ? x : -x ;// Math.abs(r % size); + + */ + // 26 0x03ffffff + // together with 0x07ffffff 27 88.372 + long r= l;// 27 + l = (l>>13)&0xffffffffffffe000L; + r ^= l; // 40 + l = (l>>11)&0xffffffffffff0000L; + r ^= l; // 51 + l = (l>>9)& 0xfffffffffffc0000L; //53 + r ^= l; // 60 + l = (l>>7)& 0xfffffffffff00000L; //62 + r ^=l; //67 + int x = ((int)r) % size; + + return x >= 0 ? x : -x ; + } +} diff --git a/dependencyParser/basic/mate-tools/src/is2/util/Options.java b/dependencyParser/basic/mate-tools/src/is2/util/Options.java new file mode 100644 index 0000000..5989483 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/util/Options.java @@ -0,0 +1,129 @@ +package is2.util; + +import is2.util.OptionsSuper; + +import java.io.File; + + +public final class Options extends OptionsSuper { + + + + public Options (String[] args) { + + for(int i = 0; i < args.length; i++) { + String[] pair = args[i].split(":"); + + if (pair[0].equals("--help")) explain(); + else if (pair[0].equals("-train")) { + train = true; + trainfile = args[i+1]; + } else if (pair[0].equals("-eval")) { + eval = true; + goldfile =args[i+1]; i++; + } else if (pair[0].equals("-test")) { + test = true; + testfile = args[i+1]; i++; + } else if (pair[0].equals("-i")) { + numIters = Integer.parseInt(args[i+1]); i++; + } + else if (pair[0].equals("-out")) { + outfile = args[i+1]; i++; + } + else if (pair[0].equals("-decode")) { + decodeProjective = args[i+1].equals("proj"); i++; + } + else if (pair[0].equals("-confidence")) { + + conf = true; + } + + else if (pair[0].equals("-count")) { + count = Integer.parseInt(args[i+1]); i++; + } else if (pair[0].equals("-model")) { + modelName = args[i+1]; i++; + } + else if (pair[0].equals("-device")) { + device = args[i+1]; i++; + } else if (pair[0].equals("-tmp")) { + tmp = args[i+1]; i++; + } else if (pair[0].equals("-format")) { + //format = args[i+1]; + formatTask = Integer.parseInt(args[i+1]); i++; + } else if (pair[0].equals("-allfeatures")) { + allFeatures=true; + } else if (pair[0].equals("-nonormalize")) { + normalize=false; + }else if (pair[0].equals("-nframes")) { + //format = args[i+1]; + nbframes= args[i+1]; i++; + + + } else if (pair[0].equals("-pframes")) { + //format = args[i+1]; + pbframes= args[i+1]; i++; + } else if (pair[0].equals("-nopred")) { + nopred =true; + } else if (pair[0].equals("-divide")) { + keep =true; + } else if (pair[0].equals("-lexicon")) { + lexicon= args[i+1]; i++; + + } else super.addOption(args, i); + + } + + + + + + try { + + if (trainfile!=null) { + + if (keep && tmp!=null) { + trainforest = new File(tmp); + if (!trainforest.exists()) keep=false; + + } else + if (tmp!=null) { + trainforest = File.createTempFile("train", ".tmp", new File(tmp)); + trainforest.deleteOnExit(); + } + else { + trainforest = File.createTempFile("train", ".tmp"); //,new File("F:\\") + trainforest.deleteOnExit(); + } + + + } + + + } catch (java.io.IOException e) { + System.out.println("Unable to create tmp files for feature forests!"); + System.out.println(e); + System.exit(0); + } + } + + private void explain() { + System.out.println("Usage: "); + System.out.println("java -class mate.jar is2.parser.Parser [Options]"); + System.out.println(); + System.out.println("Example: "); + System.out.println(" java -class mate.jar is2.parser.Parser -model eps3.model -train corpora/conll08st/train/train.closed -test corpora/conll08st/devel/devel.closed -out b3.test -eval corpora/conll08st/devel/devel.closed -count 2000 -i 6"); + System.out.println(""); + System.out.println("Options:"); + System.out.println(""); + System.out.println(" -train <file> the corpus a model is trained on; default "+this.trainfile); + System.out.println(" -test <file> the input corpus for testing; default "+this.testfile); + System.out.println(" -out <file> the output corpus (result) of a test run; default "+this.outfile); + System.out.println(" -model <file> the parsing model for traing the model is stored in the files"); + System.out.println(" and for parsing the model is load from this file; default "+this.modelName); + System.out.println(" -i <number> the number of training iterations; good numbers are 10 for smaller corpora and 6 for bigger; default "+this.numIters); + System.out.println(" -count <number> the n first sentences of the corpus are take for the training default "+this.count); + System.out.println(" -format <number> conll format of the year 8 or 9; default "+this.formatTask); + + System.exit(0); + } +} diff --git a/dependencyParser/basic/mate-tools/src/is2/util/OptionsSuper.java b/dependencyParser/basic/mate-tools/src/is2/util/OptionsSuper.java new file mode 100755 index 0000000..0a40f73 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/util/OptionsSuper.java @@ -0,0 +1,216 @@ +package is2.util; + +import is2.io.CONLLReader09; + +import java.io.File; + +public class OptionsSuper { + + public String trainfile = null; + public String testfile = null; + public File trainforest = null; + + public String nbframes = null; + public String pbframes = null; + + public boolean nopred = false; + public boolean upper = false; + + public boolean train = false; + public boolean eval = false; + public boolean test = false; + public boolean keep = false; + public boolean flt = false; + public boolean loadTaggerModels =false; + + public String modelName = "prs.mdl"; + public String modelTaggerName = null; + + public String useMapping = null; + public String device = "C:"; + public String tmp = null; + public boolean createForest = true; + public boolean decodeProjective = false; + public double decodeTH = 0.3d; + public String format = "CONLL"; + public int formatTask =9; + public int numIters = 10; + public int best = 1000; + public String outfile = "dp.conll"; + public String charset = "UTF-8"; + public String phraseTrain = null; + public String phraseTest = null; + public String goldfile = null; + public String gout = "sec23.gld"; + public String features = null; + public String lexicon = null; + public int hsize = 0x07ffffff; + public int maxLen = 2000; + public int maxForms = Integer.MAX_VALUE; + public int beam = 4; + public float prune = -100000000; + + public String third =""; + public String second =""; + public String first =""; + + public int cross=10; + + //public boolean secondOrder = true; + public boolean useRelationalFeatures = false; + public int count = 10000000; + public int cores = Integer.MAX_VALUE; + public int start = 0; + public int minOccureForms = 0; + public int tt=30; // tagger averaging + public boolean allFeatures =false; + public boolean normalize =false; + public boolean no2nd =false; + public boolean noLemmas=false; + public boolean few2nd =false,noLinear=false,noMorph=false; + public String clusterFile; + + // output confidence values + public boolean conf =false; + public String phraseFormat="penn"; // tiger | penn + public boolean average = true; + public boolean label =false; + public boolean stack=false; + public boolean oneRoot = false; + + public String significant1 =null,significant2 =null; + + + // horizontal stacking + public int minLength =0, maxLength =Integer.MAX_VALUE; + public boolean overwritegold =false; + + + public static final int MULTIPLICATIVE=1, SHIFT=2; + public int featureCreation = MULTIPLICATIVE; + + + public OptionsSuper (String[] args, String dummy) { + + for(int i = 0; i < args.length; i++) { + i = addOption(args,i); + } + + } + + public OptionsSuper() {} + + + public int addOption(String args[], int i) { + + if (args[i].equals("-train")) { + train = true; + trainfile = args[i+1]; + } else if (args[i].equals("-eval")) { + eval = true; + goldfile =args[i+1]; i++; + } else if (args[i].equals("-gout")) { + gout =args[i+1]; i++; + } else if (args[i].equals("-test")) { + test = true; + testfile = args[i+1]; i++; + } else if (args[i].equals("-sig1")) { + significant1 = args[i+1]; i++; + } else if (args[i].equals("-sig2")) { + significant2 = args[i+1]; i++; + } else if (args[i].equals("-i")) { + numIters = Integer.parseInt(args[i+1]); i++; + } else if (args[i].equals("-out")) { + outfile = args[i+1]; i++; + } else if (args[i].equals("-cluster")) { + clusterFile = args[i+1]; i++; + } + + else if (args[i].equals("-count")) { + count = Integer.parseInt(args[i+1]); i++; + } else if (args[i].equals("-model")) { + modelName = args[i+1]; i++; + } else if (args[i].equals("-tmodel")) { + this.modelTaggerName = args[i+1]; i++; + } else if (args[i].equals("-nonormalize")) { + normalize=false; + } else if (args[i].equals("-float")) { + flt =true; + } else if (args[i].equals("-hsize")) { + hsize= Integer.parseInt(args[i+1]); i++; + } else if (args[i].equals("-charset")) { + charset= args[++i]; + } else if (args[i].equals("-pstrain")) { + this.phraseTrain=args[i+1]; i++; + } else if (args[i].equals("-pstest")) { + this.phraseTest=args[i+1]; i++; + } else if (args[i].equals("-len")) { + maxLen= Integer.parseInt(args[i+1]); i++; + } else if (args[i].equals("-cores")) { + cores= Integer.parseInt(args[i+1]); i++; + } else if (args[i].equals("-start")) { + start= Integer.parseInt(args[i+1]); i++; + } else if (args[i].equals("-max")) { + maxLength= Integer.parseInt(args[i+1]); i++; + } else if (args[i].equals("-min")) { + minLength= Integer.parseInt(args[i+1]); i++; + } else if (args[i].equals("-noLemmas")) { + noLemmas= true; + } else if (args[i].equals("-noavg")) { + this.average= false; + } else if (args[i].equals("-label")) { + label= true; + } else if (args[i].equals("-stack")) { + stack= true; + } else if (args[i].equals("-overwritegold")) { + overwritegold = true; + } else if (args[i].equals("-format")) { + formatTask = Integer.parseInt(args[++i]); + } else if (args[i].equals("-tt")) { + tt = Integer.parseInt(args[++i]); + } else if (args[i].equals("-min-occure-forms")) { + minOccureForms = Integer.parseInt(args[++i]); + } else if (args[i].equals("-loadTaggerModels")) { + this.loadTaggerModels=true;; + + } else if (args[i].equals("-feature_creation")) { + this.featureCreation = args[++i].equals("shift")?SHIFT:MULTIPLICATIVE; + } + + return i; + + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("FLAGS ["); + sb.append("train-file: " + trainfile); + sb.append(" | "); + sb.append("test-file: " + testfile); + sb.append(" | "); + sb.append("gold-file: " + goldfile); + sb.append(" | "); + sb.append("output-file: " + outfile); + sb.append(" | "); + sb.append("model-name: " + modelName); + sb.append(" | "); + sb.append("train: " + train); + sb.append(" | "); + sb.append("test: " + test); + sb.append(" | "); + sb.append("eval: " + eval); + sb.append(" | "); + sb.append("training-iterations: " + numIters); + sb.append(" | "); + sb.append("decode-type: " + decodeProjective); + sb.append(" | "); + sb.append("create-forest: " + createForest); + sb.append(" | "); + sb.append("format: " + format); + + sb.append("]\n"); + return sb.toString(); + } + +} \ No newline at end of file diff --git a/dependencyParser/basic/mate-tools/src/is2/util/ParserEvaluator.java b/dependencyParser/basic/mate-tools/src/is2/util/ParserEvaluator.java new file mode 100644 index 0000000..260e4b7 --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/util/ParserEvaluator.java @@ -0,0 +1,94 @@ +package is2.util; + +import is2.data.SentenceData09; +import is2.io.CONLLReader09; + + +public class ParserEvaluator { + + + + public static final String PUNCT ="!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; + + public static class Results { + + public int total; + public int corr; + public float las; + public float ula; + + } + + public static Results evaluate (String act_file, String pred_file) throws Exception { + + CONLLReader09 goldReader = new CONLLReader09(act_file, -1); + CONLLReader09 predictedReader = new CONLLReader09(pred_file, -1); + + int total = 0, corr = 0, corrL = 0; + int numsent = 0, corrsent = 0, corrsentL = 0; + SentenceData09 goldInstance = goldReader.getNext(); + SentenceData09 predInstance = predictedReader.getNext(); + + while(goldInstance != null) { + + int instanceLength = goldInstance.length(); + + if (instanceLength != predInstance.length()) + System.out.println("Lengths do not match on sentence "+numsent); + + int[] goldHeads = goldInstance.heads; + String[] goldLabels = goldInstance.labels; + int[] predHeads = predInstance.pheads; + String[] predLabels = predInstance.plabels; + + boolean whole = true; + boolean wholeL = true; + + // NOTE: the first item is the root info added during nextInstance(), so we skip it. + + int punc=0; + for (int i = 1; i < instanceLength; i++) { + if (predHeads[i] == goldHeads[i]) { + corr++; + + if (goldLabels[i].equals(predLabels[i])) corrL++; + else { + // System.out.println(numsent+" error gold "+goldLabels[i]+" "+predLabels[i]+" head "+goldHeads[i]+" child "+i); + wholeL = false; + } + } + else { + // System.out.println(numsent+"error gold "+goldLabels[i]+" "+predLabels[i]+" head "+goldHeads[i]+" child "+i); + whole = false; wholeL = false; + } + } + total += ((instanceLength - 1) - punc); // Subtract one to not score fake root token + + if(whole) corrsent++; + if(wholeL) corrsentL++; + numsent++; + + goldInstance = goldReader.getNext(); + predInstance = predictedReader.getNext(); + } + + Results r = new Results(); + + r.total = total; + r.corr = corr; + r.las =(float)Math.round(((double)corrL/total)*100000)/1000; + r.ula =(float)Math.round(((double)corr /total)*100000)/1000; + System.out.print("Total: " + total+" \tCorrect: " + corr+" "); + System.out.println("LAS: " + (double)Math.round(((double)corrL/total)*100000)/1000+" \tTotal: " + (double)Math.round(((double)corrsentL/numsent)*100000)/1000+ + " \tULA: " + (double)Math.round(((double)corr /total)*100000)/1000+" \tTotal: " + (double)Math.round(((double)corrsent /numsent)*100000)/1000); + + return r; + } + + + public static float round (double v){ + + return Math.round(v*10000F)/10000F; + } + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/util/Split.java b/dependencyParser/basic/mate-tools/src/is2/util/Split.java new file mode 100755 index 0000000..48eadbe --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/util/Split.java @@ -0,0 +1,94 @@ +package is2.util; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.Reader; +import java.nio.channels.Channels; +import java.nio.channels.FileChannel; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.util.StringTokenizer; + +public class Split { + + /** + * Splits a tokenized sentences into one word per line format: + * + * Input + * > I am an text . + * > Sentence two ... + * + * Output: + * I _ _ _ ... + * am _ _ _ ... + * ... + * + * @param args + * @throws IOException + */ + public static void main(String args[]) throws IOException { + + if (args.length!=1) { + System.out.println("Please provide a file name."); + System.exit(0); + } + + String filename = args[0]; +// Charset charset = Charset.forName("UTF-8"); + + FileInputStream in = new FileInputStream(filename); + FileChannel channel = in.getChannel(); + CharsetDecoder decoder = Charset.defaultCharset().newDecoder();//charset.newDecoder(); + Reader infile = Channels.newReader(channel , decoder, 16*1024); + BufferedReader bInfile = new BufferedReader(infile); + +// DataOutputStream dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(options.modelName))); + + + String s; + while ((s = bInfile.readLine()) != null) { + + + // do the first tokens contain a colon? + int colon =0; + for(int k=0;k<12;k++) { + if (s.length()<=k) break; + if (s.charAt(k) == ':') { + + colon++; + break; + } + if (s.charAt(k) == ' ') break; + } + + String prefix =colon>0?s.substring(0,s.indexOf(":"))+"_":""; + + if (colon>0) { + s = s.substring(s.indexOf(":")+1); + } + + StringTokenizer t = new StringTokenizer(s); + int i=1; + boolean found=false; + while(t.hasMoreTokens()) { + found =true; + String tk =t.nextToken(); + if (tk.contains("=")) continue; + System.out.print(prefix+i+"\t"); + System.out.print(tk); + System.out.println("\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_"); + i++; + } + if (found) System.out.println(); + + } + bInfile.close(); + + + + + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/util/Split2.java b/dependencyParser/basic/mate-tools/src/is2/util/Split2.java new file mode 100644 index 0000000..1690a3d --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/util/Split2.java @@ -0,0 +1,70 @@ +package is2.util; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.Reader; +import java.nio.channels.Channels; +import java.nio.channels.FileChannel; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.util.StringTokenizer; + +public class Split2 { + + /** + * Splits a tokenized sentences into one word per line format: + * + * Input + * > I am an text . + * > Sentence two ... + * + * Output: + * I _ _ _ ... + * am _ _ _ ... + * ... + * + * @param args + * @throws IOException + */ + public static void main(String args[]) throws IOException { + + if (args.length<1) { + System.out.println("Please provide a file name."); + System.exit(0); + } + + + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]),"UTF-8"),32768); + BufferedWriter write = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[1]),"ISO-8859-1")); + + + String s; + int cnt=0; + while ((s = reader.readLine()) != null) { + StringTokenizer t = new StringTokenizer(s); + while(t.hasMoreTokens()) { + String tk =t.nextToken(); + for(int c : tk.toCharArray()) { + if (c<0 && c>=255) System.out.println("contain sign "+c+" "+cnt); + } + write.write(tk); + write.newLine(); + cnt++; + } + write.newLine(); + } + reader.close(); + write.flush(); + write.close(); + + + + } + + +} diff --git a/dependencyParser/basic/mate-tools/src/is2/util/Split3.java b/dependencyParser/basic/mate-tools/src/is2/util/Split3.java new file mode 100644 index 0000000..03d920c --- /dev/null +++ b/dependencyParser/basic/mate-tools/src/is2/util/Split3.java @@ -0,0 +1,67 @@ +package is2.util; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.Reader; +import java.nio.channels.Channels; +import java.nio.channels.FileChannel; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.util.StringTokenizer; + +public class Split3 { + + /** + * Splits a tokenized sentences into one word per line format: + * + * Input + * > I am an text . + * > Sentence two ... + * + * Output: + * I _ _ _ ... + * am _ _ _ ... + * ... + * + * @param args + * @throws IOException + */ + public static void main(String args[]) throws IOException { + + if (args.length<1) { + System.out.println("Please provide a file name."); + System.exit(0); + } + + + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]),"UTF-8"),32768); + BufferedWriter write = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[1]),"UTF-8"),32768); + + + String s; + int cnt=0; + while ((s = reader.readLine()) != null) { + StringTokenizer t = new StringTokenizer(s); + while(t.hasMoreTokens()) { + String tk =t.nextToken(); + write.write(tk); + write.newLine(); + cnt++; + } + write.newLine(); + } + reader.close(); + write.flush(); + write.close(); + + + + } + + +} diff --git a/dependencyParser/examples/README.txt b/dependencyParser/experimental/examples/README.txt index c5e773a..c5e773a 100644 --- a/dependencyParser/examples/README.txt +++ b/dependencyParser/experimental/examples/README.txt diff --git a/dependencyParser/examples/d2 output.txt b/dependencyParser/experimental/examples/d2 output.txt index 4637aa8..4637aa8 100644 --- a/dependencyParser/examples/d2 output.txt +++ b/dependencyParser/experimental/examples/d2 output.txt diff --git a/dependencyParser/examples/edges output.txt b/dependencyParser/experimental/examples/edges output.txt index c1e80e8..c1e80e8 100644 --- a/dependencyParser/examples/edges output.txt +++ b/dependencyParser/experimental/examples/edges output.txt diff --git a/dependencyParser/examples/is output.txt b/dependencyParser/experimental/examples/is output.txt index 24e879b..24e879b 100644 --- a/dependencyParser/examples/is output.txt +++ b/dependencyParser/experimental/examples/is output.txt diff --git a/dependencyParser/examples/test.csv b/dependencyParser/experimental/examples/test.csv index 4d1eba0..4d1eba0 100644 --- a/dependencyParser/examples/test.csv +++ b/dependencyParser/experimental/examples/test.csv diff --git a/dependencyParser/examples/test.out b/dependencyParser/experimental/examples/test.out index 51d709f..51d709f 100644 --- a/dependencyParser/examples/test.out +++ b/dependencyParser/experimental/examples/test.out diff --git a/dependencyParser/mate-tools/.classpath b/dependencyParser/experimental/mate-tools/.classpath index 4d8fea6..4d8fea6 100644 --- a/dependencyParser/mate-tools/.classpath +++ b/dependencyParser/experimental/mate-tools/.classpath diff --git a/dependencyParser/experimental/mate-tools/.externalToolBuilders/New_Builder.launch b/dependencyParser/experimental/mate-tools/.externalToolBuilders/New_Builder.launch new file mode 100644 index 0000000..eca73f7 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/.externalToolBuilders/New_Builder.launch @@ -0,0 +1,8 @@ +<?xml version="1.0" encoding="UTF-8"?> +<launchConfiguration type="org.eclipse.ui.externaltools.ProgramBuilderLaunchConfigurationType"> +<booleanAttribute key="org.eclipse.debug.ui.ATTR_LAUNCH_IN_BACKGROUND" value="false"/> +<stringAttribute key="org.eclipse.ui.externaltools.ATTR_LOCATION" value="${workspace_loc:/mate-tools/.project}"/> +<stringAttribute key="org.eclipse.ui.externaltools.ATTR_RUN_BUILD_KINDS" value="full,incremental,"/> +<booleanAttribute key="org.eclipse.ui.externaltools.ATTR_TRIGGERS_CONFIGURED" value="true"/> +<stringAttribute key="org.eclipse.ui.externaltools.ATTR_WORKING_DIRECTORY" value="${workspace_loc:/mate-tools}"/> +</launchConfiguration> diff --git a/dependencyParser/experimental/mate-tools/.externalToolBuilders/ana.launch b/dependencyParser/experimental/mate-tools/.externalToolBuilders/ana.launch new file mode 100644 index 0000000..09df90d --- /dev/null +++ b/dependencyParser/experimental/mate-tools/.externalToolBuilders/ana.launch @@ -0,0 +1,20 @@ +<?xml version="1.0" encoding="UTF-8"?> +<launchConfiguration type="org.eclipse.ant.AntBuilderLaunchConfigurationType"> +<booleanAttribute key="org.eclipse.ant.ui.ATTR_TARGETS_UPDATED" value="true"/> +<booleanAttribute key="org.eclipse.ant.ui.DEFAULT_VM_INSTALL" value="false"/> +<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS"> +<listEntry value="/mate-tools/scripts/build.xml"/> +</listAttribute> +<listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES"> +<listEntry value="1"/> +</listAttribute> +<booleanAttribute key="org.eclipse.debug.ui.ATTR_LAUNCH_IN_BACKGROUND" value="false"/> +<stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.ant.ui.AntClasspathProvider"/> +<booleanAttribute key="org.eclipse.jdt.launching.DEFAULT_CLASSPATH" value="true"/> +<stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="mate-tools"/> +<booleanAttribute key="org.eclipse.ui.externaltools.ATTR_BUILDER_ENABLED" value="false"/> +<stringAttribute key="org.eclipse.ui.externaltools.ATTR_LOCATION" value="${workspace_loc:/mate-tools/scripts/build.xml}"/> +<stringAttribute key="org.eclipse.ui.externaltools.ATTR_RUN_BUILD_KINDS" value="full,incremental,"/> +<booleanAttribute key="org.eclipse.ui.externaltools.ATTR_TRIGGERS_CONFIGURED" value="true"/> +<stringAttribute key="org.eclipse.ui.externaltools.ATTR_WORKING_DIRECTORY" value="${workspace_loc:/mate-tools}"/> +</launchConfiguration> diff --git a/dependencyParser/experimental/mate-tools/.project b/dependencyParser/experimental/mate-tools/.project new file mode 100644 index 0000000..f813b9e --- /dev/null +++ b/dependencyParser/experimental/mate-tools/.project @@ -0,0 +1,17 @@ +<?xml version="1.0" encoding="UTF-8"?> +<projectDescription> + <name>mate-tools</name> + <comment></comment> + <projects> + </projects> + <buildSpec> + <buildCommand> + <name>org.eclipse.jdt.core.javabuilder</name> + <arguments> + </arguments> + </buildCommand> + </buildSpec> + <natures> + <nature>org.eclipse.jdt.core.javanature</nature> + </natures> +</projectDescription> diff --git a/dependencyParser/experimental/mate-tools/build.xml b/dependencyParser/experimental/mate-tools/build.xml new file mode 100644 index 0000000..c558279 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/build.xml @@ -0,0 +1,64 @@ +<project name="analyse" default="compile" basedir="."> + <description> + The base ant build file. + </description> + + <!-- set global properties for this build --> + <property name="src" location="src"/> + <property name="classes" location="classes"/> + <property name="dist" location="dist"/> + <property name="include" location="include"/> + + + <target name="init" description="Clears the /class directory"> + <!-- Create the time stamp --> + <tstamp/> + <mkdir dir="dist"/> + <mkdir dir="javadoc"/> + <mkdir dir="classes"/> + </target> + + <target name="compile" depends="init" description="Compile the source" > + <!-- Compile the java code from ${src} into ${build} executable="javac" --> + + <javac srcdir="${src}" + destdir="${classes}" + includeantruntime="false" + executable="javac.exe" + optimize="true" + debug="off" + classpath=""/> + + </target> + + <target name="build" description="Build the distribution .jar file" > + <!-- Create the temporary distribution directory --> + <delete includeEmptyDirs="true"><fileset dir="dist" includes="**/*" excludes="gtc*.jar"/></delete> + <mkdir dir="${dist}/temp-${DSTAMP}"/> + <copy todir="${dist}/temp-${DSTAMP}"><fileset dir="${classes}" /></copy> + <copy todir="${dist}/temp-${DSTAMP}"><fileset dir="${include}" /></copy> + <!-- copy everything from /include/others to dist + <copy todir="${dist}/temp-${DSTAMP}"><fileset dir="${include}/others" /></copy>--> + <!-- copy everything from /include/classes to dist + <copy todir="${dist}/temp-${DSTAMP}"><fileset dir="${include}/classes" /></copy>--> + <!-- pack everything into a .jar file --> + <jar jarfile="${dist}/anna-3.5.jar" + basedir="${dist}/temp-${DSTAMP}"/> + <delete dir="${dist}/temp-{DSTAMP}" /> + </target> + + <target name="javadoc" depends="init" description="Create the javadoc API documentation" > + <delete includeEmptyDirs="true"><fileset dir="javadoc" includes="**/*"/></delete> + <!-- TODO: you might add new packages to packagenames --> + <javadoc destdir="javadoc" access="package" source="1.4" + use="false" notree="false" nonavbar="false" noindex="true" + splitindex="false" author="true" version="true" + nodeprecatedlist="true" nodeprecated="false" + packagenames="gtc.*.*" + sourcepath="src" classpath="class"/> + </target> + + + <target name="all" depends="init,compile,javadoc" description="Make all" /> +</project> + diff --git a/dependencyParser/mate-tools/classes/decoder/ParallelDecoder$DSet.class b/dependencyParser/experimental/mate-tools/classes/decoder/ParallelDecoder$DSet.class index afd509c..afd509c 100644 --- a/dependencyParser/mate-tools/classes/decoder/ParallelDecoder$DSet.class +++ b/dependencyParser/experimental/mate-tools/classes/decoder/ParallelDecoder$DSet.class diff --git a/dependencyParser/mate-tools/classes/decoder/ParallelDecoder.class b/dependencyParser/experimental/mate-tools/classes/decoder/ParallelDecoder.class index 238fd06..238fd06 100644 --- a/dependencyParser/mate-tools/classes/decoder/ParallelDecoder.class +++ b/dependencyParser/experimental/mate-tools/classes/decoder/ParallelDecoder.class diff --git a/dependencyParser/mate-tools/classes/decoder/ParallelRearrangeNBest$PA.class b/dependencyParser/experimental/mate-tools/classes/decoder/ParallelRearrangeNBest$PA.class index 3edcd6b..3edcd6b 100644 --- a/dependencyParser/mate-tools/classes/decoder/ParallelRearrangeNBest$PA.class +++ b/dependencyParser/experimental/mate-tools/classes/decoder/ParallelRearrangeNBest$PA.class diff --git a/dependencyParser/mate-tools/classes/decoder/ParallelRearrangeNBest.class b/dependencyParser/experimental/mate-tools/classes/decoder/ParallelRearrangeNBest.class index c604932..c604932 100644 --- a/dependencyParser/mate-tools/classes/decoder/ParallelRearrangeNBest.class +++ b/dependencyParser/experimental/mate-tools/classes/decoder/ParallelRearrangeNBest.class diff --git a/dependencyParser/mate-tools/classes/decoder/ParallelRearrangeNBest2$PA.class b/dependencyParser/experimental/mate-tools/classes/decoder/ParallelRearrangeNBest2$PA.class index f1639e7..f1639e7 100644 --- a/dependencyParser/mate-tools/classes/decoder/ParallelRearrangeNBest2$PA.class +++ b/dependencyParser/experimental/mate-tools/classes/decoder/ParallelRearrangeNBest2$PA.class diff --git a/dependencyParser/mate-tools/classes/decoder/ParallelRearrangeNBest2.class b/dependencyParser/experimental/mate-tools/classes/decoder/ParallelRearrangeNBest2.class index 80e8921..80e8921 100644 --- a/dependencyParser/mate-tools/classes/decoder/ParallelRearrangeNBest2.class +++ b/dependencyParser/experimental/mate-tools/classes/decoder/ParallelRearrangeNBest2.class diff --git a/dependencyParser/mate-tools/classes/examples/DependencyParser.class b/dependencyParser/experimental/mate-tools/classes/examples/DependencyParser.class index 89ed2d4..89ed2d4 100644 --- a/dependencyParser/mate-tools/classes/examples/DependencyParser.class +++ b/dependencyParser/experimental/mate-tools/classes/examples/DependencyParser.class diff --git a/dependencyParser/mate-tools/classes/examples/FullPipelineSpanish.class b/dependencyParser/experimental/mate-tools/classes/examples/FullPipelineSpanish.class index 256caa9..256caa9 100644 --- a/dependencyParser/mate-tools/classes/examples/FullPipelineSpanish.class +++ b/dependencyParser/experimental/mate-tools/classes/examples/FullPipelineSpanish.class diff --git a/dependencyParser/mate-tools/classes/examples/FullPipelineTest.class b/dependencyParser/experimental/mate-tools/classes/examples/FullPipelineTest.class index d76a879..d76a879 100644 --- a/dependencyParser/mate-tools/classes/examples/FullPipelineTest.class +++ b/dependencyParser/experimental/mate-tools/classes/examples/FullPipelineTest.class diff --git a/dependencyParser/mate-tools/classes/examples/MorphTagger.class b/dependencyParser/experimental/mate-tools/classes/examples/MorphTagger.class index 305225c..305225c 100644 --- a/dependencyParser/mate-tools/classes/examples/MorphTagger.class +++ b/dependencyParser/experimental/mate-tools/classes/examples/MorphTagger.class diff --git a/dependencyParser/mate-tools/classes/examples/ParseOnly.class b/dependencyParser/experimental/mate-tools/classes/examples/ParseOnly.class index 73a8f49..73a8f49 100644 --- a/dependencyParser/mate-tools/classes/examples/ParseOnly.class +++ b/dependencyParser/experimental/mate-tools/classes/examples/ParseOnly.class diff --git a/dependencyParser/mate-tools/classes/examples/Pipeline.class b/dependencyParser/experimental/mate-tools/classes/examples/Pipeline.class index b8631e1..b8631e1 100644 --- a/dependencyParser/mate-tools/classes/examples/Pipeline.class +++ b/dependencyParser/experimental/mate-tools/classes/examples/Pipeline.class diff --git a/dependencyParser/mate-tools/classes/extractors/Extractor.class b/dependencyParser/experimental/mate-tools/classes/extractors/Extractor.class index 47ad699..47ad699 100644 --- a/dependencyParser/mate-tools/classes/extractors/Extractor.class +++ b/dependencyParser/experimental/mate-tools/classes/extractors/Extractor.class diff --git a/dependencyParser/mate-tools/classes/extractors/ExtractorClusterStacked.class b/dependencyParser/experimental/mate-tools/classes/extractors/ExtractorClusterStacked.class index d8cc567..d8cc567 100644 --- a/dependencyParser/mate-tools/classes/extractors/ExtractorClusterStacked.class +++ b/dependencyParser/experimental/mate-tools/classes/extractors/ExtractorClusterStacked.class diff --git a/dependencyParser/mate-tools/classes/extractors/ExtractorClusterStackedR2.class b/dependencyParser/experimental/mate-tools/classes/extractors/ExtractorClusterStackedR2.class index 2c32133..2c32133 100644 --- a/dependencyParser/mate-tools/classes/extractors/ExtractorClusterStackedR2.class +++ b/dependencyParser/experimental/mate-tools/classes/extractors/ExtractorClusterStackedR2.class diff --git a/dependencyParser/mate-tools/classes/extractors/ExtractorFactory.class b/dependencyParser/experimental/mate-tools/classes/extractors/ExtractorFactory.class index 9f8fa84..9f8fa84 100644 --- a/dependencyParser/mate-tools/classes/extractors/ExtractorFactory.class +++ b/dependencyParser/experimental/mate-tools/classes/extractors/ExtractorFactory.class diff --git a/dependencyParser/mate-tools/classes/extractors/ExtractorReranker.class b/dependencyParser/experimental/mate-tools/classes/extractors/ExtractorReranker.class index 6173744..6173744 100644 --- a/dependencyParser/mate-tools/classes/extractors/ExtractorReranker.class +++ b/dependencyParser/experimental/mate-tools/classes/extractors/ExtractorReranker.class diff --git a/dependencyParser/mate-tools/classes/extractors/ParallelExtract$DSet.class b/dependencyParser/experimental/mate-tools/classes/extractors/ParallelExtract$DSet.class index f7ef6bf..f7ef6bf 100644 --- a/dependencyParser/mate-tools/classes/extractors/ParallelExtract$DSet.class +++ b/dependencyParser/experimental/mate-tools/classes/extractors/ParallelExtract$DSet.class diff --git a/dependencyParser/mate-tools/classes/extractors/ParallelExtract.class b/dependencyParser/experimental/mate-tools/classes/extractors/ParallelExtract.class index 1ef577d..1ef577d 100644 --- a/dependencyParser/mate-tools/classes/extractors/ParallelExtract.class +++ b/dependencyParser/experimental/mate-tools/classes/extractors/ParallelExtract.class diff --git a/dependencyParser/mate-tools/classes/is2/data/Closed.class b/dependencyParser/experimental/mate-tools/classes/is2/data/Closed.class index e7abce7..e7abce7 100644 --- a/dependencyParser/mate-tools/classes/is2/data/Closed.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/Closed.class diff --git a/dependencyParser/mate-tools/classes/is2/data/Cluster.class b/dependencyParser/experimental/mate-tools/classes/is2/data/Cluster.class index 330151f..330151f 100644 --- a/dependencyParser/mate-tools/classes/is2/data/Cluster.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/Cluster.class diff --git a/dependencyParser/mate-tools/classes/is2/data/D4.class b/dependencyParser/experimental/mate-tools/classes/is2/data/D4.class index 544eae0..544eae0 100644 --- a/dependencyParser/mate-tools/classes/is2/data/D4.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/D4.class diff --git a/dependencyParser/mate-tools/classes/is2/data/D6.class b/dependencyParser/experimental/mate-tools/classes/is2/data/D6.class index 274867a..274867a 100644 --- a/dependencyParser/mate-tools/classes/is2/data/D6.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/D6.class diff --git a/dependencyParser/mate-tools/classes/is2/data/D7.class b/dependencyParser/experimental/mate-tools/classes/is2/data/D7.class index 4cf1121..4cf1121 100644 --- a/dependencyParser/mate-tools/classes/is2/data/D7.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/D7.class diff --git a/dependencyParser/mate-tools/classes/is2/data/DPSTree.class b/dependencyParser/experimental/mate-tools/classes/is2/data/DPSTree.class index c3cf0a4..c3cf0a4 100644 --- a/dependencyParser/mate-tools/classes/is2/data/DPSTree.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/DPSTree.class diff --git a/dependencyParser/mate-tools/classes/is2/data/DX.class b/dependencyParser/experimental/mate-tools/classes/is2/data/DX.class index 773f0d7..773f0d7 100644 --- a/dependencyParser/mate-tools/classes/is2/data/DX.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/DX.class diff --git a/dependencyParser/mate-tools/classes/is2/data/DataF.class b/dependencyParser/experimental/mate-tools/classes/is2/data/DataF.class index eb52b0e..eb52b0e 100644 --- a/dependencyParser/mate-tools/classes/is2/data/DataF.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/DataF.class diff --git a/dependencyParser/mate-tools/classes/is2/data/DataFES.class b/dependencyParser/experimental/mate-tools/classes/is2/data/DataFES.class index c4c1f7d..c4c1f7d 100644 --- a/dependencyParser/mate-tools/classes/is2/data/DataFES.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/DataFES.class diff --git a/dependencyParser/mate-tools/classes/is2/data/DataT.class b/dependencyParser/experimental/mate-tools/classes/is2/data/DataT.class index 5ed10d3..5ed10d3 100644 --- a/dependencyParser/mate-tools/classes/is2/data/DataT.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/DataT.class diff --git a/dependencyParser/mate-tools/classes/is2/data/Edges$C.class b/dependencyParser/experimental/mate-tools/classes/is2/data/Edges$C.class index b57036e..b57036e 100644 --- a/dependencyParser/mate-tools/classes/is2/data/Edges$C.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/Edges$C.class diff --git a/dependencyParser/mate-tools/classes/is2/data/Edges.class b/dependencyParser/experimental/mate-tools/classes/is2/data/Edges.class index e293bbe..e293bbe 100644 --- a/dependencyParser/mate-tools/classes/is2/data/Edges.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/Edges.class diff --git a/dependencyParser/mate-tools/classes/is2/data/F2S.class b/dependencyParser/experimental/mate-tools/classes/is2/data/F2S.class index 1c9ed89..1c9ed89 100644 --- a/dependencyParser/mate-tools/classes/is2/data/F2S.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/F2S.class diff --git a/dependencyParser/mate-tools/classes/is2/data/F2SD.class b/dependencyParser/experimental/mate-tools/classes/is2/data/F2SD.class index a2b7a83..a2b7a83 100644 --- a/dependencyParser/mate-tools/classes/is2/data/F2SD.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/F2SD.class diff --git a/dependencyParser/mate-tools/classes/is2/data/F2SF.class b/dependencyParser/experimental/mate-tools/classes/is2/data/F2SF.class index f782eef..f782eef 100644 --- a/dependencyParser/mate-tools/classes/is2/data/F2SF.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/F2SF.class diff --git a/dependencyParser/mate-tools/classes/is2/data/F2SP.class b/dependencyParser/experimental/mate-tools/classes/is2/data/F2SP.class index 34b66e0..34b66e0 100644 --- a/dependencyParser/mate-tools/classes/is2/data/F2SP.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/F2SP.class diff --git a/dependencyParser/mate-tools/classes/is2/data/F2ST.class b/dependencyParser/experimental/mate-tools/classes/is2/data/F2ST.class index a88c6a6..a88c6a6 100644 --- a/dependencyParser/mate-tools/classes/is2/data/F2ST.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/F2ST.class diff --git a/dependencyParser/mate-tools/classes/is2/data/FV.class b/dependencyParser/experimental/mate-tools/classes/is2/data/FV.class index b4c89d2..b4c89d2 100644 --- a/dependencyParser/mate-tools/classes/is2/data/FV.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/FV.class diff --git a/dependencyParser/mate-tools/classes/is2/data/FVR.class b/dependencyParser/experimental/mate-tools/classes/is2/data/FVR.class index 9d75826..9d75826 100644 --- a/dependencyParser/mate-tools/classes/is2/data/FVR.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/FVR.class diff --git a/dependencyParser/mate-tools/classes/is2/data/IEncoder.class b/dependencyParser/experimental/mate-tools/classes/is2/data/IEncoder.class index 48b863c..48b863c 100644 --- a/dependencyParser/mate-tools/classes/is2/data/IEncoder.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/IEncoder.class diff --git a/dependencyParser/mate-tools/classes/is2/data/IEncoderPlus.class b/dependencyParser/experimental/mate-tools/classes/is2/data/IEncoderPlus.class index 4ea4883..4ea4883 100644 --- a/dependencyParser/mate-tools/classes/is2/data/IEncoderPlus.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/IEncoderPlus.class diff --git a/dependencyParser/mate-tools/classes/is2/data/IFV.class b/dependencyParser/experimental/mate-tools/classes/is2/data/IFV.class index 652636d..652636d 100644 --- a/dependencyParser/mate-tools/classes/is2/data/IFV.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/IFV.class diff --git a/dependencyParser/mate-tools/classes/is2/data/Instances.class b/dependencyParser/experimental/mate-tools/classes/is2/data/Instances.class index 014416a..014416a 100644 --- a/dependencyParser/mate-tools/classes/is2/data/Instances.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/Instances.class diff --git a/dependencyParser/mate-tools/classes/is2/data/InstancesTagger.class b/dependencyParser/experimental/mate-tools/classes/is2/data/InstancesTagger.class index dc155f8..dc155f8 100644 --- a/dependencyParser/mate-tools/classes/is2/data/InstancesTagger.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/InstancesTagger.class diff --git a/dependencyParser/mate-tools/classes/is2/data/IntIntHash.class b/dependencyParser/experimental/mate-tools/classes/is2/data/IntIntHash.class index 246e0c9..246e0c9 100644 --- a/dependencyParser/mate-tools/classes/is2/data/IntIntHash.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/IntIntHash.class diff --git a/dependencyParser/mate-tools/classes/is2/data/Long2Int.class b/dependencyParser/experimental/mate-tools/classes/is2/data/Long2Int.class index c11b75c..c11b75c 100644 --- a/dependencyParser/mate-tools/classes/is2/data/Long2Int.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/Long2Int.class diff --git a/dependencyParser/mate-tools/classes/is2/data/Long2IntExact.class b/dependencyParser/experimental/mate-tools/classes/is2/data/Long2IntExact.class index 830c90c..830c90c 100644 --- a/dependencyParser/mate-tools/classes/is2/data/Long2IntExact.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/Long2IntExact.class diff --git a/dependencyParser/mate-tools/classes/is2/data/Long2IntInterface.class b/dependencyParser/experimental/mate-tools/classes/is2/data/Long2IntInterface.class index af26b65..af26b65 100644 --- a/dependencyParser/mate-tools/classes/is2/data/Long2IntInterface.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/Long2IntInterface.class diff --git a/dependencyParser/mate-tools/classes/is2/data/Long2IntQuick.class b/dependencyParser/experimental/mate-tools/classes/is2/data/Long2IntQuick.class index 054e0e2..054e0e2 100644 --- a/dependencyParser/mate-tools/classes/is2/data/Long2IntQuick.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/Long2IntQuick.class diff --git a/dependencyParser/mate-tools/classes/is2/data/MFB.class b/dependencyParser/experimental/mate-tools/classes/is2/data/MFB.class index 58aa597..58aa597 100644 --- a/dependencyParser/mate-tools/classes/is2/data/MFB.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/MFB.class diff --git a/dependencyParser/mate-tools/classes/is2/data/MFC.class b/dependencyParser/experimental/mate-tools/classes/is2/data/MFC.class index fa3e098..fa3e098 100644 --- a/dependencyParser/mate-tools/classes/is2/data/MFC.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/MFC.class diff --git a/dependencyParser/mate-tools/classes/is2/data/MFO$Data4.class b/dependencyParser/experimental/mate-tools/classes/is2/data/MFO$Data4.class index dbc74bd..dbc74bd 100644 --- a/dependencyParser/mate-tools/classes/is2/data/MFO$Data4.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/MFO$Data4.class diff --git a/dependencyParser/mate-tools/classes/is2/data/MFO.class b/dependencyParser/experimental/mate-tools/classes/is2/data/MFO.class index 1125008..1125008 100644 --- a/dependencyParser/mate-tools/classes/is2/data/MFO.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/MFO.class diff --git a/dependencyParser/mate-tools/classes/is2/data/Open.class b/dependencyParser/experimental/mate-tools/classes/is2/data/Open.class index a34e40f..a34e40f 100644 --- a/dependencyParser/mate-tools/classes/is2/data/Open.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/Open.class diff --git a/dependencyParser/mate-tools/classes/is2/data/PSTree.class b/dependencyParser/experimental/mate-tools/classes/is2/data/PSTree.class index 3b68f29..3b68f29 100644 --- a/dependencyParser/mate-tools/classes/is2/data/PSTree.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/PSTree.class diff --git a/dependencyParser/mate-tools/classes/is2/data/Parameter.class b/dependencyParser/experimental/mate-tools/classes/is2/data/Parameter.class index f9a70aa..f9a70aa 100644 --- a/dependencyParser/mate-tools/classes/is2/data/Parameter.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/Parameter.class diff --git a/dependencyParser/mate-tools/classes/is2/data/ParametersFloat.class b/dependencyParser/experimental/mate-tools/classes/is2/data/ParametersFloat.class index 53c3f0a..53c3f0a 100644 --- a/dependencyParser/mate-tools/classes/is2/data/ParametersFloat.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/ParametersFloat.class diff --git a/dependencyParser/mate-tools/classes/is2/data/Parse.class b/dependencyParser/experimental/mate-tools/classes/is2/data/Parse.class index bfec325..bfec325 100644 --- a/dependencyParser/mate-tools/classes/is2/data/Parse.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/Parse.class diff --git a/dependencyParser/mate-tools/classes/is2/data/ParseNBest.class b/dependencyParser/experimental/mate-tools/classes/is2/data/ParseNBest.class index 180e58c..180e58c 100644 --- a/dependencyParser/mate-tools/classes/is2/data/ParseNBest.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/ParseNBest.class diff --git a/dependencyParser/mate-tools/classes/is2/data/PipeGen.class b/dependencyParser/experimental/mate-tools/classes/is2/data/PipeGen.class index d3400c4..d3400c4 100644 --- a/dependencyParser/mate-tools/classes/is2/data/PipeGen.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/PipeGen.class diff --git a/dependencyParser/mate-tools/classes/is2/data/PrimeFinder.class b/dependencyParser/experimental/mate-tools/classes/is2/data/PrimeFinder.class index ec51f13..ec51f13 100644 --- a/dependencyParser/mate-tools/classes/is2/data/PrimeFinder.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/PrimeFinder.class diff --git a/dependencyParser/mate-tools/classes/is2/data/RandomIndex.class b/dependencyParser/experimental/mate-tools/classes/is2/data/RandomIndex.class index 954f57c..954f57c 100644 --- a/dependencyParser/mate-tools/classes/is2/data/RandomIndex.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/RandomIndex.class diff --git a/dependencyParser/mate-tools/classes/is2/data/SentenceData09.class b/dependencyParser/experimental/mate-tools/classes/is2/data/SentenceData09.class index ee6d068..ee6d068 100644 --- a/dependencyParser/mate-tools/classes/is2/data/SentenceData09.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/SentenceData09.class diff --git a/dependencyParser/mate-tools/classes/is2/data/Thesaurus.class b/dependencyParser/experimental/mate-tools/classes/is2/data/Thesaurus.class index a7d20d4..a7d20d4 100644 --- a/dependencyParser/mate-tools/classes/is2/data/Thesaurus.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/data/Thesaurus.class diff --git a/dependencyParser/mate-tools/classes/is2/io/CONLLReader04.class b/dependencyParser/experimental/mate-tools/classes/is2/io/CONLLReader04.class index 3d0103f..3d0103f 100644 --- a/dependencyParser/mate-tools/classes/is2/io/CONLLReader04.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/io/CONLLReader04.class diff --git a/dependencyParser/mate-tools/classes/is2/io/CONLLReader06.class b/dependencyParser/experimental/mate-tools/classes/is2/io/CONLLReader06.class index 06f8226..06f8226 100644 --- a/dependencyParser/mate-tools/classes/is2/io/CONLLReader06.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/io/CONLLReader06.class diff --git a/dependencyParser/mate-tools/classes/is2/io/CONLLReader08.class b/dependencyParser/experimental/mate-tools/classes/is2/io/CONLLReader08.class index b1c4919..b1c4919 100644 --- a/dependencyParser/mate-tools/classes/is2/io/CONLLReader08.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/io/CONLLReader08.class diff --git a/dependencyParser/mate-tools/classes/is2/io/CONLLReader09.class b/dependencyParser/experimental/mate-tools/classes/is2/io/CONLLReader09.class index 98cbf01..98cbf01 100644 --- a/dependencyParser/mate-tools/classes/is2/io/CONLLReader09.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/io/CONLLReader09.class diff --git a/dependencyParser/mate-tools/classes/is2/io/CONLLWriter06.class b/dependencyParser/experimental/mate-tools/classes/is2/io/CONLLWriter06.class index e1cdf31..e1cdf31 100644 --- a/dependencyParser/mate-tools/classes/is2/io/CONLLWriter06.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/io/CONLLWriter06.class diff --git a/dependencyParser/mate-tools/classes/is2/io/CONLLWriter09.class b/dependencyParser/experimental/mate-tools/classes/is2/io/CONLLWriter09.class index 2697f96..2697f96 100644 --- a/dependencyParser/mate-tools/classes/is2/io/CONLLWriter09.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/io/CONLLWriter09.class diff --git a/dependencyParser/mate-tools/classes/is2/io/IOGenerals.class b/dependencyParser/experimental/mate-tools/classes/is2/io/IOGenerals.class index 5346483..5346483 100644 --- a/dependencyParser/mate-tools/classes/is2/io/IOGenerals.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/io/IOGenerals.class diff --git a/dependencyParser/mate-tools/classes/is2/io/PSReader.class b/dependencyParser/experimental/mate-tools/classes/is2/io/PSReader.class index d197ad7..d197ad7 100644 --- a/dependencyParser/mate-tools/classes/is2/io/PSReader.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/io/PSReader.class diff --git a/dependencyParser/mate-tools/classes/is2/io/TigerReader$Line.class b/dependencyParser/experimental/mate-tools/classes/is2/io/TigerReader$Line.class index 7c14ffa..7c14ffa 100644 --- a/dependencyParser/mate-tools/classes/is2/io/TigerReader$Line.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/io/TigerReader$Line.class diff --git a/dependencyParser/mate-tools/classes/is2/io/TigerReader.class b/dependencyParser/experimental/mate-tools/classes/is2/io/TigerReader.class index eb6aa37..eb6aa37 100644 --- a/dependencyParser/mate-tools/classes/is2/io/TigerReader.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/io/TigerReader.class diff --git a/dependencyParser/mate-tools/classes/is2/lemmatizer/Evaluator$1.class b/dependencyParser/experimental/mate-tools/classes/is2/lemmatizer/Evaluator$1.class index e015840..e015840 100644 --- a/dependencyParser/mate-tools/classes/is2/lemmatizer/Evaluator$1.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/lemmatizer/Evaluator$1.class diff --git a/dependencyParser/mate-tools/classes/is2/lemmatizer/Evaluator.class b/dependencyParser/experimental/mate-tools/classes/is2/lemmatizer/Evaluator.class index 53e79f4..53e79f4 100644 --- a/dependencyParser/mate-tools/classes/is2/lemmatizer/Evaluator.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/lemmatizer/Evaluator.class diff --git a/dependencyParser/mate-tools/classes/is2/lemmatizer/Lemmatizer$1.class b/dependencyParser/experimental/mate-tools/classes/is2/lemmatizer/Lemmatizer$1.class index fce30de..fce30de 100644 --- a/dependencyParser/mate-tools/classes/is2/lemmatizer/Lemmatizer$1.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/lemmatizer/Lemmatizer$1.class diff --git a/dependencyParser/mate-tools/classes/is2/lemmatizer/Lemmatizer.class b/dependencyParser/experimental/mate-tools/classes/is2/lemmatizer/Lemmatizer.class index b6d3745..b6d3745 100644 --- a/dependencyParser/mate-tools/classes/is2/lemmatizer/Lemmatizer.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/lemmatizer/Lemmatizer.class diff --git a/dependencyParser/mate-tools/classes/is2/lemmatizer/MFO.class b/dependencyParser/experimental/mate-tools/classes/is2/lemmatizer/MFO.class index 651e2ae..651e2ae 100644 --- a/dependencyParser/mate-tools/classes/is2/lemmatizer/MFO.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/lemmatizer/MFO.class diff --git a/dependencyParser/mate-tools/classes/is2/lemmatizer/Options.class b/dependencyParser/experimental/mate-tools/classes/is2/lemmatizer/Options.class index cbe3419..cbe3419 100644 --- a/dependencyParser/mate-tools/classes/is2/lemmatizer/Options.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/lemmatizer/Options.class diff --git a/dependencyParser/mate-tools/classes/is2/lemmatizer/Pipe$1.class b/dependencyParser/experimental/mate-tools/classes/is2/lemmatizer/Pipe$1.class index b445f21..b445f21 100644 --- a/dependencyParser/mate-tools/classes/is2/lemmatizer/Pipe$1.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/lemmatizer/Pipe$1.class diff --git a/dependencyParser/mate-tools/classes/is2/lemmatizer/Pipe.class b/dependencyParser/experimental/mate-tools/classes/is2/lemmatizer/Pipe.class index af22121..af22121 100644 --- a/dependencyParser/mate-tools/classes/is2/lemmatizer/Pipe.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/lemmatizer/Pipe.class diff --git a/dependencyParser/mate-tools/classes/is2/lemmatizer/StringEdit.class b/dependencyParser/experimental/mate-tools/classes/is2/lemmatizer/StringEdit.class index 22ddb78..22ddb78 100644 --- a/dependencyParser/mate-tools/classes/is2/lemmatizer/StringEdit.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/lemmatizer/StringEdit.class diff --git a/dependencyParser/mate-tools/classes/is2/mtag/Convert.class b/dependencyParser/experimental/mate-tools/classes/is2/mtag/Convert.class index 6814212..6814212 100644 --- a/dependencyParser/mate-tools/classes/is2/mtag/Convert.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/mtag/Convert.class diff --git a/dependencyParser/mate-tools/classes/is2/mtag/Evaluator$1.class b/dependencyParser/experimental/mate-tools/classes/is2/mtag/Evaluator$1.class index 9ad12ae..9ad12ae 100644 --- a/dependencyParser/mate-tools/classes/is2/mtag/Evaluator$1.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/mtag/Evaluator$1.class diff --git a/dependencyParser/mate-tools/classes/is2/mtag/Evaluator.class b/dependencyParser/experimental/mate-tools/classes/is2/mtag/Evaluator.class index 5f05944..5f05944 100644 --- a/dependencyParser/mate-tools/classes/is2/mtag/Evaluator.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/mtag/Evaluator.class diff --git a/dependencyParser/mate-tools/classes/is2/mtag/ExtractorM.class b/dependencyParser/experimental/mate-tools/classes/is2/mtag/ExtractorM.class index 1dfb207..1dfb207 100644 --- a/dependencyParser/mate-tools/classes/is2/mtag/ExtractorM.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/mtag/ExtractorM.class diff --git a/dependencyParser/mate-tools/classes/is2/mtag/MFO$Data.class b/dependencyParser/experimental/mate-tools/classes/is2/mtag/MFO$Data.class index fb46d59..fb46d59 100644 --- a/dependencyParser/mate-tools/classes/is2/mtag/MFO$Data.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/mtag/MFO$Data.class diff --git a/dependencyParser/mate-tools/classes/is2/mtag/MFO$Data4.class b/dependencyParser/experimental/mate-tools/classes/is2/mtag/MFO$Data4.class index a3346d7..a3346d7 100644 --- a/dependencyParser/mate-tools/classes/is2/mtag/MFO$Data4.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/mtag/MFO$Data4.class diff --git a/dependencyParser/mate-tools/classes/is2/mtag/MFO.class b/dependencyParser/experimental/mate-tools/classes/is2/mtag/MFO.class index 5014e54..5014e54 100644 --- a/dependencyParser/mate-tools/classes/is2/mtag/MFO.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/mtag/MFO.class diff --git a/dependencyParser/mate-tools/classes/is2/mtag/Options.class b/dependencyParser/experimental/mate-tools/classes/is2/mtag/Options.class index a1ffdcb..a1ffdcb 100644 --- a/dependencyParser/mate-tools/classes/is2/mtag/Options.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/mtag/Options.class diff --git a/dependencyParser/mate-tools/classes/is2/mtag/Pipe.class b/dependencyParser/experimental/mate-tools/classes/is2/mtag/Pipe.class index 4d3ae21..4d3ae21 100644 --- a/dependencyParser/mate-tools/classes/is2/mtag/Pipe.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/mtag/Pipe.class diff --git a/dependencyParser/mate-tools/classes/is2/mtag/Tagger.class b/dependencyParser/experimental/mate-tools/classes/is2/mtag/Tagger.class index 37af1ef..37af1ef 100644 --- a/dependencyParser/mate-tools/classes/is2/mtag/Tagger.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/mtag/Tagger.class diff --git a/dependencyParser/mate-tools/classes/is2/parser/Closed.class b/dependencyParser/experimental/mate-tools/classes/is2/parser/Closed.class index 7e6161a..7e6161a 100644 --- a/dependencyParser/mate-tools/classes/is2/parser/Closed.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parser/Closed.class diff --git a/dependencyParser/mate-tools/classes/is2/parser/D5.class b/dependencyParser/experimental/mate-tools/classes/is2/parser/D5.class index 11da4cc..11da4cc 100644 --- a/dependencyParser/mate-tools/classes/is2/parser/D5.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parser/D5.class diff --git a/dependencyParser/mate-tools/classes/is2/parser/Decoder.class b/dependencyParser/experimental/mate-tools/classes/is2/parser/Decoder.class index 35fbe20..35fbe20 100644 --- a/dependencyParser/mate-tools/classes/is2/parser/Decoder.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parser/Decoder.class diff --git a/dependencyParser/mate-tools/classes/is2/parser/Edges$C.class b/dependencyParser/experimental/mate-tools/classes/is2/parser/Edges$C.class index bcbfa2e..bcbfa2e 100644 --- a/dependencyParser/mate-tools/classes/is2/parser/Edges$C.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parser/Edges$C.class diff --git a/dependencyParser/mate-tools/classes/is2/parser/Edges.class b/dependencyParser/experimental/mate-tools/classes/is2/parser/Edges.class index 62f892e..62f892e 100644 --- a/dependencyParser/mate-tools/classes/is2/parser/Edges.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parser/Edges.class diff --git a/dependencyParser/mate-tools/classes/is2/parser/Evaluator$Results.class b/dependencyParser/experimental/mate-tools/classes/is2/parser/Evaluator$Results.class index 52913a2..52913a2 100644 --- a/dependencyParser/mate-tools/classes/is2/parser/Evaluator$Results.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parser/Evaluator$Results.class diff --git a/dependencyParser/mate-tools/classes/is2/parser/Evaluator.class b/dependencyParser/experimental/mate-tools/classes/is2/parser/Evaluator.class index cd8680f..cd8680f 100644 --- a/dependencyParser/mate-tools/classes/is2/parser/Evaluator.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parser/Evaluator.class diff --git a/dependencyParser/mate-tools/classes/is2/parser/Extractor.class b/dependencyParser/experimental/mate-tools/classes/is2/parser/Extractor.class index cb576f0..cb576f0 100644 --- a/dependencyParser/mate-tools/classes/is2/parser/Extractor.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parser/Extractor.class diff --git a/dependencyParser/mate-tools/classes/is2/parser/MFO.class b/dependencyParser/experimental/mate-tools/classes/is2/parser/MFO.class index e2f5d9f..e2f5d9f 100644 --- a/dependencyParser/mate-tools/classes/is2/parser/MFO.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parser/MFO.class diff --git a/dependencyParser/mate-tools/classes/is2/parser/Open.class b/dependencyParser/experimental/mate-tools/classes/is2/parser/Open.class index 8d4ef9b..8d4ef9b 100644 --- a/dependencyParser/mate-tools/classes/is2/parser/Open.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parser/Open.class diff --git a/dependencyParser/mate-tools/classes/is2/parser/Options.class b/dependencyParser/experimental/mate-tools/classes/is2/parser/Options.class index 806b3d0..806b3d0 100644 --- a/dependencyParser/mate-tools/classes/is2/parser/Options.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parser/Options.class diff --git a/dependencyParser/mate-tools/classes/is2/parser/ParallelDecoder$DSet.class b/dependencyParser/experimental/mate-tools/classes/is2/parser/ParallelDecoder$DSet.class index 534b87c..534b87c 100644 --- a/dependencyParser/mate-tools/classes/is2/parser/ParallelDecoder$DSet.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parser/ParallelDecoder$DSet.class diff --git a/dependencyParser/mate-tools/classes/is2/parser/ParallelDecoder.class b/dependencyParser/experimental/mate-tools/classes/is2/parser/ParallelDecoder.class index dd112d0..dd112d0 100644 --- a/dependencyParser/mate-tools/classes/is2/parser/ParallelDecoder.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parser/ParallelDecoder.class diff --git a/dependencyParser/mate-tools/classes/is2/parser/ParallelExtract$DSet.class b/dependencyParser/experimental/mate-tools/classes/is2/parser/ParallelExtract$DSet.class index 823130d..823130d 100644 --- a/dependencyParser/mate-tools/classes/is2/parser/ParallelExtract$DSet.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parser/ParallelExtract$DSet.class diff --git a/dependencyParser/mate-tools/classes/is2/parser/ParallelExtract.class b/dependencyParser/experimental/mate-tools/classes/is2/parser/ParallelExtract.class index 7dfe35c..7dfe35c 100644 --- a/dependencyParser/mate-tools/classes/is2/parser/ParallelExtract.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parser/ParallelExtract.class diff --git a/dependencyParser/mate-tools/classes/is2/parser/ParallelRearrange$PA.class b/dependencyParser/experimental/mate-tools/classes/is2/parser/ParallelRearrange$PA.class index d1b8bfe..d1b8bfe 100644 --- a/dependencyParser/mate-tools/classes/is2/parser/ParallelRearrange$PA.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parser/ParallelRearrange$PA.class diff --git a/dependencyParser/mate-tools/classes/is2/parser/ParallelRearrange.class b/dependencyParser/experimental/mate-tools/classes/is2/parser/ParallelRearrange.class index fcfed43..fcfed43 100644 --- a/dependencyParser/mate-tools/classes/is2/parser/ParallelRearrange.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parser/ParallelRearrange.class diff --git a/dependencyParser/mate-tools/classes/is2/parser/Parameters.class b/dependencyParser/experimental/mate-tools/classes/is2/parser/Parameters.class index bc64375..bc64375 100644 --- a/dependencyParser/mate-tools/classes/is2/parser/Parameters.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parser/Parameters.class diff --git a/dependencyParser/mate-tools/classes/is2/parser/ParametersFloat.class b/dependencyParser/experimental/mate-tools/classes/is2/parser/ParametersFloat.class index 524da17..524da17 100644 --- a/dependencyParser/mate-tools/classes/is2/parser/ParametersFloat.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parser/ParametersFloat.class diff --git a/dependencyParser/mate-tools/classes/is2/parser/Parser.class b/dependencyParser/experimental/mate-tools/classes/is2/parser/Parser.class index 575c9da..575c9da 100644 --- a/dependencyParser/mate-tools/classes/is2/parser/Parser.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parser/Parser.class diff --git a/dependencyParser/mate-tools/classes/is2/parser/Pipe.class b/dependencyParser/experimental/mate-tools/classes/is2/parser/Pipe.class index ea909ad..ea909ad 100644 --- a/dependencyParser/mate-tools/classes/is2/parser/Pipe.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parser/Pipe.class diff --git a/dependencyParser/mate-tools/src/is2/parser/package.html b/dependencyParser/experimental/mate-tools/classes/is2/parser/package.html index a4f40a2..a4f40a2 100755 --- a/dependencyParser/mate-tools/src/is2/parser/package.html +++ b/dependencyParser/experimental/mate-tools/classes/is2/parser/package.html diff --git a/dependencyParser/mate-tools/classes/is2/parserR2/Decoder.class b/dependencyParser/experimental/mate-tools/classes/is2/parserR2/Decoder.class index fd23b35..fd23b35 100644 --- a/dependencyParser/mate-tools/classes/is2/parserR2/Decoder.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parserR2/Decoder.class diff --git a/dependencyParser/mate-tools/classes/is2/parserR2/Options.class b/dependencyParser/experimental/mate-tools/classes/is2/parserR2/Options.class index c36af6b..c36af6b 100644 --- a/dependencyParser/mate-tools/classes/is2/parserR2/Options.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parserR2/Options.class diff --git a/dependencyParser/mate-tools/classes/is2/parserR2/Parameters.class b/dependencyParser/experimental/mate-tools/classes/is2/parserR2/Parameters.class index 3ff4ebe..3ff4ebe 100644 --- a/dependencyParser/mate-tools/classes/is2/parserR2/Parameters.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parserR2/Parameters.class diff --git a/dependencyParser/mate-tools/classes/is2/parserR2/ParametersFloat.class b/dependencyParser/experimental/mate-tools/classes/is2/parserR2/ParametersFloat.class index 3837dc0..3837dc0 100644 --- a/dependencyParser/mate-tools/classes/is2/parserR2/ParametersFloat.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parserR2/ParametersFloat.class diff --git a/dependencyParser/mate-tools/classes/is2/parserR2/Parser.class b/dependencyParser/experimental/mate-tools/classes/is2/parserR2/Parser.class index 2b1b742..2b1b742 100644 --- a/dependencyParser/mate-tools/classes/is2/parserR2/Parser.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parserR2/Parser.class diff --git a/dependencyParser/mate-tools/classes/is2/parserR2/Pipe.class b/dependencyParser/experimental/mate-tools/classes/is2/parserR2/Pipe.class index 7067c95..7067c95 100644 --- a/dependencyParser/mate-tools/classes/is2/parserR2/Pipe.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parserR2/Pipe.class diff --git a/dependencyParser/mate-tools/classes/is2/parserR2/PipeReranker.class b/dependencyParser/experimental/mate-tools/classes/is2/parserR2/PipeReranker.class index fc6cb91..fc6cb91 100644 --- a/dependencyParser/mate-tools/classes/is2/parserR2/PipeReranker.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parserR2/PipeReranker.class diff --git a/dependencyParser/mate-tools/classes/is2/parserR2/Reranker.class b/dependencyParser/experimental/mate-tools/classes/is2/parserR2/Reranker.class index 5177b92..5177b92 100644 --- a/dependencyParser/mate-tools/classes/is2/parserR2/Reranker.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/parserR2/Reranker.class diff --git a/dependencyParser/mate-tools/src/is2/parserR2/package.html b/dependencyParser/experimental/mate-tools/classes/is2/parserR2/package.html index 6b06482..6b06482 100755 --- a/dependencyParser/mate-tools/src/is2/parserR2/package.html +++ b/dependencyParser/experimental/mate-tools/classes/is2/parserR2/package.html diff --git a/dependencyParser/mate-tools/classes/is2/tag/ExtractorT2.class b/dependencyParser/experimental/mate-tools/classes/is2/tag/ExtractorT2.class index 44dba9e..44dba9e 100644 --- a/dependencyParser/mate-tools/classes/is2/tag/ExtractorT2.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/tag/ExtractorT2.class diff --git a/dependencyParser/mate-tools/classes/is2/tag/Lexicon.class b/dependencyParser/experimental/mate-tools/classes/is2/tag/Lexicon.class index 1ea1e85..1ea1e85 100644 --- a/dependencyParser/mate-tools/classes/is2/tag/Lexicon.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/tag/Lexicon.class diff --git a/dependencyParser/mate-tools/classes/is2/tag/MFO$Data4.class b/dependencyParser/experimental/mate-tools/classes/is2/tag/MFO$Data4.class index 3627f10..3627f10 100644 --- a/dependencyParser/mate-tools/classes/is2/tag/MFO$Data4.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/tag/MFO$Data4.class diff --git a/dependencyParser/mate-tools/classes/is2/tag/MFO.class b/dependencyParser/experimental/mate-tools/classes/is2/tag/MFO.class index 10ea428..10ea428 100644 --- a/dependencyParser/mate-tools/classes/is2/tag/MFO.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/tag/MFO.class diff --git a/dependencyParser/mate-tools/classes/is2/tag/Options.class b/dependencyParser/experimental/mate-tools/classes/is2/tag/Options.class index e6ec2c0..e6ec2c0 100644 --- a/dependencyParser/mate-tools/classes/is2/tag/Options.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/tag/Options.class diff --git a/dependencyParser/mate-tools/classes/is2/tag/POS.class b/dependencyParser/experimental/mate-tools/classes/is2/tag/POS.class index 8ea2052..8ea2052 100644 --- a/dependencyParser/mate-tools/classes/is2/tag/POS.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/tag/POS.class diff --git a/dependencyParser/mate-tools/classes/is2/tag/Tagger.class b/dependencyParser/experimental/mate-tools/classes/is2/tag/Tagger.class index 11ac1aa..11ac1aa 100644 --- a/dependencyParser/mate-tools/classes/is2/tag/Tagger.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/tag/Tagger.class diff --git a/dependencyParser/experimental/mate-tools/classes/is2/tag/package.html b/dependencyParser/experimental/mate-tools/classes/is2/tag/package.html new file mode 100644 index 0000000..469fdf6 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/classes/is2/tag/package.html @@ -0,0 +1,4 @@ +Package info +<br><br> +This parser includes a tagger into the dependency parser +<br> \ No newline at end of file diff --git a/dependencyParser/mate-tools/classes/is2/tools/IPipe.class b/dependencyParser/experimental/mate-tools/classes/is2/tools/IPipe.class index 37a51d1..37a51d1 100644 --- a/dependencyParser/mate-tools/classes/is2/tools/IPipe.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/tools/IPipe.class diff --git a/dependencyParser/mate-tools/classes/is2/tools/Retrainable.class b/dependencyParser/experimental/mate-tools/classes/is2/tools/Retrainable.class index 3675e0f..3675e0f 100644 --- a/dependencyParser/mate-tools/classes/is2/tools/Retrainable.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/tools/Retrainable.class diff --git a/dependencyParser/mate-tools/classes/is2/tools/Tool.class b/dependencyParser/experimental/mate-tools/classes/is2/tools/Tool.class index 0ba217e..0ba217e 100644 --- a/dependencyParser/mate-tools/classes/is2/tools/Tool.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/tools/Tool.class diff --git a/dependencyParser/mate-tools/classes/is2/tools/ToolIO.class b/dependencyParser/experimental/mate-tools/classes/is2/tools/ToolIO.class index a6f87ab..a6f87ab 100644 --- a/dependencyParser/mate-tools/classes/is2/tools/ToolIO.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/tools/ToolIO.class diff --git a/dependencyParser/mate-tools/classes/is2/tools/Train.class b/dependencyParser/experimental/mate-tools/classes/is2/tools/Train.class index 0551298..0551298 100644 --- a/dependencyParser/mate-tools/classes/is2/tools/Train.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/tools/Train.class diff --git a/dependencyParser/mate-tools/classes/is2/util/Convert.class b/dependencyParser/experimental/mate-tools/classes/is2/util/Convert.class index cc00ef1..cc00ef1 100644 --- a/dependencyParser/mate-tools/classes/is2/util/Convert.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/Convert.class diff --git a/dependencyParser/mate-tools/classes/is2/util/Convert0409.class b/dependencyParser/experimental/mate-tools/classes/is2/util/Convert0409.class index ded4be5..ded4be5 100644 --- a/dependencyParser/mate-tools/classes/is2/util/Convert0409.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/Convert0409.class diff --git a/dependencyParser/mate-tools/classes/is2/util/ConvertADJ.class b/dependencyParser/experimental/mate-tools/classes/is2/util/ConvertADJ.class index c080750..c080750 100644 --- a/dependencyParser/mate-tools/classes/is2/util/ConvertADJ.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/ConvertADJ.class diff --git a/dependencyParser/mate-tools/classes/is2/util/ConvertLowerCase0909.class b/dependencyParser/experimental/mate-tools/classes/is2/util/ConvertLowerCase0909.class index d2ffbf4..d2ffbf4 100644 --- a/dependencyParser/mate-tools/classes/is2/util/ConvertLowerCase0909.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/ConvertLowerCase0909.class diff --git a/dependencyParser/mate-tools/classes/is2/util/ConvertTiger2CoNLL.class b/dependencyParser/experimental/mate-tools/classes/is2/util/ConvertTiger2CoNLL.class index d9c9cdf..d9c9cdf 100644 --- a/dependencyParser/mate-tools/classes/is2/util/ConvertTiger2CoNLL.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/ConvertTiger2CoNLL.class diff --git a/dependencyParser/mate-tools/classes/is2/util/DB.class b/dependencyParser/experimental/mate-tools/classes/is2/util/DB.class index 50c83a6..50c83a6 100644 --- a/dependencyParser/mate-tools/classes/is2/util/DB.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/DB.class diff --git a/dependencyParser/mate-tools/classes/is2/util/Edges$C.class b/dependencyParser/experimental/mate-tools/classes/is2/util/Edges$C.class index e3c539b..e3c539b 100644 --- a/dependencyParser/mate-tools/classes/is2/util/Edges$C.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/Edges$C.class diff --git a/dependencyParser/mate-tools/classes/is2/util/Edges.class b/dependencyParser/experimental/mate-tools/classes/is2/util/Edges.class index 6fd6eeb..6fd6eeb 100644 --- a/dependencyParser/mate-tools/classes/is2/util/Edges.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/Edges.class diff --git a/dependencyParser/mate-tools/classes/is2/util/Evaluator$1.class b/dependencyParser/experimental/mate-tools/classes/is2/util/Evaluator$1.class index 11c15b6..11c15b6 100644 --- a/dependencyParser/mate-tools/classes/is2/util/Evaluator$1.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/Evaluator$1.class diff --git a/dependencyParser/mate-tools/classes/is2/util/Evaluator$Results.class b/dependencyParser/experimental/mate-tools/classes/is2/util/Evaluator$Results.class index c8a450a..c8a450a 100644 --- a/dependencyParser/mate-tools/classes/is2/util/Evaluator$Results.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/Evaluator$Results.class diff --git a/dependencyParser/mate-tools/classes/is2/util/Evaluator.class b/dependencyParser/experimental/mate-tools/classes/is2/util/Evaluator.class index c9a50ff..c9a50ff 100644 --- a/dependencyParser/mate-tools/classes/is2/util/Evaluator.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/Evaluator.class diff --git a/dependencyParser/mate-tools/classes/is2/util/EvaluatorTagger$1.class b/dependencyParser/experimental/mate-tools/classes/is2/util/EvaluatorTagger$1.class index 129c2f1..129c2f1 100644 --- a/dependencyParser/mate-tools/classes/is2/util/EvaluatorTagger$1.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/EvaluatorTagger$1.class diff --git a/dependencyParser/mate-tools/classes/is2/util/EvaluatorTagger$2.class b/dependencyParser/experimental/mate-tools/classes/is2/util/EvaluatorTagger$2.class index 7a634d1..7a634d1 100644 --- a/dependencyParser/mate-tools/classes/is2/util/EvaluatorTagger$2.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/EvaluatorTagger$2.class diff --git a/dependencyParser/mate-tools/classes/is2/util/EvaluatorTagger$Results.class b/dependencyParser/experimental/mate-tools/classes/is2/util/EvaluatorTagger$Results.class index fc57ccc..fc57ccc 100644 --- a/dependencyParser/mate-tools/classes/is2/util/EvaluatorTagger$Results.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/EvaluatorTagger$Results.class diff --git a/dependencyParser/mate-tools/classes/is2/util/EvaluatorTagger.class b/dependencyParser/experimental/mate-tools/classes/is2/util/EvaluatorTagger.class index 20d5d22..20d5d22 100644 --- a/dependencyParser/mate-tools/classes/is2/util/EvaluatorTagger.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/EvaluatorTagger.class diff --git a/dependencyParser/mate-tools/classes/is2/util/ExtractParagraphs.class b/dependencyParser/experimental/mate-tools/classes/is2/util/ExtractParagraphs.class index dc00415..dc00415 100644 --- a/dependencyParser/mate-tools/classes/is2/util/ExtractParagraphs.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/ExtractParagraphs.class diff --git a/dependencyParser/mate-tools/classes/is2/util/IntStack.class b/dependencyParser/experimental/mate-tools/classes/is2/util/IntStack.class index 4af4316..4af4316 100644 --- a/dependencyParser/mate-tools/classes/is2/util/IntStack.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/IntStack.class diff --git a/dependencyParser/mate-tools/classes/is2/util/Long2Int.class b/dependencyParser/experimental/mate-tools/classes/is2/util/Long2Int.class index c816fd5..c816fd5 100644 --- a/dependencyParser/mate-tools/classes/is2/util/Long2Int.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/Long2Int.class diff --git a/dependencyParser/mate-tools/classes/is2/util/Options.class b/dependencyParser/experimental/mate-tools/classes/is2/util/Options.class index 8071ebb..8071ebb 100644 --- a/dependencyParser/mate-tools/classes/is2/util/Options.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/Options.class diff --git a/dependencyParser/mate-tools/classes/is2/util/OptionsSuper.class b/dependencyParser/experimental/mate-tools/classes/is2/util/OptionsSuper.class index 8f8a181..8f8a181 100644 --- a/dependencyParser/mate-tools/classes/is2/util/OptionsSuper.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/OptionsSuper.class diff --git a/dependencyParser/mate-tools/classes/is2/util/ParserEvaluator$Results.class b/dependencyParser/experimental/mate-tools/classes/is2/util/ParserEvaluator$Results.class index 154033d..154033d 100644 --- a/dependencyParser/mate-tools/classes/is2/util/ParserEvaluator$Results.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/ParserEvaluator$Results.class diff --git a/dependencyParser/mate-tools/classes/is2/util/ParserEvaluator.class b/dependencyParser/experimental/mate-tools/classes/is2/util/ParserEvaluator.class index 72c4f35..72c4f35 100644 --- a/dependencyParser/mate-tools/classes/is2/util/ParserEvaluator.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/ParserEvaluator.class diff --git a/dependencyParser/mate-tools/classes/is2/util/Split.class b/dependencyParser/experimental/mate-tools/classes/is2/util/Split.class index d665e7e..d665e7e 100644 --- a/dependencyParser/mate-tools/classes/is2/util/Split.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/Split.class diff --git a/dependencyParser/mate-tools/classes/is2/util/Split2.class b/dependencyParser/experimental/mate-tools/classes/is2/util/Split2.class index 3222aee..3222aee 100644 --- a/dependencyParser/mate-tools/classes/is2/util/Split2.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/Split2.class diff --git a/dependencyParser/mate-tools/classes/is2/util/Split3.class b/dependencyParser/experimental/mate-tools/classes/is2/util/Split3.class index a16bbc6..a16bbc6 100644 --- a/dependencyParser/mate-tools/classes/is2/util/Split3.class +++ b/dependencyParser/experimental/mate-tools/classes/is2/util/Split3.class diff --git a/dependencyParser/experimental/mate-tools/lib/commons-math-2.2.jar b/dependencyParser/experimental/mate-tools/lib/commons-math-2.2.jar new file mode 100644 index 0000000..b29a39c --- /dev/null +++ b/dependencyParser/experimental/mate-tools/lib/commons-math-2.2.jar diff --git a/dependencyParser/experimental/mate-tools/lib/trove-2.0.4.jar b/dependencyParser/experimental/mate-tools/lib/trove-2.0.4.jar new file mode 100644 index 0000000..cb1c8f1 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/lib/trove-2.0.4.jar diff --git a/dependencyParser/mate-tools/src/decoder/ParallelDecoder.java b/dependencyParser/experimental/mate-tools/src/decoder/ParallelDecoder.java index cf4cfa9..cf4cfa9 100755 --- a/dependencyParser/mate-tools/src/decoder/ParallelDecoder.java +++ b/dependencyParser/experimental/mate-tools/src/decoder/ParallelDecoder.java diff --git a/dependencyParser/mate-tools/src/decoder/ParallelRearrangeNBest.java b/dependencyParser/experimental/mate-tools/src/decoder/ParallelRearrangeNBest.java index 5a16211..5a16211 100755 --- a/dependencyParser/mate-tools/src/decoder/ParallelRearrangeNBest.java +++ b/dependencyParser/experimental/mate-tools/src/decoder/ParallelRearrangeNBest.java diff --git a/dependencyParser/mate-tools/src/decoder/ParallelRearrangeNBest2.java b/dependencyParser/experimental/mate-tools/src/decoder/ParallelRearrangeNBest2.java index 6795b7b..6795b7b 100644 --- a/dependencyParser/mate-tools/src/decoder/ParallelRearrangeNBest2.java +++ b/dependencyParser/experimental/mate-tools/src/decoder/ParallelRearrangeNBest2.java diff --git a/dependencyParser/mate-tools/src/examples/DependencyParser.java b/dependencyParser/experimental/mate-tools/src/examples/DependencyParser.java index 917dc04..917dc04 100644 --- a/dependencyParser/mate-tools/src/examples/DependencyParser.java +++ b/dependencyParser/experimental/mate-tools/src/examples/DependencyParser.java diff --git a/dependencyParser/mate-tools/src/examples/FullPipelineSpanish.java b/dependencyParser/experimental/mate-tools/src/examples/FullPipelineSpanish.java index 9bfff59..9bfff59 100644 --- a/dependencyParser/mate-tools/src/examples/FullPipelineSpanish.java +++ b/dependencyParser/experimental/mate-tools/src/examples/FullPipelineSpanish.java diff --git a/dependencyParser/mate-tools/src/examples/FullPipelineTest.java b/dependencyParser/experimental/mate-tools/src/examples/FullPipelineTest.java index 4aecdc2..4aecdc2 100644 --- a/dependencyParser/mate-tools/src/examples/FullPipelineTest.java +++ b/dependencyParser/experimental/mate-tools/src/examples/FullPipelineTest.java diff --git a/dependencyParser/mate-tools/src/examples/MorphTagger.java b/dependencyParser/experimental/mate-tools/src/examples/MorphTagger.java index 75bfc28..75bfc28 100644 --- a/dependencyParser/mate-tools/src/examples/MorphTagger.java +++ b/dependencyParser/experimental/mate-tools/src/examples/MorphTagger.java diff --git a/dependencyParser/mate-tools/src/examples/ParseOnly.java b/dependencyParser/experimental/mate-tools/src/examples/ParseOnly.java index 23eaf59..23eaf59 100755 --- a/dependencyParser/mate-tools/src/examples/ParseOnly.java +++ b/dependencyParser/experimental/mate-tools/src/examples/ParseOnly.java diff --git a/dependencyParser/mate-tools/src/examples/Pipeline.java b/dependencyParser/experimental/mate-tools/src/examples/Pipeline.java index dcb5a24..dcb5a24 100644 --- a/dependencyParser/mate-tools/src/examples/Pipeline.java +++ b/dependencyParser/experimental/mate-tools/src/examples/Pipeline.java diff --git a/dependencyParser/mate-tools/src/extractors/Extractor.java b/dependencyParser/experimental/mate-tools/src/extractors/Extractor.java index 190ccc6..190ccc6 100644 --- a/dependencyParser/mate-tools/src/extractors/Extractor.java +++ b/dependencyParser/experimental/mate-tools/src/extractors/Extractor.java diff --git a/dependencyParser/mate-tools/src/extractors/ExtractorClusterStacked.java b/dependencyParser/experimental/mate-tools/src/extractors/ExtractorClusterStacked.java index 436bd5c..436bd5c 100755 --- a/dependencyParser/mate-tools/src/extractors/ExtractorClusterStacked.java +++ b/dependencyParser/experimental/mate-tools/src/extractors/ExtractorClusterStacked.java diff --git a/dependencyParser/mate-tools/src/extractors/ExtractorClusterStackedR2.java b/dependencyParser/experimental/mate-tools/src/extractors/ExtractorClusterStackedR2.java index d1776b6..d1776b6 100644 --- a/dependencyParser/mate-tools/src/extractors/ExtractorClusterStackedR2.java +++ b/dependencyParser/experimental/mate-tools/src/extractors/ExtractorClusterStackedR2.java diff --git a/dependencyParser/mate-tools/src/extractors/ExtractorFactory.java b/dependencyParser/experimental/mate-tools/src/extractors/ExtractorFactory.java index 20827d4..20827d4 100644 --- a/dependencyParser/mate-tools/src/extractors/ExtractorFactory.java +++ b/dependencyParser/experimental/mate-tools/src/extractors/ExtractorFactory.java diff --git a/dependencyParser/mate-tools/src/extractors/ExtractorReranker.java b/dependencyParser/experimental/mate-tools/src/extractors/ExtractorReranker.java index bf068b2..bf068b2 100644 --- a/dependencyParser/mate-tools/src/extractors/ExtractorReranker.java +++ b/dependencyParser/experimental/mate-tools/src/extractors/ExtractorReranker.java diff --git a/dependencyParser/mate-tools/src/extractors/ParallelExtract.java b/dependencyParser/experimental/mate-tools/src/extractors/ParallelExtract.java index 5e0ec08..5e0ec08 100755 --- a/dependencyParser/mate-tools/src/extractors/ParallelExtract.java +++ b/dependencyParser/experimental/mate-tools/src/extractors/ParallelExtract.java diff --git a/dependencyParser/mate-tools/src/is2/data/Closed.java b/dependencyParser/experimental/mate-tools/src/is2/data/Closed.java index 3e938a8..3e938a8 100755 --- a/dependencyParser/mate-tools/src/is2/data/Closed.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/Closed.java diff --git a/dependencyParser/mate-tools/src/is2/data/Cluster.java b/dependencyParser/experimental/mate-tools/src/is2/data/Cluster.java index 5cc8427..5cc8427 100644 --- a/dependencyParser/mate-tools/src/is2/data/Cluster.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/Cluster.java diff --git a/dependencyParser/mate-tools/src/is2/data/D4.java b/dependencyParser/experimental/mate-tools/src/is2/data/D4.java index d607668..d607668 100644 --- a/dependencyParser/mate-tools/src/is2/data/D4.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/D4.java diff --git a/dependencyParser/mate-tools/src/is2/data/D6.java b/dependencyParser/experimental/mate-tools/src/is2/data/D6.java index 0a17d51..0a17d51 100644 --- a/dependencyParser/mate-tools/src/is2/data/D6.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/D6.java diff --git a/dependencyParser/mate-tools/src/is2/data/D7.java b/dependencyParser/experimental/mate-tools/src/is2/data/D7.java index 319c54a..319c54a 100644 --- a/dependencyParser/mate-tools/src/is2/data/D7.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/D7.java diff --git a/dependencyParser/mate-tools/src/is2/data/DPSTree.java b/dependencyParser/experimental/mate-tools/src/is2/data/DPSTree.java index 085c0c7..085c0c7 100644 --- a/dependencyParser/mate-tools/src/is2/data/DPSTree.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/DPSTree.java diff --git a/dependencyParser/mate-tools/src/is2/data/DX.java b/dependencyParser/experimental/mate-tools/src/is2/data/DX.java index 8650038..8650038 100644 --- a/dependencyParser/mate-tools/src/is2/data/DX.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/DX.java diff --git a/dependencyParser/mate-tools/src/is2/data/DataF.java b/dependencyParser/experimental/mate-tools/src/is2/data/DataF.java index 0ec145f..0ec145f 100755 --- a/dependencyParser/mate-tools/src/is2/data/DataF.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/DataF.java diff --git a/dependencyParser/mate-tools/src/is2/data/DataFES.java b/dependencyParser/experimental/mate-tools/src/is2/data/DataFES.java index 751ddc3..751ddc3 100644 --- a/dependencyParser/mate-tools/src/is2/data/DataFES.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/DataFES.java diff --git a/dependencyParser/mate-tools/src/is2/data/DataT.java b/dependencyParser/experimental/mate-tools/src/is2/data/DataT.java index 2ae816d..2ae816d 100644 --- a/dependencyParser/mate-tools/src/is2/data/DataT.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/DataT.java diff --git a/dependencyParser/mate-tools/src/is2/data/Edges.java b/dependencyParser/experimental/mate-tools/src/is2/data/Edges.java index a14db91..a14db91 100644 --- a/dependencyParser/mate-tools/src/is2/data/Edges.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/Edges.java diff --git a/dependencyParser/mate-tools/src/is2/data/F2S.java b/dependencyParser/experimental/mate-tools/src/is2/data/F2S.java index 37ee11a..37ee11a 100755 --- a/dependencyParser/mate-tools/src/is2/data/F2S.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/F2S.java diff --git a/dependencyParser/mate-tools/src/is2/data/F2SD.java b/dependencyParser/experimental/mate-tools/src/is2/data/F2SD.java index 2cdd793..2cdd793 100755 --- a/dependencyParser/mate-tools/src/is2/data/F2SD.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/F2SD.java diff --git a/dependencyParser/mate-tools/src/is2/data/F2SF.java b/dependencyParser/experimental/mate-tools/src/is2/data/F2SF.java index 394352f..394352f 100755 --- a/dependencyParser/mate-tools/src/is2/data/F2SF.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/F2SF.java diff --git a/dependencyParser/mate-tools/src/is2/data/F2SP.java b/dependencyParser/experimental/mate-tools/src/is2/data/F2SP.java index 513fa33..513fa33 100644 --- a/dependencyParser/mate-tools/src/is2/data/F2SP.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/F2SP.java diff --git a/dependencyParser/mate-tools/src/is2/data/F2ST.java b/dependencyParser/experimental/mate-tools/src/is2/data/F2ST.java index 4c30144..4c30144 100644 --- a/dependencyParser/mate-tools/src/is2/data/F2ST.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/F2ST.java diff --git a/dependencyParser/mate-tools/src/is2/data/FV.java b/dependencyParser/experimental/mate-tools/src/is2/data/FV.java index 10c0030..10c0030 100755 --- a/dependencyParser/mate-tools/src/is2/data/FV.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/FV.java diff --git a/dependencyParser/mate-tools/src/is2/data/FVR.java b/dependencyParser/experimental/mate-tools/src/is2/data/FVR.java index c5bc073..c5bc073 100644 --- a/dependencyParser/mate-tools/src/is2/data/FVR.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/FVR.java diff --git a/dependencyParser/mate-tools/src/is2/data/IEncoder.java b/dependencyParser/experimental/mate-tools/src/is2/data/IEncoder.java index 2a784af..2a784af 100755 --- a/dependencyParser/mate-tools/src/is2/data/IEncoder.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/IEncoder.java diff --git a/dependencyParser/mate-tools/src/is2/data/IEncoderPlus.java b/dependencyParser/experimental/mate-tools/src/is2/data/IEncoderPlus.java index b033381..b033381 100644 --- a/dependencyParser/mate-tools/src/is2/data/IEncoderPlus.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/IEncoderPlus.java diff --git a/dependencyParser/mate-tools/src/is2/data/IFV.java b/dependencyParser/experimental/mate-tools/src/is2/data/IFV.java index 010f411..010f411 100755 --- a/dependencyParser/mate-tools/src/is2/data/IFV.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/IFV.java diff --git a/dependencyParser/mate-tools/src/is2/data/Instances.java b/dependencyParser/experimental/mate-tools/src/is2/data/Instances.java index 20493a9..20493a9 100755 --- a/dependencyParser/mate-tools/src/is2/data/Instances.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/Instances.java diff --git a/dependencyParser/mate-tools/src/is2/data/InstancesTagger.java b/dependencyParser/experimental/mate-tools/src/is2/data/InstancesTagger.java index 4cf894a..4cf894a 100644 --- a/dependencyParser/mate-tools/src/is2/data/InstancesTagger.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/InstancesTagger.java diff --git a/dependencyParser/mate-tools/src/is2/data/IntIntHash.java b/dependencyParser/experimental/mate-tools/src/is2/data/IntIntHash.java index 1019507..1019507 100644 --- a/dependencyParser/mate-tools/src/is2/data/IntIntHash.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/IntIntHash.java diff --git a/dependencyParser/mate-tools/src/is2/data/Long2Int.java b/dependencyParser/experimental/mate-tools/src/is2/data/Long2Int.java index 1a4a3c4..1a4a3c4 100755 --- a/dependencyParser/mate-tools/src/is2/data/Long2Int.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/Long2Int.java diff --git a/dependencyParser/mate-tools/src/is2/data/Long2IntExact.java b/dependencyParser/experimental/mate-tools/src/is2/data/Long2IntExact.java index debf455..debf455 100644 --- a/dependencyParser/mate-tools/src/is2/data/Long2IntExact.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/Long2IntExact.java diff --git a/dependencyParser/mate-tools/src/is2/data/Long2IntInterface.java b/dependencyParser/experimental/mate-tools/src/is2/data/Long2IntInterface.java index 8401c1f..8401c1f 100755 --- a/dependencyParser/mate-tools/src/is2/data/Long2IntInterface.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/Long2IntInterface.java diff --git a/dependencyParser/mate-tools/src/is2/data/Long2IntQuick.java b/dependencyParser/experimental/mate-tools/src/is2/data/Long2IntQuick.java index 9956173..9956173 100644 --- a/dependencyParser/mate-tools/src/is2/data/Long2IntQuick.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/Long2IntQuick.java diff --git a/dependencyParser/mate-tools/src/is2/data/MFB.java b/dependencyParser/experimental/mate-tools/src/is2/data/MFB.java index 9fa4e3c..9fa4e3c 100755 --- a/dependencyParser/mate-tools/src/is2/data/MFB.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/MFB.java diff --git a/dependencyParser/mate-tools/src/is2/data/MFC.java b/dependencyParser/experimental/mate-tools/src/is2/data/MFC.java index 859a8ce..859a8ce 100644 --- a/dependencyParser/mate-tools/src/is2/data/MFC.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/MFC.java diff --git a/dependencyParser/mate-tools/src/is2/data/MFO.java b/dependencyParser/experimental/mate-tools/src/is2/data/MFO.java index a8bc441..a8bc441 100755 --- a/dependencyParser/mate-tools/src/is2/data/MFO.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/MFO.java diff --git a/dependencyParser/mate-tools/src/is2/data/Open.java b/dependencyParser/experimental/mate-tools/src/is2/data/Open.java index d9bf0e6..d9bf0e6 100755 --- a/dependencyParser/mate-tools/src/is2/data/Open.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/Open.java diff --git a/dependencyParser/mate-tools/src/is2/data/PSTree.java b/dependencyParser/experimental/mate-tools/src/is2/data/PSTree.java index e916548..e916548 100644 --- a/dependencyParser/mate-tools/src/is2/data/PSTree.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/PSTree.java diff --git a/dependencyParser/experimental/mate-tools/src/is2/data/Parameter.java b/dependencyParser/experimental/mate-tools/src/is2/data/Parameter.java new file mode 100644 index 0000000..7b1f870 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/data/Parameter.java @@ -0,0 +1,13 @@ +/** + * + */ +package is2.data; + +/** + * @author Dr. Bernd Bohnet, 23.12.2010 + * + * + */ +public class Parameter { + +} diff --git a/dependencyParser/mate-tools/src/is2/data/ParametersFloat.java b/dependencyParser/experimental/mate-tools/src/is2/data/ParametersFloat.java index 17837ef..17837ef 100755 --- a/dependencyParser/mate-tools/src/is2/data/ParametersFloat.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/ParametersFloat.java diff --git a/dependencyParser/mate-tools/src/is2/data/Parse.java b/dependencyParser/experimental/mate-tools/src/is2/data/Parse.java index aa24005..aa24005 100755 --- a/dependencyParser/mate-tools/src/is2/data/Parse.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/Parse.java diff --git a/dependencyParser/experimental/mate-tools/src/is2/data/ParseNBest.java b/dependencyParser/experimental/mate-tools/src/is2/data/ParseNBest.java new file mode 100644 index 0000000..cb02b71 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/data/ParseNBest.java @@ -0,0 +1,103 @@ +package is2.data; + +final public class ParseNBest extends Parse { + + private String signature = null; + + // public float[] scores; + + public ParseNBest() { + } + + public ParseNBest(short[] heads2, short[] types2, float p_new) { + super(heads2, types2, p_new); + } + + public ParseNBest(int i) { + super(i); + } + + /** + * @param sig + * @param readFloat + */ + public ParseNBest(String sig, float score) { + super(sig, score); + } + + /** + * create a total order to provide replicable deterministic results + * + * @param o + * @return + */ + public int compareTo(ParseNBest o) { + if (f1 < o.f1) + return 1; + if (f1 == o.f1) { + if (signature == null) + signature = signature(); + if (o.signature == null) + o.signature = o.signature(); + return o.signature.compareTo(signature); + + } + return -1; + } + + /** + * @return the signature of a parse + */ + @Override + public String signature() { + if (signature != null) + return signature; + signature = super.signature(); + return signature; + } + + /** + * @return the signature of a parse + */ + public String signature(short[] heads, short[] labels) { + StringBuilder b = new StringBuilder(heads.length * 2); + for (int k = 0; k < heads.length; k++) { + b.append((char) heads[k]).append((char) labels[k]); + } + signature = b.toString(); + return signature; + } + + /** + * @param heads + * @param types + * @param oldP + * @param ch + * @param s + */ + public String signature(short[] heads, short[] types, short p, short ch, short l) { + StringBuilder b = new StringBuilder(heads.length * 2); + for (int k = 0; k < heads.length; k++) { + + b.append(k == ch ? (char) p : (char) heads[k]).append(k == ch ? (char) l : (char) types[k]); + } + signature = b.toString(); + return signature; + + } + + @Override + public Parse clone() { + ParseNBest p = new ParseNBest(); + p.heads = new short[heads.length]; + p.labels = new short[labels.length]; + + System.arraycopy(heads, 0, p.heads, 0, heads.length); + System.arraycopy(labels, 0, p.labels, 0, labels.length); + + p.f1 = f1; + + return p; + } + +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/data/PipeGen.java b/dependencyParser/experimental/mate-tools/src/is2/data/PipeGen.java new file mode 100755 index 0000000..728666f --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/data/PipeGen.java @@ -0,0 +1,83 @@ +package is2.data; + +public class PipeGen { + + public static final String SENSE = "SENSE", POS = "POS", DIST = "DIST", WORD = "WORD", PRED = "PRED", ARG = "ARG", + FEAT = "F", REL = "REL", TYPE = "TYPE", CHAR = "C", FFEATS = "FF", DIR = "DIR", LA = "LA", RA = "RA"; + + public static final String GPOS = "GPOS", MID = "MID", END = "END", STR = "STR", FM = "FM", NOFEAT = "NOFEAT"; + + public static final String _0 = "0", _4 = "4", _3 = "3", _2 = "2", _1 = "1", _5 = "5", _10 = "10"; + + static public int outValue(int num1, int del) { + String out = "" + num1; + StringBuffer delS = new StringBuffer(); + for (int k = 0; k < del; k++) + delS.append('\b'); + del = out.length(); + System.out.print(delS + out); + return del; + } + + static public int outValue(int num1, int del, long last) { + String out = "" + num1 + " (" + (System.currentTimeMillis() - last) / (num1 + 1) + " ms/instance)"; + StringBuffer delS = new StringBuffer(); + for (int k = 0; k < del; k++) + delS.append('\b'); + del = out.length(); + System.out.print(delS + out); + return del; + } + + static public int outValueErr(int num1, float err, float f1, int del, long last) { + + String out = "" + num1 + " (" + (System.currentTimeMillis() - last) / (num1 + 1) + " ms/instance " + + (err / num1) + " err/instance f1=" + f1 + ") "; + StringBuffer delS = new StringBuffer(); + for (int k = 0; k < del; k++) + delS.append('\b'); + del = out.length(); + System.out.print(delS + out); + return del; + } + + static public int outValueErr(int num1, float err, float f1, int del, long last, double upd) { + String out = "" + num1 + " (" + (System.currentTimeMillis() - last) / (num1 + 1) + " ms/instance " + + (err / num1) + " err/instance f1=" + f1 + ") upd " + upd; + StringBuffer delS = new StringBuffer(); + for (int k = 0; k < del; k++) + delS.append('\b'); + del = out.length(); + System.out.print(delS + out); + return del; + } + + static public int outValueErr(int num1, float err, float f1, int del, long last, double upd, String info) { + String out = "" + num1 + " (" + (System.currentTimeMillis() - last) / (num1 + 1) + " ms/instance " + + (err / num1) + " err/instance f1=" + f1 + ") upd " + upd + " " + info; + StringBuffer delS = new StringBuffer(); + for (int k = 0; k < del; k++) + delS.append('\b'); + del = out.length(); + System.out.print(delS + out); + return del; + } + + /** + * @param cnt + * @param l + * @return + */ + public static String getSecondsPerInstnace(int cnt, long l) { + return " " + (l / (cnt * 1000f)) + " seconds/sentnece "; + } + + /** + * @param l + * @return + */ + public static String getUsedTime(long l) { + return "Used time " + ((l) / 1000f) + " seconds "; + } + +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/data/PrimeFinder.java b/dependencyParser/experimental/mate-tools/src/is2/data/PrimeFinder.java new file mode 100644 index 0000000..fab0901 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/data/PrimeFinder.java @@ -0,0 +1,51 @@ +/** + * + */ +package is2.data; + +import java.util.Arrays; + +/** + * @author Dr. Bernd Bohnet, 13.05.2010 + * + * + */ +public class PrimeFinder { + + public PrimeFinder() { + } + + public static final int nextPrime(int desiredCapacity) { + int i = Arrays.binarySearch(primeCapacities, desiredCapacity); + if (i < 0) + i = -i - 1; + return primeCapacities[i]; + } + + public static final int largestPrime = 2147483647; + private static final int primeCapacities[] = { 2147483647, 5, 11, 23, 47, 97, 197, 397, 797, 1597, 3203, 6421, + 12853, 25717, 51437, 102877, 205759, 411527, 823117, 1646237, 3292489, 6584983, 13169977, 26339969, + 52679969, 105359939, 210719881, 421439783, 842879579, 1685759167, 433, 877, 1759, 3527, 7057, 14143, 28289, + 56591, 113189, 226379, 452759, 905551, 1811107, 3622219, 7244441, 14488931, 28977863, 57955739, 115911563, + 231823147, 463646329, 927292699, 1854585413, 953, 1907, 3821, 7643, 15287, 30577, 61169, 122347, 244703, + 489407, 978821, 1957651, 3915341, 7830701, 15661423, 31322867, 62645741, 125291483, 250582987, 501165979, + 1002331963, 2004663929, 1039, 2081, 4177, 8363, 16729, 33461, 66923, 133853, 267713, 535481, 1070981, + 2141977, 4283963, 8567929, 17135863, 34271747, 68543509, 137087021, 274174111, 548348231, 1096696463, 31, + 67, 137, 277, 557, 1117, 2237, 4481, 8963, 17929, 35863, 71741, 143483, 286973, 573953, 1147921, 2295859, + 4591721, 9183457, 18366923, 36733847, 73467739, 146935499, 293871013, 587742049, 1175484103, 599, 1201, + 2411, 4831, 9677, 19373, 38747, 77509, 155027, 310081, 620171, 1240361, 2480729, 4961459, 9922933, 19845871, + 39691759, 79383533, 158767069, 317534141, 635068283, 1270136683, 311, 631, 1277, 2557, 5119, 10243, 20507, + 41017, 82037, 164089, 328213, 656429, 1312867, 2625761, 5251529, 10503061, 21006137, 42012281, 84024581, + 168049163, 336098327, 672196673, 1344393353, 3, 7, 17, 37, 79, 163, 331, 673, 1361, 2729, 5471, 10949, + 21911, 43853, 87719, 175447, 350899, 701819, 1403641, 2807303, 5614657, 11229331, 22458671, 44917381, + 89834777, 179669557, 359339171, 718678369, 1437356741, 43, 89, 179, 359, 719, 1439, 2879, 5779, 11579, + 23159, 46327, 92657, 185323, 370661, 741337, 1482707, 2965421, 5930887, 11861791, 23723597, 47447201, + 94894427, 189788857, 379577741, 759155483, 1518310967, 379, 761, 1523, 3049, 6101, 12203, 24407, 48817, + 97649, 195311, 390647, 781301, 1562611, 3125257, 6250537, 12501169, 25002389, 50004791, 100009607, + 200019221, 400038451, 800076929, 1600153859 }; + + static { + Arrays.sort(primeCapacities); + } + +} diff --git a/dependencyParser/mate-tools/src/is2/data/RandomIndex.java b/dependencyParser/experimental/mate-tools/src/is2/data/RandomIndex.java index 8ab61e9..8ab61e9 100644 --- a/dependencyParser/mate-tools/src/is2/data/RandomIndex.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/RandomIndex.java diff --git a/dependencyParser/mate-tools/src/is2/data/SentenceData09.java b/dependencyParser/experimental/mate-tools/src/is2/data/SentenceData09.java index 386fa39..386fa39 100755 --- a/dependencyParser/mate-tools/src/is2/data/SentenceData09.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/SentenceData09.java diff --git a/dependencyParser/mate-tools/src/is2/data/Thesaurus.java b/dependencyParser/experimental/mate-tools/src/is2/data/Thesaurus.java index bafc3b9..bafc3b9 100644 --- a/dependencyParser/mate-tools/src/is2/data/Thesaurus.java +++ b/dependencyParser/experimental/mate-tools/src/is2/data/Thesaurus.java diff --git a/dependencyParser/mate-tools/src/is2/io/CONLLReader04.java b/dependencyParser/experimental/mate-tools/src/is2/io/CONLLReader04.java index 695f10d..695f10d 100644 --- a/dependencyParser/mate-tools/src/is2/io/CONLLReader04.java +++ b/dependencyParser/experimental/mate-tools/src/is2/io/CONLLReader04.java diff --git a/dependencyParser/mate-tools/src/is2/io/CONLLReader06.java b/dependencyParser/experimental/mate-tools/src/is2/io/CONLLReader06.java index 10d1d2d..10d1d2d 100755 --- a/dependencyParser/mate-tools/src/is2/io/CONLLReader06.java +++ b/dependencyParser/experimental/mate-tools/src/is2/io/CONLLReader06.java diff --git a/dependencyParser/mate-tools/src/is2/io/CONLLReader08.java b/dependencyParser/experimental/mate-tools/src/is2/io/CONLLReader08.java index 467e853..467e853 100644 --- a/dependencyParser/mate-tools/src/is2/io/CONLLReader08.java +++ b/dependencyParser/experimental/mate-tools/src/is2/io/CONLLReader08.java diff --git a/dependencyParser/mate-tools/src/is2/io/CONLLReader09.java b/dependencyParser/experimental/mate-tools/src/is2/io/CONLLReader09.java index cee897a..cee897a 100755 --- a/dependencyParser/mate-tools/src/is2/io/CONLLReader09.java +++ b/dependencyParser/experimental/mate-tools/src/is2/io/CONLLReader09.java diff --git a/dependencyParser/mate-tools/src/is2/io/CONLLWriter06.java b/dependencyParser/experimental/mate-tools/src/is2/io/CONLLWriter06.java index 24446b5..24446b5 100755 --- a/dependencyParser/mate-tools/src/is2/io/CONLLWriter06.java +++ b/dependencyParser/experimental/mate-tools/src/is2/io/CONLLWriter06.java diff --git a/dependencyParser/mate-tools/src/is2/io/CONLLWriter09.java b/dependencyParser/experimental/mate-tools/src/is2/io/CONLLWriter09.java index c28ab24..c28ab24 100755 --- a/dependencyParser/mate-tools/src/is2/io/CONLLWriter09.java +++ b/dependencyParser/experimental/mate-tools/src/is2/io/CONLLWriter09.java diff --git a/dependencyParser/mate-tools/src/is2/io/IOGenerals.java b/dependencyParser/experimental/mate-tools/src/is2/io/IOGenerals.java index 030bedd..030bedd 100644 --- a/dependencyParser/mate-tools/src/is2/io/IOGenerals.java +++ b/dependencyParser/experimental/mate-tools/src/is2/io/IOGenerals.java diff --git a/dependencyParser/mate-tools/src/is2/io/PSReader.java b/dependencyParser/experimental/mate-tools/src/is2/io/PSReader.java index 5e8b1ad..5e8b1ad 100644 --- a/dependencyParser/mate-tools/src/is2/io/PSReader.java +++ b/dependencyParser/experimental/mate-tools/src/is2/io/PSReader.java diff --git a/dependencyParser/experimental/mate-tools/src/is2/io/TigerReader.java b/dependencyParser/experimental/mate-tools/src/is2/io/TigerReader.java new file mode 100644 index 0000000..10fa0ea --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/io/TigerReader.java @@ -0,0 +1,208 @@ +/** + * + */ +package is2.io; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.StringTokenizer; + +import is2.data.PSTree; + +/** + * @author Dr. Bernd Bohnet, 17.01.2011 + * + * Reads a sentences in Penn Tree Bank bracket style and return + * sentences. + */ +public class TigerReader implements PSReader { + + BufferedReader inputReader; + ArrayList<File> psFiles = new ArrayList<File>(); + ArrayList<PSTree> psCache = new ArrayList<PSTree>(); + + String filter[] = null; + int startFilter = -1; + int endFilter = -1; + + public TigerReader() { + } + + public TigerReader(String file) { + + try { + inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "ISO-8859-1"), 32768); + } catch (Exception e) { + e.printStackTrace(); + } + } + + /** + * @param ps + */ + @Override + public void startReading(String file, String[] filter) { + + try { + this.filter = filter; + startFilter = filter == null ? -1 : 1; + endFilter = filter == null ? -1 : 1; + + inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "ISO-8859-1"), 32768); + } catch (Exception e) { + e.printStackTrace(); + } + + } + + public static class Line { + String form; + String lemma; + String morph; + String pos; + int parent; + String edge; + + } + + static int stop = 0; + + /** + * @return + */ + @Override + public PSTree getNext() { + + PSTree ps = null; + String l = null; + ArrayList<Line> lines = new ArrayList<Line>(); + try { + int state = 1, terminals = 0, nonterminals = 0; + while ((l = inputReader.readLine()) != null) { + + if (startFilter == 1 && l.startsWith("#BOS " + filter[0])) { + System.out.println("found start " + l); + startFilter = 2; + } + if (endFilter == 1 && l.startsWith("#EOS " + filter[1])) { + System.out.println("found end " + l); + + endFilter = 2; + } + + if (startFilter == 1 || endFilter == 2) + continue; + + if (l.startsWith("#BOS")) { + + state = 2; + continue; + } + if (l.startsWith("#500")) + state = 3; + if (l.startsWith("#EOS")) + state = 4; + if (state < 2) + continue; + + if (state == 4) { + + ps = new PSTree(); + ps.create(terminals, nonterminals); + // System.out.println("terminals "+terminals); + // build ps tree + + int cnt = 0; + // ps.entries[0] =CONLLReader09.ROOT; + // ps.head[0]=-1; + int root = -1; + for (Line line : lines) { + + /* + * if (cnt==terminals) { // insert root root =cnt; + * cnt++; } + */ + ps.entries[cnt] = line.form; + if (cnt < terminals) + ps.pos[cnt] = line.pos; + else + ps.entries[cnt] = line.pos; + ps.lemmas[cnt] = line.lemma; + ps.head[cnt] = line.parent == 0 ? lines.size() - 1 + : line.parent >= 500 ? line.parent - 500 + terminals : line.parent; + // ps.head[cnt] = + // line.parent==0?lines.size()-1:line.parent>=500?line.parent-500+terminals:line.parent; + ps.morph[cnt] = line.morph; + cnt++; + + } + + if (root == -1) + root = terminals; + ps.head[cnt - 1] = 0; // root + ps.terminalCount = terminals; + lines.clear(); + state = 1; + + /* + * for(int k=0;k<ps.head.length;k++) { if + * (ps.head[k]<terminals && k!=root) { ps.head[k]=root; // + * DB.println("error "+k+" "+ps.head[k]); } } + */ + // System.out.println(""+ps.toString()); + // if (stop++ == 4)System.exit(0); + return ps; + } + + StringTokenizer t = new StringTokenizer(l, "\t"); + int tc = 0; + Line line = new Line(); + lines.add(line); + while (t.hasMoreTokens()) { + String token = t.nextToken(); + if (token.equals("\t")) + continue; + if (tc == 0) { + if (token.startsWith("#5") || token.startsWith("#6")) { + nonterminals++; + + } else { + terminals++; + + // change it back to the wrong format since the + // conll stuff was derived from this. + // if (token.equals("durchblicken")) + // token="durchblikken"; + line.form = token; + } + + } else if (tc == 1) { + line.lemma = token; + } else if (tc == 2) { + line.pos = token; + } else if (tc == 3) { + line.morph = token; + } else if (tc == 4) { + line.edge = token; + } else if (tc == 5) { + line.parent = Integer.parseInt(token); + } + + if (token.length() > 0) + tc++; + } + + // read till #EOS + + } + } catch (Exception e) { + e.printStackTrace(); + } + return ps; + + } + +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/lemmatizer/Evaluator.java b/dependencyParser/experimental/mate-tools/src/is2/lemmatizer/Evaluator.java new file mode 100755 index 0000000..cc1b423 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/lemmatizer/Evaluator.java @@ -0,0 +1,108 @@ +package is2.lemmatizer; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.Hashtable; +import java.util.Map.Entry; + +import is2.data.SentenceData09; +import is2.io.CONLLReader09; + +public class Evaluator { + + public static void evaluate(String act_file, String pred_file, String format) throws Exception { + + CONLLReader09 goldReader = new CONLLReader09(act_file, CONLLReader09.NO_NORMALIZE); + CONLLReader09 predictedReader = new CONLLReader09(pred_file, CONLLReader09.NO_NORMALIZE); + // predictedReader.startReading(pred_file); + + Hashtable<String, Integer> errors = new Hashtable<String, Integer>(); + + int total = 0, corrL = 0, corrT = 0; + int numsent = 0; + SentenceData09 goldInstance = goldReader.getNext(); + SentenceData09 predInstance = predictedReader.getNext(); + + while (goldInstance != null) { + + int instanceLength = goldInstance.length(); + + if (instanceLength != predInstance.length()) + System.out.println("Lengths do not match on sentence " + numsent); + + String gold[] = goldInstance.lemmas; + String pred[] = predInstance.plemmas; + + boolean whole = true; + boolean wholeL = true; + + // NOTE: the first item is the root info added during + // nextInstance(), so we skip it. + + for (int i = 1; i < instanceLength; i++) { + if (gold[i].toLowerCase().equals(pred[i].toLowerCase())) + corrT++; + + if (gold[i].equals(pred[i])) + corrL++; + else { + + // System.out.println("error gold:"+goldPos[i]+" + // pred:"+predPos[i]+" "+goldInstance.forms[i]+" snt + // "+numsent+" i:"+i); + String key = "gold: '" + gold[i] + "' pred: '" + pred[i] + "'"; + Integer cnt = errors.get(key); + if (cnt == null) { + errors.put(key, 1); + } else { + errors.put(key, cnt + 1); + } + } + + } + total += instanceLength - 1; // Subtract one to not score fake root + // token + + if (whole) { + } + if (wholeL) { + } + numsent++; + + goldInstance = goldReader.getNext(); + predInstance = predictedReader.getNext(); + } + ArrayList<Entry<String, Integer>> opsl = new ArrayList<Entry<String, Integer>>(); + for (Entry<String, Integer> e : errors.entrySet()) { + opsl.add(e); + } + + Collections.sort(opsl, new Comparator<Entry<String, Integer>>() { + + @Override + public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) { + + return o1.getValue() == o2.getValue() ? 0 : o1.getValue() > o2.getValue() ? 1 : -1; + } + + }); + + /* + * for(Entry<String, Integer> e : opsl) { + * System.out.println(e.getKey()+" "+e.getValue()); } + */ + + System.out.println("Tokens: " + total + " Correct: " + corrT + " " + (float) corrT / total + + " correct uppercase " + (float) corrL / total); + } + + public static void main(String[] args) throws Exception { + String format = "CONLL"; + if (args.length > 2) + format = args[2]; + + evaluate(args[0], args[1], format); + } + +} diff --git a/dependencyParser/mate-tools/src/is2/lemmatizer/Lemmatizer.java b/dependencyParser/experimental/mate-tools/src/is2/lemmatizer/Lemmatizer.java index b15aaa7..b15aaa7 100755 --- a/dependencyParser/mate-tools/src/is2/lemmatizer/Lemmatizer.java +++ b/dependencyParser/experimental/mate-tools/src/is2/lemmatizer/Lemmatizer.java diff --git a/dependencyParser/mate-tools/src/is2/lemmatizer/MFO.java b/dependencyParser/experimental/mate-tools/src/is2/lemmatizer/MFO.java index 305e827..305e827 100755 --- a/dependencyParser/mate-tools/src/is2/lemmatizer/MFO.java +++ b/dependencyParser/experimental/mate-tools/src/is2/lemmatizer/MFO.java diff --git a/dependencyParser/experimental/mate-tools/src/is2/lemmatizer/Options.java b/dependencyParser/experimental/mate-tools/src/is2/lemmatizer/Options.java new file mode 100755 index 0000000..30c2567 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/lemmatizer/Options.java @@ -0,0 +1,72 @@ +package is2.lemmatizer; + +import java.io.File; +import java.io.IOException; + +import is2.util.OptionsSuper; + +public final class Options extends OptionsSuper { + + public Options(String[] args) throws IOException { + + for (int i = 0; i < args.length; i++) { + + if (args[i].equals("--help")) + explain(); + + if (args[i].equals("-normalize")) { + normalize = Boolean.parseBoolean(args[++i]); + } else if (args[i].equals("-features")) { + features = args[i + 1]; + i++; + } else if (args[i].equals("-hsize")) { + hsize = Integer.parseInt(args[i + 1]); + i++; + } else if (args[i].equals("-len")) { + maxLen = Integer.parseInt(args[i + 1]); + i++; + } else if (args[i].equals("-tmp")) { + tmp = args[i + 1]; + i++; + } else if (args[i].equals("-uc")) { + upper = true; + System.out.println("set uppercase " + upper); + + } else + super.addOption(args, i); + + } + + if (trainfile != null) { + + if (tmp != null) + trainforest = File.createTempFile("train", ".tmp", new File(tmp)); + else + trainforest = File.createTempFile("train", ".tmp"); // ,new + // File("F:\\") + trainforest.deleteOnExit(); + } + + } + + private void explain() { + System.out.println("Usage: "); + System.out.println("java -class mate.jar is2.lemmatizer.Lemmatizer [Options]"); + System.out.println(); + System.out.println("Options:"); + System.out.println(""); + System.out.println(" -train <file> the corpus a model is trained on; default " + this.trainfile); + System.out.println(" -test <file> the input corpus for testing; default " + this.testfile); + System.out.println(" -out <file> the output corpus (result) of a test run; default " + this.outfile); + System.out.println(" -model <file> the parsing model for traing the model is stored in the files"); + System.out.println( + " and for parsing the model is load from this file; default " + this.modelName); + System.out.println( + " -i <number> the number of training iterations; good numbers are 10 for smaller corpora and 6 for bigger; default " + + this.numIters); + System.out.println(" -count <number> the n first sentences of the corpus are take for the training default " + + this.count); + + System.exit(0); + } +} diff --git a/dependencyParser/mate-tools/src/is2/lemmatizer/Pipe.java b/dependencyParser/experimental/mate-tools/src/is2/lemmatizer/Pipe.java index c8b4bba..c8b4bba 100755 --- a/dependencyParser/mate-tools/src/is2/lemmatizer/Pipe.java +++ b/dependencyParser/experimental/mate-tools/src/is2/lemmatizer/Pipe.java diff --git a/dependencyParser/mate-tools/src/is2/lemmatizer/StringEdit.java b/dependencyParser/experimental/mate-tools/src/is2/lemmatizer/StringEdit.java index 69fd872..69fd872 100755 --- a/dependencyParser/mate-tools/src/is2/lemmatizer/StringEdit.java +++ b/dependencyParser/experimental/mate-tools/src/is2/lemmatizer/StringEdit.java diff --git a/dependencyParser/experimental/mate-tools/src/is2/mtag/Convert.java b/dependencyParser/experimental/mate-tools/src/is2/mtag/Convert.java new file mode 100755 index 0000000..05b0741 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/mtag/Convert.java @@ -0,0 +1,99 @@ +/** + * + */ +package is2.mtag; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.util.ArrayList; + +/** + * @author Dr. Bernd Bohnet, 20.01.2010 + * + * + */ +public class Convert { + + public static void main(String[] args) throws IOException { + + Options options = new Options(args); + + split(options.trainfile); + + } + + /** + * @param trainfile + * @throws IOException + */ + private static void split(String trainfile) throws IOException { + + String dir = "split"; + boolean success = (new File("split")).mkdir(); + if (success) + System.out.println("Directory: " + dir + " created"); + + ArrayList<String> corpus = new ArrayList<String>(); + + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(trainfile), "UTF-8"), + 32768); + String l = null; + int sentences = 0; + try { + while ((l = reader.readLine()) != null) { + + corpus.add(l); + if (l.length() < 8) + sentences++; + + } + } catch (IOException e) { + e.printStackTrace(); + } + System.out.println("Corpus has " + sentences + " sentences."); + + int partSize = sentences / 20; + System.out.println("Prepare corpus for cross annotations with 20 parts with part size " + partSize + + " number of lines " + corpus.size()); + + for (int k = 0; k < 20; k++) { + BufferedWriter br = new BufferedWriter( + new OutputStreamWriter(new FileOutputStream("split/p-" + k), "UTF-8")); + BufferedWriter rest = new BufferedWriter( + new OutputStreamWriter(new FileOutputStream("split/r-" + k), "UTF-8")); + int skip = k * partSize; + + int countSentences = 0; + int countSentencesWrote = 0; + System.out.println("skip from " + skip + " to " + (skip + partSize - 1)); + for (String x : corpus) { + if (countSentences >= skip && (countSentences < (skip + partSize) || k == 19)) { + rest.write(x); + rest.newLine(); + if (x.length() < 8) + countSentencesWrote++; + } else { + br.write(x); + br.newLine(); + } + + if (x.length() < 8) + countSentences++; + } + System.out.println("wrote for this part " + countSentencesWrote); + br.flush(); + br.close(); + rest.flush(); + rest.close(); + + } + + } + +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/mtag/Evaluator.java b/dependencyParser/experimental/mate-tools/src/is2/mtag/Evaluator.java new file mode 100755 index 0000000..16c7bba --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/mtag/Evaluator.java @@ -0,0 +1,149 @@ +package is2.mtag; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.Hashtable; +import java.util.Map.Entry; + +import is2.data.SentenceData09; +import is2.io.CONLLReader09; + +public class Evaluator { + + public static void evaluate(String act_file, String pred_file, String format) throws Exception { + + CONLLReader09 goldReader = new CONLLReader09(act_file);// DependencyReader.createDependencyReader(); + // boolean labeled = goldReader.startReading(act_file); + + CONLLReader09 predictedReader = new CONLLReader09(); + predictedReader.startReading(pred_file); + + // if (labeled != predLabeled) + // System.out.println("Gold file and predicted file appear to differ on + // whether or not they are labeled. Expect problems!!!"); + + int total = 0, totalP = 0, corrT = 0; + int totalD = 0, corrD = 0, err = 0; + int numsent = 0; + SentenceData09 goldInstance = goldReader.getNext(); + SentenceData09 predInstance = predictedReader.getNext(); + + Hashtable<String, Integer> errors = new Hashtable<String, Integer>(); + Hashtable<String, StringBuffer> words = new Hashtable<String, StringBuffer>(); + + while (goldInstance != null) { + + int instanceLength = goldInstance.length(); + + if (instanceLength != predInstance.length()) + System.out.println("Lengths do not match on sentence " + numsent); + + String gold[] = goldInstance.ofeats; + String pred[] = predInstance.pfeats; + + boolean whole = true; + boolean wholeL = true; + + // NOTE: the first item is the root info added during + // nextInstance(), so we skip it. + + for (int i = 1; i < instanceLength; i++) { + if (gold[i].equals(pred[i]) || (gold[i].equals("_") && pred[i] == null)) + corrT++; + else { + // System.out.println("gold:"+goldFeats[i]+" + // pred:"+predFeats[i]+" "+goldInstance.forms[i]+" snt + // "+numsent+" i:"+i); + // for (int k = 1; k < instanceLength; k++) { + + // System.out.print(goldInstance.forms[k]+":"+goldInstance.gpos[k]); + // if (k==i) System.out.print(":"+predInstance.gpos[k]); + // System.out.print(" "); + + // } + // System.out.println(); + String key = "gold: '" + gold[i] + "' pred: '" + pred[i] + "'"; + Integer cnt = errors.get(key); + StringBuffer errWrd = words.get(key); + if (cnt == null) { + errors.put(key, 1); + words.put(key, new StringBuffer().append(goldInstance.forms[i])); + } else { + errors.put(key, cnt + 1); + errWrd.append(" " + goldInstance.forms[i]); + } + err++; + + } + String[] gf = gold[i].split("|"); + int eq = 0; + + if (pred[i] != null) { + String[] pf = pred[i].split("|"); + totalP += pf.length; + + if (pf.length > gf.length) { + } else { + } + + for (String g : gf) { + for (String p : pf) { + if (g.equals(p)) { + eq++; + break; + } + } + } + } else { + } + totalD += gf.length; + corrD += eq; + } + total += instanceLength - 1; // Subtract one to not score fake root + // token + + if (whole) { + } + if (wholeL) { + } + numsent++; + + goldInstance = goldReader.getNext(); + predInstance = predictedReader.getNext(); + } + + ArrayList<Entry<String, Integer>> opsl = new ArrayList<Entry<String, Integer>>(); + for (Entry<String, Integer> e : errors.entrySet()) { + opsl.add(e); + } + + Collections.sort(opsl, new Comparator<Entry<String, Integer>>() { + + @Override + public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) { + + return o1.getValue() == o2.getValue() ? 0 : o1.getValue() > o2.getValue() ? -1 : 1; + } + + }); + + System.out.println("10 top most errors:"); + + System.out.println("Tokens: " + total + " Correct: " + corrT + " " + (float) corrT / total + " R " + + ((float) corrD / totalD) + " tP " + totalP + " tG " + totalD + " P " + (float) corrD / totalP); + System.out.println("err: " + err + " total " + total + " corr " + corrT); + // System.out.println("Unlabeled Complete Correct: " + + // ((double)corrsent/numsent)); + + } + + public static void main(String[] args) throws Exception { + String format = "CONLL"; + if (args.length > 2) + format = args[2]; + + evaluate(args[0], args[1], format); + } + +} diff --git a/dependencyParser/mate-tools/src/is2/mtag/ExtractorM.java b/dependencyParser/experimental/mate-tools/src/is2/mtag/ExtractorM.java index e84f859..e84f859 100644 --- a/dependencyParser/mate-tools/src/is2/mtag/ExtractorM.java +++ b/dependencyParser/experimental/mate-tools/src/is2/mtag/ExtractorM.java diff --git a/dependencyParser/mate-tools/src/is2/mtag/MFO.java b/dependencyParser/experimental/mate-tools/src/is2/mtag/MFO.java index e315ba4..e315ba4 100755 --- a/dependencyParser/mate-tools/src/is2/mtag/MFO.java +++ b/dependencyParser/experimental/mate-tools/src/is2/mtag/MFO.java diff --git a/dependencyParser/experimental/mate-tools/src/is2/mtag/Options.java b/dependencyParser/experimental/mate-tools/src/is2/mtag/Options.java new file mode 100755 index 0000000..20969ff --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/mtag/Options.java @@ -0,0 +1,54 @@ +package is2.mtag; + +import is2.util.OptionsSuper; + +public final class Options extends OptionsSuper { + + public Options(String[] args) { + + for (int i = 0; i < args.length; i++) { + + if (args[i].equals("--help")) + explain(); + + if (args[i].equals("-nonormalize")) { + normalize = false; + } else if (args[i].equals("-features")) { + features = args[i + 1]; + i++; + } else if (args[i].equals("-hsize")) { + hsize = Integer.parseInt(args[i + 1]); + i++; + } else if (args[i].equals("-len")) { + maxLen = Integer.parseInt(args[i + 1]); + i++; + } else + super.addOption(args, i); + } + } + + private void explain() { + System.out.println("Usage: "); + System.out.println("java -cp anna.jar is2.mtag.Tagger [Options]"); + System.out.println(); + System.out.println("Example: "); + System.out.println( + " java -cp mate.jar is2.mtag.Tagger -model eps3.model -train corpora/conll08st/train/train.closed -test corpora/conll08st/devel/devel.closed -out b3.test -eval corpora/conll08st/devel/devel.closed -count 2000 -i 6"); + System.out.println(""); + System.out.println("Options:"); + System.out.println(""); + System.out.println(" -train <file> the corpus a model is trained on; default " + this.trainfile); + System.out.println(" -test <file> the input corpus for testing; default " + this.testfile); + System.out.println(" -out <file> the output corpus (result) of a test run; default " + this.outfile); + System.out.println(" -model <file> the parsing model for traing the model is stored in the files"); + System.out.println( + " and for parsing the model is load from this file; default " + this.modelName); + System.out.println( + " -i <number> the number of training iterations; good numbers are 10 for smaller corpora and 6 for bigger; default " + + this.numIters); + System.out.println(" -count <number> the n first sentences of the corpus are take for the training default " + + this.count); + + System.exit(0); + } +} diff --git a/dependencyParser/mate-tools/src/is2/mtag/Pipe.java b/dependencyParser/experimental/mate-tools/src/is2/mtag/Pipe.java index 75fb3fe..75fb3fe 100755 --- a/dependencyParser/mate-tools/src/is2/mtag/Pipe.java +++ b/dependencyParser/experimental/mate-tools/src/is2/mtag/Pipe.java diff --git a/dependencyParser/mate-tools/src/is2/mtag/Tagger.java b/dependencyParser/experimental/mate-tools/src/is2/mtag/Tagger.java index 05aa8d7..05aa8d7 100644 --- a/dependencyParser/mate-tools/src/is2/mtag/Tagger.java +++ b/dependencyParser/experimental/mate-tools/src/is2/mtag/Tagger.java diff --git a/dependencyParser/mate-tools/src/is2/parser/Closed.java b/dependencyParser/experimental/mate-tools/src/is2/parser/Closed.java index af491aa..af491aa 100755 --- a/dependencyParser/mate-tools/src/is2/parser/Closed.java +++ b/dependencyParser/experimental/mate-tools/src/is2/parser/Closed.java diff --git a/dependencyParser/experimental/mate-tools/src/is2/parser/D5.java b/dependencyParser/experimental/mate-tools/src/is2/parser/D5.java new file mode 100644 index 0000000..407b4e1 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/parser/D5.java @@ -0,0 +1,293 @@ +/** + * + */ +package is2.parser; + +import is2.data.DX; +import is2.data.IFV; +import is2.data.Long2IntInterface; + +/** + * @author Dr. Bernd Bohnet, 30.10.2010 + * + * + */ +final public class D5 extends DX { + + public long shift; + private long h; + + /* + * (non-Javadoc) + * + * @see is2.parser52L.DX#cz2() + */ + final public void cz2() { + + if (v0 < 0 || v1 < 0) { + shift = 0; + h = -1; + return; + } + + h = v0 | v1 << (shift = a0); + shift += a1; + + } + + /* + * (non-Javadoc) + * + * @see is2.parser52L.DX#cz3() + */ + @Override + final public void cz3() { + + if (v0 < 0 || v1 < 0 || v2 < 0) { + shift = 0; + h = -1; + return; + + } + + h = v0 | v1 << (shift = a0) | v2 << (shift += a1); + shift = shift + a2; + + } + + /* + * (non-Javadoc) + * + * @see is2.parser52L.DX#cz4() + */ + @Override + final public void cz4() { + if (v0 < 0 || v1 < 0 || v2 < 0 || v3 < 0) { + shift = 0; + h = -1; + return; + } + + h = v0 | v1 << (shift = a0) | v2 << (shift += a1) | v3 << (shift += a2); + shift = shift + a3; + + } + + /* + * (non-Javadoc) + * + * @see is2.parser52L.DX#cz5() + */ + @Override + final public void cz5() { + + if (v0 < 0 || v1 < 0 || v2 < 0 || v3 < 0 || v4 < 0) { + shift = 0; + h = -1; + return; + } + + h = v0 | v1 << (shift = a0) | v2 << (shift += a1) | v3 << (shift += a2) | v4 << (shift += a3); + shift = shift + a4; + + } + + /* + * (non-Javadoc) + * + * @see is2.parser52L.DX#cz6() + */ + @Override + final public void cz6() { + + if (v0 < 0 || v1 < 0 || v2 < 0 || v3 < 0 || v4 < 0 || v5 < 0) { + shift = 0; + h = -1; + return; + } + + h = v0 | v1 << (shift = a0) | v2 << (shift += a1) | v3 << (shift += a2) | v4 << (shift += a3) + | v5 << (shift += a4); + shift = shift + a5; + + } + + /* + * (non-Javadoc) + * + * @see is2.parser52L.DX#cz7() + */ + @Override + final public void cz7() { + + if (v0 < 0 || v1 < 0 || v2 < 0 || v3 < 0 || v4 < 0 || v5 < 0 || v6 < 0) { + shift = 0; + h = -1; + return; + } + + h = v0 | v1 << (shift = a0) | v2 << (shift += a1) | v3 << (shift += a2) | v4 << (shift += a3) + | v5 << (shift += a4) | v6 << (shift += a5); + shift = shift + a6; + + } + + /* + * (non-Javadoc) + * + * @see is2.parser52L.DX#cz8() + */ + @Override + final public void cz8() { + + if (v0 < 0 || v1 < 0 || v2 < 0 || v3 < 0 || v4 < 0 || v5 < 0 || v6 < 0 || v7 < 0) { + h = -1; + shift = 0; + return; + } + + h = v0 | v1 << (shift = a0) | v2 << (shift += a1) | v3 << (shift += a2) | v4 << (shift += a3) + | v5 << (shift += a4) | v6 << (shift += a5) | v7 << (shift += a6); + shift = shift + a7; + + } + + /* + * (non-Javadoc) + * + * @see is2.parser52L.DX#clean() + */ + @Override + final public void clean() { + v0 = 0; + v1 = 0; + v2 = 0; + v3 = 0; + v4 = 0; + v5 = 0; + v6 = 0; + v7 = 0; + v8 = 0; + shift = 0; + h = 0; + } + + public final Long2IntInterface _li; + + public D5(Long2IntInterface li) { + _li = li; + } + + /* + * (non-Javadoc) + * + * @see is2.parser52L.DX#cs(int, int) + */ + @Override + final public long cs(int b, int v) { + if (h < 0) { + h = -1; + shift = 0; + return -1; + } + + h |= (long) v << shift; + shift += b; + if (shift > 64) { + System.out.println("shift too large " + shift); + new Exception().printStackTrace(); + } + + return h; + + } + + /* + * (non-Javadoc) + * + * @see is2.parser52L.DX#csa(int, int) + */ + @Override + final public long csa(int b, int v) { + if (h < 0) { + h = -1; + shift = 0; + return -1; + } + + h |= (long) v << shift; + shift += b; + if (shift > 64) { + System.out.println("shift too large " + shift); + new Exception().printStackTrace(); + } + + return h; + + } + + /* + * (non-Javadoc) + * + * @see is2.parser52L.DX#csa(int, int, is2.data.IFV) + */ + @Override + final public void csa(int b, int v, IFV f) { + if (h < 0) { + h = -1; + shift = 0; + return; + } + + h |= (long) v << shift; + shift += b; + if (shift > 64) { + System.out.println("shift too large " + shift); + new Exception().printStackTrace(); + } + + f.add(_li.l2i(h)); + } + + /* + * (non-Javadoc) + * + * @see is2.parser52L.DX#getVal() + */ + @Override + public long getVal() { + if (h < 0) { + h = -1; + shift = 0; + return h; + } + return h; + } + + /* + * (non-Javadoc) + * + * @see is2.parser52L.DX#map(is2.data.IFV, long) + */ + @Override + public void map(IFV f, long l) { + if (l > 0) + f.add(_li.l2i(l)); + } + + /* + * (non-Javadoc) + * + * @see is2.data.DX#computeLabeValue(short, short) + */ + @Override + public int computeLabeValue(int label, int shift) { + return label << shift; + } + + @Override + public void fix() { + + } + +} \ No newline at end of file diff --git a/dependencyParser/experimental/mate-tools/src/is2/parser/Decoder.java b/dependencyParser/experimental/mate-tools/src/is2/parser/Decoder.java new file mode 100755 index 0000000..1fe2340 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/parser/Decoder.java @@ -0,0 +1,243 @@ +package is2.parser; + +import java.util.ArrayList; +import java.util.concurrent.ExecutorService; + +import is2.data.DataFES; +import is2.data.Parse; + +/** + * @author Bernd Bohnet, 01.09.2009 + * + * This methods do the actual work and they build the dependency trees. + */ +final public class Decoder { + + public static final boolean TRAINING = true; + public static long timeDecotder; + public static long timeRearrange; + + /** + * Threshold for rearrange edges non-projective + */ + public static float NON_PROJECTIVITY_THRESHOLD = 0.3F; + + static ExecutorService executerService = java.util.concurrent.Executors.newFixedThreadPool(Parser.THREADS); + + // do not initialize + private Decoder() { + }; + + /** + * Build a dependency tree based on the data + * + * @param pos + * part-of-speech tags + * @param x + * the data + * @param projective + * projective or non-projective + * @param edges + * the edges + * @return a parse tree + * @throws InterruptedException + */ + public static Parse decode(short[] pos, DataFES x, boolean projective, boolean training) + throws InterruptedException { + + long ts = System.nanoTime(); + + if (executerService.isShutdown()) + executerService = java.util.concurrent.Executors.newCachedThreadPool(); + final int n = pos.length; + + final Open O[][][][] = new Open[n][n][2][]; + final Closed C[][][][] = new Closed[n][n][2][]; + + ArrayList<ParallelDecoder> pe = new ArrayList<ParallelDecoder>(); + + for (int i = 0; i < Parser.THREADS; i++) + pe.add(new ParallelDecoder(pos, x, O, C, n)); + + for (short k = 1; k < n; k++) { + + // provide the threads the data + for (short s = 0; s < n; s++) { + short t = (short) (s + k); + if (t >= n) + break; + + ParallelDecoder.add(s, t); + } + + executerService.invokeAll(pe); + } + + float bestSpanScore = (-1.0F / 0.0F); + Closed bestSpan = null; + for (int m = 1; m < n; m++) + if (C[0][n - 1][1][m].p > bestSpanScore) { + bestSpanScore = C[0][n - 1][1][m].p; + bestSpan = C[0][n - 1][1][m]; + } + + // build the dependency tree from the chart + Parse out = new Parse(pos.length); + + bestSpan.create(out); + + out.heads[0] = -1; + out.labels[0] = 0; + + timeDecotder += (System.nanoTime() - ts); + + ts = System.nanoTime(); + + if (!projective) + rearrange(pos, out.heads, out.labels, x, training); + + timeRearrange += (System.nanoTime() - ts); + + return out; + } + + public static Parse[] decodeAll(short[] pos, DataFES x, boolean projective, boolean training) + throws InterruptedException { + + long ts = System.nanoTime(); + + if (executerService.isShutdown()) + executerService = java.util.concurrent.Executors.newCachedThreadPool(); + final int n = pos.length; + + final Open O[][][][] = new Open[n][n][2][]; + final Closed C[][][][] = new Closed[n][n][2][]; + + ArrayList<ParallelDecoder> pe = new ArrayList<ParallelDecoder>(); + + for (int i = 0; i < Parser.THREADS; i++) + pe.add(new ParallelDecoder(pos, x, O, C, n)); + + for (short k = 1; k < n; k++) { + + // provide the threads the data + for (short s = 0; s < n; s++) { + short t = (short) (s + k); + if (t >= n) + break; + + ParallelDecoder.add(s, t); + } + + executerService.invokeAll(pe); + } + + Parse[] out = new Parse[n - 1]; + + // float bestSpanScore = (-1.0F / 0.0F); + // Closed bestSpan = null; + for (int m = 1; m < n; m++) { + // if (C[0][n - 1][1][m].p > bestSpanScore) { + // bestSpanScore = C[0][n - 1][1][m].p; + // bestSpan = C[0][n - 1][1][m]; + // } + out[m - 1] = new Parse(pos.length); + C[0][n - 1][1][m].create(out[m - 1]); + out[m - 1].heads[0] = -1; + out[m - 1].labels[0] = 0; + } + + // build the dependency tree from the chart + // Parse out= new Parse(pos.length); + + // bestSpan.create(out); + + // out.heads[0]=-1; + // out.labels[0]=0; + + timeDecotder += (System.nanoTime() - ts); + + ts = System.nanoTime(); + + if (!projective) + for (Parse p : out) + rearrange(pos, p.heads, p.labels, x, training); + // if (!projective) rearrange(pos, out.heads, out.labels,x,training); + + timeRearrange += (System.nanoTime() - ts); + + return out; + } + + /** + * This is the parallel non-projective edge re-arranger + * + * @param pos + * part-of-speech tags + * @param heads + * parent child relation + * @param labs + * edge labels + * @param x + * the data + * @param edges + * the existing edges defined by part-of-speech tags + * @throws InterruptedException + */ + public static void rearrange(short[] pos, short[] heads, short[] labs, DataFES x, boolean training) + throws InterruptedException { + + int threads = (pos.length > Parser.THREADS) ? Parser.THREADS : pos.length; + + // wh what to change, nPar - new parent, nType - new type + short wh = -1, nPar = -1, nType = -1; + ArrayList<ParallelRearrange> pe = new ArrayList<ParallelRearrange>(); + + while (true) { + boolean[][] isChild = new boolean[heads.length][heads.length]; + for (int i = 1, l1 = 1; i < heads.length; i++, l1 = i) + while ((l1 = heads[l1]) != -1) + isChild[l1][i] = true; + + float max = Float.NEGATIVE_INFINITY; + float p = Extractor.encode3(pos, heads, labs, x); + + pe.clear(); + for (int i = 0; i < threads; i++) + pe.add(new ParallelRearrange(isChild, pos, x, heads, labs)); + + for (int ch = 1; ch < heads.length; ch++) { + + for (short pa = 0; pa < heads.length; pa++) { + if (ch == pa || pa == heads[ch] || isChild[ch][pa]) + continue; + + ParallelRearrange.add(p, (short) ch, pa); + } + } + executerService.invokeAll(pe); + + for (ParallelRearrange.PA rp : ParallelRearrange.order) + if (max < rp.max) { + max = rp.max; + wh = rp.wh; + nPar = rp.nPar; + nType = rp.nType; + } + ParallelRearrange.order.clear(); + + if (max <= NON_PROJECTIVITY_THRESHOLD) + break; // bb: changed from 0.0 + + heads[wh] = nPar; + labs[wh] = nType; + + } + } + + public static String getInfo() { + + return "Decoder non-projectivity threshold: " + NON_PROJECTIVITY_THRESHOLD; + } + +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/parser/Edges.java b/dependencyParser/experimental/mate-tools/src/is2/parser/Edges.java new file mode 100644 index 0000000..39a0190 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/parser/Edges.java @@ -0,0 +1,208 @@ +/** + * + */ +package is2.parser; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Map.Entry; + +/** + * @author Dr. Bernd Bohnet, 13.05.2009; + * + * + */ +public final class Edges { + + private static short[][][] edges; + private static HashMap<Short, Integer> labelCount = new HashMap<Short, Integer>(); + + private static HashMap<String, Integer> slabelCount = new HashMap<String, Integer>(); + + static short[] def = new short[1]; + + private Edges() { + } + + /** + * @param length + */ + public static void init(int length) { + edges = new short[length][length][]; + } + + public static void findDefault() { + + int best = 0; + + for (Entry<Short, Integer> e : labelCount.entrySet()) { + + if (best < e.getValue()) { + best = e.getValue(); + def[0] = e.getKey(); + } + } + + // labelCount=null; + // String[] types = new String[mf.getFeatureCounter().get(PipeGen.REL)]; + // for (Entry<String, Integer> e : + // MFO.getFeatureSet().get(PipeGen.REL).entrySet()) types[e.getValue()] + // = e.getKey(); + + is2.util.DB.println("set default label to " + def[0] + " "); + + // System.out.println("found default "+def[0]); + + } + + final static public void put(int pos1, int pos2, short label) { + putD(pos1, pos2, label); + // putD(pos2, pos1,!dir, label); + } + + final static public void putD(int pos1, int pos2, short label) { + + Integer lc = labelCount.get(label); + if (lc == null) + labelCount.put(label, 1); + else + labelCount.put(label, lc + 1); + + String key = pos1 + "-" + pos2 + label; + Integer lcs = slabelCount.get(key); + if (lcs == null) + slabelCount.put(key, 1); + else + slabelCount.put(key, lcs + 1); + + if (edges[pos1][pos2] == null) { + edges[pos1][pos2] = new short[1]; + edges[pos1][pos2][0] = label; + + // edgesh[pos1][pos2][dir?0:1] = new TIntHashSet(2); + // edgesh[pos1][pos2][dir?0:1].add(label); + } else { + short labels[] = edges[pos1][pos2]; + for (short l : labels) { + // contains label already? + if (l == label) + return; + } + + short[] nlabels = new short[labels.length + 1]; + System.arraycopy(labels, 0, nlabels, 0, labels.length); + nlabels[labels.length] = label; + edges[pos1][pos2] = nlabels; + + // edgesh[pos1][pos2][dir?0:1].add(label); + } + } + + final static public short[] get(int pos1, int pos2) { + + if (pos1 < 0 || pos2 < 0 || edges[pos1][pos2] == null) + return def; + return edges[pos1][pos2]; + } + + /** + * @param dis + */ + static public void write(DataOutputStream d) throws IOException { + + int len = edges.length; + d.writeShort(len); + + for (int p1 = 0; p1 < len; p1++) { + for (int p2 = 0; p2 < len; p2++) { + if (edges[p1][p2] == null) + d.writeShort(0); + else { + d.writeShort(edges[p1][p2].length); + for (int l = 0; l < edges[p1][p2].length; l++) { + d.writeShort(edges[p1][p2][l]); + } + + } + } + } + + d.writeShort(def[0]); + + } + + /** + * @param dis + */ + public static void read(DataInputStream d) throws IOException { + int len = d.readShort(); + + edges = new short[len][len][]; + for (int p1 = 0; p1 < len; p1++) { + for (int p2 = 0; p2 < len; p2++) { + int ll = d.readShort(); + if (ll == 0) { + edges[p1][p2] = null; + } else { + edges[p1][p2] = new short[ll]; + for (int l = 0; l < ll; l++) { + edges[p1][p2][l] = d.readShort(); + } + } + } + } + + def[0] = d.readShort(); + + } + + public static void print() { + for(int i = 0; i < edges.length; ++i) + for(int j = 0; j < edges[i].length; ++j) + if(edges[i][j] != null) + System.out.println("edges[" + i + "][" + j + "] = " + Arrays.toString(edges[i][j])); + + assert def.length == 0; + System.out.println("def = [" + def[0] + "]"); + } + + public static class C implements Comparator<Short> { + + public C() { + super(); + } + + String _key; + + public C(String key) { + super(); + _key = key; + } + + /* + * (non-Javadoc) + * + * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) + */ + @Override + public int compare(Short l1, Short l2) { + + // int c1 = labelCount.get(l1); + // int c2 = labelCount.get(l2); + // if (true) return c1==c2?0:c1>c2?-1:1; + + int x1 = slabelCount.get(_key + l1.shortValue()); + int x2 = slabelCount.get(_key + l2.shortValue()); + // System.out.println(x1+" "+x2); + + return x1 == x2 ? 0 : x1 > x2 ? -1 : 1; + + } + + } + +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/parser/Evaluator.java b/dependencyParser/experimental/mate-tools/src/is2/parser/Evaluator.java new file mode 100755 index 0000000..f0d45ec --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/parser/Evaluator.java @@ -0,0 +1,100 @@ +package is2.parser; + +import is2.data.SentenceData09; +import is2.io.CONLLReader09; + +public class Evaluator { + + public static final String PUNCT = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; + + public static class Results { + + public int total; + public int corr; + public float las; + public float ula; + + } + + public static Results evaluate(String act_file, String pred_file) throws Exception { + + CONLLReader09 goldReader = new CONLLReader09(act_file, -1); + CONLLReader09 predictedReader = new CONLLReader09(pred_file, -1); + + int total = 0, corr = 0, corrL = 0; + int numsent = 0, corrsent = 0, corrsentL = 0; + SentenceData09 goldInstance = goldReader.getNext(); + SentenceData09 predInstance = predictedReader.getNext(); + + while (goldInstance != null) { + + int instanceLength = goldInstance.length(); + + if (instanceLength != predInstance.length()) + System.out.println("Lengths do not match on sentence " + numsent); + + int[] goldHeads = goldInstance.heads; + String[] goldLabels = goldInstance.labels; + int[] predHeads = predInstance.heads; + String[] predLabels = predInstance.labels; + + boolean whole = true; + boolean wholeL = true; + + // NOTE: the first item is the root info added during + // nextInstance(), so we skip it. + + int punc = 0; + for (int i = 1; i < instanceLength; i++) { + if (predHeads[i] == goldHeads[i]) { + corr++; + + if (goldLabels[i].equals(predLabels[i])) + corrL++; + else { + // System.out.println(numsent+" error gold + // "+goldLabels[i]+" "+predLabels[i]+" head + // "+goldHeads[i]+" child "+i); + wholeL = false; + } + } else { + // System.out.println(numsent+"error gold "+goldLabels[i]+" + // "+predLabels[i]+" head "+goldHeads[i]+" child "+i); + whole = false; + wholeL = false; + } + } + total += ((instanceLength - 1) - punc); // Subtract one to not score + // fake root token + + if (whole) + corrsent++; + if (wholeL) + corrsentL++; + numsent++; + + goldInstance = goldReader.getNext(); + predInstance = predictedReader.getNext(); + } + + Results r = new Results(); + + r.total = total; + r.corr = corr; + r.las = (float) Math.round(((double) corrL / total) * 100000) / 1000; + r.ula = (float) Math.round(((double) corr / total) * 100000) / 1000; + System.out.print("Total: " + total + " \tCorrect: " + corr + " "); + System.out.println("LAS: " + (double) Math.round(((double) corrL / total) * 100000) / 1000 + " \tTotal: " + + (double) Math.round(((double) corrsentL / numsent) * 100000) / 1000 + " \tULA: " + + (double) Math.round(((double) corr / total) * 100000) / 1000 + " \tTotal: " + + (double) Math.round(((double) corrsent / numsent) * 100000) / 1000); + + return r; + } + + public static float round(double v) { + + return Math.round(v * 10000F) / 10000F; + } + +} diff --git a/dependencyParser/mate-tools/src/is2/parser/Extractor.java b/dependencyParser/experimental/mate-tools/src/is2/parser/Extractor.java index 3ba9cc9..3ba9cc9 100755 --- a/dependencyParser/mate-tools/src/is2/parser/Extractor.java +++ b/dependencyParser/experimental/mate-tools/src/is2/parser/Extractor.java diff --git a/dependencyParser/experimental/mate-tools/src/is2/parser/MFO.java b/dependencyParser/experimental/mate-tools/src/is2/parser/MFO.java new file mode 100755 index 0000000..5a2de73 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/parser/MFO.java @@ -0,0 +1,267 @@ +package is2.parser; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map.Entry; +import is2.data.IEncoderPlus; +import is2.util.DB; + +/** + * Map Features, do not map long to integer + * + * @author Bernd Bohnet, 20.09.2009 + */ + +final public class MFO implements IEncoderPlus { + + /** The features and its values */ + static private final HashMap<String, HashMap<String, Integer>> m_featureSets = new HashMap<String, HashMap<String, Integer>>(); + + /** The feature class and the number of values */ + static private final HashMap<String, Integer> m_featureCounters = new HashMap<String, Integer>(); + + /** The number of bits needed to encode a feature */ + static final HashMap<String, Integer> m_featureBits = new HashMap<String, Integer>(); + + /** Integer counter for long2int */ + static private int count = 0; + + /** Stop growing */ + public boolean stop = false; + + final public static String NONE = "<None>"; + + public MFO() { + } + + public int size() { + return count; + } + + /** + * Register an attribute class, if it not exists and add a possible value + * + * @param type + * @param type2 + */ + @Override + final public int register(String a, String v) { + + HashMap<String, Integer> fs = getFeatureSet().get(a); + if (fs == null) { + fs = new HashMap<String, Integer>(); + getFeatureSet().put(a, fs); + fs.put(NONE, 0); + getFeatureCounter().put(a, 1); + } + Integer c = getFeatureCounter().get(a); + + Integer i = fs.get(v); + if (i == null) { + fs.put(v, c); + c++; + getFeatureCounter().put(a, c); + return c - 1; + } else + return i; + } + + /** + * Calculates the number of bits needed to encode a feature + */ + public void calculateBits() { + + for (Entry<String, Integer> e : getFeatureCounter().entrySet()) { + int bits = (int) Math.ceil((Math.log(e.getValue() + 1) / Math.log(2))); + m_featureBits.put(e.getKey(), bits); + } + + // System.out.println("total number of needed bits "+total); + } + + @Override + public String toString() { + + StringBuffer content = new StringBuffer(); + for (Entry<String, Integer> e : getFeatureCounter().entrySet()) { + content.append(e.getKey() + " " + e.getValue()); + content.append(':'); + // HashMap<String,Integer> vs = getFeatureSet().get(e.getKey()); + content.append(getFeatureBits(e.getKey())); + + /* + * if (vs.size()<120) for(Entry<String,Integer> e2 : vs.entrySet()) + * { content.append(e2.getKey()+" ("+e2.getValue()+") "); } + */ + content.append('\n'); + + } + return content.toString(); + } + + static final public short getFeatureBits(String a) { + if (m_featureBits.get(a) == null) + return 0; + return (short) m_featureBits.get(a).intValue(); + } + + /** + * Get the integer place holder of the string value v of the type a + * + * @param t + * the type + * @param v + * the value + * @return the integer place holder of v + */ + @Override + final public int getValue(String t, String v) { + + if (m_featureSets.get(t) == null) + return -1; + Integer vi = m_featureSets.get(t).get(v); + if (vi == null) + return -1; // stop && + return vi.intValue(); + } + + /** + * Static version of getValue + * + * @see getValue + */ + static final public int getValueS(String a, String v) { + + if (m_featureSets.get(a) == null) + return -1; + Integer vi = m_featureSets.get(a).get(v); + if (vi == null) + return -1; // stop && + return vi.intValue(); + } + + public int hasValue(String a, String v) { + + Integer vi = m_featureSets.get(a).get(v); + if (vi == null) + return -1; + return vi.intValue(); + } + + public static String printBits(int k) { + StringBuffer s = new StringBuffer(); + for (int i = 0; i < 31; i++) { + s.append((k & 0x00000001) == 1 ? '1' : '0'); + k = k >> 1; + + } + s.reverse(); + return s.toString(); + } + + /** + * Maps a long to a integer value. This is very useful to save memory for + * sparse data long values + * + * @param l + * @return the integer + */ + static public int misses = 0; + static public int good = 0; + + /** + * Write the data + * + * @param dos + * @throws IOException + */ + static public void writeData(DataOutputStream dos) throws IOException { + dos.writeInt(getFeatureSet().size()); + // DB.println("write"+getFeatureSet().size()); + for (Entry<String, HashMap<String, Integer>> e : getFeatureSet().entrySet()) { + dos.writeUTF(e.getKey()); + dos.writeInt(e.getValue().size()); + + for (Entry<String, Integer> e2 : e.getValue().entrySet()) { + + if (e2.getKey() == null) + DB.println("key " + e2.getKey() + " value " + e2.getValue() + " e -key " + e.getKey()); + dos.writeUTF(e2.getKey()); + dos.writeInt(e2.getValue()); + + } + + } + } + + public void read(DataInputStream din) throws IOException { + + int size = din.readInt(); + for (int i = 0; i < size; i++) { + String k = din.readUTF(); + int size2 = din.readInt(); + + HashMap<String, Integer> h = new HashMap<String, Integer>(); + getFeatureSet().put(k, h); + for (int j = 0; j < size2; j++) { + h.put(din.readUTF(), din.readInt()); + } + getFeatureCounter().put(k, size2); + } + + count = size; + // stop(); + calculateBits(); + } + + /** + * Clear the data + */ + static public void clearData() { + getFeatureSet().clear(); + m_featureBits.clear(); + getFeatureSet().clear(); + } + + @Override + public HashMap<String, Integer> getFeatureCounter() { + return m_featureCounters; + } + + static public HashMap<String, HashMap<String, Integer>> getFeatureSet() { + return m_featureSets; + } + + static public String[] reverse(HashMap<String, Integer> v) { + String[] set = new String[v.size()]; + for (Entry<String, Integer> e : v.entrySet()) { + set[e.getValue()] = e.getKey(); + } + return set; + } + + private static <K, V> String mapToString(HashMap<K, V> m) { + int counter = 0; + StringBuilder s = new StringBuilder(); + for(K k: m.keySet()) { + s.append(", " + k + ": " + m.get(k)); + ++counter; + if(counter == Parser.maxPrint) break; + } + if(s.length() < 3) return "{}"; + else if(counter == Parser.maxPrint) return "{" + s.substring(2) + ",...} (exceeds maximum print length)"; + else return "{" + s.substring(2) + "}"; + } + + @Override + public void print() { + for(String s: m_featureSets.keySet()) + System.out.println("m_featureSets[" + s + "] = " + mapToString(m_featureSets.get(s))); + System.out.println("m_featureCounters = " + mapToString(m_featureCounters)); + System.out.println("m_featureBits = " + mapToString(m_featureBits)); + System.out.println("count = " + count); + System.out.println("stop = " + stop); + } +} diff --git a/dependencyParser/mate-tools/src/is2/parser/Open.java b/dependencyParser/experimental/mate-tools/src/is2/parser/Open.java index 2f68e07..2f68e07 100755 --- a/dependencyParser/mate-tools/src/is2/parser/Open.java +++ b/dependencyParser/experimental/mate-tools/src/is2/parser/Open.java diff --git a/dependencyParser/experimental/mate-tools/src/is2/parser/Options.java b/dependencyParser/experimental/mate-tools/src/is2/parser/Options.java new file mode 100755 index 0000000..bd550ec --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/parser/Options.java @@ -0,0 +1,70 @@ +package is2.parser; + +import is2.util.OptionsSuper; + +public final class Options extends OptionsSuper { + + public Options(String[] args) { + + for (int i = 0; i < args.length; i++) { + + if (args[i].equals("--help")) + explain(); + + if (args[i].equals("-decode")) { + decodeProjective = args[i + 1].equals("proj"); + i++; + } else if (args[i].equals("-decodeTH")) { + decodeTH = Double.parseDouble(args[i + 1]); + i++; + } else if (args[i].equals("-nonormalize")) { + normalize = false; + } else if (args[i].equals("-features")) { + features = args[i + 1]; + i++; + } else if (args[i].equals("-hsize")) { + hsize = Integer.parseInt(args[i + 1]); + i++; + } else if (args[i].equals("-len")) { + maxLen = Integer.parseInt(args[i + 1]); + i++; + } else if (args[i].equals("-cores")) { + cores = Integer.parseInt(args[i + 1]); + i++; + } else if (args[i].equals("-no2nd")) { + no2nd = true; + } else if (args[i].equals("-few2nd")) { + few2nd = true; + } else + super.addOption(args, i); + + } + + } + + private void explain() { + System.out.println("Usage: "); + System.out.println("java -class mate.jar is2.parser.Parser [Options]"); + System.out.println(); + System.out.println("Example: "); + System.out.println( + " java -class mate.jar is2.parser.Parser -model eps3.model -train corpora/conll08st/train/train.closed -test corpora/conll08st/devel/devel.closed -out b3.test -eval corpora/conll08st/devel/devel.closed -count 2000 -i 6"); + System.out.println(""); + System.out.println("Options:"); + System.out.println(""); + System.out.println(" -train <file> the corpus a model is trained on; default " + this.trainfile); + System.out.println(" -test <file> the input corpus for testing; default " + this.testfile); + System.out.println(" -out <file> the output corpus (result) of a test run; default " + this.outfile); + System.out.println(" -model <file> the parsing model for traing the model is stored in the files"); + System.out.println( + " and for parsing the model is load from this file; default " + this.modelName); + System.out.println( + " -i <number> the number of training iterations; good numbers are 10 for smaller corpora and 6 for bigger; default " + + this.numIters); + System.out.println(" -count <number> the n first sentences of the corpus are take for the training default " + + this.count); + System.out.println(" -format <number> conll format of the year 8 or 9; default " + this.formatTask); + + System.exit(0); + } +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/parser/ParallelDecoder.java b/dependencyParser/experimental/mate-tools/src/is2/parser/ParallelDecoder.java new file mode 100755 index 0000000..ca508fd --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/parser/ParallelDecoder.java @@ -0,0 +1,194 @@ +package is2.parser; + +import java.util.ArrayList; +import java.util.concurrent.Callable; + +import is2.data.DataFES; + +/** + * @author Bernd Bohnet, 30.08.2009 + * + * This class implements a parallel feature extractor. + */ +final public class ParallelDecoder implements Callable<Object> { + // some constants + private static final float INIT_BEST = (-1.0F / 0.0F); + private static final boolean[] DIR = { false, true }; + + // the data space of the weights for a dependency tree + final private DataFES x; + + private short[] pos; + + private Open O[][][][]; + private Closed C[][][][]; + + private int length; + + boolean done = false; + public boolean waiting = false; + + /** + * Initialize the parallel decoder. + * + * @param pos + * part-of-speech + * @param d + * data + * @param edges + * part-of-speech edge mapping + * @param o + * open spans + * @param c + * closed spans + * @param length + * number of words + */ + public ParallelDecoder(short[] pos, DataFES d, Open o[][][][], Closed c[][][][], int length) { + + this.pos = pos; + this.x = d; + + this.O = o; + this.C = c; + this.length = length; + } + + private static class DSet { + short w1, w2; + } + + @Override + public Object call() { + + try { + + while (true) { + + DSet set = get(); + // if (done && set==null) break; + + if (set == null) + return null; + + short s = set.w1, t = set.w2; + + for (short dir = 0; dir < 2; dir++) { + + short[] labs = (dir == 1) ? Edges.get(pos[s], pos[t]) : Edges.get(pos[t], pos[s]); + + O[s][t][dir] = new Open[labs.length]; + + for (int l = 0; l < labs.length; l++) { + + double tRP = INIT_BEST; + + Closed tL = null, tR = null; + + for (int r = s; r < t; r++) { + + if (s == 0 && r != 0) + continue; + + double tLPr = INIT_BEST, tRPr = INIT_BEST; + Closed tLCld = null, tRCld = null; + + if (r == s) + tLPr = dir == 1 ? x.sib[s][t][s][l] : x.gra[t][s][s][l]; + else + for (int i = s + 1; i <= r; i++) + if (((dir == 1 ? x.sib[s][t][i][l] : x.gra[t][s][i][l]) + C[s][r][1][i].p) > tLPr) { + tLPr = ((dir == 1 ? x.sib[s][t][i][l] : x.gra[t][s][i][l]) + C[s][r][1][i].p); + tLCld = C[s][r][1][i]; + } + + if (r == t - 1) + tRPr = dir == 1 ? x.gra[s][t][s][l] : x.sib[t][s][s][l]; + else + for (int i = r + 1; i < t; i++) + if (((dir == 1 ? x.gra[s][t][i][l] : x.sib[t][s][i][l]) + + C[r + 1][t][0][i].p) > tRPr) { + tRPr = ((dir == 1 ? x.gra[s][t][i][l] : x.sib[t][s][i][l]) + + C[r + 1][t][0][i].p); + tRCld = C[r + 1][t][0][i]; + } + + if (tLPr + tRPr > tRP) { + tRP = tLPr + tRPr; + tL = tLCld; + tR = tRCld; + } + } + O[s][t][dir][l] = new Open(s, t, dir, labs[l], tL, tR, + (float) (tRP + ((dir == 1) ? x.pl[s][t] : x.pl[t][s]) + + ((dir == 1) ? x.lab[s][t][labs[l]] : x.lab[t][s][labs[l]]))); + } + } + C[s][t][1] = new Closed[length]; + C[s][t][0] = new Closed[length]; + + for (int m = s; m <= t; m++) { + for (boolean d : DIR) { + if ((d && m != s) || !d && (m != t && s != 0)) { + + // create closed structure + + double top = INIT_BEST; + + Open tU = null; + Closed tL = null; + int numLabels = O[(d ? s : m)][(d ? m : t)][d ? 1 : 0].length; + + // for (int l = numLabels-1; l >=0; l--) { + for (int l = 0; l < numLabels; l++) { + + Open hi = O[(d ? s : m)][(d ? m : t)][d ? 1 : 0][l]; + for (int amb = m + (d ? 1 : -1); amb != (d ? t : s) + + (d ? 1 : -1); amb += (d ? 1 : -1)) { + + if ((hi.p + C[d ? m : s][d ? t : m][d ? 1 : 0][amb].p + + x.gra[d ? s : t][m][amb][l]) > top) { + top = (hi.p + C[d ? m : s][d ? t : m][d ? 1 : 0][amb].p + + x.gra[d ? s : t][m][amb][l]); + tU = hi; + tL = C[d ? m : s][d ? t : m][d ? 1 : 0][amb]; + } + + } + + if ((m == (d ? t : s)) && (hi.p + x.gra[d ? s : t][d ? t : s][m][l]) > top) { + top = (hi.p + x.gra[d ? s : t][d ? t : s][m][l]); + tU = hi; + tL = null; + } + } + C[s][t][d ? 1 : 0][m] = new Closed(s, t, m, d ? 1 : 0, tU, tL, (float) top); + + } + } + } + } + } catch (Exception e) { + e.printStackTrace(); + System.exit(0); + } + return null; + } + + public static ArrayList<DSet> sets = new ArrayList<DSet>(); + + static synchronized private DSet get() { + synchronized (sets) { + if (sets.size() == 0) + return null; + return sets.remove(sets.size() - 1); + } + } + + public static void add(short w1, short w2) { + DSet ds = new DSet(); + ds.w1 = w1; + ds.w2 = w2; + sets.add(ds); + } +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/parser/ParallelExtract.java b/dependencyParser/experimental/mate-tools/src/is2/parser/ParallelExtract.java new file mode 100755 index 0000000..ca85711 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/parser/ParallelExtract.java @@ -0,0 +1,248 @@ +package is2.parser; + +import java.util.ArrayList; +import java.util.concurrent.Callable; + +import is2.data.Cluster; +import is2.data.DataFES; +import is2.data.F2SF; +import is2.data.Instances; +import is2.data.Long2IntInterface; + +/** + * @author Bernd Bohnet, 30.08.2009 + * + * This class implements a parallel feature extractor. + */ +final public class ParallelExtract implements Callable<Object> { + // the data space of the weights for a dependency tree + final DataFES d; + + // the data extractor does the actual work + final Extractor extractor; + + private Instances is; + private int i; + + private F2SF para; + + private Cluster cluster; + + public ParallelExtract(Extractor e, Instances is, int i, DataFES d, F2SF para, Cluster cluster) { + + this.is = is; + extractor = e; + this.d = d; + this.i = i; + this.para = para; + this.cluster = cluster; + } + + public static class DSet { + int w1, w2; + } + + @Override + public Object call() { + + try { + + F2SF f = para; + + short[] pos = is.pposs[i]; + int length = pos.length; + + long[] gvs = new long[50]; + long[] svs = new long[220]; + + while (true) { + + DSet set = get(); + if (set == null) + break; + + int w1 = set.w1; + int w2 = set.w2; + + f.clear(); + extractor.basic(pos, w1, w2, f); + d.pl[w1][w2] = f.getScoreF(); + + f.clear(); + + extractor.basic(pos, w2, w1, f); + d.pl[w2][w1] = f.getScoreF(); + + short[] labels = Edges.get(pos[w1], pos[w2]); + float[] lab = d.lab[w1][w2]; + + final Long2IntInterface li = extractor.li; + + int c = extractor.firstm(is, i, w1, w2, 0, cluster, svs); + + for (int l = 0; l < lab.length; l++) + lab[l] = -100; + + for (short label2 : labels) { + short label = label2; + + f.clear(); + int lv = extractor.d0.computeLabeValue(label, Extractor.s_type); + for (int k = 0; k < c; k++) + if (svs[k] > 0) + f.add(li.l2i(svs[k] + lv)); + + lab[label] = f.getScoreF(); + } + + labels = Edges.get(pos[w2], pos[w1]); + lab = d.lab[w2][w1]; + + for (int l = 0; l < lab.length; l++) + lab[l] = -100; + + for (short label2 : labels) { + int label = label2; + + f.clear(); + int lv = extractor.d0.computeLabeValue(label + Extractor.s_rel1, Extractor.s_type); + for (int k = 0; k < c; k++) + if (svs[k] > 0) + f.add(li.l2i(svs[k] + lv)); + + lab[label] = f.getScoreF(); + } + + int s = w1 < w2 ? w1 : w2; + int e = w1 < w2 ? w2 : w1; + + for (int m = 0; m < length; m++) { + + int g = (m == s || e == m) ? -1 : m; + + int cn = extractor.second(is, i, w1, w2, g, 0, cluster, svs); + int cc = extractor.addClusterFeatures(is, i, w1, w2, g, cluster, 0, gvs, 0); + // for(int k=0;k<c;k++) dl1.map(f,svs[k]); + + if (m >= w1) { + labels = Edges.get(pos[w1], pos[w2]); + float[] lab2 = new float[labels.length]; + for (int l = 0; l < labels.length; l++) { + + short label = labels[l]; + + int lx = label + Extractor.s_rel1 * (g < w2 ? 0 : 2); + + f.clear(); + int lv = extractor.d0.computeLabeValue(lx, Extractor.s_type); + for (int k = 0; k < cn; k++) + if (svs[k] > 0) + f.add(li.l2i(svs[k] + lv)); + for (int k = 0; k < cc; k++) + if (gvs[k] > 0) + f.add(li.l2i(gvs[k] + lv)); + + lab2[l] = f.getScoreF(); + } + d.gra[w1][w2][m] = lab2; + } + + if (m <= w2) { + labels = Edges.get(pos[w2], pos[w1]); + float lab2[]; + d.gra[w2][w1][m] = lab2 = new float[labels.length]; + for (int l = 0; l < labels.length; l++) { + + int label = labels[l]; + int lx = label + Extractor.s_rel1 * (1 + (g < w1 ? 0 : 2)); + + f.clear(); + int lv = extractor.d0.computeLabeValue(lx, Extractor.s_type); + for (int k = 0; k < cn; k++) + if (svs[k] > 0) + f.add(li.l2i(svs[k] + lv)); + for (int k = 0; k < cc; k++) + if (gvs[k] > 0) + f.add(li.l2i(gvs[k] + lv)); + + lab2[l] = f.getScoreF(); + + } + } + + g = (m == s || e == m) ? -1 : m; + + // int cn = extractor.second(is,i,w1,w2,g,0, cluster, + // svs,Extractor._SIB); + if (m >= w1 && m <= w2) { + labels = Edges.get(pos[w1], pos[w2]); + float lab2[] = new float[labels.length]; + d.sib[w1][w2][m] = lab2; + + for (int l = 0; l < labels.length; l++) { + + short label = labels[l]; + + int lx = label + Extractor.s_rel1 * (8); + f.clear(); + int lv = extractor.d0.computeLabeValue(lx, Extractor.s_type); + for (int k = 0; k < cn; k++) + if (svs[k] > 0) + f.add(li.l2i(svs[k] + lv)); + for (int k = 0; k < cc; k++) + if (gvs[k] > 0) + f.add(li.l2i(gvs[k] + lv)); + + lab2[l] = f.score;// f.getScoreF(); + } + } + if (m >= w1 && m <= w2) { + labels = Edges.get(pos[w2], pos[w1]); + float[] lab2 = new float[labels.length]; + d.sib[w2][w1][m] = lab2; + for (int l = 0; l < labels.length; l++) { + + int label = labels[l]; + + int lx = label + Extractor.s_rel1 * (9); + + f.clear(); + int lv = extractor.d0.computeLabeValue(lx, Extractor.s_type); + for (int k = 0; k < cn; k++) + if (svs[k] > 0) + f.add(li.l2i(svs[k] + lv)); + for (int k = 0; k < cc; k++) + if (gvs[k] > 0) + f.add(li.l2i(gvs[k] + lv)); + + lab2[l] = f.score;// f.getScoreF(); + } + } + } + } + + } catch (Exception e) { + e.printStackTrace(); + } + return null; + } + + static ArrayList<DSet> sets = new ArrayList<DSet>(); + + private DSet get() { + + synchronized (sets) { + if (sets.size() == 0) + return null; + return sets.remove(sets.size() - 1); + } + } + + static public void add(int w1, int w2) { + DSet ds = new DSet(); + ds.w1 = w1; + ds.w2 = w2; + sets.add(ds); + } + +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/parser/ParallelRearrange.java b/dependencyParser/experimental/mate-tools/src/is2/parser/ParallelRearrange.java new file mode 100755 index 0000000..83dcdaa --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/parser/ParallelRearrange.java @@ -0,0 +1,146 @@ +package is2.parser; + +import java.util.ArrayList; +import java.util.concurrent.Callable; + +import is2.data.DataFES; + +/** + * @author Dr. Bernd Bohnet, 30.08.2009 + * + * This class implements a parallel edge rearrangement for + * non-projective parsing; The linear method was first suggest by Rayn + * McDonald et. al. 2005. + */ +final public class ParallelRearrange implements Callable<Object> { + + // new parent child combination to explore + final static class PA { + final float p; + final short ch, pa; + public float max; + public short wh; + public short nPar; + public short nType; + + public PA(float p2, short ch2, short pa2) { + p = p2; + ch = ch2; + pa = pa2; + } + } + + // list of parent child combinations + static ArrayList<PA> parents = new ArrayList<PA>(); + static ArrayList<PA> order = new ArrayList<PA>(); + // best new parent child combination, found so far + public float max; + + // some data from the dependency tree + // private EdgesC edges; + private short[] pos; + private DataFES x; + private boolean[][] isChild; + public short[] heads, types; + + // child, new parent, new label + public short wh, nPar, nType; + + /** + * Initialize the parallel rearrange thread + * + * @param isChild2 + * is a child + * @param edgesC + * the part-of-speech edge mapping + * @param pos + * the part-of-speech + * @param x + * the data + * @param s + * the heads + * @param ts + * the types + */ + public ParallelRearrange(boolean[][] isChild2, short[] pos, DataFES x, short[] s, short[] ts) { + + heads = new short[s.length]; + System.arraycopy(s, 0, heads, 0, s.length); + + types = new short[ts.length]; + System.arraycopy(ts, 0, types, 0, ts.length); + + isChild = isChild2; + // edges = edgesC; + this.pos = pos; + this.x = x; + } + + @Override + public Object call() { + + // check the list of new possible parents and children for a better + // combination + while (true) { + PA px = getPA(); + if (px == null) + break; + + float max = 0; + short pa = px.pa, ch = px.ch; + + if (ch == pa || pa == heads[ch] || isChild[ch][pa]) + continue; + + short oldP = heads[ch], oldT = types[ch]; + + heads[ch] = pa; + + short[] labels = Edges.get(pos[pa], pos[ch]); + + for (short label : labels) { + + types[ch] = label; + + float p_new = Extractor.encode3(pos, heads, types, x); + + if (max < p_new - px.p) { + max = p_new - px.p; + wh = ch; + nPar = pa; + nType = label; + px.max = max; + px.wh = ch; + px.nPar = pa; + px.nType = label; + } + } + heads[ch] = oldP; + types[ch] = oldT; + } + return null; + } + + /** + * Add a child-parent combination which are latter explored for + * rearrangement + * + * @param p2 + * @param ch2 + * @param pa + */ + static public void add(float p2, short ch2, short pa) { + PA px = new PA(p2, ch2, pa); + parents.add(px); + order.add(px); + } + + static private PA getPA() { + synchronized (parents) { + if (parents.size() == 0) + return null; + return parents.remove(parents.size() - 1); + } + } + +} diff --git a/dependencyParser/mate-tools/src/is2/parser/Parameters.java b/dependencyParser/experimental/mate-tools/src/is2/parser/Parameters.java index baba7b6..baba7b6 100755 --- a/dependencyParser/mate-tools/src/is2/parser/Parameters.java +++ b/dependencyParser/experimental/mate-tools/src/is2/parser/Parameters.java diff --git a/dependencyParser/experimental/mate-tools/src/is2/parser/ParametersFloat.java b/dependencyParser/experimental/mate-tools/src/is2/parser/ParametersFloat.java new file mode 100755 index 0000000..faf795d --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/parser/ParametersFloat.java @@ -0,0 +1,138 @@ +package is2.parser; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; + +import is2.data.F2SF; +import is2.data.FV; +import is2.data.Instances; +import is2.data.Parse; +import is2.util.DB; + +final public class ParametersFloat extends Parameters { + + public float[] parameters; + public float[] total; + + public ParametersFloat(int size) { + parameters = new float[size]; + total = new float[size]; + for (int i = 0; i < parameters.length; i++) { + parameters[i] = 0F; + total[i] = 0F; + } + } + + /** + * @param parameters2 + */ + public ParametersFloat(float[] p) { + parameters = p; + } + + @Override + public void average(double avVal) { + for (int j = 0; j < total.length; j++) { + parameters[j] = total[j] / ((float) avVal); + } + total = null; + } + + public ParametersFloat average2(double avVal) { + float[] px = new float[this.parameters.length]; + for (int j = 0; j < total.length; j++) { + px[j] = total[j] / ((float) avVal); + } + ParametersFloat pf = new ParametersFloat(px); + return pf; + } + + @Override + public void update(FV act, FV pred, Instances isd, int instc, Parse d, double upd, double e) { + + e++; + + float lam_dist = getScore(act) - getScore(pred); + + float b = (float) e - lam_dist; + + FV dist = act.getDistVector(pred); + + dist.update(parameters, total, hildreth(dist, b), upd, false); + } + + protected double hildreth(FV a, double b) { + + double A = a.dotProduct(a); + if (A <= 0.0000000000000000001) + return 0.0; + return b / A; + } + + public float getScore(FV fv) { + if (fv == null) + return 0.0F; + return fv.getScore(parameters, false); + + } + + @Override + final public void write(DataOutputStream dos) throws IOException { + + dos.writeInt(parameters.length); + for (float d : parameters) + dos.writeFloat(d); + + } + + @Override + public void read(DataInputStream dis) throws IOException { + + parameters = new float[dis.readInt()]; + int notZero = 0; + for (int i = 0; i < parameters.length; i++) { + parameters[i] = dis.readFloat(); + if (parameters[i] != 0.0F) + notZero++; + } + + DB.println("read parameters " + parameters.length + " not zero " + notZero); + + } + + public int countNZ() { + + int notZero = 0; + for (float parameter : parameters) { + if (parameter != 0.0F) + notZero++; + } + return notZero; + + // DB.println("read parameters "+parameters.length+" not zero + // "+notZero); + + } + + /* + * (non-Javadoc) + * + * @see is2.sp09k99995.Parameters#getFV() + */ + @Override + public F2SF getFV() { + return new F2SF(parameters); + } + + /* + * (non-Javadoc) + * + * @see is2.sp09k99999.Parameters#size() + */ + @Override + public int size() { + return parameters.length; + } + +} diff --git a/dependencyParser/mate-tools/src/is2/parser/Parser.java b/dependencyParser/experimental/mate-tools/src/is2/parser/Parser.java index b361d69..b361d69 100755 --- a/dependencyParser/mate-tools/src/is2/parser/Parser.java +++ b/dependencyParser/experimental/mate-tools/src/is2/parser/Parser.java diff --git a/dependencyParser/experimental/mate-tools/src/is2/parser/Pipe.java b/dependencyParser/experimental/mate-tools/src/is2/parser/Pipe.java new file mode 100755 index 0000000..f7f3782 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/parser/Pipe.java @@ -0,0 +1,224 @@ +package is2.parser; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.concurrent.ExecutorService; + +import is2.data.Cluster; +import is2.data.DataFES; +import is2.data.F2SF; +import is2.data.Instances; +import is2.data.Parse; +import is2.data.PipeGen; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; +import is2.util.OptionsSuper; + +final public class Pipe extends PipeGen { + + public Extractor[] extractor; + final public MFO mf = new MFO(); + + public Cluster cl; + + private OptionsSuper options; + public static long timeExtract; + + public Pipe(OptionsSuper o) { + options = o; + } + + public void createInstances(String file, Instances is) throws Exception { + + CONLLReader09 depReader = new CONLLReader09(file); + + mf.register(REL, "<root-type>"); + + // register at least one predicate since the parsing data might not + // contain predicates as in + // the Japaness corpus but the development sets contains some + + System.out.print("Registering feature parts of sentence: "); + int ic = 0; + int del = 0; + while (true) { + SentenceData09 instance = depReader.getNext(); + if (instance == null) + break; + ic++; + + if (ic % 1000 == 0) { + del = outValue(ic, del); + } + + String[] labs1 = instance.labels; + for (String element : labs1) + mf.register(REL, element); + + String[] w = instance.forms; + for (String element : w) + mf.register(WORD, depReader.normalize(element)); + + w = instance.plemmas; + for (String element : w) + mf.register(WORD, depReader.normalize(element)); + + w = instance.ppos; + for (String element : w) + mf.register(POS, element); + + w = instance.gpos; + for (String element : w) + mf.register(POS, element); + + if (instance.feats != null) { + String fs[][] = instance.feats; + for (String[] element : fs) { + w = element; + if (w == null) + continue; + for (String element2 : w) + mf.register(FEAT, element2); + } + } + + if ((ic - 1) > options.count) + break; + } + del = outValue(ic, del); + + System.out.println(); + Extractor.initFeatures(); + + Extractor.maxForm = mf.getFeatureCounter().get(WORD); + + if (options.clusterFile == null) + cl = new Cluster(); + else + cl = new Cluster(options.clusterFile, mf, 6); + + mf.calculateBits(); + Extractor.initStat(options.featureCreation); + + System.out.println("" + mf.toString()); + + for (Extractor e : extractor) + e.init(); + + depReader.startReading(file); + + int num1 = 0; + + is.init(ic, new MFO()); + + Edges.init(mf.getFeatureCounter().get(POS)); + + System.out.print("Creating edge filters and read corpus: "); + del = 0; + + while (true) { + if (num1 % 100 == 0) + del = outValue(num1, del); + + SentenceData09 instance1 = depReader.getNext(is); + + if (instance1 == null) + break; + + int last = is.size() - 1; + short[] pos = is.pposs[last]; + + for (int k = 0; k < is.length(last); k++) { + if (is.heads[last][k] < 0) + continue; + Edges.put(pos[is.heads[last][k]], pos[k], is.labels[last][k]); + // Edges.put(pos[k],pos[is.heads[last][k]], is.labels[last][k]); + } + + if (!options.allFeatures && num1 > options.count) + break; + + num1++; + + } + del = outValue(num1, del); + System.out.println(); + Edges.findDefault(); + } + + /** + * Creates an instance for outputParses + * + * @param is + * @return + * @throws IOException + */ + protected final SentenceData09 nextInstance(Instances is, CONLLReader09 depReader) throws Exception { + + SentenceData09 instance = depReader.getNext(is); + if (instance == null || instance.forms == null) + return null; + + return instance; + } + + public static ExecutorService executerService = java.util.concurrent.Executors.newFixedThreadPool(Parser.THREADS); + + public DataFES fillVector(F2SF params, Instances is, int inst, DataFES d, Cluster cluster) + throws InterruptedException { + + long ts = System.nanoTime(); + + if (executerService.isShutdown()) + executerService = java.util.concurrent.Executors.newCachedThreadPool(); + + final int length = is.length(inst); + if (d == null || d.len < length) + d = new DataFES(length, mf.getFeatureCounter().get(PipeGen.REL).shortValue()); + + ArrayList<ParallelExtract> pe = new ArrayList<ParallelExtract>(); + for (int i = 0; i < Parser.THREADS; i++) + pe.add(new ParallelExtract(extractor[i], is, inst, d, (F2SF) params.clone(), cluster)); + + for (int w1 = 0; w1 < length; w1++) { + for (int w2 = w1 + 1; w2 < length; w2++) { + + if (w1 == w2) + continue; + + ParallelExtract.add(w1, w2); + + } + } + // for(int i=0;i<efp.length;i++) efp[i].start(); + // for(int i=0;i<efp.length;i++) efp[i].join(); + executerService.invokeAll(pe); + + timeExtract += (System.nanoTime() - ts); + + return d; + } + + public double errors(Instances is, int ic, Parse p) { + short[] act = is.heads[ic]; + double correct = 0; + + // do not count root + for (int i = 1; i < act.length; i++) { + + // if (is.ppos[ic] ==null ) System.out.println("mf + // null"+is.ppos[ic][i]); + if (p.heads[i] == act[i]) { + correct += 0.5; + if (p.labels[i] == is.labels[ic][i]) + correct += 0.5; + } + } + + double x = ((double) act.length - 1 - correct); + + p.f1 = correct / (act.length - 1); + + return x; + } +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/parser/package.html b/dependencyParser/experimental/mate-tools/src/is2/parser/package.html new file mode 100755 index 0000000..a4f40a2 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/parser/package.html @@ -0,0 +1,11 @@ +Package info +<ul> + <li> separate cluster feature to keep speed since two many features in a function reduce speed drastically. </li> + <li> try second order stacking features </li> + <li> parser stacking features </li> + <li> lots of cluster features </li> + <li> Iteration over edges and not extraction of all edges each time </li> + <li> integrated new structurer writer </li> +</ul> + Change in FS, I observed lots of duplicated grand-children features +<br> \ No newline at end of file diff --git a/dependencyParser/mate-tools/src/is2/parserR2/Decoder.java b/dependencyParser/experimental/mate-tools/src/is2/parserR2/Decoder.java index 2ba175f..2ba175f 100755 --- a/dependencyParser/mate-tools/src/is2/parserR2/Decoder.java +++ b/dependencyParser/experimental/mate-tools/src/is2/parserR2/Decoder.java diff --git a/dependencyParser/experimental/mate-tools/src/is2/parserR2/Options.java b/dependencyParser/experimental/mate-tools/src/is2/parserR2/Options.java new file mode 100755 index 0000000..eb396b4 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/parserR2/Options.java @@ -0,0 +1,92 @@ +package is2.parserR2; + +import is2.util.OptionsSuper; + +public final class Options extends OptionsSuper { + + int start = 0, end = 0; + String prefix_model = "m"; + String prefix_test = "t"; + + public Options(String[] args) { + + for (int i = 0; i < args.length; i++) { + + if (args[i].equals("--help")) + explain(); + + if (args[i].equals("-decode")) { + decodeProjective = args[i + 1].equals("proj"); + i++; + } else if (args[i].equals("-decodeTH")) { + decodeTH = Double.parseDouble(args[i + 1]); + i++; + } else if (args[i].equals("-nonormalize")) { + normalize = false; + } else if (args[i].equals("-features")) { + features = args[i + 1]; + i++; + } else if (args[i].equals("-hsize")) { + hsize = Integer.parseInt(args[i + 1]); + i++; + } else if (args[i].equals("-len")) { + maxLen = Integer.parseInt(args[i + 1]); + i++; + } else if (args[i].equals("-cores")) { + cores = Integer.parseInt(args[i + 1]); + i++; + } else if (args[i].equals("-best")) { + best = Integer.parseInt(args[i + 1]); + i++; + } else if (args[i].equals("-start")) { + start = Integer.parseInt(args[i + 1]); + i++; + } else if (args[i].equals("-end")) { + end = Integer.parseInt(args[i + 1]); + i++; + } else if (args[i].equals("-prefix-model")) { + prefix_model = args[i + 1]; + i++; + } else if (args[i].equals("-prefix-test")) { + prefix_test = args[i + 1]; + i++; + } else if (args[i].equals("-mapping")) { + this.useMapping = args[i + 1]; + i++; + } else if (args[i].equals("-no2nd")) { + no2nd = true; + } else if (args[i].equals("-few2nd")) { + few2nd = true; + } else + super.addOption(args, i); + + } + + } + + private void explain() { + System.out.println("Usage: "); + System.out.println("java -class mate.jar is2.parser.Parser [Options]"); + System.out.println(); + System.out.println("Example: "); + System.out.println( + " java -class mate.jar is2.parser.Parser -model eps3.model -train corpora/conll08st/train/train.closed -test corpora/conll08st/devel/devel.closed -out b3.test -eval corpora/conll08st/devel/devel.closed -count 2000 -i 6"); + System.out.println(""); + System.out.println("Options:"); + System.out.println(""); + System.out.println(" -train <file> the corpus a model is trained on; default " + this.trainfile); + System.out.println(" -test <file> the input corpus for testing; default " + this.testfile); + System.out.println(" -out <file> the output corpus (result) of a test run; default " + this.outfile); + System.out.println(" -model <file> the parsing model for traing the model is stored in the files"); + System.out.println( + " and for parsing the model is load from this file; default " + this.modelName); + System.out.println( + " -i <number> the number of training iterations; good numbers are 10 for smaller corpora and 6 for bigger; default " + + this.numIters); + System.out.println(" -count <number> the n first sentences of the corpus are take for the training default " + + this.count); + System.out.println(" -format <number> conll format of the year 8 or 9; default " + this.formatTask); + + System.exit(0); + } +} diff --git a/dependencyParser/mate-tools/src/is2/parserR2/Parameters.java b/dependencyParser/experimental/mate-tools/src/is2/parserR2/Parameters.java index de7b591..de7b591 100755 --- a/dependencyParser/mate-tools/src/is2/parserR2/Parameters.java +++ b/dependencyParser/experimental/mate-tools/src/is2/parserR2/Parameters.java diff --git a/dependencyParser/experimental/mate-tools/src/is2/parserR2/ParametersFloat.java b/dependencyParser/experimental/mate-tools/src/is2/parserR2/ParametersFloat.java new file mode 100755 index 0000000..2ba0aaa --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/parserR2/ParametersFloat.java @@ -0,0 +1,178 @@ +package is2.parserR2; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; + +import is2.data.F2SF; +import is2.data.FV; +import is2.data.FVR; +import is2.data.Instances; +import is2.data.Parse; +import is2.util.DB; + +final public class ParametersFloat extends Parameters { + + public float[] parameters; + private float[] total; + + public ParametersFloat(int size) { + parameters = new float[size]; + total = new float[size]; + for (int i = 0; i < parameters.length; i++) { + parameters[i] = 0F; + total[i] = 0F; + } + } + + /** + * @param parameters2 + */ + public ParametersFloat(float[] p) { + parameters = p; + } + + @Override + public void average(double avVal) { + for (int j = 0; j < total.length; j++) { + parameters[j] = total[j] / ((float) avVal); + } + total = null; + } + + public ParametersFloat average2(double avVal) { + float[] px = new float[this.parameters.length]; + for (int j = 0; j < total.length; j++) { + px[j] = total[j] / ((float) avVal); + } + ParametersFloat pf = new ParametersFloat(px); + return pf; + } + + public void update(FV act, FV pred, Instances isd, int instc, Parse dx, double upd, double e, float d, float f) { + + e++; + + float lam_dist = d - f; + + float b = (float) e - lam_dist; + + FV dist = act.getDistVector(pred); + + dist.update(parameters, total, hildreth(dist, b), upd, false); + } + + @Override + public void update(FV act, FV pred, Instances isd, int instc, Parse dx, double upd, double e) { + + e++; + + float lam_dist = getScore(act) - getScore(pred); + + float b = (float) e - lam_dist; + + FV dist = act.getDistVector(pred); + + dist.update(parameters, total, hildreth(dist, b), upd, false); + } + + public void update(FVR act, FVR pred, Instances isd, int instc, Parse dx, double upd, double e, float lam_dist) { + + e++; + + float b = (float) e - lam_dist; + + FVR dist = act.getDistVector(pred); + + dist.update(parameters, total, hildreth(dist, b), upd, false); + } + + protected double hildreth(FV a, double b) { + + double A = a.dotProduct(a); + if (A <= 0.0000000000000000001) + return 0.0; + return b / A; + } + + protected double hildreth(FVR a, double b) { + + double A = a.dotProduct(a); + if (A <= 0.0000000000000000001) + return 0.0; + return b / A; + } + + public float getScore(FV fv) { + if (fv == null) + return 0.0F; + return fv.getScore(parameters, false); + + } + + public float getScore(FVR fv) { // xx + if (fv == null) + return 0.0F; + return fv.getScore(parameters, false); + + } + + @Override + final public void write(DataOutputStream dos) throws IOException { + + dos.writeInt(parameters.length); + for (float d : parameters) + dos.writeFloat(d); + + } + + @Override + public void read(DataInputStream dis) throws IOException { + + parameters = new float[dis.readInt()]; + int notZero = 0; + for (int i = 0; i < parameters.length; i++) { + parameters[i] = dis.readFloat(); + if (parameters[i] != 0.0F) + notZero++; + } + + DB.println("read parameters " + parameters.length + " not zero " + notZero); + + } + + public int countNZ() { + + int notZero = 0; + for (float parameter : parameters) { + if (parameter != 0.0F) + notZero++; + } + return notZero; + + // DB.println("read parameters "+parameters.length+" not zero + // "+notZero); + + } + + /* + * (non-Javadoc) + * + * @see is2.sp09k99995.Parameters#getFV() + */ + @Override + public F2SF getFV() { + return new F2SF(parameters); + } + + /* + * (non-Javadoc) + * + * @see is2.sp09k99999.Parameters#size() + */ + @Override + public int size() { + return parameters.length; + } + +} diff --git a/dependencyParser/mate-tools/src/is2/parserR2/Parser.java b/dependencyParser/experimental/mate-tools/src/is2/parserR2/Parser.java index 54d6f2e..54d6f2e 100755 --- a/dependencyParser/mate-tools/src/is2/parserR2/Parser.java +++ b/dependencyParser/experimental/mate-tools/src/is2/parserR2/Parser.java diff --git a/dependencyParser/experimental/mate-tools/src/is2/parserR2/Pipe.java b/dependencyParser/experimental/mate-tools/src/is2/parserR2/Pipe.java new file mode 100755 index 0000000..81ce59a --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/parserR2/Pipe.java @@ -0,0 +1,261 @@ +package is2.parserR2; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.concurrent.ExecutorService; + +import extractors.Extractor; +import extractors.ParallelExtract; +import is2.data.Cluster; +import is2.data.DataF; +import is2.data.Edges; +import is2.data.F2SF; +import is2.data.Instances; +import is2.data.Long2IntInterface; +import is2.data.MFB; +import is2.data.Parse; +import is2.data.PipeGen; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; +import is2.util.OptionsSuper; + +final public class Pipe extends PipeGen { + + public Extractor[] extractor; + final public MFB mf = new MFB(); + + Cluster cl; + + private OptionsSuper options; + public static long timeExtract; + + public Pipe(OptionsSuper o) { + options = o; + } + + public void createInstances(String file, Instances is) + // throws Exception + + { + + CONLLReader09 depReader = new CONLLReader09(file); + + mf.register(REL, "<root-type>"); + + // register at least one predicate since the parsing data might not + // contain predicates as in + // the Japaness corpus but the development sets contains some + + System.out.print("Registering feature parts of sentence: "); + int ic = 0; + int del = 0; + while (true) { + SentenceData09 instance = depReader.getNext(); + if (instance == null) + break; + ic++; + + if (ic % 1000 == 0) { + del = outValue(ic, del); + } + + String[] labs1 = instance.labels; + for (String element : labs1) + mf.register(REL, element); + + String[] w = instance.forms; + for (String element : w) + mf.register(WORD, depReader.normalize(element)); + + w = instance.plemmas; + for (String element : w) + mf.register(WORD, depReader.normalize(element)); + + w = instance.ppos; + for (String element : w) + mf.register(POS, element); + + w = instance.gpos; + for (String element : w) + mf.register(POS, element); + + if (instance.feats != null) { + String fs[][] = instance.feats; + for (String[] element : fs) { + w = element; + if (w == null) + continue; + for (String element2 : w) + mf.register(FEAT, element2); + } + } + + if ((ic - 1) > options.count) + break; + } + del = outValue(ic, del); + + for (Extractor e : extractor) { + e.setMaxForm(mf.getFeatureCounter().get(WORD)); + } + + if (options.clusterFile == null) + cl = new Cluster(); + else + cl = new Cluster(options.clusterFile, mf, 6); + + mf.calculateBits(); + + System.out.println("" + mf.toString()); + + for (Extractor e : extractor) { + e.initStat(); + e.init(); + } + + depReader.startReading(file); + + int num1 = 0; + + Edges.init(mf.getFeatureCounter().get(POS)); + + System.out.print("Creating edge filters and read corpus: "); + del = 0; + + is.init(ic, new MFB()); + + while (true) { + if (num1 % 100 == 0) + del = outValue(num1, del); + + SentenceData09 instance1 = depReader.getNext(is); + + if (instance1 == null) + break; + + int last = is.size() - 1; + short[] pos = is.pposs[last]; + + for (int k = 0; k < is.length(last); k++) { + if (is.heads[last][k] < 0) + continue; + Edges.put(pos[is.heads[last][k]], pos[k], k < is.heads[last][k], is.labels[last][k]); + } + + if (!options.allFeatures && num1 > options.count) + break; + + num1++; + + } + del = outValue(num1, del); + System.out.println(); + Edges.findDefault(); + } + + public void getInstances(String file, Instances is) { + CONLLReader09 depReader = new CONLLReader09(file); + + int ic = options.count + 2; + + is.init(ic, new MFB()); + + int num1 = 0, del = 0; + while (true) { + if (num1 % 100 == 0) + del = outValue(num1, del); + + SentenceData09 instance1 = depReader.getNext(is); + + if (instance1 == null) + break; + + if (!options.allFeatures && num1 > options.count) + break; + + num1++; + + } + del = outValue(num1, del); + System.out.println(); + + } + + /** + * Creates an instance for outputParses + * + * @param is + * @return + * @throws IOException + */ + protected final SentenceData09 nextInstance(Instances is, CONLLReader09 depReader) throws Exception { + + SentenceData09 instance = depReader.getNext(is); + if (instance == null || instance.forms == null) + return null; + + return instance; + } + + public static ExecutorService executerService = java.util.concurrent.Executors.newFixedThreadPool(Parser.THREADS); + + public DataF fillVector(F2SF params, Instances is, int inst, DataF d, Cluster cluster, int threads, + Long2IntInterface li) throws InterruptedException { + + long ts = System.nanoTime(); + + if (executerService.isShutdown()) + executerService = java.util.concurrent.Executors.newCachedThreadPool(); + + final int length = is.length(inst); + if (d == null || d.len < length) + d = new DataF(length, mf.getFeatureCounter().get(PipeGen.REL).shortValue()); + + ArrayList<ParallelExtract> pe = new ArrayList<ParallelExtract>(); + + for (int i = 0; i < threads; i++) { + + // DB.println(""+((ExtractorClusterStackedR2)extractor[i]).s_dist); + pe.add(new ParallelExtract(extractor[i], is, inst, d, (F2SF) params.clone(), cluster, li)); + } + + for (int w1 = 0; w1 < length; w1++) { + for (int w2 = 0; w2 < length; w2++) { + if (w1 == w2) + continue; + ParallelExtract.add(w1, w2); + } + } + executerService.invokeAll(pe); + + timeExtract += (System.nanoTime() - ts); + + return d; + } + + /** + * the loss function + */ + public double errors(Instances is, int ic, Parse p) { + + if (p.heads == null) + p.signature2parse(p.signature()); + short[] act = is.heads[ic]; + double correct = 0; + + // do not count root + for (int i = 1; i < act.length; i++) { + if (p.heads[i] == act[i]) { + correct += 0.5; + if (p.labels[i] == is.labels[ic][i]) + correct += 0.5; + } + } + + double x = ((double) act.length - 1 - correct); + + // p.f1 = (double)correct / (double)(act.length-1); + + return x; + } +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/parserR2/PipeReranker.java b/dependencyParser/experimental/mate-tools/src/is2/parserR2/PipeReranker.java new file mode 100644 index 0000000..622fe1c --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/parserR2/PipeReranker.java @@ -0,0 +1,123 @@ +package is2.parserR2; + +import java.util.concurrent.ExecutorService; + +import extractors.ExtractorReranker; +import is2.data.Cluster; +import is2.data.Edges; +import is2.data.Instances; +import is2.data.MFB; +import is2.data.PipeGen; +import is2.data.SentenceData09; +import is2.io.CONLLReader09; +import is2.util.OptionsSuper; + +final public class PipeReranker extends PipeGen { + + public ExtractorReranker extractor; + final public MFB mf = new MFB(); + + Cluster cl; + + private OptionsSuper options; + public static long timeExtract; + + public PipeReranker(OptionsSuper o) { + options = o; + } + + public void createInstances(String file, Instances is) + // throws Exception + + { + + CONLLReader09 depReader = new CONLLReader09(file); + + mf.register(REL, "<root-type>"); + + // register at least one predicate since the parsing data might not + // contain predicates as in + // the Japaness corpus but the development sets contains some + + System.out.print("Registering feature parts of sentence: "); + int ic = 0; + int del = 0; + while (true) { + SentenceData09 instance = depReader.getNext(); + if (instance == null) + break; + ic++; + + if (ic % 1000 == 0) { + del = outValue(ic, del); + } + + String[] labs1 = instance.labels; + for (String element : labs1) + mf.register(REL, element); + + String[] w = instance.forms; + for (String element : w) + mf.register(WORD, depReader.normalize(element)); + + w = instance.plemmas; + for (String element : w) + mf.register(WORD, depReader.normalize(element)); + + w = instance.ppos; + for (String element : w) + mf.register(POS, element); + + w = instance.gpos; + for (String element : w) + mf.register(POS, element); + + if (instance.feats != null) { + String fs[][] = instance.feats; + for (String[] element : fs) { + w = element; + if (w == null) + continue; + for (String element2 : w) + mf.register(FEAT, element2); + } + } + + if ((ic - 1) > options.count) + break; + } + del = outValue(ic, del); + + System.out.println(); + ExtractorReranker.initFeatures(); + + ExtractorReranker.maxForm = mf.getFeatureCounter().get(WORD); + + if (options.clusterFile == null) + cl = new Cluster(); + else + cl = new Cluster(options.clusterFile, mf, 6); + + mf.calculateBits(); + ExtractorReranker.initStat(); + + System.out.println("" + mf.toString()); + + extractor.init(); + depReader.startReading(file); + + int num1 = 0; + + is.init(ic, new MFB()); + + Edges.init(mf.getFeatureCounter().get(POS)); + + del = 0; + + del = outValue(num1, del); + System.out.println(); + } + + public static ExecutorService executerService = java.util.concurrent.Executors.newFixedThreadPool(Parser.THREADS); + +} diff --git a/dependencyParser/mate-tools/src/is2/parserR2/Reranker.java b/dependencyParser/experimental/mate-tools/src/is2/parserR2/Reranker.java index 61650a0..61650a0 100644 --- a/dependencyParser/mate-tools/src/is2/parserR2/Reranker.java +++ b/dependencyParser/experimental/mate-tools/src/is2/parserR2/Reranker.java diff --git a/dependencyParser/experimental/mate-tools/src/is2/parserR2/package.html b/dependencyParser/experimental/mate-tools/src/is2/parserR2/package.html new file mode 100755 index 0000000..6b06482 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/parserR2/package.html @@ -0,0 +1,3 @@ +Package info + - n-best parser +<br> \ No newline at end of file diff --git a/dependencyParser/mate-tools/src/is2/tag/ExtractorT2.java b/dependencyParser/experimental/mate-tools/src/is2/tag/ExtractorT2.java index 688dc21..688dc21 100644 --- a/dependencyParser/mate-tools/src/is2/tag/ExtractorT2.java +++ b/dependencyParser/experimental/mate-tools/src/is2/tag/ExtractorT2.java diff --git a/dependencyParser/experimental/mate-tools/src/is2/tag/Lexicon.java b/dependencyParser/experimental/mate-tools/src/is2/tag/Lexicon.java new file mode 100644 index 0000000..f719f26 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/tag/Lexicon.java @@ -0,0 +1,150 @@ +/** + * + */ +package is2.tag; + +import java.io.BufferedReader; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; + +import is2.data.IEncoderPlus; +import is2.data.PipeGen; +import is2.util.DB; + +/** + * @author Dr. Bernd Bohnet, 07.01.2011 + * + * + */ +public class Lexicon { + + public static final String FR = "FR", TAG = "TAG"; + + final byte[][] word2tag; + + public Lexicon(byte[][] w2t) { + + word2tag = w2t; + } + + public Lexicon(String clusterFile, IEncoderPlus mf) { + + final String REGEX = "\t"; + + // register words + try { + BufferedReader inputReader = new BufferedReader( + new InputStreamReader(new FileInputStream(clusterFile), "UTF-8"), 32768); + + int cnt = 0; + String line; + while ((line = inputReader.readLine()) != null) { + + try { + String[] split = line.split(REGEX); + // int f = Integer.parseInt(split[2]); + // if (f>2) { + cnt++; + mf.register(PipeGen.WORD, split[0]); + mf.register(TAG, split[1]); // tag + + if (split.length > 1) + mf.register(FR, split[1]); // frequency + // } + } catch (Exception e) { + System.out.println("Error in lexicon line " + cnt + " error: " + e.getMessage()); + } + } + System.out.println("read number of words from lexicon " + cnt); + inputReader.close(); + + } catch (Exception e) { + e.printStackTrace(); + } + + word2tag = new byte[mf.getFeatureCounter().get(PipeGen.WORD)][1]; + // insert words + try { + String line; + BufferedReader inputReader = new BufferedReader( + new InputStreamReader(new FileInputStream(clusterFile), "UTF-8"), 32768); + + while ((line = inputReader.readLine()) != null) { + + String[] split = line.split(REGEX); + int w = mf.getValue(PipeGen.WORD, split[0]); + if (w < 0) + continue; + word2tag[w][0] = (byte) mf.getValue(TAG, split[1]); + // if (split.length>1) word2tag[w][1]= (byte)mf.getValue(FR, + // split[2]); // frequency + } + inputReader.close(); + int fill = 0; + for (byte[] element : word2tag) { + if (element[0] != 0) + fill++; + } + System.out.println("filled " + fill + " of " + word2tag.length); + + } catch (Exception e) { + e.printStackTrace(); + } + } + + /** + * Read the cluster + * + * @param dos + * @throws IOException + */ + public Lexicon(DataInputStream dis) throws IOException { + + word2tag = new byte[dis.readInt()][1]; + for (int i = 0; i < word2tag.length; i++) { + word2tag[i][0] = dis.readByte(); + // word2tag[i][1]=dis.readByte(); + } + DB.println("Read lexicon with " + word2tag.length + " words "); + } + + /** + * Write the cluster + * + * @param dos + * @throws IOException + */ + public void write(DataOutputStream dos) throws IOException { + + dos.writeInt(word2tag.length); + for (byte[] i : word2tag) { + dos.writeByte(i[0]); + // dos.writeByte(i[1]); + } + + } + + /** + * @param form + * @return + */ + public int getTag(int form) { + if (word2tag.length < form || form < 0) + return -1; + return word2tag[form][0]; + } + + /** + * @param form + * @return + */ + public int getConf(int form) { + if (word2tag.length < form || form < 0) + return -1; + return word2tag[form][1]; + } + +} diff --git a/dependencyParser/mate-tools/src/is2/tag/MFO.java b/dependencyParser/experimental/mate-tools/src/is2/tag/MFO.java index d8ac62d..d8ac62d 100644 --- a/dependencyParser/mate-tools/src/is2/tag/MFO.java +++ b/dependencyParser/experimental/mate-tools/src/is2/tag/MFO.java diff --git a/dependencyParser/experimental/mate-tools/src/is2/tag/Options.java b/dependencyParser/experimental/mate-tools/src/is2/tag/Options.java new file mode 100644 index 0000000..0998c70 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/tag/Options.java @@ -0,0 +1,132 @@ +package is2.tag; + +import java.io.File; + +import is2.util.OptionsSuper; + +public final class Options extends OptionsSuper { + + public Options(String[] args) { + + for (int i = 0; i < args.length; i++) { + String[] pair = args[i].split(":"); + + if (pair[0].equals("--help")) + explain(); + else if (pair[0].equals("-train")) { + train = true; + trainfile = args[i + 1]; + } else if (pair[0].equals("-eval")) { + eval = true; + goldfile = args[i + 1]; + i++; + } else if (pair[0].equals("-test")) { + test = true; + testfile = args[i + 1]; + i++; + } else if (pair[0].equals("-i")) { + numIters = Integer.parseInt(args[i + 1]); + i++; + } else if (pair[0].equals("-out")) { + outfile = args[i + 1]; + i++; + } else if (pair[0].equals("-decode")) { + decodeProjective = args[i + 1].equals("proj"); + i++; + } else if (pair[0].equals("-confidence")) { + + conf = true; + } + + else if (pair[0].equals("-count")) { + count = Integer.parseInt(args[i + 1]); + i++; + } else if (pair[0].equals("-model")) { + modelName = args[i + 1]; + i++; + } else if (pair[0].equals("-tmp")) { + tmp = args[i + 1]; + i++; + } else if (pair[0].equals("-format")) { + // format = args[i+1]; + formatTask = Integer.parseInt(args[i + 1]); + i++; + } else if (pair[0].equals("-allfeatures")) { + allFeatures = true; + } else if (pair[0].equals("-nonormalize")) { + normalize = false; + } else if (pair[0].equals("-nframes")) { + // format = args[i+1]; + nbframes = args[i + 1]; + i++; + + } else if (pair[0].equals("-pframes")) { + // format = args[i+1]; + pbframes = args[i + 1]; + i++; + } else if (pair[0].equals("-nopred")) { + nopred = true; + } else if (pair[0].equals("-divide")) { + keep = true; + } else if (pair[0].equals("-lexicon")) { + lexicon = args[i + 1]; + i++; + + } else + super.addOption(args, i); + + } + + try { + + if (trainfile != null) { + + if (keep && tmp != null) { + trainforest = new File(tmp); + if (!trainforest.exists()) + keep = false; + + } else if (tmp != null) { + trainforest = File.createTempFile("train", ".tmp", new File(tmp)); + trainforest.deleteOnExit(); + } else { + trainforest = File.createTempFile("train", ".tmp"); // ,new + // File("F:\\") + trainforest.deleteOnExit(); + } + + } + + } catch (java.io.IOException e) { + System.out.println("Unable to create tmp files for feature forests!"); + System.out.println(e); + System.exit(0); + } + } + + private void explain() { + System.out.println("Usage: "); + System.out.println("java -class mate.jar is2.parser.Parser [Options]"); + System.out.println(); + System.out.println("Example: "); + System.out.println( + " java -class mate.jar is2.parser.Parser -model eps3.model -train corpora/conll08st/train/train.closed -test corpora/conll08st/devel/devel.closed -out b3.test -eval corpora/conll08st/devel/devel.closed -count 2000 -i 6"); + System.out.println(""); + System.out.println("Options:"); + System.out.println(""); + System.out.println(" -train <file> the corpus a model is trained on; default " + this.trainfile); + System.out.println(" -test <file> the input corpus for testing; default " + this.testfile); + System.out.println(" -out <file> the output corpus (result) of a test run; default " + this.outfile); + System.out.println(" -model <file> the parsing model for traing the model is stored in the files"); + System.out.println( + " and for parsing the model is load from this file; default " + this.modelName); + System.out.println( + " -i <number> the number of training iterations; good numbers are 10 for smaller corpora and 6 for bigger; default " + + this.numIters); + System.out.println(" -count <number> the n first sentences of the corpus are take for the training default " + + this.count); + System.out.println(" -format <number> conll format of the year 8 or 9; default " + this.formatTask); + + System.exit(0); + } +} diff --git a/dependencyParser/mate-tools/src/is2/tag/POS.java b/dependencyParser/experimental/mate-tools/src/is2/tag/POS.java index 30f1e41..30f1e41 100644 --- a/dependencyParser/mate-tools/src/is2/tag/POS.java +++ b/dependencyParser/experimental/mate-tools/src/is2/tag/POS.java diff --git a/dependencyParser/mate-tools/src/is2/tag/Tagger.java b/dependencyParser/experimental/mate-tools/src/is2/tag/Tagger.java index 2ea146e..2ea146e 100644 --- a/dependencyParser/mate-tools/src/is2/tag/Tagger.java +++ b/dependencyParser/experimental/mate-tools/src/is2/tag/Tagger.java diff --git a/dependencyParser/experimental/mate-tools/src/is2/tag/package.html b/dependencyParser/experimental/mate-tools/src/is2/tag/package.html new file mode 100644 index 0000000..469fdf6 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/tag/package.html @@ -0,0 +1,4 @@ +Package info +<br><br> +This parser includes a tagger into the dependency parser +<br> \ No newline at end of file diff --git a/dependencyParser/mate-tools/src/is2/tools/IPipe.java b/dependencyParser/experimental/mate-tools/src/is2/tools/IPipe.java index b6e0e02..b6e0e02 100644 --- a/dependencyParser/mate-tools/src/is2/tools/IPipe.java +++ b/dependencyParser/experimental/mate-tools/src/is2/tools/IPipe.java diff --git a/dependencyParser/experimental/mate-tools/src/is2/tools/Retrainable.java b/dependencyParser/experimental/mate-tools/src/is2/tools/Retrainable.java new file mode 100644 index 0000000..86fbfcc --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/tools/Retrainable.java @@ -0,0 +1,30 @@ +package is2.tools; + +import is2.data.SentenceData09; + +/** + * Provides Methods for the retraining + * + * @author bohnetbd + * + */ +public interface Retrainable { + + /** + * Retrains with a update factor (upd). The retraining stops when the model + * was successful adapted or it gave up after the maximal iterations. + * + * @param sentence + * the data container of the new example. + * @param upd + * the update factor, e.g. 0.01 + * @param iterations + * maximal number of iterations that are tried to adapt the + * system. + * @return success = true -- else false + */ + public boolean retrain(SentenceData09 sentence, float upd, int iterations); + + boolean retrain(SentenceData09 sentence, float upd, int iterations, boolean print); + +} diff --git a/dependencyParser/mate-tools/src/is2/tools/Tool.java b/dependencyParser/experimental/mate-tools/src/is2/tools/Tool.java index 41ead53..41ead53 100644 --- a/dependencyParser/mate-tools/src/is2/tools/Tool.java +++ b/dependencyParser/experimental/mate-tools/src/is2/tools/Tool.java diff --git a/dependencyParser/mate-tools/src/is2/tools/ToolIO.java b/dependencyParser/experimental/mate-tools/src/is2/tools/ToolIO.java index d7b67fe..d7b67fe 100644 --- a/dependencyParser/mate-tools/src/is2/tools/ToolIO.java +++ b/dependencyParser/experimental/mate-tools/src/is2/tools/ToolIO.java diff --git a/dependencyParser/mate-tools/src/is2/tools/Train.java b/dependencyParser/experimental/mate-tools/src/is2/tools/Train.java index 31a7ad8..31a7ad8 100644 --- a/dependencyParser/mate-tools/src/is2/tools/Train.java +++ b/dependencyParser/experimental/mate-tools/src/is2/tools/Train.java diff --git a/dependencyParser/mate-tools/src/is2/util/Convert.java b/dependencyParser/experimental/mate-tools/src/is2/util/Convert.java index b9c820f..b9c820f 100644 --- a/dependencyParser/mate-tools/src/is2/util/Convert.java +++ b/dependencyParser/experimental/mate-tools/src/is2/util/Convert.java diff --git a/dependencyParser/experimental/mate-tools/src/is2/util/Convert0409.java b/dependencyParser/experimental/mate-tools/src/is2/util/Convert0409.java new file mode 100644 index 0000000..b735ad8 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/util/Convert0409.java @@ -0,0 +1,176 @@ +/** + * + */ +package is2.util; + +import is2.data.SentenceData09; +import is2.io.CONLLReader04; +import is2.io.CONLLReader09; +import is2.io.CONLLWriter06; +import is2.io.CONLLWriter09; + +/** + * @author Dr. Bernd Bohnet, 01.03.2010 + * + * + */ +public class Convert0409 { + + public static void main(String args[]) throws Exception { + + convert(args[0], args[1]); + + } + + public static void convert(String source, String target) throws Exception { + + CONLLReader04 reader = new CONLLReader04(source); + CONLLWriter09 writer = new CONLLWriter09(target); + + int str = 0; + while (true) { + SentenceData09 i = reader.getNext(); + str++; + if (i == null) + break; + + String[] formsNoRoot = new String[i.length() - 1]; + String[] posNoRoot = new String[formsNoRoot.length]; + String[] lemmas = new String[formsNoRoot.length]; + + String[] org_lemmas = new String[formsNoRoot.length]; + + String[] of = new String[formsNoRoot.length]; + String[] pf = new String[formsNoRoot.length]; + + String[] pposs = new String[formsNoRoot.length]; + String[] labels = new String[formsNoRoot.length]; + String[] fillp = new String[formsNoRoot.length]; + + int[] heads = new int[formsNoRoot.length]; + + for (int j = 0; j < formsNoRoot.length; j++) { + formsNoRoot[j] = i.forms[j + 1]; + if (formsNoRoot[j].length() == 0 || formsNoRoot[j].equals("")) { + System.out.println("error forms " + str); + // System.exit(0); + formsNoRoot[j] = " "; + } + posNoRoot[j] = i.gpos[j + 1]; + if (posNoRoot[j].length() == 0 || posNoRoot[j].equals(" ")) { + System.out.println("error pos " + str); + // System.exit(0); + } + pposs[j] = i.ppos[j + 1]; + if (pposs[j].length() == 0 || pposs[j].equals(" ")) { + System.out.println("error pos " + str); + // System.exit(0); + } + + labels[j] = i.labels[j + 1]; + if (labels[j].length() == 0 || labels[j].equals(" ")) { + System.out.println("error lab " + str); + // System.exit(0); + } + heads[j] = i.heads[j + 1]; + if (heads[j] > posNoRoot.length) { + System.out.println("head out of range " + heads[j] + " " + heads.length + " " + str); + heads[j] = posNoRoot.length; + } + + lemmas[j] = i.plemmas[j + 1]; + if (lemmas[j].length() == 0 || lemmas[j].equals(" ")) { + System.out.println("error lab " + str); + // System.exit(0); + } + org_lemmas[j] = i.lemmas[j + 1]; + if (org_lemmas[j].length() == 0 || org_lemmas[j].equals(" ")) { + System.out.println("error lab " + str); + // System.exit(0); + } + of[j] = i.ofeats[j + 1]; + pf[j] = i.pfeats[j + 1]; + if (str == 6099) { + // System.out.println(formsNoRoot[j]+"\t"+posNoRoot[j]+"\t"+pposs[j]+"\t"+labels[j]+"\t"+heads[j]); + } + + // (instance.fillp!=null) fillp[j] = instance.fillp[j+1]; + } + + SentenceData09 i09 = new SentenceData09(formsNoRoot, lemmas, org_lemmas, pposs, pposs, labels, heads, fillp, + of, pf); + + // public SentenceData09(String[] forms, String[] lemmas, String[] + // olemmas,String[] gpos, String[] ppos, String[] labs, int[] heads, + // String[] fillpred) { + // SentenceData09 + // SentenceData09 i2 = new SentenceData09(i.forms, + // i.lemmas,i.org_lemmas,); + + writer.write(i09); + + } + writer.finishWriting(); + + } + + public static void convert0906(String source, String target) throws Exception { + + CONLLReader09 reader = new CONLLReader09(source); + CONLLWriter06 writer = new CONLLWriter06(target); + + while (true) { + SentenceData09 i = reader.getNext(); + + if (i == null) + break; + + String[] formsNoRoot = new String[i.length() - 1]; + String[] posNoRoot = new String[formsNoRoot.length]; + String[] lemmas = new String[formsNoRoot.length]; + + String[] org_lemmas = new String[formsNoRoot.length]; + + String[] of = new String[formsNoRoot.length]; + String[] pf = new String[formsNoRoot.length]; + + String[] pposs = new String[formsNoRoot.length]; + String[] labels = new String[formsNoRoot.length]; + String[] fillp = new String[formsNoRoot.length]; + + int[] heads = new int[formsNoRoot.length]; + + for (int j = 0; j < formsNoRoot.length; j++) { + formsNoRoot[j] = i.forms[j + 1]; + posNoRoot[j] = i.gpos[j + 1]; + pposs[j] = i.ppos[j + 1]; + + labels[j] = i.labels[j + 1]; + heads[j] = i.heads[j + 1]; + lemmas[j] = i.plemmas[j + 1]; + + org_lemmas[j] = i.lemmas[j + 1]; + of[j] = i.ofeats[j + 1]; + pf[j] = i.pfeats[j + 1]; + + // (instance.fillp!=null) fillp[j] = instance.fillp[j+1]; + } + + SentenceData09 i09 = new SentenceData09(formsNoRoot, lemmas, org_lemmas, posNoRoot, pposs, labels, heads, + fillp, of, pf); + + // public SentenceData09(String[] forms, String[] lemmas, String[] + // olemmas,String[] gpos, String[] ppos, String[] labs, int[] heads, + // String[] fillpred) { + // SentenceData09 + // SentenceData09 i2 = new SentenceData09(i.forms, + // i.lemmas,i.org_lemmas,); + + writer.write(i09); + + } + writer.finishWriting(); + + } + +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/util/ConvertADJ.java b/dependencyParser/experimental/mate-tools/src/is2/util/ConvertADJ.java new file mode 100644 index 0000000..e6ca6c1 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/util/ConvertADJ.java @@ -0,0 +1,121 @@ +/** + * + */ +package is2.util; + +import is2.data.SentenceData09; +import is2.io.CONLLReader09; +import is2.io.CONLLWriter06; + +/** + * @author Dr. Bernd Bohnet, 01.03.2010 + * + * + */ +public class ConvertADJ { + + public static void main(String args[]) throws Exception { + + convert(args[0], args[1]); + + } + + public static void convert(String source, String target) throws Exception { + + CONLLReader09 reader = new CONLLReader09(source); + // CONLLWriter09 writer = new CONLLWriter09(target); + int adj = 0, argadj = 0; + int rb = 0, argrb = 0; + while (true) { + SentenceData09 i = reader.getNext(); + if (i == null) + break; + + for (int k = 0; k < i.length(); k++) { + + if (i.gpos[k].startsWith("JJ")) + adj++; + if (i.gpos[k].startsWith("RB")) + rb++; + + if (i.argposition != null) { + for (int[] element : i.argposition) { + if (element != null) + for (int a = 0; a < element.length; a++) { + if (element[a] == k && i.gpos[k].startsWith("JJ")) + argadj++; + if (element[a] == k && i.gpos[k].startsWith("RB")) + argrb++; + } + + } + } + // (instance.fillp!=null) fillp[j] = instance.fillp[j+1]; + } + + } + System.out.println("adj " + adj + " " + argadj); + System.out.println("rb " + rb + " " + argrb); + + } + + public static void convert0906(String source, String target) throws Exception { + + CONLLReader09 reader = new CONLLReader09(source); + CONLLWriter06 writer = new CONLLWriter06(target); + + while (true) { + SentenceData09 i = reader.getNext(); + + if (i == null) + break; + + String[] formsNoRoot = new String[i.length() - 1]; + String[] posNoRoot = new String[formsNoRoot.length]; + String[] lemmas = new String[formsNoRoot.length]; + + String[] org_lemmas = new String[formsNoRoot.length]; + + String[] of = new String[formsNoRoot.length]; + String[] pf = new String[formsNoRoot.length]; + + String[] pposs = new String[formsNoRoot.length]; + String[] labels = new String[formsNoRoot.length]; + String[] fillp = new String[formsNoRoot.length]; + + int[] heads = new int[formsNoRoot.length]; + + for (int j = 0; j < formsNoRoot.length; j++) { + formsNoRoot[j] = i.forms[j + 1]; + posNoRoot[j] = i.gpos[j + 1]; + pposs[j] = i.ppos[j + 1]; + + labels[j] = i.labels[j + 1]; + heads[j] = i.heads[j + 1]; + lemmas[j] = i.plemmas[j + 1]; + + org_lemmas[j] = i.lemmas[j + 1]; + of[j] = i.ofeats[j + 1]; + pf[j] = i.pfeats[j + 1]; + + // (instance.fillp!=null) fillp[j] = instance.fillp[j+1]; + } + + SentenceData09 i09 = new SentenceData09(formsNoRoot, lemmas, org_lemmas, posNoRoot, pposs, labels, heads, + fillp, of, pf); + + // public SentenceData09(String[] forms, String[] lemmas, String[] + // olemmas,String[] gpos, String[] ppos, String[] labs, int[] heads, + // String[] fillpred) { + // SentenceData09 + // SentenceData09 i2 = new SentenceData09(i.forms, + // i.lemmas,i.org_lemmas,); + + writer.write(i09); + + } + writer.finishWriting(); + + } + +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/util/ConvertLowerCase0909.java b/dependencyParser/experimental/mate-tools/src/is2/util/ConvertLowerCase0909.java new file mode 100644 index 0000000..e5842d6 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/util/ConvertLowerCase0909.java @@ -0,0 +1,76 @@ +/** + * + */ +package is2.util; + +import is2.data.SentenceData09; +import is2.io.CONLLReader09; +import is2.io.CONLLWriter09; + +/** + * @author Dr. Bernd Bohnet, 01.03.2010 + * + * + */ +public class ConvertLowerCase0909 { + + public static void main(String args[]) throws Exception { + + CONLLReader09 reader = new CONLLReader09(args[0]); + CONLLWriter09 writer = new CONLLWriter09(args[1]); + + while (true) { + SentenceData09 i = reader.getNext(); + if (i == null) + break; + + SentenceData09 i09 = new SentenceData09(i); + i09.createSemantic(i); + + for (int k = 0; k < i09.length(); k++) { + i09.lemmas[k] = i09.lemmas[k].toLowerCase(); + i09.plemmas[k] = i09.plemmas[k].toLowerCase(); + + } + + writer.write(i09); + + } + writer.finishWriting(); + + } + + public static void convert(String source, String target) throws Exception { + + CONLLReader09 reader = new CONLLReader09(source); + CONLLWriter09 writer = new CONLLWriter09(target); + + while (true) { + SentenceData09 i = reader.getNext(); + if (i == null) + break; + + SentenceData09 i09 = new SentenceData09(i); + i09.createSemantic(i); + + for (int k = 0; k < i09.length(); k++) { + i09.lemmas[k] = i09.lemmas[k].toLowerCase(); + i09.plemmas[k] = i09.plemmas[k].toLowerCase(); + + } + + // public SentenceData09(String[] forms, String[] lemmas, String[] + // olemmas,String[] gpos, String[] ppos, String[] labs, int[] heads, + // String[] fillpred) { + // SentenceData09 + // SentenceData09 i2 = new SentenceData09(i.forms, + // i.lemmas,i.org_lemmas,); + + writer.write(i09); + + } + writer.finishWriting(); + + } + +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/util/ConvertTiger2CoNLL.java b/dependencyParser/experimental/mate-tools/src/is2/util/ConvertTiger2CoNLL.java new file mode 100644 index 0000000..e650737 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/util/ConvertTiger2CoNLL.java @@ -0,0 +1,120 @@ +/** + * + */ +package is2.util; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.util.StringTokenizer; + +/** + * @author Dr. Bernd Bohnet, 17.01.2010 + * + * This class removes all information from a conll 2009 file except of + * columns 1 and 2 that contain the word id and the word form. + */ +public class ConvertTiger2CoNLL { + + public static void main(String[] args) throws IOException { + + OptionsSuper options = new OptionsSuper(args, null); + + if (options.trainfile != null) { + System.err.println( + "included sentences " + clean(options.trainfile, options.outfile, options.start, options.count)); + } else + System.err.println("Please proivde the file name -train <file-name>"); + + } + + /** + * @param trainfile + * @throws IOException + */ + private static int clean(String file, String outFile, int start, int numberOfSentences) throws IOException { + + System.err.println("writting to " + outFile); + System.err.println("start " + start + " to " + (start + numberOfSentences)); + int state = 0; + + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"), 32768); + BufferedWriter writer = new BufferedWriter( + new java.io.OutputStreamWriter(new java.io.FileOutputStream(outFile), "UTF-8"), 32768); + String l = null; + try { + + int id = 1, snt = 0, cnt = 0; + + while ((l = reader.readLine()) != null) { + + if (l.startsWith("#BOS")) { + state = 1; // BOS + id = 1; + snt++; + continue; + } + if (l.startsWith("#EOS") && state == 1) { + state = 2; // BOS + cnt++; + + writer.newLine(); + } + + if (start > snt || (start + numberOfSentences) <= snt) { + state = 3; + } + + if (l.startsWith("#5") || l.startsWith("#6") || l.startsWith("#7")) + continue; + if ((start + numberOfSentences) <= snt) + break; + + if (state == 3) + continue; + + if (state == 1) { + + l = l.replace("\t\t", "\t"); + l = l.replace("\t\t", "\t"); + + StringTokenizer t = new StringTokenizer(l, "\t"); + int count = 0; + + writer.write("" + id + "\t"); + + while (t.hasMoreTokens()) { + if (count == 0) { + writer.write(t.nextToken() + "\t"); + } else if (count == 1) { + writer.write(t.nextToken() + "\t_\t"); + } else if (count == 2) { + writer.write(t.nextToken() + "\t_\t"); + } else if (count == 3) { + writer.write(t.nextToken().replace(".", "|") + "\t_\t"); + } else { + t.nextToken(); + } + count++; + } + writer.write("_\t_\t_\t_\t_\t_\t_\t_\t_"); + writer.newLine(); + } + id++; + } + writer.flush(); + writer.close(); + reader.close(); + + return cnt; + } catch (IOException e) { + e.printStackTrace(); + } + + return -1; + + } + +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/util/DB.java b/dependencyParser/experimental/mate-tools/src/is2/util/DB.java new file mode 100755 index 0000000..30fd231 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/util/DB.java @@ -0,0 +1,78 @@ +package is2.util; + +import java.util.Calendar; +import java.util.GregorianCalendar; + +public class DB { + + private static final String ARROW = " -> "; + private static final String LEER = " "; + private static final String BIG = " "; + + private static boolean debug = true; + + final static public void println(Object err) { + + if (!debug) + return; + + StackTraceElement[] ste = new Exception().getStackTrace(); + + StringBuffer msg = new StringBuffer(); + msg.append((getDate().append(LEER).substring(0, 10))); + msg.append(' '); + msg.append(ste[1].getClassName() + " " + ste[1].getLineNumber()); + msg.append(':'); + msg.append(ste[1].getMethodName()); + msg.append(ARROW); + + int l = 55 - msg.length(); + if (l < 0) + l = 0; + msg.append(BIG.substring(0, l)); + + // if ((m_depth >= 0) && (m_depth < (BIG.length()) )) { + // vDebugMessage.append(BIG.substring(0, m_depth*2)); + // } + + msg.append(err); + + System.err.println(msg); + + } + + final static public void prints(Object err) { + + if (!debug) + return; + System.err.println(err); + + } + + final private static StringBuffer getDate() { + // if (Preferences.s_debug <= BDebug.FAIL) return s_sb; + + GregorianCalendar s_cal = new GregorianCalendar(); + StringBuffer sb = new StringBuffer(); + // sb.append(s_cal.get(Calendar.HOUR_OF_DAY)); + // sb.append('_'); + sb.append(s_cal.get(Calendar.MINUTE)); + sb.append('.'); + sb.append(s_cal.get(Calendar.SECOND)); + sb.append('.'); + sb.append(s_cal.get(Calendar.MILLISECOND)); + + return sb; + } + + public static void setDebug(boolean b) { + debug = b; + + } + + public static boolean getDebug() { + + return debug; + } + +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/util/Edges.java b/dependencyParser/experimental/mate-tools/src/is2/util/Edges.java new file mode 100644 index 0000000..2457cae --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/util/Edges.java @@ -0,0 +1,197 @@ +/** + * + */ +package is2.util; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Map.Entry; + +/** + * @author Dr. Bernd Bohnet, 13.05.2009; + * + * + */ +public final class Edges { + + private static short[][][] edges; + private static HashMap<Short, Integer> labelCount = new HashMap<Short, Integer>(); + + private static HashMap<String, Integer> slabelCount = new HashMap<String, Integer>(); + + static short[] def = new short[1]; + + private Edges() { + } + + /** + * @param length + */ + public static void init(int length) { + edges = new short[length][length][]; + } + + public static void findDefault() { + + int best = 0; + + for (Entry<Short, Integer> e : labelCount.entrySet()) { + + if (best < e.getValue()) { + best = e.getValue(); + def[0] = e.getKey(); + } + } + + // labelCount=null; + // String[] types = new String[mf.getFeatureCounter().get(PipeGen.REL)]; + // for (Entry<String, Integer> e : + // MFO.getFeatureSet().get(PipeGen.REL).entrySet()) types[e.getValue()] + // = e.getKey(); + + is2.util.DB.println("set default label to " + def[0] + " "); + + // System.out.println("found default "+def[0]); + + } + + final static public void put(int pos1, int pos2, short label) { + putD(pos1, pos2, label); + // putD(pos2, pos1,!dir, label); + } + + final static public void putD(int pos1, int pos2, short label) { + + Integer lc = labelCount.get(label); + if (lc == null) + labelCount.put(label, 1); + else + labelCount.put(label, lc + 1); + + String key = pos1 + "-" + pos2 + label; + Integer lcs = slabelCount.get(key); + if (lcs == null) + slabelCount.put(key, 1); + else + slabelCount.put(key, lcs + 1); + + if (edges[pos1][pos2] == null) { + edges[pos1][pos2] = new short[1]; + edges[pos1][pos2][0] = label; + + // edgesh[pos1][pos2][dir?0:1] = new TIntHashSet(2); + // edgesh[pos1][pos2][dir?0:1].add(label); + } else { + short labels[] = edges[pos1][pos2]; + for (short l : labels) { + // contains label already? + if (l == label) + return; + } + + short[] nlabels = new short[labels.length + 1]; + System.arraycopy(labels, 0, nlabels, 0, labels.length); + nlabels[labels.length] = label; + edges[pos1][pos2] = nlabels; + + // edgesh[pos1][pos2][dir?0:1].add(label); + } + } + + final static public short[] get(int pos1, int pos2) { + + if (pos1 < 0 || pos2 < 0 || edges[pos1][pos2] == null) + return def; + return edges[pos1][pos2]; + } + + /** + * @param dis + */ + static public void write(DataOutputStream d) throws IOException { + + int len = edges.length; + d.writeShort(len); + + for (int p1 = 0; p1 < len; p1++) { + for (int p2 = 0; p2 < len; p2++) { + if (edges[p1][p2] == null) + d.writeShort(0); + else { + d.writeShort(edges[p1][p2].length); + for (int l = 0; l < edges[p1][p2].length; l++) { + d.writeShort(edges[p1][p2][l]); + } + + } + } + } + + d.writeShort(def[0]); + + } + + /** + * @param dis + */ + public static void read(DataInputStream d) throws IOException { + int len = d.readShort(); + + edges = new short[len][len][]; + for (int p1 = 0; p1 < len; p1++) { + for (int p2 = 0; p2 < len; p2++) { + int ll = d.readShort(); + if (ll == 0) { + edges[p1][p2] = null; + } else { + edges[p1][p2] = new short[ll]; + for (int l = 0; l < ll; l++) { + edges[p1][p2][l] = d.readShort(); + } + } + } + } + + def[0] = d.readShort(); + + } + + public static class C implements Comparator<Short> { + + public C() { + super(); + } + + String _key; + + public C(String key) { + super(); + _key = key; + } + + /* + * (non-Javadoc) + * + * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) + */ + @Override + public int compare(Short l1, Short l2) { + + // int c1 = labelCount.get(l1); + // int c2 = labelCount.get(l2); + // if (true) return c1==c2?0:c1>c2?-1:1; + + int x1 = slabelCount.get(_key + l1.shortValue()); + int x2 = slabelCount.get(_key + l2.shortValue()); + // System.out.println(x1+" "+x2); + + return x1 == x2 ? 0 : x1 > x2 ? -1 : 1; + + } + + } + +} diff --git a/dependencyParser/mate-tools/src/is2/util/Evaluator.java b/dependencyParser/experimental/mate-tools/src/is2/util/Evaluator.java index f75fc54..f75fc54 100644 --- a/dependencyParser/mate-tools/src/is2/util/Evaluator.java +++ b/dependencyParser/experimental/mate-tools/src/is2/util/Evaluator.java diff --git a/dependencyParser/mate-tools/src/is2/util/EvaluatorTagger.java b/dependencyParser/experimental/mate-tools/src/is2/util/EvaluatorTagger.java index c1f88f4..c1f88f4 100644 --- a/dependencyParser/mate-tools/src/is2/util/EvaluatorTagger.java +++ b/dependencyParser/experimental/mate-tools/src/is2/util/EvaluatorTagger.java diff --git a/dependencyParser/experimental/mate-tools/src/is2/util/ExtractParagraphs.java b/dependencyParser/experimental/mate-tools/src/is2/util/ExtractParagraphs.java new file mode 100644 index 0000000..aa65d8d --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/util/ExtractParagraphs.java @@ -0,0 +1,74 @@ +package is2.util; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; + +public class ExtractParagraphs { + + /** + * + * @param args + * @throws IOException + */ + public static void main(String args[]) throws IOException { + + if (args.length < 1) { + System.out.println("Please provide a file name."); + System.exit(0); + } + + File file = new File(args[0]); + file.isDirectory(); + String[] dirs = file.list(); + + BufferedWriter write = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[1]), "UTF-8"), + 32768); + int cnt = 0; + + for (String fileName : dirs) { + BufferedReader reader = new BufferedReader( + new InputStreamReader(new FileInputStream(args[0] + fileName), "UTF-8"), 32768); + + int state = 0; + + String s; + while ((s = reader.readLine()) != null) { + + if (s.startsWith("<P>") || s.startsWith("<p>")) { + state = 1; // paragraph start + continue; + } + + if (s.startsWith("</P>") || s.startsWith("</p>")) { + state = 2; // paragraph end + write.newLine(); + } + + if (state == 1) { + String sp[] = s.split("\\. "); + for (String p : sp) { + write.write(p); + // if (sp.length>1) write.newLine(); + } + cnt++; + } + } + + // if (cnt>5000) break; + + reader.close(); + } + write.flush(); + write.close(); + + System.out.println("Extract " + cnt + " lines "); + + } + +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/util/IntStack.java b/dependencyParser/experimental/mate-tools/src/is2/util/IntStack.java new file mode 100644 index 0000000..b291d16 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/util/IntStack.java @@ -0,0 +1,90 @@ +/** + * + */ +package is2.util; + +/** + * @author Dr. Bernd Bohnet, 01.06.2011 + * + * + */ +final public class IntStack { + + final public int[] stack; + public int position = -1; + + public IntStack(int size) { + if (size <= 0) + stack = new int[1]; + else + stack = new int[size + 1]; + } + + public IntStack(IntStack s) { + stack = s.stack; + position = s.position; + } + + public int peek() { + return position == -1 ? -1 : stack[position]; + } + + public void push(int i) { + // if (i ==2)new Exception().printStackTrace(); + stack[++position] = i; + } + + public int pop() { + return position == -1 ? -1 : stack[position--]; + } + + public int size() { + return position + 1; + } + + public boolean isEmpty() { + return position == -1 ? true : false; + } + + public int get(int p) { + return stack[p]; + } + + public void clear() { + position = -1; + } + + /** + * @param b + */ + public void addAll(IntStack b) { + + position = b.position; + if (position < 0) + return; + + for (int k = 0; k <= position; k++) + stack[k] = b.stack[k]; + + } + + public boolean contains(int s) { + ; + + for (int k = 0; k <= position; k++) + if (stack[k] == s) + return true; + + return false; + } + + @Override + public String toString() { + StringBuffer s = new StringBuffer(); + for (int k = position; k >= 0; k--) { + s.append(k).append(":").append(this.stack[k]).append(" "); + } + return s.toString(); + } + +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/util/Long2Int.java b/dependencyParser/experimental/mate-tools/src/is2/util/Long2Int.java new file mode 100644 index 0000000..e6ef45c --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/util/Long2Int.java @@ -0,0 +1,81 @@ +package is2.util; + +import is2.data.Long2IntInterface; + +/** + * @author Bernd Bohnet, 01.09.2009 + * + * Maps for the Hash Kernel the long values to the int values. + */ +final public class Long2Int implements Long2IntInterface { + + public Long2Int() { + size = 115911564; + } + + public Long2Int(int s) { + size = s; + } + + /** Integer counter for long2int */ + final private int size; // 0x03ffffff //0x07ffffff + + /* + * (non-Javadoc) + * + * @see is2.sp09k9992.Long2IntIterface#size() + */ + @Override + public int size() { + return size; + } + + /* + * (non-Javadoc) + * + * @see is2.sp09k9992.Long2IntIterface#start() has no meaning for this + * implementation + */ + final public void start() { + } + + /* + * (non-Javadoc) + * + * @see is2.sp09k9992.Long2IntIterface#l2i(long) + */ + @Override + final public int l2i(long l) { + if (l < 0) + return -1; + + // this works well LAS 88.138 + // int r= (int)(( l ^ (l&0xffffffff00000000L) >>> 29 ));//0x811c9dc5 ^ + // // 29 + // return Math.abs(r % size); + // this works a bit better and good with 0x03ffffff + // + /* + * long r= l;//26 l = (l>>12)&0xfffffffffffff000L; r ^= l;//38 l = + * (l>>11)&0xffffffffffffc000L; r ^= l;//49 l = (l>>9)& + * 0xffffffffffff0000L; //53 r ^= l;//58 l = (l>>7)&0xfffffffffffc0000L; + * //62 r ^=l;//65 int x = (int)r; x = x % size; // return x >= 0 ? x : + * -x ;// Math.abs(r % size); + * + */ + // 26 0x03ffffff + // together with 0x07ffffff 27 88.372 + long r = l;// 27 + l = (l >> 13) & 0xffffffffffffe000L; + r ^= l; // 40 + l = (l >> 11) & 0xffffffffffff0000L; + r ^= l; // 51 + l = (l >> 9) & 0xfffffffffffc0000L; // 53 + r ^= l; // 60 + l = (l >> 7) & 0xfffffffffff00000L; // 62 + r ^= l; // 67 + int x = ((int) r) % size; + + return x >= 0 ? x : -x; + } +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/util/Options.java b/dependencyParser/experimental/mate-tools/src/is2/util/Options.java new file mode 100644 index 0000000..30b53b0 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/util/Options.java @@ -0,0 +1,133 @@ +package is2.util; + +import java.io.File; + +public final class Options extends OptionsSuper { + + public Options(String[] args) { + + for (int i = 0; i < args.length; i++) { + String[] pair = args[i].split(":"); + + if (pair[0].equals("--help")) + explain(); + else if (pair[0].equals("-train")) { + train = true; + trainfile = args[i + 1]; + } else if (pair[0].equals("-eval")) { + eval = true; + goldfile = args[i + 1]; + i++; + } else if (pair[0].equals("-test")) { + test = true; + testfile = args[i + 1]; + i++; + } else if (pair[0].equals("-i")) { + numIters = Integer.parseInt(args[i + 1]); + i++; + } else if (pair[0].equals("-out")) { + outfile = args[i + 1]; + i++; + } else if (pair[0].equals("-decode")) { + decodeProjective = args[i + 1].equals("proj"); + i++; + } else if (pair[0].equals("-confidence")) { + + conf = true; + } + + else if (pair[0].equals("-count")) { + count = Integer.parseInt(args[i + 1]); + i++; + } else if (pair[0].equals("-model")) { + modelName = args[i + 1]; + i++; + } else if (pair[0].equals("-device")) { + device = args[i + 1]; + i++; + } else if (pair[0].equals("-tmp")) { + tmp = args[i + 1]; + i++; + } else if (pair[0].equals("-format")) { + // format = args[i+1]; + formatTask = Integer.parseInt(args[i + 1]); + i++; + } else if (pair[0].equals("-allfeatures")) { + allFeatures = true; + } else if (pair[0].equals("-nonormalize")) { + normalize = false; + } else if (pair[0].equals("-nframes")) { + // format = args[i+1]; + nbframes = args[i + 1]; + i++; + + } else if (pair[0].equals("-pframes")) { + // format = args[i+1]; + pbframes = args[i + 1]; + i++; + } else if (pair[0].equals("-nopred")) { + nopred = true; + } else if (pair[0].equals("-divide")) { + keep = true; + } else if (pair[0].equals("-lexicon")) { + lexicon = args[i + 1]; + i++; + + } else + super.addOption(args, i); + + } + + try { + + if (trainfile != null) { + + if (keep && tmp != null) { + trainforest = new File(tmp); + if (!trainforest.exists()) + keep = false; + + } else if (tmp != null) { + trainforest = File.createTempFile("train", ".tmp", new File(tmp)); + trainforest.deleteOnExit(); + } else { + trainforest = File.createTempFile("train", ".tmp"); // ,new + // File("F:\\") + trainforest.deleteOnExit(); + } + + } + + } catch (java.io.IOException e) { + System.out.println("Unable to create tmp files for feature forests!"); + System.out.println(e); + System.exit(0); + } + } + + private void explain() { + System.out.println("Usage: "); + System.out.println("java -class mate.jar is2.parser.Parser [Options]"); + System.out.println(); + System.out.println("Example: "); + System.out.println( + " java -class mate.jar is2.parser.Parser -model eps3.model -train corpora/conll08st/train/train.closed -test corpora/conll08st/devel/devel.closed -out b3.test -eval corpora/conll08st/devel/devel.closed -count 2000 -i 6"); + System.out.println(""); + System.out.println("Options:"); + System.out.println(""); + System.out.println(" -train <file> the corpus a model is trained on; default " + this.trainfile); + System.out.println(" -test <file> the input corpus for testing; default " + this.testfile); + System.out.println(" -out <file> the output corpus (result) of a test run; default " + this.outfile); + System.out.println(" -model <file> the parsing model for traing the model is stored in the files"); + System.out.println( + " and for parsing the model is load from this file; default " + this.modelName); + System.out.println( + " -i <number> the number of training iterations; good numbers are 10 for smaller corpora and 6 for bigger; default " + + this.numIters); + System.out.println(" -count <number> the n first sentences of the corpus are take for the training default " + + this.count); + System.out.println(" -format <number> conll format of the year 8 or 9; default " + this.formatTask); + + System.exit(0); + } +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/util/OptionsSuper.java b/dependencyParser/experimental/mate-tools/src/is2/util/OptionsSuper.java new file mode 100755 index 0000000..f6370f7 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/util/OptionsSuper.java @@ -0,0 +1,231 @@ +package is2.util; + +import java.io.File; + +public class OptionsSuper { + + public String trainfile = null; + public String testfile = null; + public File trainforest = null; + + public String nbframes = null; + public String pbframes = null; + + public boolean nopred = false; + public boolean upper = false; + + public boolean train = false; + public boolean eval = false; + public boolean test = false; + public boolean keep = false; + public boolean flt = false; + public boolean loadTaggerModels = false; + + public String modelName = "prs.mdl"; + public String modelTaggerName = null; + + public String useMapping = null; + public String device = "C:"; + public String tmp = null; + public boolean createForest = true; + public boolean decodeProjective = false; + public double decodeTH = 0.3d; + public String format = "CONLL"; + public int formatTask = 9; + public int numIters = 10; + public int best = 1000; + public String outfile = "dp.conll"; + public String charset = "UTF-8"; + public String phraseTrain = null; + public String phraseTest = null; + public String goldfile = null; + public String gout = "sec23.gld"; + public String features = null; + public String lexicon = null; + public int hsize = 0x07ffffff; + public int maxLen = 2000; + public int maxForms = Integer.MAX_VALUE; + public int beam = 4; + public float prune = -100000000; + + public String third = ""; + public String second = ""; + public String first = ""; + + public int cross = 10; + + // public boolean secondOrder = true; + public boolean useRelationalFeatures = false; + public int count = 10000000; + public int cores = Integer.MAX_VALUE; + public int start = 0; + public int minOccureForms = 0; + public int tt = 30; // tagger averaging + public boolean allFeatures = false; + public boolean normalize = false; + public boolean no2nd = false; + public boolean noLemmas = false; + public boolean few2nd = false, noLinear = false, noMorph = false; + public String clusterFile; + + // output confidence values + public boolean conf = false; + public String phraseFormat = "penn"; // tiger | penn + public boolean average = true; + public boolean label = false; + public boolean stack = false; + public boolean oneRoot = false; + + public String significant1 = null, significant2 = null; + + // horizontal stacking + public int minLength = 0, maxLength = Integer.MAX_VALUE; + public boolean overwritegold = false; + + public static final int MULTIPLICATIVE = 1, SHIFT = 2; + public int featureCreation = MULTIPLICATIVE; + + public OptionsSuper(String[] args, String dummy) { + + for (int i = 0; i < args.length; i++) { + i = addOption(args, i); + } + + } + + public OptionsSuper() { + } + + public int addOption(String args[], int i) { + + if (args[i].equals("-train")) { + train = true; + trainfile = args[i + 1]; + } else if (args[i].equals("-eval")) { + eval = true; + goldfile = args[i + 1]; + i++; + } else if (args[i].equals("-gout")) { + gout = args[i + 1]; + i++; + } else if (args[i].equals("-test")) { + test = true; + testfile = args[i + 1]; + i++; + } else if (args[i].equals("-sig1")) { + significant1 = args[i + 1]; + i++; + } else if (args[i].equals("-sig2")) { + significant2 = args[i + 1]; + i++; + } else if (args[i].equals("-i")) { + numIters = Integer.parseInt(args[i + 1]); + i++; + } else if (args[i].equals("-out")) { + outfile = args[i + 1]; + i++; + } else if (args[i].equals("-cluster")) { + clusterFile = args[i + 1]; + i++; + } + + else if (args[i].equals("-count")) { + count = Integer.parseInt(args[i + 1]); + i++; + } else if (args[i].equals("-model")) { + modelName = args[i + 1]; + i++; + } else if (args[i].equals("-tmodel")) { + this.modelTaggerName = args[i + 1]; + i++; + } else if (args[i].equals("-nonormalize")) { + normalize = false; + } else if (args[i].equals("-float")) { + flt = true; + } else if (args[i].equals("-hsize")) { + hsize = Integer.parseInt(args[i + 1]); + i++; + } else if (args[i].equals("-charset")) { + charset = args[++i]; + } else if (args[i].equals("-pstrain")) { + this.phraseTrain = args[i + 1]; + i++; + } else if (args[i].equals("-pstest")) { + this.phraseTest = args[i + 1]; + i++; + } else if (args[i].equals("-len")) { + maxLen = Integer.parseInt(args[i + 1]); + i++; + } else if (args[i].equals("-cores")) { + cores = Integer.parseInt(args[i + 1]); + i++; + } else if (args[i].equals("-start")) { + start = Integer.parseInt(args[i + 1]); + i++; + } else if (args[i].equals("-max")) { + maxLength = Integer.parseInt(args[i + 1]); + i++; + } else if (args[i].equals("-min")) { + minLength = Integer.parseInt(args[i + 1]); + i++; + } else if (args[i].equals("-noLemmas")) { + noLemmas = true; + } else if (args[i].equals("-noavg")) { + this.average = false; + } else if (args[i].equals("-label")) { + label = true; + } else if (args[i].equals("-stack")) { + stack = true; + } else if (args[i].equals("-overwritegold")) { + overwritegold = true; + } else if (args[i].equals("-format")) { + formatTask = Integer.parseInt(args[++i]); + } else if (args[i].equals("-tt")) { + tt = Integer.parseInt(args[++i]); + } else if (args[i].equals("-min-occure-forms")) { + minOccureForms = Integer.parseInt(args[++i]); + } else if (args[i].equals("-loadTaggerModels")) { + this.loadTaggerModels = true; + ; + + } else if (args[i].equals("-feature_creation")) { + this.featureCreation = args[++i].equals("shift") ? SHIFT : MULTIPLICATIVE; + } + + return i; + + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("FLAGS ["); + sb.append("train-file: " + trainfile); + sb.append(" | "); + sb.append("test-file: " + testfile); + sb.append(" | "); + sb.append("gold-file: " + goldfile); + sb.append(" | "); + sb.append("output-file: " + outfile); + sb.append(" | "); + sb.append("model-name: " + modelName); + sb.append(" | "); + sb.append("train: " + train); + sb.append(" | "); + sb.append("test: " + test); + sb.append(" | "); + sb.append("eval: " + eval); + sb.append(" | "); + sb.append("training-iterations: " + numIters); + sb.append(" | "); + sb.append("decode-type: " + decodeProjective); + sb.append(" | "); + sb.append("create-forest: " + createForest); + sb.append(" | "); + sb.append("format: " + format); + + sb.append("]\n"); + return sb.toString(); + } + +} \ No newline at end of file diff --git a/dependencyParser/experimental/mate-tools/src/is2/util/ParserEvaluator.java b/dependencyParser/experimental/mate-tools/src/is2/util/ParserEvaluator.java new file mode 100644 index 0000000..95e8949 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/util/ParserEvaluator.java @@ -0,0 +1,100 @@ +package is2.util; + +import is2.data.SentenceData09; +import is2.io.CONLLReader09; + +public class ParserEvaluator { + + public static final String PUNCT = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; + + public static class Results { + + public int total; + public int corr; + public float las; + public float ula; + + } + + public static Results evaluate(String act_file, String pred_file) throws Exception { + + CONLLReader09 goldReader = new CONLLReader09(act_file, -1); + CONLLReader09 predictedReader = new CONLLReader09(pred_file, -1); + + int total = 0, corr = 0, corrL = 0; + int numsent = 0, corrsent = 0, corrsentL = 0; + SentenceData09 goldInstance = goldReader.getNext(); + SentenceData09 predInstance = predictedReader.getNext(); + + while (goldInstance != null) { + + int instanceLength = goldInstance.length(); + + if (instanceLength != predInstance.length()) + System.out.println("Lengths do not match on sentence " + numsent); + + int[] goldHeads = goldInstance.heads; + String[] goldLabels = goldInstance.labels; + int[] predHeads = predInstance.pheads; + String[] predLabels = predInstance.plabels; + + boolean whole = true; + boolean wholeL = true; + + // NOTE: the first item is the root info added during + // nextInstance(), so we skip it. + + int punc = 0; + for (int i = 1; i < instanceLength; i++) { + if (predHeads[i] == goldHeads[i]) { + corr++; + + if (goldLabels[i].equals(predLabels[i])) + corrL++; + else { + // System.out.println(numsent+" error gold + // "+goldLabels[i]+" "+predLabels[i]+" head + // "+goldHeads[i]+" child "+i); + wholeL = false; + } + } else { + // System.out.println(numsent+"error gold "+goldLabels[i]+" + // "+predLabels[i]+" head "+goldHeads[i]+" child "+i); + whole = false; + wholeL = false; + } + } + total += ((instanceLength - 1) - punc); // Subtract one to not score + // fake root token + + if (whole) + corrsent++; + if (wholeL) + corrsentL++; + numsent++; + + goldInstance = goldReader.getNext(); + predInstance = predictedReader.getNext(); + } + + Results r = new Results(); + + r.total = total; + r.corr = corr; + r.las = (float) Math.round(((double) corrL / total) * 100000) / 1000; + r.ula = (float) Math.round(((double) corr / total) * 100000) / 1000; + System.out.print("Total: " + total + " \tCorrect: " + corr + " "); + System.out.println("LAS: " + (double) Math.round(((double) corrL / total) * 100000) / 1000 + " \tTotal: " + + (double) Math.round(((double) corrsentL / numsent) * 100000) / 1000 + " \tULA: " + + (double) Math.round(((double) corr / total) * 100000) / 1000 + " \tTotal: " + + (double) Math.round(((double) corrsent / numsent) * 100000) / 1000); + + return r; + } + + public static float round(double v) { + + return Math.round(v * 10000F) / 10000F; + } + +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/util/Split.java b/dependencyParser/experimental/mate-tools/src/is2/util/Split.java new file mode 100755 index 0000000..ea1151b --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/util/Split.java @@ -0,0 +1,89 @@ +package is2.util; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.Reader; +import java.nio.channels.Channels; +import java.nio.channels.FileChannel; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.util.StringTokenizer; + +public class Split { + + /** + * Splits a tokenized sentences into one word per line format: + * + * Input > I am an text . > Sentence two ... + * + * Output: I _ _ _ ... am _ _ _ ... ... + * + * @param args + * @throws IOException + */ + public static void main(String args[]) throws IOException { + + if (args.length != 1) { + System.out.println("Please provide a file name."); + System.exit(0); + } + + String filename = args[0]; + // Charset charset = Charset.forName("UTF-8"); + + FileInputStream in = new FileInputStream(filename); + FileChannel channel = in.getChannel(); + CharsetDecoder decoder = Charset.defaultCharset().newDecoder();// charset.newDecoder(); + Reader infile = Channels.newReader(channel, decoder, 16 * 1024); + BufferedReader bInfile = new BufferedReader(infile); + + // DataOutputStream dos = new DataOutputStream(new + // BufferedOutputStream(new FileOutputStream(options.modelName))); + + String s; + while ((s = bInfile.readLine()) != null) { + + // do the first tokens contain a colon? + int colon = 0; + for (int k = 0; k < 12; k++) { + if (s.length() <= k) + break; + if (s.charAt(k) == ':') { + + colon++; + break; + } + if (s.charAt(k) == ' ') + break; + } + + String prefix = colon > 0 ? s.substring(0, s.indexOf(":")) + "_" : ""; + + if (colon > 0) { + s = s.substring(s.indexOf(":") + 1); + } + + StringTokenizer t = new StringTokenizer(s); + int i = 1; + boolean found = false; + while (t.hasMoreTokens()) { + found = true; + String tk = t.nextToken(); + if (tk.contains("=")) + continue; + System.out.print(prefix + i + "\t"); + System.out.print(tk); + System.out.println("\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_"); + i++; + } + if (found) + System.out.println(); + + } + bInfile.close(); + in.close(); + + } + +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/util/Split2.java b/dependencyParser/experimental/mate-tools/src/is2/util/Split2.java new file mode 100644 index 0000000..4ed4004 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/util/Split2.java @@ -0,0 +1,56 @@ +package is2.util; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.util.StringTokenizer; + +public class Split2 { + + /** + * Splits a tokenized sentences into one word per line format: + * + * Input > I am an text . > Sentence two ... + * + * Output: I _ _ _ ... am _ _ _ ... ... + * + * @param args + * @throws IOException + */ + public static void main(String args[]) throws IOException { + + if (args.length < 1) { + System.out.println("Please provide a file name."); + System.exit(0); + } + + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]), "UTF-8"), 32768); + BufferedWriter write = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[1]), "ISO-8859-1")); + + String s; + int cnt = 0; + while ((s = reader.readLine()) != null) { + StringTokenizer t = new StringTokenizer(s); + while (t.hasMoreTokens()) { + String tk = t.nextToken(); + for (int c : tk.toCharArray()) { + if (c < 0 && c >= 255) + System.out.println("contain sign " + c + " " + cnt); + } + write.write(tk); + write.newLine(); + cnt++; + } + write.newLine(); + } + reader.close(); + write.flush(); + write.close(); + + } + +} diff --git a/dependencyParser/experimental/mate-tools/src/is2/util/Split3.java b/dependencyParser/experimental/mate-tools/src/is2/util/Split3.java new file mode 100644 index 0000000..2cf7cf2 --- /dev/null +++ b/dependencyParser/experimental/mate-tools/src/is2/util/Split3.java @@ -0,0 +1,51 @@ +package is2.util; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.util.StringTokenizer; + +public class Split3 { + + /** + * Splits a tokenized sentences into one word per line format: + * + * Input > I am an text . > Sentence two ... + * + * Output: I _ _ _ ... am _ _ _ ... ... + * + * @param args + * @throws IOException + */ + public static void main(String args[]) throws IOException { + + if (args.length < 1) { + System.out.println("Please provide a file name."); + System.exit(0); + } + + BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]), "UTF-8"), 32768); + BufferedWriter write = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[1]), "UTF-8"), + 32768); + + String s; + while ((s = reader.readLine()) != null) { + StringTokenizer t = new StringTokenizer(s); + while (t.hasMoreTokens()) { + String tk = t.nextToken(); + write.write(tk); + write.newLine(); + } + write.newLine(); + } + reader.close(); + write.flush(); + write.close(); + + } + +} diff --git a/dependencyParser/mate-tools/src/is2/data/Parameter.java b/dependencyParser/mate-tools/src/is2/data/Parameter.java deleted file mode 100644 index 7b1f870..0000000 --- a/dependencyParser/mate-tools/src/is2/data/Parameter.java +++ /dev/null @@ -1,13 +0,0 @@ -/** - * - */ -package is2.data; - -/** - * @author Dr. Bernd Bohnet, 23.12.2010 - * - * - */ -public class Parameter { - -} diff --git a/dependencyParser/mate-tools/src/is2/data/ParseNBest.java b/dependencyParser/mate-tools/src/is2/data/ParseNBest.java deleted file mode 100644 index cb02b71..0000000 --- a/dependencyParser/mate-tools/src/is2/data/ParseNBest.java +++ /dev/null @@ -1,103 +0,0 @@ -package is2.data; - -final public class ParseNBest extends Parse { - - private String signature = null; - - // public float[] scores; - - public ParseNBest() { - } - - public ParseNBest(short[] heads2, short[] types2, float p_new) { - super(heads2, types2, p_new); - } - - public ParseNBest(int i) { - super(i); - } - - /** - * @param sig - * @param readFloat - */ - public ParseNBest(String sig, float score) { - super(sig, score); - } - - /** - * create a total order to provide replicable deterministic results - * - * @param o - * @return - */ - public int compareTo(ParseNBest o) { - if (f1 < o.f1) - return 1; - if (f1 == o.f1) { - if (signature == null) - signature = signature(); - if (o.signature == null) - o.signature = o.signature(); - return o.signature.compareTo(signature); - - } - return -1; - } - - /** - * @return the signature of a parse - */ - @Override - public String signature() { - if (signature != null) - return signature; - signature = super.signature(); - return signature; - } - - /** - * @return the signature of a parse - */ - public String signature(short[] heads, short[] labels) { - StringBuilder b = new StringBuilder(heads.length * 2); - for (int k = 0; k < heads.length; k++) { - b.append((char) heads[k]).append((char) labels[k]); - } - signature = b.toString(); - return signature; - } - - /** - * @param heads - * @param types - * @param oldP - * @param ch - * @param s - */ - public String signature(short[] heads, short[] types, short p, short ch, short l) { - StringBuilder b = new StringBuilder(heads.length * 2); - for (int k = 0; k < heads.length; k++) { - - b.append(k == ch ? (char) p : (char) heads[k]).append(k == ch ? (char) l : (char) types[k]); - } - signature = b.toString(); - return signature; - - } - - @Override - public Parse clone() { - ParseNBest p = new ParseNBest(); - p.heads = new short[heads.length]; - p.labels = new short[labels.length]; - - System.arraycopy(heads, 0, p.heads, 0, heads.length); - System.arraycopy(labels, 0, p.labels, 0, labels.length); - - p.f1 = f1; - - return p; - } - -} diff --git a/dependencyParser/mate-tools/src/is2/data/PipeGen.java b/dependencyParser/mate-tools/src/is2/data/PipeGen.java deleted file mode 100755 index 728666f..0000000 --- a/dependencyParser/mate-tools/src/is2/data/PipeGen.java +++ /dev/null @@ -1,83 +0,0 @@ -package is2.data; - -public class PipeGen { - - public static final String SENSE = "SENSE", POS = "POS", DIST = "DIST", WORD = "WORD", PRED = "PRED", ARG = "ARG", - FEAT = "F", REL = "REL", TYPE = "TYPE", CHAR = "C", FFEATS = "FF", DIR = "DIR", LA = "LA", RA = "RA"; - - public static final String GPOS = "GPOS", MID = "MID", END = "END", STR = "STR", FM = "FM", NOFEAT = "NOFEAT"; - - public static final String _0 = "0", _4 = "4", _3 = "3", _2 = "2", _1 = "1", _5 = "5", _10 = "10"; - - static public int outValue(int num1, int del) { - String out = "" + num1; - StringBuffer delS = new StringBuffer(); - for (int k = 0; k < del; k++) - delS.append('\b'); - del = out.length(); - System.out.print(delS + out); - return del; - } - - static public int outValue(int num1, int del, long last) { - String out = "" + num1 + " (" + (System.currentTimeMillis() - last) / (num1 + 1) + " ms/instance)"; - StringBuffer delS = new StringBuffer(); - for (int k = 0; k < del; k++) - delS.append('\b'); - del = out.length(); - System.out.print(delS + out); - return del; - } - - static public int outValueErr(int num1, float err, float f1, int del, long last) { - - String out = "" + num1 + " (" + (System.currentTimeMillis() - last) / (num1 + 1) + " ms/instance " - + (err / num1) + " err/instance f1=" + f1 + ") "; - StringBuffer delS = new StringBuffer(); - for (int k = 0; k < del; k++) - delS.append('\b'); - del = out.length(); - System.out.print(delS + out); - return del; - } - - static public int outValueErr(int num1, float err, float f1, int del, long last, double upd) { - String out = "" + num1 + " (" + (System.currentTimeMillis() - last) / (num1 + 1) + " ms/instance " - + (err / num1) + " err/instance f1=" + f1 + ") upd " + upd; - StringBuffer delS = new StringBuffer(); - for (int k = 0; k < del; k++) - delS.append('\b'); - del = out.length(); - System.out.print(delS + out); - return del; - } - - static public int outValueErr(int num1, float err, float f1, int del, long last, double upd, String info) { - String out = "" + num1 + " (" + (System.currentTimeMillis() - last) / (num1 + 1) + " ms/instance " - + (err / num1) + " err/instance f1=" + f1 + ") upd " + upd + " " + info; - StringBuffer delS = new StringBuffer(); - for (int k = 0; k < del; k++) - delS.append('\b'); - del = out.length(); - System.out.print(delS + out); - return del; - } - - /** - * @param cnt - * @param l - * @return - */ - public static String getSecondsPerInstnace(int cnt, long l) { - return " " + (l / (cnt * 1000f)) + " seconds/sentnece "; - } - - /** - * @param l - * @return - */ - public static String getUsedTime(long l) { - return "Used time " + ((l) / 1000f) + " seconds "; - } - -} diff --git a/dependencyParser/mate-tools/src/is2/data/PrimeFinder.java b/dependencyParser/mate-tools/src/is2/data/PrimeFinder.java deleted file mode 100644 index fab0901..0000000 --- a/dependencyParser/mate-tools/src/is2/data/PrimeFinder.java +++ /dev/null @@ -1,51 +0,0 @@ -/** - * - */ -package is2.data; - -import java.util.Arrays; - -/** - * @author Dr. Bernd Bohnet, 13.05.2010 - * - * - */ -public class PrimeFinder { - - public PrimeFinder() { - } - - public static final int nextPrime(int desiredCapacity) { - int i = Arrays.binarySearch(primeCapacities, desiredCapacity); - if (i < 0) - i = -i - 1; - return primeCapacities[i]; - } - - public static final int largestPrime = 2147483647; - private static final int primeCapacities[] = { 2147483647, 5, 11, 23, 47, 97, 197, 397, 797, 1597, 3203, 6421, - 12853, 25717, 51437, 102877, 205759, 411527, 823117, 1646237, 3292489, 6584983, 13169977, 26339969, - 52679969, 105359939, 210719881, 421439783, 842879579, 1685759167, 433, 877, 1759, 3527, 7057, 14143, 28289, - 56591, 113189, 226379, 452759, 905551, 1811107, 3622219, 7244441, 14488931, 28977863, 57955739, 115911563, - 231823147, 463646329, 927292699, 1854585413, 953, 1907, 3821, 7643, 15287, 30577, 61169, 122347, 244703, - 489407, 978821, 1957651, 3915341, 7830701, 15661423, 31322867, 62645741, 125291483, 250582987, 501165979, - 1002331963, 2004663929, 1039, 2081, 4177, 8363, 16729, 33461, 66923, 133853, 267713, 535481, 1070981, - 2141977, 4283963, 8567929, 17135863, 34271747, 68543509, 137087021, 274174111, 548348231, 1096696463, 31, - 67, 137, 277, 557, 1117, 2237, 4481, 8963, 17929, 35863, 71741, 143483, 286973, 573953, 1147921, 2295859, - 4591721, 9183457, 18366923, 36733847, 73467739, 146935499, 293871013, 587742049, 1175484103, 599, 1201, - 2411, 4831, 9677, 19373, 38747, 77509, 155027, 310081, 620171, 1240361, 2480729, 4961459, 9922933, 19845871, - 39691759, 79383533, 158767069, 317534141, 635068283, 1270136683, 311, 631, 1277, 2557, 5119, 10243, 20507, - 41017, 82037, 164089, 328213, 656429, 1312867, 2625761, 5251529, 10503061, 21006137, 42012281, 84024581, - 168049163, 336098327, 672196673, 1344393353, 3, 7, 17, 37, 79, 163, 331, 673, 1361, 2729, 5471, 10949, - 21911, 43853, 87719, 175447, 350899, 701819, 1403641, 2807303, 5614657, 11229331, 22458671, 44917381, - 89834777, 179669557, 359339171, 718678369, 1437356741, 43, 89, 179, 359, 719, 1439, 2879, 5779, 11579, - 23159, 46327, 92657, 185323, 370661, 741337, 1482707, 2965421, 5930887, 11861791, 23723597, 47447201, - 94894427, 189788857, 379577741, 759155483, 1518310967, 379, 761, 1523, 3049, 6101, 12203, 24407, 48817, - 97649, 195311, 390647, 781301, 1562611, 3125257, 6250537, 12501169, 25002389, 50004791, 100009607, - 200019221, 400038451, 800076929, 1600153859 }; - - static { - Arrays.sort(primeCapacities); - } - -} diff --git a/dependencyParser/mate-tools/src/is2/io/TigerReader.java b/dependencyParser/mate-tools/src/is2/io/TigerReader.java deleted file mode 100644 index 10fa0ea..0000000 --- a/dependencyParser/mate-tools/src/is2/io/TigerReader.java +++ /dev/null @@ -1,208 +0,0 @@ -/** - * - */ -package is2.io; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.StringTokenizer; - -import is2.data.PSTree; - -/** - * @author Dr. Bernd Bohnet, 17.01.2011 - * - * Reads a sentences in Penn Tree Bank bracket style and return - * sentences. - */ -public class TigerReader implements PSReader { - - BufferedReader inputReader; - ArrayList<File> psFiles = new ArrayList<File>(); - ArrayList<PSTree> psCache = new ArrayList<PSTree>(); - - String filter[] = null; - int startFilter = -1; - int endFilter = -1; - - public TigerReader() { - } - - public TigerReader(String file) { - - try { - inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "ISO-8859-1"), 32768); - } catch (Exception e) { - e.printStackTrace(); - } - } - - /** - * @param ps - */ - @Override - public void startReading(String file, String[] filter) { - - try { - this.filter = filter; - startFilter = filter == null ? -1 : 1; - endFilter = filter == null ? -1 : 1; - - inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "ISO-8859-1"), 32768); - } catch (Exception e) { - e.printStackTrace(); - } - - } - - public static class Line { - String form; - String lemma; - String morph; - String pos; - int parent; - String edge; - - } - - static int stop = 0; - - /** - * @return - */ - @Override - public PSTree getNext() { - - PSTree ps = null; - String l = null; - ArrayList<Line> lines = new ArrayList<Line>(); - try { - int state = 1, terminals = 0, nonterminals = 0; - while ((l = inputReader.readLine()) != null) { - - if (startFilter == 1 && l.startsWith("#BOS " + filter[0])) { - System.out.println("found start " + l); - startFilter = 2; - } - if (endFilter == 1 && l.startsWith("#EOS " + filter[1])) { - System.out.println("found end " + l); - - endFilter = 2; - } - - if (startFilter == 1 || endFilter == 2) - continue; - - if (l.startsWith("#BOS")) { - - state = 2; - continue; - } - if (l.startsWith("#500")) - state = 3; - if (l.startsWith("#EOS")) - state = 4; - if (state < 2) - continue; - - if (state == 4) { - - ps = new PSTree(); - ps.create(terminals, nonterminals); - // System.out.println("terminals "+terminals); - // build ps tree - - int cnt = 0; - // ps.entries[0] =CONLLReader09.ROOT; - // ps.head[0]=-1; - int root = -1; - for (Line line : lines) { - - /* - * if (cnt==terminals) { // insert root root =cnt; - * cnt++; } - */ - ps.entries[cnt] = line.form; - if (cnt < terminals) - ps.pos[cnt] = line.pos; - else - ps.entries[cnt] = line.pos; - ps.lemmas[cnt] = line.lemma; - ps.head[cnt] = line.parent == 0 ? lines.size() - 1 - : line.parent >= 500 ? line.parent - 500 + terminals : line.parent; - // ps.head[cnt] = - // line.parent==0?lines.size()-1:line.parent>=500?line.parent-500+terminals:line.parent; - ps.morph[cnt] = line.morph; - cnt++; - - } - - if (root == -1) - root = terminals; - ps.head[cnt - 1] = 0; // root - ps.terminalCount = terminals; - lines.clear(); - state = 1; - - /* - * for(int k=0;k<ps.head.length;k++) { if - * (ps.head[k]<terminals && k!=root) { ps.head[k]=root; // - * DB.println("error "+k+" "+ps.head[k]); } } - */ - // System.out.println(""+ps.toString()); - // if (stop++ == 4)System.exit(0); - return ps; - } - - StringTokenizer t = new StringTokenizer(l, "\t"); - int tc = 0; - Line line = new Line(); - lines.add(line); - while (t.hasMoreTokens()) { - String token = t.nextToken(); - if (token.equals("\t")) - continue; - if (tc == 0) { - if (token.startsWith("#5") || token.startsWith("#6")) { - nonterminals++; - - } else { - terminals++; - - // change it back to the wrong format since the - // conll stuff was derived from this. - // if (token.equals("durchblicken")) - // token="durchblikken"; - line.form = token; - } - - } else if (tc == 1) { - line.lemma = token; - } else if (tc == 2) { - line.pos = token; - } else if (tc == 3) { - line.morph = token; - } else if (tc == 4) { - line.edge = token; - } else if (tc == 5) { - line.parent = Integer.parseInt(token); - } - - if (token.length() > 0) - tc++; - } - - // read till #EOS - - } - } catch (Exception e) { - e.printStackTrace(); - } - return ps; - - } - -} diff --git a/dependencyParser/mate-tools/src/is2/lemmatizer/Evaluator.java b/dependencyParser/mate-tools/src/is2/lemmatizer/Evaluator.java deleted file mode 100755 index cc1b423..0000000 --- a/dependencyParser/mate-tools/src/is2/lemmatizer/Evaluator.java +++ /dev/null @@ -1,108 +0,0 @@ -package is2.lemmatizer; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.Hashtable; -import java.util.Map.Entry; - -import is2.data.SentenceData09; -import is2.io.CONLLReader09; - -public class Evaluator { - - public static void evaluate(String act_file, String pred_file, String format) throws Exception { - - CONLLReader09 goldReader = new CONLLReader09(act_file, CONLLReader09.NO_NORMALIZE); - CONLLReader09 predictedReader = new CONLLReader09(pred_file, CONLLReader09.NO_NORMALIZE); - // predictedReader.startReading(pred_file); - - Hashtable<String, Integer> errors = new Hashtable<String, Integer>(); - - int total = 0, corrL = 0, corrT = 0; - int numsent = 0; - SentenceData09 goldInstance = goldReader.getNext(); - SentenceData09 predInstance = predictedReader.getNext(); - - while (goldInstance != null) { - - int instanceLength = goldInstance.length(); - - if (instanceLength != predInstance.length()) - System.out.println("Lengths do not match on sentence " + numsent); - - String gold[] = goldInstance.lemmas; - String pred[] = predInstance.plemmas; - - boolean whole = true; - boolean wholeL = true; - - // NOTE: the first item is the root info added during - // nextInstance(), so we skip it. - - for (int i = 1; i < instanceLength; i++) { - if (gold[i].toLowerCase().equals(pred[i].toLowerCase())) - corrT++; - - if (gold[i].equals(pred[i])) - corrL++; - else { - - // System.out.println("error gold:"+goldPos[i]+" - // pred:"+predPos[i]+" "+goldInstance.forms[i]+" snt - // "+numsent+" i:"+i); - String key = "gold: '" + gold[i] + "' pred: '" + pred[i] + "'"; - Integer cnt = errors.get(key); - if (cnt == null) { - errors.put(key, 1); - } else { - errors.put(key, cnt + 1); - } - } - - } - total += instanceLength - 1; // Subtract one to not score fake root - // token - - if (whole) { - } - if (wholeL) { - } - numsent++; - - goldInstance = goldReader.getNext(); - predInstance = predictedReader.getNext(); - } - ArrayList<Entry<String, Integer>> opsl = new ArrayList<Entry<String, Integer>>(); - for (Entry<String, Integer> e : errors.entrySet()) { - opsl.add(e); - } - - Collections.sort(opsl, new Comparator<Entry<String, Integer>>() { - - @Override - public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) { - - return o1.getValue() == o2.getValue() ? 0 : o1.getValue() > o2.getValue() ? 1 : -1; - } - - }); - - /* - * for(Entry<String, Integer> e : opsl) { - * System.out.println(e.getKey()+" "+e.getValue()); } - */ - - System.out.println("Tokens: " + total + " Correct: " + corrT + " " + (float) corrT / total - + " correct uppercase " + (float) corrL / total); - } - - public static void main(String[] args) throws Exception { - String format = "CONLL"; - if (args.length > 2) - format = args[2]; - - evaluate(args[0], args[1], format); - } - -} diff --git a/dependencyParser/mate-tools/src/is2/lemmatizer/Options.java b/dependencyParser/mate-tools/src/is2/lemmatizer/Options.java deleted file mode 100755 index 30c2567..0000000 --- a/dependencyParser/mate-tools/src/is2/lemmatizer/Options.java +++ /dev/null @@ -1,72 +0,0 @@ -package is2.lemmatizer; - -import java.io.File; -import java.io.IOException; - -import is2.util.OptionsSuper; - -public final class Options extends OptionsSuper { - - public Options(String[] args) throws IOException { - - for (int i = 0; i < args.length; i++) { - - if (args[i].equals("--help")) - explain(); - - if (args[i].equals("-normalize")) { - normalize = Boolean.parseBoolean(args[++i]); - } else if (args[i].equals("-features")) { - features = args[i + 1]; - i++; - } else if (args[i].equals("-hsize")) { - hsize = Integer.parseInt(args[i + 1]); - i++; - } else if (args[i].equals("-len")) { - maxLen = Integer.parseInt(args[i + 1]); - i++; - } else if (args[i].equals("-tmp")) { - tmp = args[i + 1]; - i++; - } else if (args[i].equals("-uc")) { - upper = true; - System.out.println("set uppercase " + upper); - - } else - super.addOption(args, i); - - } - - if (trainfile != null) { - - if (tmp != null) - trainforest = File.createTempFile("train", ".tmp", new File(tmp)); - else - trainforest = File.createTempFile("train", ".tmp"); // ,new - // File("F:\\") - trainforest.deleteOnExit(); - } - - } - - private void explain() { - System.out.println("Usage: "); - System.out.println("java -class mate.jar is2.lemmatizer.Lemmatizer [Options]"); - System.out.println(); - System.out.println("Options:"); - System.out.println(""); - System.out.println(" -train <file> the corpus a model is trained on; default " + this.trainfile); - System.out.println(" -test <file> the input corpus for testing; default " + this.testfile); - System.out.println(" -out <file> the output corpus (result) of a test run; default " + this.outfile); - System.out.println(" -model <file> the parsing model for traing the model is stored in the files"); - System.out.println( - " and for parsing the model is load from this file; default " + this.modelName); - System.out.println( - " -i <number> the number of training iterations; good numbers are 10 for smaller corpora and 6 for bigger; default " - + this.numIters); - System.out.println(" -count <number> the n first sentences of the corpus are take for the training default " - + this.count); - - System.exit(0); - } -} diff --git a/dependencyParser/mate-tools/src/is2/mtag/Convert.java b/dependencyParser/mate-tools/src/is2/mtag/Convert.java deleted file mode 100755 index 05b0741..0000000 --- a/dependencyParser/mate-tools/src/is2/mtag/Convert.java +++ /dev/null @@ -1,99 +0,0 @@ -/** - * - */ -package is2.mtag; - -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.OutputStreamWriter; -import java.util.ArrayList; - -/** - * @author Dr. Bernd Bohnet, 20.01.2010 - * - * - */ -public class Convert { - - public static void main(String[] args) throws IOException { - - Options options = new Options(args); - - split(options.trainfile); - - } - - /** - * @param trainfile - * @throws IOException - */ - private static void split(String trainfile) throws IOException { - - String dir = "split"; - boolean success = (new File("split")).mkdir(); - if (success) - System.out.println("Directory: " + dir + " created"); - - ArrayList<String> corpus = new ArrayList<String>(); - - BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(trainfile), "UTF-8"), - 32768); - String l = null; - int sentences = 0; - try { - while ((l = reader.readLine()) != null) { - - corpus.add(l); - if (l.length() < 8) - sentences++; - - } - } catch (IOException e) { - e.printStackTrace(); - } - System.out.println("Corpus has " + sentences + " sentences."); - - int partSize = sentences / 20; - System.out.println("Prepare corpus for cross annotations with 20 parts with part size " + partSize - + " number of lines " + corpus.size()); - - for (int k = 0; k < 20; k++) { - BufferedWriter br = new BufferedWriter( - new OutputStreamWriter(new FileOutputStream("split/p-" + k), "UTF-8")); - BufferedWriter rest = new BufferedWriter( - new OutputStreamWriter(new FileOutputStream("split/r-" + k), "UTF-8")); - int skip = k * partSize; - - int countSentences = 0; - int countSentencesWrote = 0; - System.out.println("skip from " + skip + " to " + (skip + partSize - 1)); - for (String x : corpus) { - if (countSentences >= skip && (countSentences < (skip + partSize) || k == 19)) { - rest.write(x); - rest.newLine(); - if (x.length() < 8) - countSentencesWrote++; - } else { - br.write(x); - br.newLine(); - } - - if (x.length() < 8) - countSentences++; - } - System.out.println("wrote for this part " + countSentencesWrote); - br.flush(); - br.close(); - rest.flush(); - rest.close(); - - } - - } - -} diff --git a/dependencyParser/mate-tools/src/is2/mtag/Evaluator.java b/dependencyParser/mate-tools/src/is2/mtag/Evaluator.java deleted file mode 100755 index 16c7bba..0000000 --- a/dependencyParser/mate-tools/src/is2/mtag/Evaluator.java +++ /dev/null @@ -1,149 +0,0 @@ -package is2.mtag; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.Hashtable; -import java.util.Map.Entry; - -import is2.data.SentenceData09; -import is2.io.CONLLReader09; - -public class Evaluator { - - public static void evaluate(String act_file, String pred_file, String format) throws Exception { - - CONLLReader09 goldReader = new CONLLReader09(act_file);// DependencyReader.createDependencyReader(); - // boolean labeled = goldReader.startReading(act_file); - - CONLLReader09 predictedReader = new CONLLReader09(); - predictedReader.startReading(pred_file); - - // if (labeled != predLabeled) - // System.out.println("Gold file and predicted file appear to differ on - // whether or not they are labeled. Expect problems!!!"); - - int total = 0, totalP = 0, corrT = 0; - int totalD = 0, corrD = 0, err = 0; - int numsent = 0; - SentenceData09 goldInstance = goldReader.getNext(); - SentenceData09 predInstance = predictedReader.getNext(); - - Hashtable<String, Integer> errors = new Hashtable<String, Integer>(); - Hashtable<String, StringBuffer> words = new Hashtable<String, StringBuffer>(); - - while (goldInstance != null) { - - int instanceLength = goldInstance.length(); - - if (instanceLength != predInstance.length()) - System.out.println("Lengths do not match on sentence " + numsent); - - String gold[] = goldInstance.ofeats; - String pred[] = predInstance.pfeats; - - boolean whole = true; - boolean wholeL = true; - - // NOTE: the first item is the root info added during - // nextInstance(), so we skip it. - - for (int i = 1; i < instanceLength; i++) { - if (gold[i].equals(pred[i]) || (gold[i].equals("_") && pred[i] == null)) - corrT++; - else { - // System.out.println("gold:"+goldFeats[i]+" - // pred:"+predFeats[i]+" "+goldInstance.forms[i]+" snt - // "+numsent+" i:"+i); - // for (int k = 1; k < instanceLength; k++) { - - // System.out.print(goldInstance.forms[k]+":"+goldInstance.gpos[k]); - // if (k==i) System.out.print(":"+predInstance.gpos[k]); - // System.out.print(" "); - - // } - // System.out.println(); - String key = "gold: '" + gold[i] + "' pred: '" + pred[i] + "'"; - Integer cnt = errors.get(key); - StringBuffer errWrd = words.get(key); - if (cnt == null) { - errors.put(key, 1); - words.put(key, new StringBuffer().append(goldInstance.forms[i])); - } else { - errors.put(key, cnt + 1); - errWrd.append(" " + goldInstance.forms[i]); - } - err++; - - } - String[] gf = gold[i].split("|"); - int eq = 0; - - if (pred[i] != null) { - String[] pf = pred[i].split("|"); - totalP += pf.length; - - if (pf.length > gf.length) { - } else { - } - - for (String g : gf) { - for (String p : pf) { - if (g.equals(p)) { - eq++; - break; - } - } - } - } else { - } - totalD += gf.length; - corrD += eq; - } - total += instanceLength - 1; // Subtract one to not score fake root - // token - - if (whole) { - } - if (wholeL) { - } - numsent++; - - goldInstance = goldReader.getNext(); - predInstance = predictedReader.getNext(); - } - - ArrayList<Entry<String, Integer>> opsl = new ArrayList<Entry<String, Integer>>(); - for (Entry<String, Integer> e : errors.entrySet()) { - opsl.add(e); - } - - Collections.sort(opsl, new Comparator<Entry<String, Integer>>() { - - @Override - public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) { - - return o1.getValue() == o2.getValue() ? 0 : o1.getValue() > o2.getValue() ? -1 : 1; - } - - }); - - System.out.println("10 top most errors:"); - - System.out.println("Tokens: " + total + " Correct: " + corrT + " " + (float) corrT / total + " R " - + ((float) corrD / totalD) + " tP " + totalP + " tG " + totalD + " P " + (float) corrD / totalP); - System.out.println("err: " + err + " total " + total + " corr " + corrT); - // System.out.println("Unlabeled Complete Correct: " + - // ((double)corrsent/numsent)); - - } - - public static void main(String[] args) throws Exception { - String format = "CONLL"; - if (args.length > 2) - format = args[2]; - - evaluate(args[0], args[1], format); - } - -} diff --git a/dependencyParser/mate-tools/src/is2/mtag/Options.java b/dependencyParser/mate-tools/src/is2/mtag/Options.java deleted file mode 100755 index 20969ff..0000000 --- a/dependencyParser/mate-tools/src/is2/mtag/Options.java +++ /dev/null @@ -1,54 +0,0 @@ -package is2.mtag; - -import is2.util.OptionsSuper; - -public final class Options extends OptionsSuper { - - public Options(String[] args) { - - for (int i = 0; i < args.length; i++) { - - if (args[i].equals("--help")) - explain(); - - if (args[i].equals("-nonormalize")) { - normalize = false; - } else if (args[i].equals("-features")) { - features = args[i + 1]; - i++; - } else if (args[i].equals("-hsize")) { - hsize = Integer.parseInt(args[i + 1]); - i++; - } else if (args[i].equals("-len")) { - maxLen = Integer.parseInt(args[i + 1]); - i++; - } else - super.addOption(args, i); - } - } - - private void explain() { - System.out.println("Usage: "); - System.out.println("java -cp anna.jar is2.mtag.Tagger [Options]"); - System.out.println(); - System.out.println("Example: "); - System.out.println( - " java -cp mate.jar is2.mtag.Tagger -model eps3.model -train corpora/conll08st/train/train.closed -test corpora/conll08st/devel/devel.closed -out b3.test -eval corpora/conll08st/devel/devel.closed -count 2000 -i 6"); - System.out.println(""); - System.out.println("Options:"); - System.out.println(""); - System.out.println(" -train <file> the corpus a model is trained on; default " + this.trainfile); - System.out.println(" -test <file> the input corpus for testing; default " + this.testfile); - System.out.println(" -out <file> the output corpus (result) of a test run; default " + this.outfile); - System.out.println(" -model <file> the parsing model for traing the model is stored in the files"); - System.out.println( - " and for parsing the model is load from this file; default " + this.modelName); - System.out.println( - " -i <number> the number of training iterations; good numbers are 10 for smaller corpora and 6 for bigger; default " - + this.numIters); - System.out.println(" -count <number> the n first sentences of the corpus are take for the training default " - + this.count); - - System.exit(0); - } -} diff --git a/dependencyParser/mate-tools/src/is2/parser/D5.java b/dependencyParser/mate-tools/src/is2/parser/D5.java deleted file mode 100644 index 407b4e1..0000000 --- a/dependencyParser/mate-tools/src/is2/parser/D5.java +++ /dev/null @@ -1,293 +0,0 @@ -/** - * - */ -package is2.parser; - -import is2.data.DX; -import is2.data.IFV; -import is2.data.Long2IntInterface; - -/** - * @author Dr. Bernd Bohnet, 30.10.2010 - * - * - */ -final public class D5 extends DX { - - public long shift; - private long h; - - /* - * (non-Javadoc) - * - * @see is2.parser52L.DX#cz2() - */ - final public void cz2() { - - if (v0 < 0 || v1 < 0) { - shift = 0; - h = -1; - return; - } - - h = v0 | v1 << (shift = a0); - shift += a1; - - } - - /* - * (non-Javadoc) - * - * @see is2.parser52L.DX#cz3() - */ - @Override - final public void cz3() { - - if (v0 < 0 || v1 < 0 || v2 < 0) { - shift = 0; - h = -1; - return; - - } - - h = v0 | v1 << (shift = a0) | v2 << (shift += a1); - shift = shift + a2; - - } - - /* - * (non-Javadoc) - * - * @see is2.parser52L.DX#cz4() - */ - @Override - final public void cz4() { - if (v0 < 0 || v1 < 0 || v2 < 0 || v3 < 0) { - shift = 0; - h = -1; - return; - } - - h = v0 | v1 << (shift = a0) | v2 << (shift += a1) | v3 << (shift += a2); - shift = shift + a3; - - } - - /* - * (non-Javadoc) - * - * @see is2.parser52L.DX#cz5() - */ - @Override - final public void cz5() { - - if (v0 < 0 || v1 < 0 || v2 < 0 || v3 < 0 || v4 < 0) { - shift = 0; - h = -1; - return; - } - - h = v0 | v1 << (shift = a0) | v2 << (shift += a1) | v3 << (shift += a2) | v4 << (shift += a3); - shift = shift + a4; - - } - - /* - * (non-Javadoc) - * - * @see is2.parser52L.DX#cz6() - */ - @Override - final public void cz6() { - - if (v0 < 0 || v1 < 0 || v2 < 0 || v3 < 0 || v4 < 0 || v5 < 0) { - shift = 0; - h = -1; - return; - } - - h = v0 | v1 << (shift = a0) | v2 << (shift += a1) | v3 << (shift += a2) | v4 << (shift += a3) - | v5 << (shift += a4); - shift = shift + a5; - - } - - /* - * (non-Javadoc) - * - * @see is2.parser52L.DX#cz7() - */ - @Override - final public void cz7() { - - if (v0 < 0 || v1 < 0 || v2 < 0 || v3 < 0 || v4 < 0 || v5 < 0 || v6 < 0) { - shift = 0; - h = -1; - return; - } - - h = v0 | v1 << (shift = a0) | v2 << (shift += a1) | v3 << (shift += a2) | v4 << (shift += a3) - | v5 << (shift += a4) | v6 << (shift += a5); - shift = shift + a6; - - } - - /* - * (non-Javadoc) - * - * @see is2.parser52L.DX#cz8() - */ - @Override - final public void cz8() { - - if (v0 < 0 || v1 < 0 || v2 < 0 || v3 < 0 || v4 < 0 || v5 < 0 || v6 < 0 || v7 < 0) { - h = -1; - shift = 0; - return; - } - - h = v0 | v1 << (shift = a0) | v2 << (shift += a1) | v3 << (shift += a2) | v4 << (shift += a3) - | v5 << (shift += a4) | v6 << (shift += a5) | v7 << (shift += a6); - shift = shift + a7; - - } - - /* - * (non-Javadoc) - * - * @see is2.parser52L.DX#clean() - */ - @Override - final public void clean() { - v0 = 0; - v1 = 0; - v2 = 0; - v3 = 0; - v4 = 0; - v5 = 0; - v6 = 0; - v7 = 0; - v8 = 0; - shift = 0; - h = 0; - } - - public final Long2IntInterface _li; - - public D5(Long2IntInterface li) { - _li = li; - } - - /* - * (non-Javadoc) - * - * @see is2.parser52L.DX#cs(int, int) - */ - @Override - final public long cs(int b, int v) { - if (h < 0) { - h = -1; - shift = 0; - return -1; - } - - h |= (long) v << shift; - shift += b; - if (shift > 64) { - System.out.println("shift too large " + shift); - new Exception().printStackTrace(); - } - - return h; - - } - - /* - * (non-Javadoc) - * - * @see is2.parser52L.DX#csa(int, int) - */ - @Override - final public long csa(int b, int v) { - if (h < 0) { - h = -1; - shift = 0; - return -1; - } - - h |= (long) v << shift; - shift += b; - if (shift > 64) { - System.out.println("shift too large " + shift); - new Exception().printStackTrace(); - } - - return h; - - } - - /* - * (non-Javadoc) - * - * @see is2.parser52L.DX#csa(int, int, is2.data.IFV) - */ - @Override - final public void csa(int b, int v, IFV f) { - if (h < 0) { - h = -1; - shift = 0; - return; - } - - h |= (long) v << shift; - shift += b; - if (shift > 64) { - System.out.println("shift too large " + shift); - new Exception().printStackTrace(); - } - - f.add(_li.l2i(h)); - } - - /* - * (non-Javadoc) - * - * @see is2.parser52L.DX#getVal() - */ - @Override - public long getVal() { - if (h < 0) { - h = -1; - shift = 0; - return h; - } - return h; - } - - /* - * (non-Javadoc) - * - * @see is2.parser52L.DX#map(is2.data.IFV, long) - */ - @Override - public void map(IFV f, long l) { - if (l > 0) - f.add(_li.l2i(l)); - } - - /* - * (non-Javadoc) - * - * @see is2.data.DX#computeLabeValue(short, short) - */ - @Override - public int computeLabeValue(int label, int shift) { - return label << shift; - } - - @Override - public void fix() { - - } - -} \ No newline at end of file diff --git a/dependencyParser/mate-tools/src/is2/parser/Decoder.java b/dependencyParser/mate-tools/src/is2/parser/Decoder.java deleted file mode 100755 index 1fe2340..0000000 --- a/dependencyParser/mate-tools/src/is2/parser/Decoder.java +++ /dev/null @@ -1,243 +0,0 @@ -package is2.parser; - -import java.util.ArrayList; -import java.util.concurrent.ExecutorService; - -import is2.data.DataFES; -import is2.data.Parse; - -/** - * @author Bernd Bohnet, 01.09.2009 - * - * This methods do the actual work and they build the dependency trees. - */ -final public class Decoder { - - public static final boolean TRAINING = true; - public static long timeDecotder; - public static long timeRearrange; - - /** - * Threshold for rearrange edges non-projective - */ - public static float NON_PROJECTIVITY_THRESHOLD = 0.3F; - - static ExecutorService executerService = java.util.concurrent.Executors.newFixedThreadPool(Parser.THREADS); - - // do not initialize - private Decoder() { - }; - - /** - * Build a dependency tree based on the data - * - * @param pos - * part-of-speech tags - * @param x - * the data - * @param projective - * projective or non-projective - * @param edges - * the edges - * @return a parse tree - * @throws InterruptedException - */ - public static Parse decode(short[] pos, DataFES x, boolean projective, boolean training) - throws InterruptedException { - - long ts = System.nanoTime(); - - if (executerService.isShutdown()) - executerService = java.util.concurrent.Executors.newCachedThreadPool(); - final int n = pos.length; - - final Open O[][][][] = new Open[n][n][2][]; - final Closed C[][][][] = new Closed[n][n][2][]; - - ArrayList<ParallelDecoder> pe = new ArrayList<ParallelDecoder>(); - - for (int i = 0; i < Parser.THREADS; i++) - pe.add(new ParallelDecoder(pos, x, O, C, n)); - - for (short k = 1; k < n; k++) { - - // provide the threads the data - for (short s = 0; s < n; s++) { - short t = (short) (s + k); - if (t >= n) - break; - - ParallelDecoder.add(s, t); - } - - executerService.invokeAll(pe); - } - - float bestSpanScore = (-1.0F / 0.0F); - Closed bestSpan = null; - for (int m = 1; m < n; m++) - if (C[0][n - 1][1][m].p > bestSpanScore) { - bestSpanScore = C[0][n - 1][1][m].p; - bestSpan = C[0][n - 1][1][m]; - } - - // build the dependency tree from the chart - Parse out = new Parse(pos.length); - - bestSpan.create(out); - - out.heads[0] = -1; - out.labels[0] = 0; - - timeDecotder += (System.nanoTime() - ts); - - ts = System.nanoTime(); - - if (!projective) - rearrange(pos, out.heads, out.labels, x, training); - - timeRearrange += (System.nanoTime() - ts); - - return out; - } - - public static Parse[] decodeAll(short[] pos, DataFES x, boolean projective, boolean training) - throws InterruptedException { - - long ts = System.nanoTime(); - - if (executerService.isShutdown()) - executerService = java.util.concurrent.Executors.newCachedThreadPool(); - final int n = pos.length; - - final Open O[][][][] = new Open[n][n][2][]; - final Closed C[][][][] = new Closed[n][n][2][]; - - ArrayList<ParallelDecoder> pe = new ArrayList<ParallelDecoder>(); - - for (int i = 0; i < Parser.THREADS; i++) - pe.add(new ParallelDecoder(pos, x, O, C, n)); - - for (short k = 1; k < n; k++) { - - // provide the threads the data - for (short s = 0; s < n; s++) { - short t = (short) (s + k); - if (t >= n) - break; - - ParallelDecoder.add(s, t); - } - - executerService.invokeAll(pe); - } - - Parse[] out = new Parse[n - 1]; - - // float bestSpanScore = (-1.0F / 0.0F); - // Closed bestSpan = null; - for (int m = 1; m < n; m++) { - // if (C[0][n - 1][1][m].p > bestSpanScore) { - // bestSpanScore = C[0][n - 1][1][m].p; - // bestSpan = C[0][n - 1][1][m]; - // } - out[m - 1] = new Parse(pos.length); - C[0][n - 1][1][m].create(out[m - 1]); - out[m - 1].heads[0] = -1; - out[m - 1].labels[0] = 0; - } - - // build the dependency tree from the chart - // Parse out= new Parse(pos.length); - - // bestSpan.create(out); - - // out.heads[0]=-1; - // out.labels[0]=0; - - timeDecotder += (System.nanoTime() - ts); - - ts = System.nanoTime(); - - if (!projective) - for (Parse p : out) - rearrange(pos, p.heads, p.labels, x, training); - // if (!projective) rearrange(pos, out.heads, out.labels,x,training); - - timeRearrange += (System.nanoTime() - ts); - - return out; - } - - /** - * This is the parallel non-projective edge re-arranger - * - * @param pos - * part-of-speech tags - * @param heads - * parent child relation - * @param labs - * edge labels - * @param x - * the data - * @param edges - * the existing edges defined by part-of-speech tags - * @throws InterruptedException - */ - public static void rearrange(short[] pos, short[] heads, short[] labs, DataFES x, boolean training) - throws InterruptedException { - - int threads = (pos.length > Parser.THREADS) ? Parser.THREADS : pos.length; - - // wh what to change, nPar - new parent, nType - new type - short wh = -1, nPar = -1, nType = -1; - ArrayList<ParallelRearrange> pe = new ArrayList<ParallelRearrange>(); - - while (true) { - boolean[][] isChild = new boolean[heads.length][heads.length]; - for (int i = 1, l1 = 1; i < heads.length; i++, l1 = i) - while ((l1 = heads[l1]) != -1) - isChild[l1][i] = true; - - float max = Float.NEGATIVE_INFINITY; - float p = Extractor.encode3(pos, heads, labs, x); - - pe.clear(); - for (int i = 0; i < threads; i++) - pe.add(new ParallelRearrange(isChild, pos, x, heads, labs)); - - for (int ch = 1; ch < heads.length; ch++) { - - for (short pa = 0; pa < heads.length; pa++) { - if (ch == pa || pa == heads[ch] || isChild[ch][pa]) - continue; - - ParallelRearrange.add(p, (short) ch, pa); - } - } - executerService.invokeAll(pe); - - for (ParallelRearrange.PA rp : ParallelRearrange.order) - if (max < rp.max) { - max = rp.max; - wh = rp.wh; - nPar = rp.nPar; - nType = rp.nType; - } - ParallelRearrange.order.clear(); - - if (max <= NON_PROJECTIVITY_THRESHOLD) - break; // bb: changed from 0.0 - - heads[wh] = nPar; - labs[wh] = nType; - - } - } - - public static String getInfo() { - - return "Decoder non-projectivity threshold: " + NON_PROJECTIVITY_THRESHOLD; - } - -} diff --git a/dependencyParser/mate-tools/src/is2/parser/Edges.java b/dependencyParser/mate-tools/src/is2/parser/Edges.java deleted file mode 100644 index 39a0190..0000000 --- a/dependencyParser/mate-tools/src/is2/parser/Edges.java +++ /dev/null @@ -1,208 +0,0 @@ -/** - * - */ -package is2.parser; - -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.IOException; -import java.util.Arrays; -import java.util.Comparator; -import java.util.HashMap; -import java.util.Map.Entry; - -/** - * @author Dr. Bernd Bohnet, 13.05.2009; - * - * - */ -public final class Edges { - - private static short[][][] edges; - private static HashMap<Short, Integer> labelCount = new HashMap<Short, Integer>(); - - private static HashMap<String, Integer> slabelCount = new HashMap<String, Integer>(); - - static short[] def = new short[1]; - - private Edges() { - } - - /** - * @param length - */ - public static void init(int length) { - edges = new short[length][length][]; - } - - public static void findDefault() { - - int best = 0; - - for (Entry<Short, Integer> e : labelCount.entrySet()) { - - if (best < e.getValue()) { - best = e.getValue(); - def[0] = e.getKey(); - } - } - - // labelCount=null; - // String[] types = new String[mf.getFeatureCounter().get(PipeGen.REL)]; - // for (Entry<String, Integer> e : - // MFO.getFeatureSet().get(PipeGen.REL).entrySet()) types[e.getValue()] - // = e.getKey(); - - is2.util.DB.println("set default label to " + def[0] + " "); - - // System.out.println("found default "+def[0]); - - } - - final static public void put(int pos1, int pos2, short label) { - putD(pos1, pos2, label); - // putD(pos2, pos1,!dir, label); - } - - final static public void putD(int pos1, int pos2, short label) { - - Integer lc = labelCount.get(label); - if (lc == null) - labelCount.put(label, 1); - else - labelCount.put(label, lc + 1); - - String key = pos1 + "-" + pos2 + label; - Integer lcs = slabelCount.get(key); - if (lcs == null) - slabelCount.put(key, 1); - else - slabelCount.put(key, lcs + 1); - - if (edges[pos1][pos2] == null) { - edges[pos1][pos2] = new short[1]; - edges[pos1][pos2][0] = label; - - // edgesh[pos1][pos2][dir?0:1] = new TIntHashSet(2); - // edgesh[pos1][pos2][dir?0:1].add(label); - } else { - short labels[] = edges[pos1][pos2]; - for (short l : labels) { - // contains label already? - if (l == label) - return; - } - - short[] nlabels = new short[labels.length + 1]; - System.arraycopy(labels, 0, nlabels, 0, labels.length); - nlabels[labels.length] = label; - edges[pos1][pos2] = nlabels; - - // edgesh[pos1][pos2][dir?0:1].add(label); - } - } - - final static public short[] get(int pos1, int pos2) { - - if (pos1 < 0 || pos2 < 0 || edges[pos1][pos2] == null) - return def; - return edges[pos1][pos2]; - } - - /** - * @param dis - */ - static public void write(DataOutputStream d) throws IOException { - - int len = edges.length; - d.writeShort(len); - - for (int p1 = 0; p1 < len; p1++) { - for (int p2 = 0; p2 < len; p2++) { - if (edges[p1][p2] == null) - d.writeShort(0); - else { - d.writeShort(edges[p1][p2].length); - for (int l = 0; l < edges[p1][p2].length; l++) { - d.writeShort(edges[p1][p2][l]); - } - - } - } - } - - d.writeShort(def[0]); - - } - - /** - * @param dis - */ - public static void read(DataInputStream d) throws IOException { - int len = d.readShort(); - - edges = new short[len][len][]; - for (int p1 = 0; p1 < len; p1++) { - for (int p2 = 0; p2 < len; p2++) { - int ll = d.readShort(); - if (ll == 0) { - edges[p1][p2] = null; - } else { - edges[p1][p2] = new short[ll]; - for (int l = 0; l < ll; l++) { - edges[p1][p2][l] = d.readShort(); - } - } - } - } - - def[0] = d.readShort(); - - } - - public static void print() { - for(int i = 0; i < edges.length; ++i) - for(int j = 0; j < edges[i].length; ++j) - if(edges[i][j] != null) - System.out.println("edges[" + i + "][" + j + "] = " + Arrays.toString(edges[i][j])); - - assert def.length == 0; - System.out.println("def = [" + def[0] + "]"); - } - - public static class C implements Comparator<Short> { - - public C() { - super(); - } - - String _key; - - public C(String key) { - super(); - _key = key; - } - - /* - * (non-Javadoc) - * - * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) - */ - @Override - public int compare(Short l1, Short l2) { - - // int c1 = labelCount.get(l1); - // int c2 = labelCount.get(l2); - // if (true) return c1==c2?0:c1>c2?-1:1; - - int x1 = slabelCount.get(_key + l1.shortValue()); - int x2 = slabelCount.get(_key + l2.shortValue()); - // System.out.println(x1+" "+x2); - - return x1 == x2 ? 0 : x1 > x2 ? -1 : 1; - - } - - } - -} diff --git a/dependencyParser/mate-tools/src/is2/parser/Evaluator.java b/dependencyParser/mate-tools/src/is2/parser/Evaluator.java deleted file mode 100755 index f0d45ec..0000000 --- a/dependencyParser/mate-tools/src/is2/parser/Evaluator.java +++ /dev/null @@ -1,100 +0,0 @@ -package is2.parser; - -import is2.data.SentenceData09; -import is2.io.CONLLReader09; - -public class Evaluator { - - public static final String PUNCT = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; - - public static class Results { - - public int total; - public int corr; - public float las; - public float ula; - - } - - public static Results evaluate(String act_file, String pred_file) throws Exception { - - CONLLReader09 goldReader = new CONLLReader09(act_file, -1); - CONLLReader09 predictedReader = new CONLLReader09(pred_file, -1); - - int total = 0, corr = 0, corrL = 0; - int numsent = 0, corrsent = 0, corrsentL = 0; - SentenceData09 goldInstance = goldReader.getNext(); - SentenceData09 predInstance = predictedReader.getNext(); - - while (goldInstance != null) { - - int instanceLength = goldInstance.length(); - - if (instanceLength != predInstance.length()) - System.out.println("Lengths do not match on sentence " + numsent); - - int[] goldHeads = goldInstance.heads; - String[] goldLabels = goldInstance.labels; - int[] predHeads = predInstance.heads; - String[] predLabels = predInstance.labels; - - boolean whole = true; - boolean wholeL = true; - - // NOTE: the first item is the root info added during - // nextInstance(), so we skip it. - - int punc = 0; - for (int i = 1; i < instanceLength; i++) { - if (predHeads[i] == goldHeads[i]) { - corr++; - - if (goldLabels[i].equals(predLabels[i])) - corrL++; - else { - // System.out.println(numsent+" error gold - // "+goldLabels[i]+" "+predLabels[i]+" head - // "+goldHeads[i]+" child "+i); - wholeL = false; - } - } else { - // System.out.println(numsent+"error gold "+goldLabels[i]+" - // "+predLabels[i]+" head "+goldHeads[i]+" child "+i); - whole = false; - wholeL = false; - } - } - total += ((instanceLength - 1) - punc); // Subtract one to not score - // fake root token - - if (whole) - corrsent++; - if (wholeL) - corrsentL++; - numsent++; - - goldInstance = goldReader.getNext(); - predInstance = predictedReader.getNext(); - } - - Results r = new Results(); - - r.total = total; - r.corr = corr; - r.las = (float) Math.round(((double) corrL / total) * 100000) / 1000; - r.ula = (float) Math.round(((double) corr / total) * 100000) / 1000; - System.out.print("Total: " + total + " \tCorrect: " + corr + " "); - System.out.println("LAS: " + (double) Math.round(((double) corrL / total) * 100000) / 1000 + " \tTotal: " - + (double) Math.round(((double) corrsentL / numsent) * 100000) / 1000 + " \tULA: " - + (double) Math.round(((double) corr / total) * 100000) / 1000 + " \tTotal: " - + (double) Math.round(((double) corrsent / numsent) * 100000) / 1000); - - return r; - } - - public static float round(double v) { - - return Math.round(v * 10000F) / 10000F; - } - -} diff --git a/dependencyParser/mate-tools/src/is2/parser/MFO.java b/dependencyParser/mate-tools/src/is2/parser/MFO.java deleted file mode 100755 index 5a2de73..0000000 --- a/dependencyParser/mate-tools/src/is2/parser/MFO.java +++ /dev/null @@ -1,267 +0,0 @@ -package is2.parser; - -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map.Entry; -import is2.data.IEncoderPlus; -import is2.util.DB; - -/** - * Map Features, do not map long to integer - * - * @author Bernd Bohnet, 20.09.2009 - */ - -final public class MFO implements IEncoderPlus { - - /** The features and its values */ - static private final HashMap<String, HashMap<String, Integer>> m_featureSets = new HashMap<String, HashMap<String, Integer>>(); - - /** The feature class and the number of values */ - static private final HashMap<String, Integer> m_featureCounters = new HashMap<String, Integer>(); - - /** The number of bits needed to encode a feature */ - static final HashMap<String, Integer> m_featureBits = new HashMap<String, Integer>(); - - /** Integer counter for long2int */ - static private int count = 0; - - /** Stop growing */ - public boolean stop = false; - - final public static String NONE = "<None>"; - - public MFO() { - } - - public int size() { - return count; - } - - /** - * Register an attribute class, if it not exists and add a possible value - * - * @param type - * @param type2 - */ - @Override - final public int register(String a, String v) { - - HashMap<String, Integer> fs = getFeatureSet().get(a); - if (fs == null) { - fs = new HashMap<String, Integer>(); - getFeatureSet().put(a, fs); - fs.put(NONE, 0); - getFeatureCounter().put(a, 1); - } - Integer c = getFeatureCounter().get(a); - - Integer i = fs.get(v); - if (i == null) { - fs.put(v, c); - c++; - getFeatureCounter().put(a, c); - return c - 1; - } else - return i; - } - - /** - * Calculates the number of bits needed to encode a feature - */ - public void calculateBits() { - - for (Entry<String, Integer> e : getFeatureCounter().entrySet()) { - int bits = (int) Math.ceil((Math.log(e.getValue() + 1) / Math.log(2))); - m_featureBits.put(e.getKey(), bits); - } - - // System.out.println("total number of needed bits "+total); - } - - @Override - public String toString() { - - StringBuffer content = new StringBuffer(); - for (Entry<String, Integer> e : getFeatureCounter().entrySet()) { - content.append(e.getKey() + " " + e.getValue()); - content.append(':'); - // HashMap<String,Integer> vs = getFeatureSet().get(e.getKey()); - content.append(getFeatureBits(e.getKey())); - - /* - * if (vs.size()<120) for(Entry<String,Integer> e2 : vs.entrySet()) - * { content.append(e2.getKey()+" ("+e2.getValue()+") "); } - */ - content.append('\n'); - - } - return content.toString(); - } - - static final public short getFeatureBits(String a) { - if (m_featureBits.get(a) == null) - return 0; - return (short) m_featureBits.get(a).intValue(); - } - - /** - * Get the integer place holder of the string value v of the type a - * - * @param t - * the type - * @param v - * the value - * @return the integer place holder of v - */ - @Override - final public int getValue(String t, String v) { - - if (m_featureSets.get(t) == null) - return -1; - Integer vi = m_featureSets.get(t).get(v); - if (vi == null) - return -1; // stop && - return vi.intValue(); - } - - /** - * Static version of getValue - * - * @see getValue - */ - static final public int getValueS(String a, String v) { - - if (m_featureSets.get(a) == null) - return -1; - Integer vi = m_featureSets.get(a).get(v); - if (vi == null) - return -1; // stop && - return vi.intValue(); - } - - public int hasValue(String a, String v) { - - Integer vi = m_featureSets.get(a).get(v); - if (vi == null) - return -1; - return vi.intValue(); - } - - public static String printBits(int k) { - StringBuffer s = new StringBuffer(); - for (int i = 0; i < 31; i++) { - s.append((k & 0x00000001) == 1 ? '1' : '0'); - k = k >> 1; - - } - s.reverse(); - return s.toString(); - } - - /** - * Maps a long to a integer value. This is very useful to save memory for - * sparse data long values - * - * @param l - * @return the integer - */ - static public int misses = 0; - static public int good = 0; - - /** - * Write the data - * - * @param dos - * @throws IOException - */ - static public void writeData(DataOutputStream dos) throws IOException { - dos.writeInt(getFeatureSet().size()); - // DB.println("write"+getFeatureSet().size()); - for (Entry<String, HashMap<String, Integer>> e : getFeatureSet().entrySet()) { - dos.writeUTF(e.getKey()); - dos.writeInt(e.getValue().size()); - - for (Entry<String, Integer> e2 : e.getValue().entrySet()) { - - if (e2.getKey() == null) - DB.println("key " + e2.getKey() + " value " + e2.getValue() + " e -key " + e.getKey()); - dos.writeUTF(e2.getKey()); - dos.writeInt(e2.getValue()); - - } - - } - } - - public void read(DataInputStream din) throws IOException { - - int size = din.readInt(); - for (int i = 0; i < size; i++) { - String k = din.readUTF(); - int size2 = din.readInt(); - - HashMap<String, Integer> h = new HashMap<String, Integer>(); - getFeatureSet().put(k, h); - for (int j = 0; j < size2; j++) { - h.put(din.readUTF(), din.readInt()); - } - getFeatureCounter().put(k, size2); - } - - count = size; - // stop(); - calculateBits(); - } - - /** - * Clear the data - */ - static public void clearData() { - getFeatureSet().clear(); - m_featureBits.clear(); - getFeatureSet().clear(); - } - - @Override - public HashMap<String, Integer> getFeatureCounter() { - return m_featureCounters; - } - - static public HashMap<String, HashMap<String, Integer>> getFeatureSet() { - return m_featureSets; - } - - static public String[] reverse(HashMap<String, Integer> v) { - String[] set = new String[v.size()]; - for (Entry<String, Integer> e : v.entrySet()) { - set[e.getValue()] = e.getKey(); - } - return set; - } - - private static <K, V> String mapToString(HashMap<K, V> m) { - int counter = 0; - StringBuilder s = new StringBuilder(); - for(K k: m.keySet()) { - s.append(", " + k + ": " + m.get(k)); - ++counter; - if(counter == Parser.maxPrint) break; - } - if(s.length() < 3) return "{}"; - else if(counter == Parser.maxPrint) return "{" + s.substring(2) + ",...} (exceeds maximum print length)"; - else return "{" + s.substring(2) + "}"; - } - - @Override - public void print() { - for(String s: m_featureSets.keySet()) - System.out.println("m_featureSets[" + s + "] = " + mapToString(m_featureSets.get(s))); - System.out.println("m_featureCounters = " + mapToString(m_featureCounters)); - System.out.println("m_featureBits = " + mapToString(m_featureBits)); - System.out.println("count = " + count); - System.out.println("stop = " + stop); - } -} diff --git a/dependencyParser/mate-tools/src/is2/parser/Options.java b/dependencyParser/mate-tools/src/is2/parser/Options.java deleted file mode 100755 index bd550ec..0000000 --- a/dependencyParser/mate-tools/src/is2/parser/Options.java +++ /dev/null @@ -1,70 +0,0 @@ -package is2.parser; - -import is2.util.OptionsSuper; - -public final class Options extends OptionsSuper { - - public Options(String[] args) { - - for (int i = 0; i < args.length; i++) { - - if (args[i].equals("--help")) - explain(); - - if (args[i].equals("-decode")) { - decodeProjective = args[i + 1].equals("proj"); - i++; - } else if (args[i].equals("-decodeTH")) { - decodeTH = Double.parseDouble(args[i + 1]); - i++; - } else if (args[i].equals("-nonormalize")) { - normalize = false; - } else if (args[i].equals("-features")) { - features = args[i + 1]; - i++; - } else if (args[i].equals("-hsize")) { - hsize = Integer.parseInt(args[i + 1]); - i++; - } else if (args[i].equals("-len")) { - maxLen = Integer.parseInt(args[i + 1]); - i++; - } else if (args[i].equals("-cores")) { - cores = Integer.parseInt(args[i + 1]); - i++; - } else if (args[i].equals("-no2nd")) { - no2nd = true; - } else if (args[i].equals("-few2nd")) { - few2nd = true; - } else - super.addOption(args, i); - - } - - } - - private void explain() { - System.out.println("Usage: "); - System.out.println("java -class mate.jar is2.parser.Parser [Options]"); - System.out.println(); - System.out.println("Example: "); - System.out.println( - " java -class mate.jar is2.parser.Parser -model eps3.model -train corpora/conll08st/train/train.closed -test corpora/conll08st/devel/devel.closed -out b3.test -eval corpora/conll08st/devel/devel.closed -count 2000 -i 6"); - System.out.println(""); - System.out.println("Options:"); - System.out.println(""); - System.out.println(" -train <file> the corpus a model is trained on; default " + this.trainfile); - System.out.println(" -test <file> the input corpus for testing; default " + this.testfile); - System.out.println(" -out <file> the output corpus (result) of a test run; default " + this.outfile); - System.out.println(" -model <file> the parsing model for traing the model is stored in the files"); - System.out.println( - " and for parsing the model is load from this file; default " + this.modelName); - System.out.println( - " -i <number> the number of training iterations; good numbers are 10 for smaller corpora and 6 for bigger; default " - + this.numIters); - System.out.println(" -count <number> the n first sentences of the corpus are take for the training default " - + this.count); - System.out.println(" -format <number> conll format of the year 8 or 9; default " + this.formatTask); - - System.exit(0); - } -} diff --git a/dependencyParser/mate-tools/src/is2/parser/ParallelDecoder.java b/dependencyParser/mate-tools/src/is2/parser/ParallelDecoder.java deleted file mode 100755 index ca508fd..0000000 --- a/dependencyParser/mate-tools/src/is2/parser/ParallelDecoder.java +++ /dev/null @@ -1,194 +0,0 @@ -package is2.parser; - -import java.util.ArrayList; -import java.util.concurrent.Callable; - -import is2.data.DataFES; - -/** - * @author Bernd Bohnet, 30.08.2009 - * - * This class implements a parallel feature extractor. - */ -final public class ParallelDecoder implements Callable<Object> { - // some constants - private static final float INIT_BEST = (-1.0F / 0.0F); - private static final boolean[] DIR = { false, true }; - - // the data space of the weights for a dependency tree - final private DataFES x; - - private short[] pos; - - private Open O[][][][]; - private Closed C[][][][]; - - private int length; - - boolean done = false; - public boolean waiting = false; - - /** - * Initialize the parallel decoder. - * - * @param pos - * part-of-speech - * @param d - * data - * @param edges - * part-of-speech edge mapping - * @param o - * open spans - * @param c - * closed spans - * @param length - * number of words - */ - public ParallelDecoder(short[] pos, DataFES d, Open o[][][][], Closed c[][][][], int length) { - - this.pos = pos; - this.x = d; - - this.O = o; - this.C = c; - this.length = length; - } - - private static class DSet { - short w1, w2; - } - - @Override - public Object call() { - - try { - - while (true) { - - DSet set = get(); - // if (done && set==null) break; - - if (set == null) - return null; - - short s = set.w1, t = set.w2; - - for (short dir = 0; dir < 2; dir++) { - - short[] labs = (dir == 1) ? Edges.get(pos[s], pos[t]) : Edges.get(pos[t], pos[s]); - - O[s][t][dir] = new Open[labs.length]; - - for (int l = 0; l < labs.length; l++) { - - double tRP = INIT_BEST; - - Closed tL = null, tR = null; - - for (int r = s; r < t; r++) { - - if (s == 0 && r != 0) - continue; - - double tLPr = INIT_BEST, tRPr = INIT_BEST; - Closed tLCld = null, tRCld = null; - - if (r == s) - tLPr = dir == 1 ? x.sib[s][t][s][l] : x.gra[t][s][s][l]; - else - for (int i = s + 1; i <= r; i++) - if (((dir == 1 ? x.sib[s][t][i][l] : x.gra[t][s][i][l]) + C[s][r][1][i].p) > tLPr) { - tLPr = ((dir == 1 ? x.sib[s][t][i][l] : x.gra[t][s][i][l]) + C[s][r][1][i].p); - tLCld = C[s][r][1][i]; - } - - if (r == t - 1) - tRPr = dir == 1 ? x.gra[s][t][s][l] : x.sib[t][s][s][l]; - else - for (int i = r + 1; i < t; i++) - if (((dir == 1 ? x.gra[s][t][i][l] : x.sib[t][s][i][l]) - + C[r + 1][t][0][i].p) > tRPr) { - tRPr = ((dir == 1 ? x.gra[s][t][i][l] : x.sib[t][s][i][l]) - + C[r + 1][t][0][i].p); - tRCld = C[r + 1][t][0][i]; - } - - if (tLPr + tRPr > tRP) { - tRP = tLPr + tRPr; - tL = tLCld; - tR = tRCld; - } - } - O[s][t][dir][l] = new Open(s, t, dir, labs[l], tL, tR, - (float) (tRP + ((dir == 1) ? x.pl[s][t] : x.pl[t][s]) - + ((dir == 1) ? x.lab[s][t][labs[l]] : x.lab[t][s][labs[l]]))); - } - } - C[s][t][1] = new Closed[length]; - C[s][t][0] = new Closed[length]; - - for (int m = s; m <= t; m++) { - for (boolean d : DIR) { - if ((d && m != s) || !d && (m != t && s != 0)) { - - // create closed structure - - double top = INIT_BEST; - - Open tU = null; - Closed tL = null; - int numLabels = O[(d ? s : m)][(d ? m : t)][d ? 1 : 0].length; - - // for (int l = numLabels-1; l >=0; l--) { - for (int l = 0; l < numLabels; l++) { - - Open hi = O[(d ? s : m)][(d ? m : t)][d ? 1 : 0][l]; - for (int amb = m + (d ? 1 : -1); amb != (d ? t : s) - + (d ? 1 : -1); amb += (d ? 1 : -1)) { - - if ((hi.p + C[d ? m : s][d ? t : m][d ? 1 : 0][amb].p - + x.gra[d ? s : t][m][amb][l]) > top) { - top = (hi.p + C[d ? m : s][d ? t : m][d ? 1 : 0][amb].p - + x.gra[d ? s : t][m][amb][l]); - tU = hi; - tL = C[d ? m : s][d ? t : m][d ? 1 : 0][amb]; - } - - } - - if ((m == (d ? t : s)) && (hi.p + x.gra[d ? s : t][d ? t : s][m][l]) > top) { - top = (hi.p + x.gra[d ? s : t][d ? t : s][m][l]); - tU = hi; - tL = null; - } - } - C[s][t][d ? 1 : 0][m] = new Closed(s, t, m, d ? 1 : 0, tU, tL, (float) top); - - } - } - } - } - } catch (Exception e) { - e.printStackTrace(); - System.exit(0); - } - return null; - } - - public static ArrayList<DSet> sets = new ArrayList<DSet>(); - - static synchronized private DSet get() { - synchronized (sets) { - if (sets.size() == 0) - return null; - return sets.remove(sets.size() - 1); - } - } - - public static void add(short w1, short w2) { - DSet ds = new DSet(); - ds.w1 = w1; - ds.w2 = w2; - sets.add(ds); - } -} diff --git a/dependencyParser/mate-tools/src/is2/parser/ParallelExtract.java b/dependencyParser/mate-tools/src/is2/parser/ParallelExtract.java deleted file mode 100755 index ca85711..0000000 --- a/dependencyParser/mate-tools/src/is2/parser/ParallelExtract.java +++ /dev/null @@ -1,248 +0,0 @@ -package is2.parser; - -import java.util.ArrayList; -import java.util.concurrent.Callable; - -import is2.data.Cluster; -import is2.data.DataFES; -import is2.data.F2SF; -import is2.data.Instances; -import is2.data.Long2IntInterface; - -/** - * @author Bernd Bohnet, 30.08.2009 - * - * This class implements a parallel feature extractor. - */ -final public class ParallelExtract implements Callable<Object> { - // the data space of the weights for a dependency tree - final DataFES d; - - // the data extractor does the actual work - final Extractor extractor; - - private Instances is; - private int i; - - private F2SF para; - - private Cluster cluster; - - public ParallelExtract(Extractor e, Instances is, int i, DataFES d, F2SF para, Cluster cluster) { - - this.is = is; - extractor = e; - this.d = d; - this.i = i; - this.para = para; - this.cluster = cluster; - } - - public static class DSet { - int w1, w2; - } - - @Override - public Object call() { - - try { - - F2SF f = para; - - short[] pos = is.pposs[i]; - int length = pos.length; - - long[] gvs = new long[50]; - long[] svs = new long[220]; - - while (true) { - - DSet set = get(); - if (set == null) - break; - - int w1 = set.w1; - int w2 = set.w2; - - f.clear(); - extractor.basic(pos, w1, w2, f); - d.pl[w1][w2] = f.getScoreF(); - - f.clear(); - - extractor.basic(pos, w2, w1, f); - d.pl[w2][w1] = f.getScoreF(); - - short[] labels = Edges.get(pos[w1], pos[w2]); - float[] lab = d.lab[w1][w2]; - - final Long2IntInterface li = extractor.li; - - int c = extractor.firstm(is, i, w1, w2, 0, cluster, svs); - - for (int l = 0; l < lab.length; l++) - lab[l] = -100; - - for (short label2 : labels) { - short label = label2; - - f.clear(); - int lv = extractor.d0.computeLabeValue(label, Extractor.s_type); - for (int k = 0; k < c; k++) - if (svs[k] > 0) - f.add(li.l2i(svs[k] + lv)); - - lab[label] = f.getScoreF(); - } - - labels = Edges.get(pos[w2], pos[w1]); - lab = d.lab[w2][w1]; - - for (int l = 0; l < lab.length; l++) - lab[l] = -100; - - for (short label2 : labels) { - int label = label2; - - f.clear(); - int lv = extractor.d0.computeLabeValue(label + Extractor.s_rel1, Extractor.s_type); - for (int k = 0; k < c; k++) - if (svs[k] > 0) - f.add(li.l2i(svs[k] + lv)); - - lab[label] = f.getScoreF(); - } - - int s = w1 < w2 ? w1 : w2; - int e = w1 < w2 ? w2 : w1; - - for (int m = 0; m < length; m++) { - - int g = (m == s || e == m) ? -1 : m; - - int cn = extractor.second(is, i, w1, w2, g, 0, cluster, svs); - int cc = extractor.addClusterFeatures(is, i, w1, w2, g, cluster, 0, gvs, 0); - // for(int k=0;k<c;k++) dl1.map(f,svs[k]); - - if (m >= w1) { - labels = Edges.get(pos[w1], pos[w2]); - float[] lab2 = new float[labels.length]; - for (int l = 0; l < labels.length; l++) { - - short label = labels[l]; - - int lx = label + Extractor.s_rel1 * (g < w2 ? 0 : 2); - - f.clear(); - int lv = extractor.d0.computeLabeValue(lx, Extractor.s_type); - for (int k = 0; k < cn; k++) - if (svs[k] > 0) - f.add(li.l2i(svs[k] + lv)); - for (int k = 0; k < cc; k++) - if (gvs[k] > 0) - f.add(li.l2i(gvs[k] + lv)); - - lab2[l] = f.getScoreF(); - } - d.gra[w1][w2][m] = lab2; - } - - if (m <= w2) { - labels = Edges.get(pos[w2], pos[w1]); - float lab2[]; - d.gra[w2][w1][m] = lab2 = new float[labels.length]; - for (int l = 0; l < labels.length; l++) { - - int label = labels[l]; - int lx = label + Extractor.s_rel1 * (1 + (g < w1 ? 0 : 2)); - - f.clear(); - int lv = extractor.d0.computeLabeValue(lx, Extractor.s_type); - for (int k = 0; k < cn; k++) - if (svs[k] > 0) - f.add(li.l2i(svs[k] + lv)); - for (int k = 0; k < cc; k++) - if (gvs[k] > 0) - f.add(li.l2i(gvs[k] + lv)); - - lab2[l] = f.getScoreF(); - - } - } - - g = (m == s || e == m) ? -1 : m; - - // int cn = extractor.second(is,i,w1,w2,g,0, cluster, - // svs,Extractor._SIB); - if (m >= w1 && m <= w2) { - labels = Edges.get(pos[w1], pos[w2]); - float lab2[] = new float[labels.length]; - d.sib[w1][w2][m] = lab2; - - for (int l = 0; l < labels.length; l++) { - - short label = labels[l]; - - int lx = label + Extractor.s_rel1 * (8); - f.clear(); - int lv = extractor.d0.computeLabeValue(lx, Extractor.s_type); - for (int k = 0; k < cn; k++) - if (svs[k] > 0) - f.add(li.l2i(svs[k] + lv)); - for (int k = 0; k < cc; k++) - if (gvs[k] > 0) - f.add(li.l2i(gvs[k] + lv)); - - lab2[l] = f.score;// f.getScoreF(); - } - } - if (m >= w1 && m <= w2) { - labels = Edges.get(pos[w2], pos[w1]); - float[] lab2 = new float[labels.length]; - d.sib[w2][w1][m] = lab2; - for (int l = 0; l < labels.length; l++) { - - int label = labels[l]; - - int lx = label + Extractor.s_rel1 * (9); - - f.clear(); - int lv = extractor.d0.computeLabeValue(lx, Extractor.s_type); - for (int k = 0; k < cn; k++) - if (svs[k] > 0) - f.add(li.l2i(svs[k] + lv)); - for (int k = 0; k < cc; k++) - if (gvs[k] > 0) - f.add(li.l2i(gvs[k] + lv)); - - lab2[l] = f.score;// f.getScoreF(); - } - } - } - } - - } catch (Exception e) { - e.printStackTrace(); - } - return null; - } - - static ArrayList<DSet> sets = new ArrayList<DSet>(); - - private DSet get() { - - synchronized (sets) { - if (sets.size() == 0) - return null; - return sets.remove(sets.size() - 1); - } - } - - static public void add(int w1, int w2) { - DSet ds = new DSet(); - ds.w1 = w1; - ds.w2 = w2; - sets.add(ds); - } - -} diff --git a/dependencyParser/mate-tools/src/is2/parser/ParallelRearrange.java b/dependencyParser/mate-tools/src/is2/parser/ParallelRearrange.java deleted file mode 100755 index 83dcdaa..0000000 --- a/dependencyParser/mate-tools/src/is2/parser/ParallelRearrange.java +++ /dev/null @@ -1,146 +0,0 @@ -package is2.parser; - -import java.util.ArrayList; -import java.util.concurrent.Callable; - -import is2.data.DataFES; - -/** - * @author Dr. Bernd Bohnet, 30.08.2009 - * - * This class implements a parallel edge rearrangement for - * non-projective parsing; The linear method was first suggest by Rayn - * McDonald et. al. 2005. - */ -final public class ParallelRearrange implements Callable<Object> { - - // new parent child combination to explore - final static class PA { - final float p; - final short ch, pa; - public float max; - public short wh; - public short nPar; - public short nType; - - public PA(float p2, short ch2, short pa2) { - p = p2; - ch = ch2; - pa = pa2; - } - } - - // list of parent child combinations - static ArrayList<PA> parents = new ArrayList<PA>(); - static ArrayList<PA> order = new ArrayList<PA>(); - // best new parent child combination, found so far - public float max; - - // some data from the dependency tree - // private EdgesC edges; - private short[] pos; - private DataFES x; - private boolean[][] isChild; - public short[] heads, types; - - // child, new parent, new label - public short wh, nPar, nType; - - /** - * Initialize the parallel rearrange thread - * - * @param isChild2 - * is a child - * @param edgesC - * the part-of-speech edge mapping - * @param pos - * the part-of-speech - * @param x - * the data - * @param s - * the heads - * @param ts - * the types - */ - public ParallelRearrange(boolean[][] isChild2, short[] pos, DataFES x, short[] s, short[] ts) { - - heads = new short[s.length]; - System.arraycopy(s, 0, heads, 0, s.length); - - types = new short[ts.length]; - System.arraycopy(ts, 0, types, 0, ts.length); - - isChild = isChild2; - // edges = edgesC; - this.pos = pos; - this.x = x; - } - - @Override - public Object call() { - - // check the list of new possible parents and children for a better - // combination - while (true) { - PA px = getPA(); - if (px == null) - break; - - float max = 0; - short pa = px.pa, ch = px.ch; - - if (ch == pa || pa == heads[ch] || isChild[ch][pa]) - continue; - - short oldP = heads[ch], oldT = types[ch]; - - heads[ch] = pa; - - short[] labels = Edges.get(pos[pa], pos[ch]); - - for (short label : labels) { - - types[ch] = label; - - float p_new = Extractor.encode3(pos, heads, types, x); - - if (max < p_new - px.p) { - max = p_new - px.p; - wh = ch; - nPar = pa; - nType = label; - px.max = max; - px.wh = ch; - px.nPar = pa; - px.nType = label; - } - } - heads[ch] = oldP; - types[ch] = oldT; - } - return null; - } - - /** - * Add a child-parent combination which are latter explored for - * rearrangement - * - * @param p2 - * @param ch2 - * @param pa - */ - static public void add(float p2, short ch2, short pa) { - PA px = new PA(p2, ch2, pa); - parents.add(px); - order.add(px); - } - - static private PA getPA() { - synchronized (parents) { - if (parents.size() == 0) - return null; - return parents.remove(parents.size() - 1); - } - } - -} diff --git a/dependencyParser/mate-tools/src/is2/parser/ParametersFloat.java b/dependencyParser/mate-tools/src/is2/parser/ParametersFloat.java deleted file mode 100755 index faf795d..0000000 --- a/dependencyParser/mate-tools/src/is2/parser/ParametersFloat.java +++ /dev/null @@ -1,138 +0,0 @@ -package is2.parser; - -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.IOException; - -import is2.data.F2SF; -import is2.data.FV; -import is2.data.Instances; -import is2.data.Parse; -import is2.util.DB; - -final public class ParametersFloat extends Parameters { - - public float[] parameters; - public float[] total; - - public ParametersFloat(int size) { - parameters = new float[size]; - total = new float[size]; - for (int i = 0; i < parameters.length; i++) { - parameters[i] = 0F; - total[i] = 0F; - } - } - - /** - * @param parameters2 - */ - public ParametersFloat(float[] p) { - parameters = p; - } - - @Override - public void average(double avVal) { - for (int j = 0; j < total.length; j++) { - parameters[j] = total[j] / ((float) avVal); - } - total = null; - } - - public ParametersFloat average2(double avVal) { - float[] px = new float[this.parameters.length]; - for (int j = 0; j < total.length; j++) { - px[j] = total[j] / ((float) avVal); - } - ParametersFloat pf = new ParametersFloat(px); - return pf; - } - - @Override - public void update(FV act, FV pred, Instances isd, int instc, Parse d, double upd, double e) { - - e++; - - float lam_dist = getScore(act) - getScore(pred); - - float b = (float) e - lam_dist; - - FV dist = act.getDistVector(pred); - - dist.update(parameters, total, hildreth(dist, b), upd, false); - } - - protected double hildreth(FV a, double b) { - - double A = a.dotProduct(a); - if (A <= 0.0000000000000000001) - return 0.0; - return b / A; - } - - public float getScore(FV fv) { - if (fv == null) - return 0.0F; - return fv.getScore(parameters, false); - - } - - @Override - final public void write(DataOutputStream dos) throws IOException { - - dos.writeInt(parameters.length); - for (float d : parameters) - dos.writeFloat(d); - - } - - @Override - public void read(DataInputStream dis) throws IOException { - - parameters = new float[dis.readInt()]; - int notZero = 0; - for (int i = 0; i < parameters.length; i++) { - parameters[i] = dis.readFloat(); - if (parameters[i] != 0.0F) - notZero++; - } - - DB.println("read parameters " + parameters.length + " not zero " + notZero); - - } - - public int countNZ() { - - int notZero = 0; - for (float parameter : parameters) { - if (parameter != 0.0F) - notZero++; - } - return notZero; - - // DB.println("read parameters "+parameters.length+" not zero - // "+notZero); - - } - - /* - * (non-Javadoc) - * - * @see is2.sp09k99995.Parameters#getFV() - */ - @Override - public F2SF getFV() { - return new F2SF(parameters); - } - - /* - * (non-Javadoc) - * - * @see is2.sp09k99999.Parameters#size() - */ - @Override - public int size() { - return parameters.length; - } - -} diff --git a/dependencyParser/mate-tools/src/is2/parser/Pipe.java b/dependencyParser/mate-tools/src/is2/parser/Pipe.java deleted file mode 100755 index f7f3782..0000000 --- a/dependencyParser/mate-tools/src/is2/parser/Pipe.java +++ /dev/null @@ -1,224 +0,0 @@ -package is2.parser; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.concurrent.ExecutorService; - -import is2.data.Cluster; -import is2.data.DataFES; -import is2.data.F2SF; -import is2.data.Instances; -import is2.data.Parse; -import is2.data.PipeGen; -import is2.data.SentenceData09; -import is2.io.CONLLReader09; -import is2.util.OptionsSuper; - -final public class Pipe extends PipeGen { - - public Extractor[] extractor; - final public MFO mf = new MFO(); - - public Cluster cl; - - private OptionsSuper options; - public static long timeExtract; - - public Pipe(OptionsSuper o) { - options = o; - } - - public void createInstances(String file, Instances is) throws Exception { - - CONLLReader09 depReader = new CONLLReader09(file); - - mf.register(REL, "<root-type>"); - - // register at least one predicate since the parsing data might not - // contain predicates as in - // the Japaness corpus but the development sets contains some - - System.out.print("Registering feature parts of sentence: "); - int ic = 0; - int del = 0; - while (true) { - SentenceData09 instance = depReader.getNext(); - if (instance == null) - break; - ic++; - - if (ic % 1000 == 0) { - del = outValue(ic, del); - } - - String[] labs1 = instance.labels; - for (String element : labs1) - mf.register(REL, element); - - String[] w = instance.forms; - for (String element : w) - mf.register(WORD, depReader.normalize(element)); - - w = instance.plemmas; - for (String element : w) - mf.register(WORD, depReader.normalize(element)); - - w = instance.ppos; - for (String element : w) - mf.register(POS, element); - - w = instance.gpos; - for (String element : w) - mf.register(POS, element); - - if (instance.feats != null) { - String fs[][] = instance.feats; - for (String[] element : fs) { - w = element; - if (w == null) - continue; - for (String element2 : w) - mf.register(FEAT, element2); - } - } - - if ((ic - 1) > options.count) - break; - } - del = outValue(ic, del); - - System.out.println(); - Extractor.initFeatures(); - - Extractor.maxForm = mf.getFeatureCounter().get(WORD); - - if (options.clusterFile == null) - cl = new Cluster(); - else - cl = new Cluster(options.clusterFile, mf, 6); - - mf.calculateBits(); - Extractor.initStat(options.featureCreation); - - System.out.println("" + mf.toString()); - - for (Extractor e : extractor) - e.init(); - - depReader.startReading(file); - - int num1 = 0; - - is.init(ic, new MFO()); - - Edges.init(mf.getFeatureCounter().get(POS)); - - System.out.print("Creating edge filters and read corpus: "); - del = 0; - - while (true) { - if (num1 % 100 == 0) - del = outValue(num1, del); - - SentenceData09 instance1 = depReader.getNext(is); - - if (instance1 == null) - break; - - int last = is.size() - 1; - short[] pos = is.pposs[last]; - - for (int k = 0; k < is.length(last); k++) { - if (is.heads[last][k] < 0) - continue; - Edges.put(pos[is.heads[last][k]], pos[k], is.labels[last][k]); - // Edges.put(pos[k],pos[is.heads[last][k]], is.labels[last][k]); - } - - if (!options.allFeatures && num1 > options.count) - break; - - num1++; - - } - del = outValue(num1, del); - System.out.println(); - Edges.findDefault(); - } - - /** - * Creates an instance for outputParses - * - * @param is - * @return - * @throws IOException - */ - protected final SentenceData09 nextInstance(Instances is, CONLLReader09 depReader) throws Exception { - - SentenceData09 instance = depReader.getNext(is); - if (instance == null || instance.forms == null) - return null; - - return instance; - } - - public static ExecutorService executerService = java.util.concurrent.Executors.newFixedThreadPool(Parser.THREADS); - - public DataFES fillVector(F2SF params, Instances is, int inst, DataFES d, Cluster cluster) - throws InterruptedException { - - long ts = System.nanoTime(); - - if (executerService.isShutdown()) - executerService = java.util.concurrent.Executors.newCachedThreadPool(); - - final int length = is.length(inst); - if (d == null || d.len < length) - d = new DataFES(length, mf.getFeatureCounter().get(PipeGen.REL).shortValue()); - - ArrayList<ParallelExtract> pe = new ArrayList<ParallelExtract>(); - for (int i = 0; i < Parser.THREADS; i++) - pe.add(new ParallelExtract(extractor[i], is, inst, d, (F2SF) params.clone(), cluster)); - - for (int w1 = 0; w1 < length; w1++) { - for (int w2 = w1 + 1; w2 < length; w2++) { - - if (w1 == w2) - continue; - - ParallelExtract.add(w1, w2); - - } - } - // for(int i=0;i<efp.length;i++) efp[i].start(); - // for(int i=0;i<efp.length;i++) efp[i].join(); - executerService.invokeAll(pe); - - timeExtract += (System.nanoTime() - ts); - - return d; - } - - public double errors(Instances is, int ic, Parse p) { - short[] act = is.heads[ic]; - double correct = 0; - - // do not count root - for (int i = 1; i < act.length; i++) { - - // if (is.ppos[ic] ==null ) System.out.println("mf - // null"+is.ppos[ic][i]); - if (p.heads[i] == act[i]) { - correct += 0.5; - if (p.labels[i] == is.labels[ic][i]) - correct += 0.5; - } - } - - double x = ((double) act.length - 1 - correct); - - p.f1 = correct / (act.length - 1); - - return x; - } -} diff --git a/dependencyParser/mate-tools/src/is2/parserR2/Options.java b/dependencyParser/mate-tools/src/is2/parserR2/Options.java deleted file mode 100755 index eb396b4..0000000 --- a/dependencyParser/mate-tools/src/is2/parserR2/Options.java +++ /dev/null @@ -1,92 +0,0 @@ -package is2.parserR2; - -import is2.util.OptionsSuper; - -public final class Options extends OptionsSuper { - - int start = 0, end = 0; - String prefix_model = "m"; - String prefix_test = "t"; - - public Options(String[] args) { - - for (int i = 0; i < args.length; i++) { - - if (args[i].equals("--help")) - explain(); - - if (args[i].equals("-decode")) { - decodeProjective = args[i + 1].equals("proj"); - i++; - } else if (args[i].equals("-decodeTH")) { - decodeTH = Double.parseDouble(args[i + 1]); - i++; - } else if (args[i].equals("-nonormalize")) { - normalize = false; - } else if (args[i].equals("-features")) { - features = args[i + 1]; - i++; - } else if (args[i].equals("-hsize")) { - hsize = Integer.parseInt(args[i + 1]); - i++; - } else if (args[i].equals("-len")) { - maxLen = Integer.parseInt(args[i + 1]); - i++; - } else if (args[i].equals("-cores")) { - cores = Integer.parseInt(args[i + 1]); - i++; - } else if (args[i].equals("-best")) { - best = Integer.parseInt(args[i + 1]); - i++; - } else if (args[i].equals("-start")) { - start = Integer.parseInt(args[i + 1]); - i++; - } else if (args[i].equals("-end")) { - end = Integer.parseInt(args[i + 1]); - i++; - } else if (args[i].equals("-prefix-model")) { - prefix_model = args[i + 1]; - i++; - } else if (args[i].equals("-prefix-test")) { - prefix_test = args[i + 1]; - i++; - } else if (args[i].equals("-mapping")) { - this.useMapping = args[i + 1]; - i++; - } else if (args[i].equals("-no2nd")) { - no2nd = true; - } else if (args[i].equals("-few2nd")) { - few2nd = true; - } else - super.addOption(args, i); - - } - - } - - private void explain() { - System.out.println("Usage: "); - System.out.println("java -class mate.jar is2.parser.Parser [Options]"); - System.out.println(); - System.out.println("Example: "); - System.out.println( - " java -class mate.jar is2.parser.Parser -model eps3.model -train corpora/conll08st/train/train.closed -test corpora/conll08st/devel/devel.closed -out b3.test -eval corpora/conll08st/devel/devel.closed -count 2000 -i 6"); - System.out.println(""); - System.out.println("Options:"); - System.out.println(""); - System.out.println(" -train <file> the corpus a model is trained on; default " + this.trainfile); - System.out.println(" -test <file> the input corpus for testing; default " + this.testfile); - System.out.println(" -out <file> the output corpus (result) of a test run; default " + this.outfile); - System.out.println(" -model <file> the parsing model for traing the model is stored in the files"); - System.out.println( - " and for parsing the model is load from this file; default " + this.modelName); - System.out.println( - " -i <number> the number of training iterations; good numbers are 10 for smaller corpora and 6 for bigger; default " - + this.numIters); - System.out.println(" -count <number> the n first sentences of the corpus are take for the training default " - + this.count); - System.out.println(" -format <number> conll format of the year 8 or 9; default " + this.formatTask); - - System.exit(0); - } -} diff --git a/dependencyParser/mate-tools/src/is2/parserR2/ParametersFloat.java b/dependencyParser/mate-tools/src/is2/parserR2/ParametersFloat.java deleted file mode 100755 index 2ba0aaa..0000000 --- a/dependencyParser/mate-tools/src/is2/parserR2/ParametersFloat.java +++ /dev/null @@ -1,178 +0,0 @@ -package is2.parserR2; - -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.IOException; - -import is2.data.F2SF; -import is2.data.FV; -import is2.data.FVR; -import is2.data.Instances; -import is2.data.Parse; -import is2.util.DB; - -final public class ParametersFloat extends Parameters { - - public float[] parameters; - private float[] total; - - public ParametersFloat(int size) { - parameters = new float[size]; - total = new float[size]; - for (int i = 0; i < parameters.length; i++) { - parameters[i] = 0F; - total[i] = 0F; - } - } - - /** - * @param parameters2 - */ - public ParametersFloat(float[] p) { - parameters = p; - } - - @Override - public void average(double avVal) { - for (int j = 0; j < total.length; j++) { - parameters[j] = total[j] / ((float) avVal); - } - total = null; - } - - public ParametersFloat average2(double avVal) { - float[] px = new float[this.parameters.length]; - for (int j = 0; j < total.length; j++) { - px[j] = total[j] / ((float) avVal); - } - ParametersFloat pf = new ParametersFloat(px); - return pf; - } - - public void update(FV act, FV pred, Instances isd, int instc, Parse dx, double upd, double e, float d, float f) { - - e++; - - float lam_dist = d - f; - - float b = (float) e - lam_dist; - - FV dist = act.getDistVector(pred); - - dist.update(parameters, total, hildreth(dist, b), upd, false); - } - - @Override - public void update(FV act, FV pred, Instances isd, int instc, Parse dx, double upd, double e) { - - e++; - - float lam_dist = getScore(act) - getScore(pred); - - float b = (float) e - lam_dist; - - FV dist = act.getDistVector(pred); - - dist.update(parameters, total, hildreth(dist, b), upd, false); - } - - public void update(FVR act, FVR pred, Instances isd, int instc, Parse dx, double upd, double e, float lam_dist) { - - e++; - - float b = (float) e - lam_dist; - - FVR dist = act.getDistVector(pred); - - dist.update(parameters, total, hildreth(dist, b), upd, false); - } - - protected double hildreth(FV a, double b) { - - double A = a.dotProduct(a); - if (A <= 0.0000000000000000001) - return 0.0; - return b / A; - } - - protected double hildreth(FVR a, double b) { - - double A = a.dotProduct(a); - if (A <= 0.0000000000000000001) - return 0.0; - return b / A; - } - - public float getScore(FV fv) { - if (fv == null) - return 0.0F; - return fv.getScore(parameters, false); - - } - - public float getScore(FVR fv) { // xx - if (fv == null) - return 0.0F; - return fv.getScore(parameters, false); - - } - - @Override - final public void write(DataOutputStream dos) throws IOException { - - dos.writeInt(parameters.length); - for (float d : parameters) - dos.writeFloat(d); - - } - - @Override - public void read(DataInputStream dis) throws IOException { - - parameters = new float[dis.readInt()]; - int notZero = 0; - for (int i = 0; i < parameters.length; i++) { - parameters[i] = dis.readFloat(); - if (parameters[i] != 0.0F) - notZero++; - } - - DB.println("read parameters " + parameters.length + " not zero " + notZero); - - } - - public int countNZ() { - - int notZero = 0; - for (float parameter : parameters) { - if (parameter != 0.0F) - notZero++; - } - return notZero; - - // DB.println("read parameters "+parameters.length+" not zero - // "+notZero); - - } - - /* - * (non-Javadoc) - * - * @see is2.sp09k99995.Parameters#getFV() - */ - @Override - public F2SF getFV() { - return new F2SF(parameters); - } - - /* - * (non-Javadoc) - * - * @see is2.sp09k99999.Parameters#size() - */ - @Override - public int size() { - return parameters.length; - } - -} diff --git a/dependencyParser/mate-tools/src/is2/parserR2/Pipe.java b/dependencyParser/mate-tools/src/is2/parserR2/Pipe.java deleted file mode 100755 index 81ce59a..0000000 --- a/dependencyParser/mate-tools/src/is2/parserR2/Pipe.java +++ /dev/null @@ -1,261 +0,0 @@ -package is2.parserR2; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.concurrent.ExecutorService; - -import extractors.Extractor; -import extractors.ParallelExtract; -import is2.data.Cluster; -import is2.data.DataF; -import is2.data.Edges; -import is2.data.F2SF; -import is2.data.Instances; -import is2.data.Long2IntInterface; -import is2.data.MFB; -import is2.data.Parse; -import is2.data.PipeGen; -import is2.data.SentenceData09; -import is2.io.CONLLReader09; -import is2.util.OptionsSuper; - -final public class Pipe extends PipeGen { - - public Extractor[] extractor; - final public MFB mf = new MFB(); - - Cluster cl; - - private OptionsSuper options; - public static long timeExtract; - - public Pipe(OptionsSuper o) { - options = o; - } - - public void createInstances(String file, Instances is) - // throws Exception - - { - - CONLLReader09 depReader = new CONLLReader09(file); - - mf.register(REL, "<root-type>"); - - // register at least one predicate since the parsing data might not - // contain predicates as in - // the Japaness corpus but the development sets contains some - - System.out.print("Registering feature parts of sentence: "); - int ic = 0; - int del = 0; - while (true) { - SentenceData09 instance = depReader.getNext(); - if (instance == null) - break; - ic++; - - if (ic % 1000 == 0) { - del = outValue(ic, del); - } - - String[] labs1 = instance.labels; - for (String element : labs1) - mf.register(REL, element); - - String[] w = instance.forms; - for (String element : w) - mf.register(WORD, depReader.normalize(element)); - - w = instance.plemmas; - for (String element : w) - mf.register(WORD, depReader.normalize(element)); - - w = instance.ppos; - for (String element : w) - mf.register(POS, element); - - w = instance.gpos; - for (String element : w) - mf.register(POS, element); - - if (instance.feats != null) { - String fs[][] = instance.feats; - for (String[] element : fs) { - w = element; - if (w == null) - continue; - for (String element2 : w) - mf.register(FEAT, element2); - } - } - - if ((ic - 1) > options.count) - break; - } - del = outValue(ic, del); - - for (Extractor e : extractor) { - e.setMaxForm(mf.getFeatureCounter().get(WORD)); - } - - if (options.clusterFile == null) - cl = new Cluster(); - else - cl = new Cluster(options.clusterFile, mf, 6); - - mf.calculateBits(); - - System.out.println("" + mf.toString()); - - for (Extractor e : extractor) { - e.initStat(); - e.init(); - } - - depReader.startReading(file); - - int num1 = 0; - - Edges.init(mf.getFeatureCounter().get(POS)); - - System.out.print("Creating edge filters and read corpus: "); - del = 0; - - is.init(ic, new MFB()); - - while (true) { - if (num1 % 100 == 0) - del = outValue(num1, del); - - SentenceData09 instance1 = depReader.getNext(is); - - if (instance1 == null) - break; - - int last = is.size() - 1; - short[] pos = is.pposs[last]; - - for (int k = 0; k < is.length(last); k++) { - if (is.heads[last][k] < 0) - continue; - Edges.put(pos[is.heads[last][k]], pos[k], k < is.heads[last][k], is.labels[last][k]); - } - - if (!options.allFeatures && num1 > options.count) - break; - - num1++; - - } - del = outValue(num1, del); - System.out.println(); - Edges.findDefault(); - } - - public void getInstances(String file, Instances is) { - CONLLReader09 depReader = new CONLLReader09(file); - - int ic = options.count + 2; - - is.init(ic, new MFB()); - - int num1 = 0, del = 0; - while (true) { - if (num1 % 100 == 0) - del = outValue(num1, del); - - SentenceData09 instance1 = depReader.getNext(is); - - if (instance1 == null) - break; - - if (!options.allFeatures && num1 > options.count) - break; - - num1++; - - } - del = outValue(num1, del); - System.out.println(); - - } - - /** - * Creates an instance for outputParses - * - * @param is - * @return - * @throws IOException - */ - protected final SentenceData09 nextInstance(Instances is, CONLLReader09 depReader) throws Exception { - - SentenceData09 instance = depReader.getNext(is); - if (instance == null || instance.forms == null) - return null; - - return instance; - } - - public static ExecutorService executerService = java.util.concurrent.Executors.newFixedThreadPool(Parser.THREADS); - - public DataF fillVector(F2SF params, Instances is, int inst, DataF d, Cluster cluster, int threads, - Long2IntInterface li) throws InterruptedException { - - long ts = System.nanoTime(); - - if (executerService.isShutdown()) - executerService = java.util.concurrent.Executors.newCachedThreadPool(); - - final int length = is.length(inst); - if (d == null || d.len < length) - d = new DataF(length, mf.getFeatureCounter().get(PipeGen.REL).shortValue()); - - ArrayList<ParallelExtract> pe = new ArrayList<ParallelExtract>(); - - for (int i = 0; i < threads; i++) { - - // DB.println(""+((ExtractorClusterStackedR2)extractor[i]).s_dist); - pe.add(new ParallelExtract(extractor[i], is, inst, d, (F2SF) params.clone(), cluster, li)); - } - - for (int w1 = 0; w1 < length; w1++) { - for (int w2 = 0; w2 < length; w2++) { - if (w1 == w2) - continue; - ParallelExtract.add(w1, w2); - } - } - executerService.invokeAll(pe); - - timeExtract += (System.nanoTime() - ts); - - return d; - } - - /** - * the loss function - */ - public double errors(Instances is, int ic, Parse p) { - - if (p.heads == null) - p.signature2parse(p.signature()); - short[] act = is.heads[ic]; - double correct = 0; - - // do not count root - for (int i = 1; i < act.length; i++) { - if (p.heads[i] == act[i]) { - correct += 0.5; - if (p.labels[i] == is.labels[ic][i]) - correct += 0.5; - } - } - - double x = ((double) act.length - 1 - correct); - - // p.f1 = (double)correct / (double)(act.length-1); - - return x; - } -} diff --git a/dependencyParser/mate-tools/src/is2/parserR2/PipeReranker.java b/dependencyParser/mate-tools/src/is2/parserR2/PipeReranker.java deleted file mode 100644 index 622fe1c..0000000 --- a/dependencyParser/mate-tools/src/is2/parserR2/PipeReranker.java +++ /dev/null @@ -1,123 +0,0 @@ -package is2.parserR2; - -import java.util.concurrent.ExecutorService; - -import extractors.ExtractorReranker; -import is2.data.Cluster; -import is2.data.Edges; -import is2.data.Instances; -import is2.data.MFB; -import is2.data.PipeGen; -import is2.data.SentenceData09; -import is2.io.CONLLReader09; -import is2.util.OptionsSuper; - -final public class PipeReranker extends PipeGen { - - public ExtractorReranker extractor; - final public MFB mf = new MFB(); - - Cluster cl; - - private OptionsSuper options; - public static long timeExtract; - - public PipeReranker(OptionsSuper o) { - options = o; - } - - public void createInstances(String file, Instances is) - // throws Exception - - { - - CONLLReader09 depReader = new CONLLReader09(file); - - mf.register(REL, "<root-type>"); - - // register at least one predicate since the parsing data might not - // contain predicates as in - // the Japaness corpus but the development sets contains some - - System.out.print("Registering feature parts of sentence: "); - int ic = 0; - int del = 0; - while (true) { - SentenceData09 instance = depReader.getNext(); - if (instance == null) - break; - ic++; - - if (ic % 1000 == 0) { - del = outValue(ic, del); - } - - String[] labs1 = instance.labels; - for (String element : labs1) - mf.register(REL, element); - - String[] w = instance.forms; - for (String element : w) - mf.register(WORD, depReader.normalize(element)); - - w = instance.plemmas; - for (String element : w) - mf.register(WORD, depReader.normalize(element)); - - w = instance.ppos; - for (String element : w) - mf.register(POS, element); - - w = instance.gpos; - for (String element : w) - mf.register(POS, element); - - if (instance.feats != null) { - String fs[][] = instance.feats; - for (String[] element : fs) { - w = element; - if (w == null) - continue; - for (String element2 : w) - mf.register(FEAT, element2); - } - } - - if ((ic - 1) > options.count) - break; - } - del = outValue(ic, del); - - System.out.println(); - ExtractorReranker.initFeatures(); - - ExtractorReranker.maxForm = mf.getFeatureCounter().get(WORD); - - if (options.clusterFile == null) - cl = new Cluster(); - else - cl = new Cluster(options.clusterFile, mf, 6); - - mf.calculateBits(); - ExtractorReranker.initStat(); - - System.out.println("" + mf.toString()); - - extractor.init(); - depReader.startReading(file); - - int num1 = 0; - - is.init(ic, new MFB()); - - Edges.init(mf.getFeatureCounter().get(POS)); - - del = 0; - - del = outValue(num1, del); - System.out.println(); - } - - public static ExecutorService executerService = java.util.concurrent.Executors.newFixedThreadPool(Parser.THREADS); - -} diff --git a/dependencyParser/mate-tools/src/is2/tag/Lexicon.java b/dependencyParser/mate-tools/src/is2/tag/Lexicon.java deleted file mode 100644 index f719f26..0000000 --- a/dependencyParser/mate-tools/src/is2/tag/Lexicon.java +++ /dev/null @@ -1,150 +0,0 @@ -/** - * - */ -package is2.tag; - -import java.io.BufferedReader; -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStreamReader; - -import is2.data.IEncoderPlus; -import is2.data.PipeGen; -import is2.util.DB; - -/** - * @author Dr. Bernd Bohnet, 07.01.2011 - * - * - */ -public class Lexicon { - - public static final String FR = "FR", TAG = "TAG"; - - final byte[][] word2tag; - - public Lexicon(byte[][] w2t) { - - word2tag = w2t; - } - - public Lexicon(String clusterFile, IEncoderPlus mf) { - - final String REGEX = "\t"; - - // register words - try { - BufferedReader inputReader = new BufferedReader( - new InputStreamReader(new FileInputStream(clusterFile), "UTF-8"), 32768); - - int cnt = 0; - String line; - while ((line = inputReader.readLine()) != null) { - - try { - String[] split = line.split(REGEX); - // int f = Integer.parseInt(split[2]); - // if (f>2) { - cnt++; - mf.register(PipeGen.WORD, split[0]); - mf.register(TAG, split[1]); // tag - - if (split.length > 1) - mf.register(FR, split[1]); // frequency - // } - } catch (Exception e) { - System.out.println("Error in lexicon line " + cnt + " error: " + e.getMessage()); - } - } - System.out.println("read number of words from lexicon " + cnt); - inputReader.close(); - - } catch (Exception e) { - e.printStackTrace(); - } - - word2tag = new byte[mf.getFeatureCounter().get(PipeGen.WORD)][1]; - // insert words - try { - String line; - BufferedReader inputReader = new BufferedReader( - new InputStreamReader(new FileInputStream(clusterFile), "UTF-8"), 32768); - - while ((line = inputReader.readLine()) != null) { - - String[] split = line.split(REGEX); - int w = mf.getValue(PipeGen.WORD, split[0]); - if (w < 0) - continue; - word2tag[w][0] = (byte) mf.getValue(TAG, split[1]); - // if (split.length>1) word2tag[w][1]= (byte)mf.getValue(FR, - // split[2]); // frequency - } - inputReader.close(); - int fill = 0; - for (byte[] element : word2tag) { - if (element[0] != 0) - fill++; - } - System.out.println("filled " + fill + " of " + word2tag.length); - - } catch (Exception e) { - e.printStackTrace(); - } - } - - /** - * Read the cluster - * - * @param dos - * @throws IOException - */ - public Lexicon(DataInputStream dis) throws IOException { - - word2tag = new byte[dis.readInt()][1]; - for (int i = 0; i < word2tag.length; i++) { - word2tag[i][0] = dis.readByte(); - // word2tag[i][1]=dis.readByte(); - } - DB.println("Read lexicon with " + word2tag.length + " words "); - } - - /** - * Write the cluster - * - * @param dos - * @throws IOException - */ - public void write(DataOutputStream dos) throws IOException { - - dos.writeInt(word2tag.length); - for (byte[] i : word2tag) { - dos.writeByte(i[0]); - // dos.writeByte(i[1]); - } - - } - - /** - * @param form - * @return - */ - public int getTag(int form) { - if (word2tag.length < form || form < 0) - return -1; - return word2tag[form][0]; - } - - /** - * @param form - * @return - */ - public int getConf(int form) { - if (word2tag.length < form || form < 0) - return -1; - return word2tag[form][1]; - } - -} diff --git a/dependencyParser/mate-tools/src/is2/tag/Options.java b/dependencyParser/mate-tools/src/is2/tag/Options.java deleted file mode 100644 index 0998c70..0000000 --- a/dependencyParser/mate-tools/src/is2/tag/Options.java +++ /dev/null @@ -1,132 +0,0 @@ -package is2.tag; - -import java.io.File; - -import is2.util.OptionsSuper; - -public final class Options extends OptionsSuper { - - public Options(String[] args) { - - for (int i = 0; i < args.length; i++) { - String[] pair = args[i].split(":"); - - if (pair[0].equals("--help")) - explain(); - else if (pair[0].equals("-train")) { - train = true; - trainfile = args[i + 1]; - } else if (pair[0].equals("-eval")) { - eval = true; - goldfile = args[i + 1]; - i++; - } else if (pair[0].equals("-test")) { - test = true; - testfile = args[i + 1]; - i++; - } else if (pair[0].equals("-i")) { - numIters = Integer.parseInt(args[i + 1]); - i++; - } else if (pair[0].equals("-out")) { - outfile = args[i + 1]; - i++; - } else if (pair[0].equals("-decode")) { - decodeProjective = args[i + 1].equals("proj"); - i++; - } else if (pair[0].equals("-confidence")) { - - conf = true; - } - - else if (pair[0].equals("-count")) { - count = Integer.parseInt(args[i + 1]); - i++; - } else if (pair[0].equals("-model")) { - modelName = args[i + 1]; - i++; - } else if (pair[0].equals("-tmp")) { - tmp = args[i + 1]; - i++; - } else if (pair[0].equals("-format")) { - // format = args[i+1]; - formatTask = Integer.parseInt(args[i + 1]); - i++; - } else if (pair[0].equals("-allfeatures")) { - allFeatures = true; - } else if (pair[0].equals("-nonormalize")) { - normalize = false; - } else if (pair[0].equals("-nframes")) { - // format = args[i+1]; - nbframes = args[i + 1]; - i++; - - } else if (pair[0].equals("-pframes")) { - // format = args[i+1]; - pbframes = args[i + 1]; - i++; - } else if (pair[0].equals("-nopred")) { - nopred = true; - } else if (pair[0].equals("-divide")) { - keep = true; - } else if (pair[0].equals("-lexicon")) { - lexicon = args[i + 1]; - i++; - - } else - super.addOption(args, i); - - } - - try { - - if (trainfile != null) { - - if (keep && tmp != null) { - trainforest = new File(tmp); - if (!trainforest.exists()) - keep = false; - - } else if (tmp != null) { - trainforest = File.createTempFile("train", ".tmp", new File(tmp)); - trainforest.deleteOnExit(); - } else { - trainforest = File.createTempFile("train", ".tmp"); // ,new - // File("F:\\") - trainforest.deleteOnExit(); - } - - } - - } catch (java.io.IOException e) { - System.out.println("Unable to create tmp files for feature forests!"); - System.out.println(e); - System.exit(0); - } - } - - private void explain() { - System.out.println("Usage: "); - System.out.println("java -class mate.jar is2.parser.Parser [Options]"); - System.out.println(); - System.out.println("Example: "); - System.out.println( - " java -class mate.jar is2.parser.Parser -model eps3.model -train corpora/conll08st/train/train.closed -test corpora/conll08st/devel/devel.closed -out b3.test -eval corpora/conll08st/devel/devel.closed -count 2000 -i 6"); - System.out.println(""); - System.out.println("Options:"); - System.out.println(""); - System.out.println(" -train <file> the corpus a model is trained on; default " + this.trainfile); - System.out.println(" -test <file> the input corpus for testing; default " + this.testfile); - System.out.println(" -out <file> the output corpus (result) of a test run; default " + this.outfile); - System.out.println(" -model <file> the parsing model for traing the model is stored in the files"); - System.out.println( - " and for parsing the model is load from this file; default " + this.modelName); - System.out.println( - " -i <number> the number of training iterations; good numbers are 10 for smaller corpora and 6 for bigger; default " - + this.numIters); - System.out.println(" -count <number> the n first sentences of the corpus are take for the training default " - + this.count); - System.out.println(" -format <number> conll format of the year 8 or 9; default " + this.formatTask); - - System.exit(0); - } -} diff --git a/dependencyParser/mate-tools/src/is2/tag/package.html b/dependencyParser/mate-tools/src/is2/tag/package.html deleted file mode 100644 index 469fdf6..0000000 --- a/dependencyParser/mate-tools/src/is2/tag/package.html +++ /dev/null @@ -1,4 +0,0 @@ -Package info -<br><br> -This parser includes a tagger into the dependency parser -<br> \ No newline at end of file diff --git a/dependencyParser/mate-tools/src/is2/tools/Retrainable.java b/dependencyParser/mate-tools/src/is2/tools/Retrainable.java deleted file mode 100644 index 86fbfcc..0000000 --- a/dependencyParser/mate-tools/src/is2/tools/Retrainable.java +++ /dev/null @@ -1,30 +0,0 @@ -package is2.tools; - -import is2.data.SentenceData09; - -/** - * Provides Methods for the retraining - * - * @author bohnetbd - * - */ -public interface Retrainable { - - /** - * Retrains with a update factor (upd). The retraining stops when the model - * was successful adapted or it gave up after the maximal iterations. - * - * @param sentence - * the data container of the new example. - * @param upd - * the update factor, e.g. 0.01 - * @param iterations - * maximal number of iterations that are tried to adapt the - * system. - * @return success = true -- else false - */ - public boolean retrain(SentenceData09 sentence, float upd, int iterations); - - boolean retrain(SentenceData09 sentence, float upd, int iterations, boolean print); - -} diff --git a/dependencyParser/mate-tools/src/is2/util/Convert0409.java b/dependencyParser/mate-tools/src/is2/util/Convert0409.java deleted file mode 100644 index b735ad8..0000000 --- a/dependencyParser/mate-tools/src/is2/util/Convert0409.java +++ /dev/null @@ -1,176 +0,0 @@ -/** - * - */ -package is2.util; - -import is2.data.SentenceData09; -import is2.io.CONLLReader04; -import is2.io.CONLLReader09; -import is2.io.CONLLWriter06; -import is2.io.CONLLWriter09; - -/** - * @author Dr. Bernd Bohnet, 01.03.2010 - * - * - */ -public class Convert0409 { - - public static void main(String args[]) throws Exception { - - convert(args[0], args[1]); - - } - - public static void convert(String source, String target) throws Exception { - - CONLLReader04 reader = new CONLLReader04(source); - CONLLWriter09 writer = new CONLLWriter09(target); - - int str = 0; - while (true) { - SentenceData09 i = reader.getNext(); - str++; - if (i == null) - break; - - String[] formsNoRoot = new String[i.length() - 1]; - String[] posNoRoot = new String[formsNoRoot.length]; - String[] lemmas = new String[formsNoRoot.length]; - - String[] org_lemmas = new String[formsNoRoot.length]; - - String[] of = new String[formsNoRoot.length]; - String[] pf = new String[formsNoRoot.length]; - - String[] pposs = new String[formsNoRoot.length]; - String[] labels = new String[formsNoRoot.length]; - String[] fillp = new String[formsNoRoot.length]; - - int[] heads = new int[formsNoRoot.length]; - - for (int j = 0; j < formsNoRoot.length; j++) { - formsNoRoot[j] = i.forms[j + 1]; - if (formsNoRoot[j].length() == 0 || formsNoRoot[j].equals("")) { - System.out.println("error forms " + str); - // System.exit(0); - formsNoRoot[j] = " "; - } - posNoRoot[j] = i.gpos[j + 1]; - if (posNoRoot[j].length() == 0 || posNoRoot[j].equals(" ")) { - System.out.println("error pos " + str); - // System.exit(0); - } - pposs[j] = i.ppos[j + 1]; - if (pposs[j].length() == 0 || pposs[j].equals(" ")) { - System.out.println("error pos " + str); - // System.exit(0); - } - - labels[j] = i.labels[j + 1]; - if (labels[j].length() == 0 || labels[j].equals(" ")) { - System.out.println("error lab " + str); - // System.exit(0); - } - heads[j] = i.heads[j + 1]; - if (heads[j] > posNoRoot.length) { - System.out.println("head out of range " + heads[j] + " " + heads.length + " " + str); - heads[j] = posNoRoot.length; - } - - lemmas[j] = i.plemmas[j + 1]; - if (lemmas[j].length() == 0 || lemmas[j].equals(" ")) { - System.out.println("error lab " + str); - // System.exit(0); - } - org_lemmas[j] = i.lemmas[j + 1]; - if (org_lemmas[j].length() == 0 || org_lemmas[j].equals(" ")) { - System.out.println("error lab " + str); - // System.exit(0); - } - of[j] = i.ofeats[j + 1]; - pf[j] = i.pfeats[j + 1]; - if (str == 6099) { - // System.out.println(formsNoRoot[j]+"\t"+posNoRoot[j]+"\t"+pposs[j]+"\t"+labels[j]+"\t"+heads[j]); - } - - // (instance.fillp!=null) fillp[j] = instance.fillp[j+1]; - } - - SentenceData09 i09 = new SentenceData09(formsNoRoot, lemmas, org_lemmas, pposs, pposs, labels, heads, fillp, - of, pf); - - // public SentenceData09(String[] forms, String[] lemmas, String[] - // olemmas,String[] gpos, String[] ppos, String[] labs, int[] heads, - // String[] fillpred) { - // SentenceData09 - // SentenceData09 i2 = new SentenceData09(i.forms, - // i.lemmas,i.org_lemmas,); - - writer.write(i09); - - } - writer.finishWriting(); - - } - - public static void convert0906(String source, String target) throws Exception { - - CONLLReader09 reader = new CONLLReader09(source); - CONLLWriter06 writer = new CONLLWriter06(target); - - while (true) { - SentenceData09 i = reader.getNext(); - - if (i == null) - break; - - String[] formsNoRoot = new String[i.length() - 1]; - String[] posNoRoot = new String[formsNoRoot.length]; - String[] lemmas = new String[formsNoRoot.length]; - - String[] org_lemmas = new String[formsNoRoot.length]; - - String[] of = new String[formsNoRoot.length]; - String[] pf = new String[formsNoRoot.length]; - - String[] pposs = new String[formsNoRoot.length]; - String[] labels = new String[formsNoRoot.length]; - String[] fillp = new String[formsNoRoot.length]; - - int[] heads = new int[formsNoRoot.length]; - - for (int j = 0; j < formsNoRoot.length; j++) { - formsNoRoot[j] = i.forms[j + 1]; - posNoRoot[j] = i.gpos[j + 1]; - pposs[j] = i.ppos[j + 1]; - - labels[j] = i.labels[j + 1]; - heads[j] = i.heads[j + 1]; - lemmas[j] = i.plemmas[j + 1]; - - org_lemmas[j] = i.lemmas[j + 1]; - of[j] = i.ofeats[j + 1]; - pf[j] = i.pfeats[j + 1]; - - // (instance.fillp!=null) fillp[j] = instance.fillp[j+1]; - } - - SentenceData09 i09 = new SentenceData09(formsNoRoot, lemmas, org_lemmas, posNoRoot, pposs, labels, heads, - fillp, of, pf); - - // public SentenceData09(String[] forms, String[] lemmas, String[] - // olemmas,String[] gpos, String[] ppos, String[] labs, int[] heads, - // String[] fillpred) { - // SentenceData09 - // SentenceData09 i2 = new SentenceData09(i.forms, - // i.lemmas,i.org_lemmas,); - - writer.write(i09); - - } - writer.finishWriting(); - - } - -} diff --git a/dependencyParser/mate-tools/src/is2/util/ConvertADJ.java b/dependencyParser/mate-tools/src/is2/util/ConvertADJ.java deleted file mode 100644 index e6ca6c1..0000000 --- a/dependencyParser/mate-tools/src/is2/util/ConvertADJ.java +++ /dev/null @@ -1,121 +0,0 @@ -/** - * - */ -package is2.util; - -import is2.data.SentenceData09; -import is2.io.CONLLReader09; -import is2.io.CONLLWriter06; - -/** - * @author Dr. Bernd Bohnet, 01.03.2010 - * - * - */ -public class ConvertADJ { - - public static void main(String args[]) throws Exception { - - convert(args[0], args[1]); - - } - - public static void convert(String source, String target) throws Exception { - - CONLLReader09 reader = new CONLLReader09(source); - // CONLLWriter09 writer = new CONLLWriter09(target); - int adj = 0, argadj = 0; - int rb = 0, argrb = 0; - while (true) { - SentenceData09 i = reader.getNext(); - if (i == null) - break; - - for (int k = 0; k < i.length(); k++) { - - if (i.gpos[k].startsWith("JJ")) - adj++; - if (i.gpos[k].startsWith("RB")) - rb++; - - if (i.argposition != null) { - for (int[] element : i.argposition) { - if (element != null) - for (int a = 0; a < element.length; a++) { - if (element[a] == k && i.gpos[k].startsWith("JJ")) - argadj++; - if (element[a] == k && i.gpos[k].startsWith("RB")) - argrb++; - } - - } - } - // (instance.fillp!=null) fillp[j] = instance.fillp[j+1]; - } - - } - System.out.println("adj " + adj + " " + argadj); - System.out.println("rb " + rb + " " + argrb); - - } - - public static void convert0906(String source, String target) throws Exception { - - CONLLReader09 reader = new CONLLReader09(source); - CONLLWriter06 writer = new CONLLWriter06(target); - - while (true) { - SentenceData09 i = reader.getNext(); - - if (i == null) - break; - - String[] formsNoRoot = new String[i.length() - 1]; - String[] posNoRoot = new String[formsNoRoot.length]; - String[] lemmas = new String[formsNoRoot.length]; - - String[] org_lemmas = new String[formsNoRoot.length]; - - String[] of = new String[formsNoRoot.length]; - String[] pf = new String[formsNoRoot.length]; - - String[] pposs = new String[formsNoRoot.length]; - String[] labels = new String[formsNoRoot.length]; - String[] fillp = new String[formsNoRoot.length]; - - int[] heads = new int[formsNoRoot.length]; - - for (int j = 0; j < formsNoRoot.length; j++) { - formsNoRoot[j] = i.forms[j + 1]; - posNoRoot[j] = i.gpos[j + 1]; - pposs[j] = i.ppos[j + 1]; - - labels[j] = i.labels[j + 1]; - heads[j] = i.heads[j + 1]; - lemmas[j] = i.plemmas[j + 1]; - - org_lemmas[j] = i.lemmas[j + 1]; - of[j] = i.ofeats[j + 1]; - pf[j] = i.pfeats[j + 1]; - - // (instance.fillp!=null) fillp[j] = instance.fillp[j+1]; - } - - SentenceData09 i09 = new SentenceData09(formsNoRoot, lemmas, org_lemmas, posNoRoot, pposs, labels, heads, - fillp, of, pf); - - // public SentenceData09(String[] forms, String[] lemmas, String[] - // olemmas,String[] gpos, String[] ppos, String[] labs, int[] heads, - // String[] fillpred) { - // SentenceData09 - // SentenceData09 i2 = new SentenceData09(i.forms, - // i.lemmas,i.org_lemmas,); - - writer.write(i09); - - } - writer.finishWriting(); - - } - -} diff --git a/dependencyParser/mate-tools/src/is2/util/ConvertLowerCase0909.java b/dependencyParser/mate-tools/src/is2/util/ConvertLowerCase0909.java deleted file mode 100644 index e5842d6..0000000 --- a/dependencyParser/mate-tools/src/is2/util/ConvertLowerCase0909.java +++ /dev/null @@ -1,76 +0,0 @@ -/** - * - */ -package is2.util; - -import is2.data.SentenceData09; -import is2.io.CONLLReader09; -import is2.io.CONLLWriter09; - -/** - * @author Dr. Bernd Bohnet, 01.03.2010 - * - * - */ -public class ConvertLowerCase0909 { - - public static void main(String args[]) throws Exception { - - CONLLReader09 reader = new CONLLReader09(args[0]); - CONLLWriter09 writer = new CONLLWriter09(args[1]); - - while (true) { - SentenceData09 i = reader.getNext(); - if (i == null) - break; - - SentenceData09 i09 = new SentenceData09(i); - i09.createSemantic(i); - - for (int k = 0; k < i09.length(); k++) { - i09.lemmas[k] = i09.lemmas[k].toLowerCase(); - i09.plemmas[k] = i09.plemmas[k].toLowerCase(); - - } - - writer.write(i09); - - } - writer.finishWriting(); - - } - - public static void convert(String source, String target) throws Exception { - - CONLLReader09 reader = new CONLLReader09(source); - CONLLWriter09 writer = new CONLLWriter09(target); - - while (true) { - SentenceData09 i = reader.getNext(); - if (i == null) - break; - - SentenceData09 i09 = new SentenceData09(i); - i09.createSemantic(i); - - for (int k = 0; k < i09.length(); k++) { - i09.lemmas[k] = i09.lemmas[k].toLowerCase(); - i09.plemmas[k] = i09.plemmas[k].toLowerCase(); - - } - - // public SentenceData09(String[] forms, String[] lemmas, String[] - // olemmas,String[] gpos, String[] ppos, String[] labs, int[] heads, - // String[] fillpred) { - // SentenceData09 - // SentenceData09 i2 = new SentenceData09(i.forms, - // i.lemmas,i.org_lemmas,); - - writer.write(i09); - - } - writer.finishWriting(); - - } - -} diff --git a/dependencyParser/mate-tools/src/is2/util/ConvertTiger2CoNLL.java b/dependencyParser/mate-tools/src/is2/util/ConvertTiger2CoNLL.java deleted file mode 100644 index e650737..0000000 --- a/dependencyParser/mate-tools/src/is2/util/ConvertTiger2CoNLL.java +++ /dev/null @@ -1,120 +0,0 @@ -/** - * - */ -package is2.util; - -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.StringTokenizer; - -/** - * @author Dr. Bernd Bohnet, 17.01.2010 - * - * This class removes all information from a conll 2009 file except of - * columns 1 and 2 that contain the word id and the word form. - */ -public class ConvertTiger2CoNLL { - - public static void main(String[] args) throws IOException { - - OptionsSuper options = new OptionsSuper(args, null); - - if (options.trainfile != null) { - System.err.println( - "included sentences " + clean(options.trainfile, options.outfile, options.start, options.count)); - } else - System.err.println("Please proivde the file name -train <file-name>"); - - } - - /** - * @param trainfile - * @throws IOException - */ - private static int clean(String file, String outFile, int start, int numberOfSentences) throws IOException { - - System.err.println("writting to " + outFile); - System.err.println("start " + start + " to " + (start + numberOfSentences)); - int state = 0; - - BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"), 32768); - BufferedWriter writer = new BufferedWriter( - new java.io.OutputStreamWriter(new java.io.FileOutputStream(outFile), "UTF-8"), 32768); - String l = null; - try { - - int id = 1, snt = 0, cnt = 0; - - while ((l = reader.readLine()) != null) { - - if (l.startsWith("#BOS")) { - state = 1; // BOS - id = 1; - snt++; - continue; - } - if (l.startsWith("#EOS") && state == 1) { - state = 2; // BOS - cnt++; - - writer.newLine(); - } - - if (start > snt || (start + numberOfSentences) <= snt) { - state = 3; - } - - if (l.startsWith("#5") || l.startsWith("#6") || l.startsWith("#7")) - continue; - if ((start + numberOfSentences) <= snt) - break; - - if (state == 3) - continue; - - if (state == 1) { - - l = l.replace("\t\t", "\t"); - l = l.replace("\t\t", "\t"); - - StringTokenizer t = new StringTokenizer(l, "\t"); - int count = 0; - - writer.write("" + id + "\t"); - - while (t.hasMoreTokens()) { - if (count == 0) { - writer.write(t.nextToken() + "\t"); - } else if (count == 1) { - writer.write(t.nextToken() + "\t_\t"); - } else if (count == 2) { - writer.write(t.nextToken() + "\t_\t"); - } else if (count == 3) { - writer.write(t.nextToken().replace(".", "|") + "\t_\t"); - } else { - t.nextToken(); - } - count++; - } - writer.write("_\t_\t_\t_\t_\t_\t_\t_\t_"); - writer.newLine(); - } - id++; - } - writer.flush(); - writer.close(); - reader.close(); - - return cnt; - } catch (IOException e) { - e.printStackTrace(); - } - - return -1; - - } - -} diff --git a/dependencyParser/mate-tools/src/is2/util/DB.java b/dependencyParser/mate-tools/src/is2/util/DB.java deleted file mode 100755 index 30fd231..0000000 --- a/dependencyParser/mate-tools/src/is2/util/DB.java +++ /dev/null @@ -1,78 +0,0 @@ -package is2.util; - -import java.util.Calendar; -import java.util.GregorianCalendar; - -public class DB { - - private static final String ARROW = " -> "; - private static final String LEER = " "; - private static final String BIG = " "; - - private static boolean debug = true; - - final static public void println(Object err) { - - if (!debug) - return; - - StackTraceElement[] ste = new Exception().getStackTrace(); - - StringBuffer msg = new StringBuffer(); - msg.append((getDate().append(LEER).substring(0, 10))); - msg.append(' '); - msg.append(ste[1].getClassName() + " " + ste[1].getLineNumber()); - msg.append(':'); - msg.append(ste[1].getMethodName()); - msg.append(ARROW); - - int l = 55 - msg.length(); - if (l < 0) - l = 0; - msg.append(BIG.substring(0, l)); - - // if ((m_depth >= 0) && (m_depth < (BIG.length()) )) { - // vDebugMessage.append(BIG.substring(0, m_depth*2)); - // } - - msg.append(err); - - System.err.println(msg); - - } - - final static public void prints(Object err) { - - if (!debug) - return; - System.err.println(err); - - } - - final private static StringBuffer getDate() { - // if (Preferences.s_debug <= BDebug.FAIL) return s_sb; - - GregorianCalendar s_cal = new GregorianCalendar(); - StringBuffer sb = new StringBuffer(); - // sb.append(s_cal.get(Calendar.HOUR_OF_DAY)); - // sb.append('_'); - sb.append(s_cal.get(Calendar.MINUTE)); - sb.append('.'); - sb.append(s_cal.get(Calendar.SECOND)); - sb.append('.'); - sb.append(s_cal.get(Calendar.MILLISECOND)); - - return sb; - } - - public static void setDebug(boolean b) { - debug = b; - - } - - public static boolean getDebug() { - - return debug; - } - -} diff --git a/dependencyParser/mate-tools/src/is2/util/Edges.java b/dependencyParser/mate-tools/src/is2/util/Edges.java deleted file mode 100644 index 2457cae..0000000 --- a/dependencyParser/mate-tools/src/is2/util/Edges.java +++ /dev/null @@ -1,197 +0,0 @@ -/** - * - */ -package is2.util; - -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.IOException; -import java.util.Comparator; -import java.util.HashMap; -import java.util.Map.Entry; - -/** - * @author Dr. Bernd Bohnet, 13.05.2009; - * - * - */ -public final class Edges { - - private static short[][][] edges; - private static HashMap<Short, Integer> labelCount = new HashMap<Short, Integer>(); - - private static HashMap<String, Integer> slabelCount = new HashMap<String, Integer>(); - - static short[] def = new short[1]; - - private Edges() { - } - - /** - * @param length - */ - public static void init(int length) { - edges = new short[length][length][]; - } - - public static void findDefault() { - - int best = 0; - - for (Entry<Short, Integer> e : labelCount.entrySet()) { - - if (best < e.getValue()) { - best = e.getValue(); - def[0] = e.getKey(); - } - } - - // labelCount=null; - // String[] types = new String[mf.getFeatureCounter().get(PipeGen.REL)]; - // for (Entry<String, Integer> e : - // MFO.getFeatureSet().get(PipeGen.REL).entrySet()) types[e.getValue()] - // = e.getKey(); - - is2.util.DB.println("set default label to " + def[0] + " "); - - // System.out.println("found default "+def[0]); - - } - - final static public void put(int pos1, int pos2, short label) { - putD(pos1, pos2, label); - // putD(pos2, pos1,!dir, label); - } - - final static public void putD(int pos1, int pos2, short label) { - - Integer lc = labelCount.get(label); - if (lc == null) - labelCount.put(label, 1); - else - labelCount.put(label, lc + 1); - - String key = pos1 + "-" + pos2 + label; - Integer lcs = slabelCount.get(key); - if (lcs == null) - slabelCount.put(key, 1); - else - slabelCount.put(key, lcs + 1); - - if (edges[pos1][pos2] == null) { - edges[pos1][pos2] = new short[1]; - edges[pos1][pos2][0] = label; - - // edgesh[pos1][pos2][dir?0:1] = new TIntHashSet(2); - // edgesh[pos1][pos2][dir?0:1].add(label); - } else { - short labels[] = edges[pos1][pos2]; - for (short l : labels) { - // contains label already? - if (l == label) - return; - } - - short[] nlabels = new short[labels.length + 1]; - System.arraycopy(labels, 0, nlabels, 0, labels.length); - nlabels[labels.length] = label; - edges[pos1][pos2] = nlabels; - - // edgesh[pos1][pos2][dir?0:1].add(label); - } - } - - final static public short[] get(int pos1, int pos2) { - - if (pos1 < 0 || pos2 < 0 || edges[pos1][pos2] == null) - return def; - return edges[pos1][pos2]; - } - - /** - * @param dis - */ - static public void write(DataOutputStream d) throws IOException { - - int len = edges.length; - d.writeShort(len); - - for (int p1 = 0; p1 < len; p1++) { - for (int p2 = 0; p2 < len; p2++) { - if (edges[p1][p2] == null) - d.writeShort(0); - else { - d.writeShort(edges[p1][p2].length); - for (int l = 0; l < edges[p1][p2].length; l++) { - d.writeShort(edges[p1][p2][l]); - } - - } - } - } - - d.writeShort(def[0]); - - } - - /** - * @param dis - */ - public static void read(DataInputStream d) throws IOException { - int len = d.readShort(); - - edges = new short[len][len][]; - for (int p1 = 0; p1 < len; p1++) { - for (int p2 = 0; p2 < len; p2++) { - int ll = d.readShort(); - if (ll == 0) { - edges[p1][p2] = null; - } else { - edges[p1][p2] = new short[ll]; - for (int l = 0; l < ll; l++) { - edges[p1][p2][l] = d.readShort(); - } - } - } - } - - def[0] = d.readShort(); - - } - - public static class C implements Comparator<Short> { - - public C() { - super(); - } - - String _key; - - public C(String key) { - super(); - _key = key; - } - - /* - * (non-Javadoc) - * - * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) - */ - @Override - public int compare(Short l1, Short l2) { - - // int c1 = labelCount.get(l1); - // int c2 = labelCount.get(l2); - // if (true) return c1==c2?0:c1>c2?-1:1; - - int x1 = slabelCount.get(_key + l1.shortValue()); - int x2 = slabelCount.get(_key + l2.shortValue()); - // System.out.println(x1+" "+x2); - - return x1 == x2 ? 0 : x1 > x2 ? -1 : 1; - - } - - } - -} diff --git a/dependencyParser/mate-tools/src/is2/util/ExtractParagraphs.java b/dependencyParser/mate-tools/src/is2/util/ExtractParagraphs.java deleted file mode 100644 index aa65d8d..0000000 --- a/dependencyParser/mate-tools/src/is2/util/ExtractParagraphs.java +++ /dev/null @@ -1,74 +0,0 @@ -package is2.util; - -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.OutputStreamWriter; - -public class ExtractParagraphs { - - /** - * - * @param args - * @throws IOException - */ - public static void main(String args[]) throws IOException { - - if (args.length < 1) { - System.out.println("Please provide a file name."); - System.exit(0); - } - - File file = new File(args[0]); - file.isDirectory(); - String[] dirs = file.list(); - - BufferedWriter write = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[1]), "UTF-8"), - 32768); - int cnt = 0; - - for (String fileName : dirs) { - BufferedReader reader = new BufferedReader( - new InputStreamReader(new FileInputStream(args[0] + fileName), "UTF-8"), 32768); - - int state = 0; - - String s; - while ((s = reader.readLine()) != null) { - - if (s.startsWith("<P>") || s.startsWith("<p>")) { - state = 1; // paragraph start - continue; - } - - if (s.startsWith("</P>") || s.startsWith("</p>")) { - state = 2; // paragraph end - write.newLine(); - } - - if (state == 1) { - String sp[] = s.split("\\. "); - for (String p : sp) { - write.write(p); - // if (sp.length>1) write.newLine(); - } - cnt++; - } - } - - // if (cnt>5000) break; - - reader.close(); - } - write.flush(); - write.close(); - - System.out.println("Extract " + cnt + " lines "); - - } - -} diff --git a/dependencyParser/mate-tools/src/is2/util/IntStack.java b/dependencyParser/mate-tools/src/is2/util/IntStack.java deleted file mode 100644 index b291d16..0000000 --- a/dependencyParser/mate-tools/src/is2/util/IntStack.java +++ /dev/null @@ -1,90 +0,0 @@ -/** - * - */ -package is2.util; - -/** - * @author Dr. Bernd Bohnet, 01.06.2011 - * - * - */ -final public class IntStack { - - final public int[] stack; - public int position = -1; - - public IntStack(int size) { - if (size <= 0) - stack = new int[1]; - else - stack = new int[size + 1]; - } - - public IntStack(IntStack s) { - stack = s.stack; - position = s.position; - } - - public int peek() { - return position == -1 ? -1 : stack[position]; - } - - public void push(int i) { - // if (i ==2)new Exception().printStackTrace(); - stack[++position] = i; - } - - public int pop() { - return position == -1 ? -1 : stack[position--]; - } - - public int size() { - return position + 1; - } - - public boolean isEmpty() { - return position == -1 ? true : false; - } - - public int get(int p) { - return stack[p]; - } - - public void clear() { - position = -1; - } - - /** - * @param b - */ - public void addAll(IntStack b) { - - position = b.position; - if (position < 0) - return; - - for (int k = 0; k <= position; k++) - stack[k] = b.stack[k]; - - } - - public boolean contains(int s) { - ; - - for (int k = 0; k <= position; k++) - if (stack[k] == s) - return true; - - return false; - } - - @Override - public String toString() { - StringBuffer s = new StringBuffer(); - for (int k = position; k >= 0; k--) { - s.append(k).append(":").append(this.stack[k]).append(" "); - } - return s.toString(); - } - -} diff --git a/dependencyParser/mate-tools/src/is2/util/Long2Int.java b/dependencyParser/mate-tools/src/is2/util/Long2Int.java deleted file mode 100644 index e6ef45c..0000000 --- a/dependencyParser/mate-tools/src/is2/util/Long2Int.java +++ /dev/null @@ -1,81 +0,0 @@ -package is2.util; - -import is2.data.Long2IntInterface; - -/** - * @author Bernd Bohnet, 01.09.2009 - * - * Maps for the Hash Kernel the long values to the int values. - */ -final public class Long2Int implements Long2IntInterface { - - public Long2Int() { - size = 115911564; - } - - public Long2Int(int s) { - size = s; - } - - /** Integer counter for long2int */ - final private int size; // 0x03ffffff //0x07ffffff - - /* - * (non-Javadoc) - * - * @see is2.sp09k9992.Long2IntIterface#size() - */ - @Override - public int size() { - return size; - } - - /* - * (non-Javadoc) - * - * @see is2.sp09k9992.Long2IntIterface#start() has no meaning for this - * implementation - */ - final public void start() { - } - - /* - * (non-Javadoc) - * - * @see is2.sp09k9992.Long2IntIterface#l2i(long) - */ - @Override - final public int l2i(long l) { - if (l < 0) - return -1; - - // this works well LAS 88.138 - // int r= (int)(( l ^ (l&0xffffffff00000000L) >>> 29 ));//0x811c9dc5 ^ - // // 29 - // return Math.abs(r % size); - // this works a bit better and good with 0x03ffffff - // - /* - * long r= l;//26 l = (l>>12)&0xfffffffffffff000L; r ^= l;//38 l = - * (l>>11)&0xffffffffffffc000L; r ^= l;//49 l = (l>>9)& - * 0xffffffffffff0000L; //53 r ^= l;//58 l = (l>>7)&0xfffffffffffc0000L; - * //62 r ^=l;//65 int x = (int)r; x = x % size; // return x >= 0 ? x : - * -x ;// Math.abs(r % size); - * - */ - // 26 0x03ffffff - // together with 0x07ffffff 27 88.372 - long r = l;// 27 - l = (l >> 13) & 0xffffffffffffe000L; - r ^= l; // 40 - l = (l >> 11) & 0xffffffffffff0000L; - r ^= l; // 51 - l = (l >> 9) & 0xfffffffffffc0000L; // 53 - r ^= l; // 60 - l = (l >> 7) & 0xfffffffffff00000L; // 62 - r ^= l; // 67 - int x = ((int) r) % size; - - return x >= 0 ? x : -x; - } -} diff --git a/dependencyParser/mate-tools/src/is2/util/Options.java b/dependencyParser/mate-tools/src/is2/util/Options.java deleted file mode 100644 index 30b53b0..0000000 --- a/dependencyParser/mate-tools/src/is2/util/Options.java +++ /dev/null @@ -1,133 +0,0 @@ -package is2.util; - -import java.io.File; - -public final class Options extends OptionsSuper { - - public Options(String[] args) { - - for (int i = 0; i < args.length; i++) { - String[] pair = args[i].split(":"); - - if (pair[0].equals("--help")) - explain(); - else if (pair[0].equals("-train")) { - train = true; - trainfile = args[i + 1]; - } else if (pair[0].equals("-eval")) { - eval = true; - goldfile = args[i + 1]; - i++; - } else if (pair[0].equals("-test")) { - test = true; - testfile = args[i + 1]; - i++; - } else if (pair[0].equals("-i")) { - numIters = Integer.parseInt(args[i + 1]); - i++; - } else if (pair[0].equals("-out")) { - outfile = args[i + 1]; - i++; - } else if (pair[0].equals("-decode")) { - decodeProjective = args[i + 1].equals("proj"); - i++; - } else if (pair[0].equals("-confidence")) { - - conf = true; - } - - else if (pair[0].equals("-count")) { - count = Integer.parseInt(args[i + 1]); - i++; - } else if (pair[0].equals("-model")) { - modelName = args[i + 1]; - i++; - } else if (pair[0].equals("-device")) { - device = args[i + 1]; - i++; - } else if (pair[0].equals("-tmp")) { - tmp = args[i + 1]; - i++; - } else if (pair[0].equals("-format")) { - // format = args[i+1]; - formatTask = Integer.parseInt(args[i + 1]); - i++; - } else if (pair[0].equals("-allfeatures")) { - allFeatures = true; - } else if (pair[0].equals("-nonormalize")) { - normalize = false; - } else if (pair[0].equals("-nframes")) { - // format = args[i+1]; - nbframes = args[i + 1]; - i++; - - } else if (pair[0].equals("-pframes")) { - // format = args[i+1]; - pbframes = args[i + 1]; - i++; - } else if (pair[0].equals("-nopred")) { - nopred = true; - } else if (pair[0].equals("-divide")) { - keep = true; - } else if (pair[0].equals("-lexicon")) { - lexicon = args[i + 1]; - i++; - - } else - super.addOption(args, i); - - } - - try { - - if (trainfile != null) { - - if (keep && tmp != null) { - trainforest = new File(tmp); - if (!trainforest.exists()) - keep = false; - - } else if (tmp != null) { - trainforest = File.createTempFile("train", ".tmp", new File(tmp)); - trainforest.deleteOnExit(); - } else { - trainforest = File.createTempFile("train", ".tmp"); // ,new - // File("F:\\") - trainforest.deleteOnExit(); - } - - } - - } catch (java.io.IOException e) { - System.out.println("Unable to create tmp files for feature forests!"); - System.out.println(e); - System.exit(0); - } - } - - private void explain() { - System.out.println("Usage: "); - System.out.println("java -class mate.jar is2.parser.Parser [Options]"); - System.out.println(); - System.out.println("Example: "); - System.out.println( - " java -class mate.jar is2.parser.Parser -model eps3.model -train corpora/conll08st/train/train.closed -test corpora/conll08st/devel/devel.closed -out b3.test -eval corpora/conll08st/devel/devel.closed -count 2000 -i 6"); - System.out.println(""); - System.out.println("Options:"); - System.out.println(""); - System.out.println(" -train <file> the corpus a model is trained on; default " + this.trainfile); - System.out.println(" -test <file> the input corpus for testing; default " + this.testfile); - System.out.println(" -out <file> the output corpus (result) of a test run; default " + this.outfile); - System.out.println(" -model <file> the parsing model for traing the model is stored in the files"); - System.out.println( - " and for parsing the model is load from this file; default " + this.modelName); - System.out.println( - " -i <number> the number of training iterations; good numbers are 10 for smaller corpora and 6 for bigger; default " - + this.numIters); - System.out.println(" -count <number> the n first sentences of the corpus are take for the training default " - + this.count); - System.out.println(" -format <number> conll format of the year 8 or 9; default " + this.formatTask); - - System.exit(0); - } -} diff --git a/dependencyParser/mate-tools/src/is2/util/OptionsSuper.java b/dependencyParser/mate-tools/src/is2/util/OptionsSuper.java deleted file mode 100755 index f6370f7..0000000 --- a/dependencyParser/mate-tools/src/is2/util/OptionsSuper.java +++ /dev/null @@ -1,231 +0,0 @@ -package is2.util; - -import java.io.File; - -public class OptionsSuper { - - public String trainfile = null; - public String testfile = null; - public File trainforest = null; - - public String nbframes = null; - public String pbframes = null; - - public boolean nopred = false; - public boolean upper = false; - - public boolean train = false; - public boolean eval = false; - public boolean test = false; - public boolean keep = false; - public boolean flt = false; - public boolean loadTaggerModels = false; - - public String modelName = "prs.mdl"; - public String modelTaggerName = null; - - public String useMapping = null; - public String device = "C:"; - public String tmp = null; - public boolean createForest = true; - public boolean decodeProjective = false; - public double decodeTH = 0.3d; - public String format = "CONLL"; - public int formatTask = 9; - public int numIters = 10; - public int best = 1000; - public String outfile = "dp.conll"; - public String charset = "UTF-8"; - public String phraseTrain = null; - public String phraseTest = null; - public String goldfile = null; - public String gout = "sec23.gld"; - public String features = null; - public String lexicon = null; - public int hsize = 0x07ffffff; - public int maxLen = 2000; - public int maxForms = Integer.MAX_VALUE; - public int beam = 4; - public float prune = -100000000; - - public String third = ""; - public String second = ""; - public String first = ""; - - public int cross = 10; - - // public boolean secondOrder = true; - public boolean useRelationalFeatures = false; - public int count = 10000000; - public int cores = Integer.MAX_VALUE; - public int start = 0; - public int minOccureForms = 0; - public int tt = 30; // tagger averaging - public boolean allFeatures = false; - public boolean normalize = false; - public boolean no2nd = false; - public boolean noLemmas = false; - public boolean few2nd = false, noLinear = false, noMorph = false; - public String clusterFile; - - // output confidence values - public boolean conf = false; - public String phraseFormat = "penn"; // tiger | penn - public boolean average = true; - public boolean label = false; - public boolean stack = false; - public boolean oneRoot = false; - - public String significant1 = null, significant2 = null; - - // horizontal stacking - public int minLength = 0, maxLength = Integer.MAX_VALUE; - public boolean overwritegold = false; - - public static final int MULTIPLICATIVE = 1, SHIFT = 2; - public int featureCreation = MULTIPLICATIVE; - - public OptionsSuper(String[] args, String dummy) { - - for (int i = 0; i < args.length; i++) { - i = addOption(args, i); - } - - } - - public OptionsSuper() { - } - - public int addOption(String args[], int i) { - - if (args[i].equals("-train")) { - train = true; - trainfile = args[i + 1]; - } else if (args[i].equals("-eval")) { - eval = true; - goldfile = args[i + 1]; - i++; - } else if (args[i].equals("-gout")) { - gout = args[i + 1]; - i++; - } else if (args[i].equals("-test")) { - test = true; - testfile = args[i + 1]; - i++; - } else if (args[i].equals("-sig1")) { - significant1 = args[i + 1]; - i++; - } else if (args[i].equals("-sig2")) { - significant2 = args[i + 1]; - i++; - } else if (args[i].equals("-i")) { - numIters = Integer.parseInt(args[i + 1]); - i++; - } else if (args[i].equals("-out")) { - outfile = args[i + 1]; - i++; - } else if (args[i].equals("-cluster")) { - clusterFile = args[i + 1]; - i++; - } - - else if (args[i].equals("-count")) { - count = Integer.parseInt(args[i + 1]); - i++; - } else if (args[i].equals("-model")) { - modelName = args[i + 1]; - i++; - } else if (args[i].equals("-tmodel")) { - this.modelTaggerName = args[i + 1]; - i++; - } else if (args[i].equals("-nonormalize")) { - normalize = false; - } else if (args[i].equals("-float")) { - flt = true; - } else if (args[i].equals("-hsize")) { - hsize = Integer.parseInt(args[i + 1]); - i++; - } else if (args[i].equals("-charset")) { - charset = args[++i]; - } else if (args[i].equals("-pstrain")) { - this.phraseTrain = args[i + 1]; - i++; - } else if (args[i].equals("-pstest")) { - this.phraseTest = args[i + 1]; - i++; - } else if (args[i].equals("-len")) { - maxLen = Integer.parseInt(args[i + 1]); - i++; - } else if (args[i].equals("-cores")) { - cores = Integer.parseInt(args[i + 1]); - i++; - } else if (args[i].equals("-start")) { - start = Integer.parseInt(args[i + 1]); - i++; - } else if (args[i].equals("-max")) { - maxLength = Integer.parseInt(args[i + 1]); - i++; - } else if (args[i].equals("-min")) { - minLength = Integer.parseInt(args[i + 1]); - i++; - } else if (args[i].equals("-noLemmas")) { - noLemmas = true; - } else if (args[i].equals("-noavg")) { - this.average = false; - } else if (args[i].equals("-label")) { - label = true; - } else if (args[i].equals("-stack")) { - stack = true; - } else if (args[i].equals("-overwritegold")) { - overwritegold = true; - } else if (args[i].equals("-format")) { - formatTask = Integer.parseInt(args[++i]); - } else if (args[i].equals("-tt")) { - tt = Integer.parseInt(args[++i]); - } else if (args[i].equals("-min-occure-forms")) { - minOccureForms = Integer.parseInt(args[++i]); - } else if (args[i].equals("-loadTaggerModels")) { - this.loadTaggerModels = true; - ; - - } else if (args[i].equals("-feature_creation")) { - this.featureCreation = args[++i].equals("shift") ? SHIFT : MULTIPLICATIVE; - } - - return i; - - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("FLAGS ["); - sb.append("train-file: " + trainfile); - sb.append(" | "); - sb.append("test-file: " + testfile); - sb.append(" | "); - sb.append("gold-file: " + goldfile); - sb.append(" | "); - sb.append("output-file: " + outfile); - sb.append(" | "); - sb.append("model-name: " + modelName); - sb.append(" | "); - sb.append("train: " + train); - sb.append(" | "); - sb.append("test: " + test); - sb.append(" | "); - sb.append("eval: " + eval); - sb.append(" | "); - sb.append("training-iterations: " + numIters); - sb.append(" | "); - sb.append("decode-type: " + decodeProjective); - sb.append(" | "); - sb.append("create-forest: " + createForest); - sb.append(" | "); - sb.append("format: " + format); - - sb.append("]\n"); - return sb.toString(); - } - -} \ No newline at end of file diff --git a/dependencyParser/mate-tools/src/is2/util/ParserEvaluator.java b/dependencyParser/mate-tools/src/is2/util/ParserEvaluator.java deleted file mode 100644 index 95e8949..0000000 --- a/dependencyParser/mate-tools/src/is2/util/ParserEvaluator.java +++ /dev/null @@ -1,100 +0,0 @@ -package is2.util; - -import is2.data.SentenceData09; -import is2.io.CONLLReader09; - -public class ParserEvaluator { - - public static final String PUNCT = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; - - public static class Results { - - public int total; - public int corr; - public float las; - public float ula; - - } - - public static Results evaluate(String act_file, String pred_file) throws Exception { - - CONLLReader09 goldReader = new CONLLReader09(act_file, -1); - CONLLReader09 predictedReader = new CONLLReader09(pred_file, -1); - - int total = 0, corr = 0, corrL = 0; - int numsent = 0, corrsent = 0, corrsentL = 0; - SentenceData09 goldInstance = goldReader.getNext(); - SentenceData09 predInstance = predictedReader.getNext(); - - while (goldInstance != null) { - - int instanceLength = goldInstance.length(); - - if (instanceLength != predInstance.length()) - System.out.println("Lengths do not match on sentence " + numsent); - - int[] goldHeads = goldInstance.heads; - String[] goldLabels = goldInstance.labels; - int[] predHeads = predInstance.pheads; - String[] predLabels = predInstance.plabels; - - boolean whole = true; - boolean wholeL = true; - - // NOTE: the first item is the root info added during - // nextInstance(), so we skip it. - - int punc = 0; - for (int i = 1; i < instanceLength; i++) { - if (predHeads[i] == goldHeads[i]) { - corr++; - - if (goldLabels[i].equals(predLabels[i])) - corrL++; - else { - // System.out.println(numsent+" error gold - // "+goldLabels[i]+" "+predLabels[i]+" head - // "+goldHeads[i]+" child "+i); - wholeL = false; - } - } else { - // System.out.println(numsent+"error gold "+goldLabels[i]+" - // "+predLabels[i]+" head "+goldHeads[i]+" child "+i); - whole = false; - wholeL = false; - } - } - total += ((instanceLength - 1) - punc); // Subtract one to not score - // fake root token - - if (whole) - corrsent++; - if (wholeL) - corrsentL++; - numsent++; - - goldInstance = goldReader.getNext(); - predInstance = predictedReader.getNext(); - } - - Results r = new Results(); - - r.total = total; - r.corr = corr; - r.las = (float) Math.round(((double) corrL / total) * 100000) / 1000; - r.ula = (float) Math.round(((double) corr / total) * 100000) / 1000; - System.out.print("Total: " + total + " \tCorrect: " + corr + " "); - System.out.println("LAS: " + (double) Math.round(((double) corrL / total) * 100000) / 1000 + " \tTotal: " - + (double) Math.round(((double) corrsentL / numsent) * 100000) / 1000 + " \tULA: " - + (double) Math.round(((double) corr / total) * 100000) / 1000 + " \tTotal: " - + (double) Math.round(((double) corrsent / numsent) * 100000) / 1000); - - return r; - } - - public static float round(double v) { - - return Math.round(v * 10000F) / 10000F; - } - -} diff --git a/dependencyParser/mate-tools/src/is2/util/Split.java b/dependencyParser/mate-tools/src/is2/util/Split.java deleted file mode 100755 index ea1151b..0000000 --- a/dependencyParser/mate-tools/src/is2/util/Split.java +++ /dev/null @@ -1,89 +0,0 @@ -package is2.util; - -import java.io.BufferedReader; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.Reader; -import java.nio.channels.Channels; -import java.nio.channels.FileChannel; -import java.nio.charset.Charset; -import java.nio.charset.CharsetDecoder; -import java.util.StringTokenizer; - -public class Split { - - /** - * Splits a tokenized sentences into one word per line format: - * - * Input > I am an text . > Sentence two ... - * - * Output: I _ _ _ ... am _ _ _ ... ... - * - * @param args - * @throws IOException - */ - public static void main(String args[]) throws IOException { - - if (args.length != 1) { - System.out.println("Please provide a file name."); - System.exit(0); - } - - String filename = args[0]; - // Charset charset = Charset.forName("UTF-8"); - - FileInputStream in = new FileInputStream(filename); - FileChannel channel = in.getChannel(); - CharsetDecoder decoder = Charset.defaultCharset().newDecoder();// charset.newDecoder(); - Reader infile = Channels.newReader(channel, decoder, 16 * 1024); - BufferedReader bInfile = new BufferedReader(infile); - - // DataOutputStream dos = new DataOutputStream(new - // BufferedOutputStream(new FileOutputStream(options.modelName))); - - String s; - while ((s = bInfile.readLine()) != null) { - - // do the first tokens contain a colon? - int colon = 0; - for (int k = 0; k < 12; k++) { - if (s.length() <= k) - break; - if (s.charAt(k) == ':') { - - colon++; - break; - } - if (s.charAt(k) == ' ') - break; - } - - String prefix = colon > 0 ? s.substring(0, s.indexOf(":")) + "_" : ""; - - if (colon > 0) { - s = s.substring(s.indexOf(":") + 1); - } - - StringTokenizer t = new StringTokenizer(s); - int i = 1; - boolean found = false; - while (t.hasMoreTokens()) { - found = true; - String tk = t.nextToken(); - if (tk.contains("=")) - continue; - System.out.print(prefix + i + "\t"); - System.out.print(tk); - System.out.println("\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_"); - i++; - } - if (found) - System.out.println(); - - } - bInfile.close(); - in.close(); - - } - -} diff --git a/dependencyParser/mate-tools/src/is2/util/Split2.java b/dependencyParser/mate-tools/src/is2/util/Split2.java deleted file mode 100644 index 4ed4004..0000000 --- a/dependencyParser/mate-tools/src/is2/util/Split2.java +++ /dev/null @@ -1,56 +0,0 @@ -package is2.util; - -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.OutputStreamWriter; -import java.util.StringTokenizer; - -public class Split2 { - - /** - * Splits a tokenized sentences into one word per line format: - * - * Input > I am an text . > Sentence two ... - * - * Output: I _ _ _ ... am _ _ _ ... ... - * - * @param args - * @throws IOException - */ - public static void main(String args[]) throws IOException { - - if (args.length < 1) { - System.out.println("Please provide a file name."); - System.exit(0); - } - - BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]), "UTF-8"), 32768); - BufferedWriter write = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[1]), "ISO-8859-1")); - - String s; - int cnt = 0; - while ((s = reader.readLine()) != null) { - StringTokenizer t = new StringTokenizer(s); - while (t.hasMoreTokens()) { - String tk = t.nextToken(); - for (int c : tk.toCharArray()) { - if (c < 0 && c >= 255) - System.out.println("contain sign " + c + " " + cnt); - } - write.write(tk); - write.newLine(); - cnt++; - } - write.newLine(); - } - reader.close(); - write.flush(); - write.close(); - - } - -} diff --git a/dependencyParser/mate-tools/src/is2/util/Split3.java b/dependencyParser/mate-tools/src/is2/util/Split3.java deleted file mode 100644 index 2cf7cf2..0000000 --- a/dependencyParser/mate-tools/src/is2/util/Split3.java +++ /dev/null @@ -1,51 +0,0 @@ -package is2.util; - -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.OutputStreamWriter; -import java.util.StringTokenizer; - -public class Split3 { - - /** - * Splits a tokenized sentences into one word per line format: - * - * Input > I am an text . > Sentence two ... - * - * Output: I _ _ _ ... am _ _ _ ... ... - * - * @param args - * @throws IOException - */ - public static void main(String args[]) throws IOException { - - if (args.length < 1) { - System.out.println("Please provide a file name."); - System.exit(0); - } - - BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]), "UTF-8"), 32768); - BufferedWriter write = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[1]), "UTF-8"), - 32768); - - String s; - while ((s = reader.readLine()) != null) { - StringTokenizer t = new StringTokenizer(s); - while (t.hasMoreTokens()) { - String tk = t.nextToken(); - write.write(tk); - write.newLine(); - } - write.newLine(); - } - reader.close(); - write.flush(); - write.close(); - - } - -}