Pipe.java 5.03 KB
package is2.parser;

import java.io.IOException;
import java.util.ArrayList;
import java.util.concurrent.ExecutorService;

import is2.data.Cluster;
import is2.data.DataFES;
import is2.data.F2SF;
import is2.data.Instances;
import is2.data.Parse;
import is2.data.PipeGen;
import is2.data.SentenceData09;
import is2.io.CONLLReader09;
import is2.util.OptionsSuper;

final public class Pipe extends PipeGen {

	public Extractor[] extractor;
	final public MFO mf = new MFO();

	public Cluster cl;

	private OptionsSuper options;
	public static long timeExtract;

	public Pipe(OptionsSuper o) {
		options = o;
	}

	public void createInstances(String file, Instances is) throws Exception {

		CONLLReader09 depReader = new CONLLReader09(file);

		mf.register(REL, "<root-type>");

		// register at least one predicate since the parsing data might not
		// contain predicates as in
		// the Japaness corpus but the development sets contains some

		System.out.print("Registering feature parts of sentence: ");
		int ic = 0;
		int del = 0;
		while (true) {
			SentenceData09 instance = depReader.getNext();
			if (instance == null)
				break;
			ic++;

			if (ic % 1000 == 0) {
				del = outValue(ic, del);
			}

			String[] labs1 = instance.labels;
			for (String element : labs1)
				mf.register(REL, element);

			String[] w = instance.forms;
			for (String element : w)
				mf.register(WORD, depReader.normalize(element));

			w = instance.plemmas;
			for (String element : w)
				mf.register(WORD, depReader.normalize(element));

			w = instance.ppos;
			for (String element : w)
				mf.register(POS, element);

			w = instance.gpos;
			for (String element : w)
				mf.register(POS, element);

			if (instance.feats != null) {
				String fs[][] = instance.feats;
				for (String[] element : fs) {
					w = element;
					if (w == null)
						continue;
					for (String element2 : w)
						mf.register(FEAT, element2);
				}
			}

			if ((ic - 1) > options.count)
				break;
		}
		del = outValue(ic, del);

		System.out.println();
		Extractor.initFeatures();

		Extractor.maxForm = mf.getFeatureCounter().get(WORD);

		if (options.clusterFile == null)
			cl = new Cluster();
		else
			cl = new Cluster(options.clusterFile, mf, 6);

		mf.calculateBits();
		Extractor.initStat(options.featureCreation);

		System.out.println("" + mf.toString());

		for (Extractor e : extractor)
			e.init();

		depReader.startReading(file);

		int num1 = 0;

		is.init(ic, new MFO());

		Edges.init(mf.getFeatureCounter().get(POS));

		System.out.print("Creating edge filters and read corpus: ");
		del = 0;

		while (true) {
			if (num1 % 100 == 0)
				del = outValue(num1, del);

			SentenceData09 instance1 = depReader.getNext(is);

			if (instance1 == null)
				break;

			int last = is.size() - 1;
			short[] pos = is.pposs[last];

			for (int k = 0; k < is.length(last); k++) {
				if (is.heads[last][k] < 0)
					continue;
				Edges.put(pos[is.heads[last][k]], pos[k], is.labels[last][k]);
				// Edges.put(pos[k],pos[is.heads[last][k]], is.labels[last][k]);
			}

			if (!options.allFeatures && num1 > options.count)
				break;

			num1++;

		}
		del = outValue(num1, del);
		System.out.println();
		Edges.findDefault();
	}

	/**
	 * Creates an instance for outputParses
	 *
	 * @param is
	 * @return
	 * @throws IOException
	 */
	protected final SentenceData09 nextInstance(Instances is, CONLLReader09 depReader) throws Exception {

		SentenceData09 instance = depReader.getNext(is);
		if (instance == null || instance.forms == null)
			return null;

		return instance;
	}

	public static ExecutorService executerService = java.util.concurrent.Executors.newFixedThreadPool(Parser.THREADS);

	public DataFES fillVector(F2SF params, Instances is, int inst, DataFES d, Cluster cluster)
			throws InterruptedException {

		long ts = System.nanoTime();

		if (executerService.isShutdown())
			executerService = java.util.concurrent.Executors.newCachedThreadPool();

		final int length = is.length(inst);
		if (d == null || d.len < length)
			d = new DataFES(length, mf.getFeatureCounter().get(PipeGen.REL).shortValue());

		ArrayList<ParallelExtract> pe = new ArrayList<ParallelExtract>();
		for (int i = 0; i < Parser.THREADS; i++)
			pe.add(new ParallelExtract(extractor[i], is, inst, d, (F2SF) params.clone(), cluster));

		for (int w1 = 0; w1 < length; w1++) {
			for (int w2 = w1 + 1; w2 < length; w2++) {

				if (w1 == w2)
					continue;

				ParallelExtract.add(w1, w2);

			}
		}
		// for(int i=0;i<efp.length;i++) efp[i].start();
		// for(int i=0;i<efp.length;i++) efp[i].join();
		executerService.invokeAll(pe);

		timeExtract += (System.nanoTime() - ts);

		return d;
	}

	public double errors(Instances is, int ic, Parse p) {
		short[] act = is.heads[ic];
		double correct = 0;

		// do not count root
		for (int i = 1; i < act.length; i++) {

			// if (is.ppos[ic] ==null ) System.out.println("mf
			// null"+is.ppos[ic][i]);
			if (p.heads[i] == act[i]) {
				correct += 0.5;
				if (p.labels[i] == is.labels[ic][i])
					correct += 0.5;
			}
		}

		double x = ((double) act.length - 1 - correct);

		p.f1 = correct / (act.length - 1);

		return x;
	}
}