Cluster.java 3.78 KB
/**
 *
 */
package is2.data;

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;

import is2.util.DB;

/**
 * @author Dr. Bernd Bohnet, 28.10.2010
 *
 *
 */
final public class Cluster {

	public static final String LPATH = "LP";
	public static final String SPATH = "SP";

	// [word][p] p = [0:long-path | 1:short-path]
	final private short[][] word2path;

	public Cluster() {
		word2path = new short[0][0];
	}

	/**
	 * @param clusterFile
	 * @param mf
	 *
	 */
	public Cluster(String clusterFile, IEncoderPlus mf, int ls) {

		final String REGEX = "\t";

		// register words
		try {
			BufferedReader inputReader = new BufferedReader(
					new InputStreamReader(new FileInputStream(clusterFile), "UTF-8"), 32768);

			int cnt = 0;
			String line;
			while ((line = inputReader.readLine()) != null) {

				cnt++;
				try {
					String[] split = line.split(REGEX);
					mf.register(SPATH, split[0].length() < ls ? split[0] : split[0].substring(0, ls));
					mf.register(LPATH, split[0]);
					mf.register(PipeGen.WORD, split[1]);
				} catch (Exception e) {
					System.out.println("Error in cluster line " + cnt + " error: " + e.getMessage());
				}
			}
			System.out.println("read number of clusters " + cnt);
			inputReader.close();

		} catch (Exception e) {
			e.printStackTrace();
		}

		word2path = new short[mf.getFeatureCounter().get(PipeGen.WORD)][2];

		// insert words
		try {
			String line;
			BufferedReader inputReader = new BufferedReader(
					new InputStreamReader(new FileInputStream(clusterFile), "UTF-8"), 32768);

			while ((line = inputReader.readLine()) != null) {

				String[] split = line.split(REGEX);
				int wd = mf.getValue(PipeGen.WORD, split[1]);
				word2path[wd][0] = (short) mf.getValue(SPATH,
						split[0].length() < ls ? split[0] : split[0].substring(0, ls));
				word2path[wd][1] = (short) mf.getValue(LPATH, split[0]);
			}
			inputReader.close();
			int fill = 0;
			for (short[] element : word2path) {
				if (element[0] != 0)
					fill++;
			}
			/*
			 * for(int l = 0; l<word2path.length; l++ ){ if (word2path[l][1]!=0)
			 * fillL++; if (word2path[l][1]<-1)
			 * System.out.println("lower "+word2path[l][1]); }
			 */
			System.out.println("filled " + fill + " of " + word2path.length);

		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	/**
	 * Read the cluster
	 * 
	 * @param dos
	 * @throws IOException
	 */
	public Cluster(DataInputStream dis) throws IOException {

		word2path = new short[dis.readInt()][2];
		for (int i = 0; i < word2path.length; i++) {
			word2path[i][0] = dis.readShort();
			word2path[i][1] = dis.readShort();
		}
		DB.println("Read cluster with " + word2path.length + " words ");
	}

	/**
	 * Write the cluster
	 * 
	 * @param dos
	 * @throws IOException
	 */
	public void write(DataOutputStream dos) throws IOException {

		dos.writeInt(word2path.length);
		for (short[] i : word2path) {
			dos.writeShort(i[0]);
			dos.writeShort(i[1]);
		}

	}

	/**
	 * @param form
	 *            the id of a word form
	 * @return the short path to the word form in the cluster
	 * 
	 *         final public int getSP(int form) { if (word2path.length<form)
	 *         return -1; return word2path[form][0]; }
	 */
	/**
	 * get the long path to a word form in the cluster
	 * 
	 * @param form
	 *            the id of a word form
	 * @return the long path to the word
	 */
	final public int getLP(int form) {
		if (word2path.length <= form || word2path[form].length <= 0)
			return -1;
		return word2path[form][0] == 0 ? -1 : word2path[form][0];
	}

	final public int getLP(int form, int l) {
		if (word2path.length < form)
			return -1;
		return word2path[form][l] == 0 ? -1 : word2path[form][l];
	}

	final public int size() {
		return word2path.length;
	}
}