Cluster.java 3.64 KB
/**
 * 
 */
package is2.data;



import is2.util.DB;

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;

/**
 * @author Dr. Bernd Bohnet, 28.10.2010
 * 
 * 
 */
final public class Cluster {

	public static final String LPATH = "LP";
	public static final String SPATH = "SP";

	// [word][p]  p = [0:long-path | 1:short-path]  
	final private short[][] word2path; 
	
	public Cluster() {
		word2path =new short[0][0];
	}
	
	/**
	 * @param clusterFile
	 * @param mf
	 * 
	 */
	public Cluster(String clusterFile, IEncoderPlus mf, int ls) {

		final String REGEX = "\t";

		// register words
		try {
			BufferedReader	inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(clusterFile),"UTF-8"),32768);

			int cnt=0;
			String line;
			while ((line =inputReader.readLine())!=null) {

				cnt++;
				try {
				String[] split = line.split(REGEX); 
				mf.register(SPATH, split[0].length()<ls?split[0]:split[0].substring(0,ls));
				mf.register(LPATH, split[0]);
				mf.register(PipeGen.WORD, split[1]);
				} catch(Exception e) {
					System.out.println("Error in cluster line "+cnt+" error: "+e.getMessage());
				}
			}
			System.out.println("read number of clusters "+cnt);
			inputReader.close();
			
		} catch (Exception e) {
			e.printStackTrace();
		}
			
		word2path = new short[mf.getFeatureCounter().get(PipeGen.WORD)][2];

		
		// insert words
		try {
			String line;
			BufferedReader	inputReader = new BufferedReader(new InputStreamReader(new FileInputStream(clusterFile),"UTF-8"),32768);

			while ((line =inputReader.readLine())!=null) {

				String[] split = line.split(REGEX);
				int wd = mf.getValue(PipeGen.WORD, split[1]);
				word2path[wd][0] = (short)mf.getValue(SPATH, split[0].length()<ls?split[0]:split[0].substring(0,ls));
				word2path[wd][1] = (short)mf.getValue(LPATH, split[0]);
			}
			inputReader.close();
			int fill=0;
			for(int l = 0; l<word2path.length; l++ ){
				if (word2path[l][0]!=0) fill++;
			}
			/*
			for(int l = 0; l<word2path.length; l++ ){
				if (word2path[l][1]!=0) fillL++;
				if (word2path[l][1]<-1) System.out.println("lower "+word2path[l][1]);
			} 
			*/
			System.out.println("filled "+fill+" of "+word2path.length);
			
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	/**
	 * Read the cluster
	 * @param dos
	 * @throws IOException 
	 */
	public Cluster(DataInputStream dis) throws IOException {

		word2path = new short[dis.readInt()][2];
		for(int i =0;i<word2path.length;i++) {
			word2path[i][0]=dis.readShort();
			word2path[i][1]=dis.readShort();
		}
		//DB.println("Read cluster with "+word2path.length+" words ");
	}
	
	/**
	 * Write the cluster
	 * @param dos
	 * @throws IOException 
	 */
	public void write(DataOutputStream dos) throws IOException {

		dos.writeInt(word2path.length);
		for(short[] i : word2path) {
			dos.writeShort(i[0]);
			dos.writeShort(i[1]);
		}
	
	}

	/**
	 * @param form the id of a word form
	 * @return the short path to the word form in the cluster
	
	final public int getSP(int form) {
		if (word2path.length<form) return -1;
		return word2path[form][0];
	}
 */
	/**
	 * get the long path to a word form in the cluster
	 * @param form the id of a word form
	 * @return the long path to the word
	 */
	final public int getLP(int form) {
		if (word2path.length<=form || word2path[form].length<=0) return -1;
		return word2path[form][0]==0?-1:word2path[form][0];
	}
	
	final public int getLP(int form, int l) {
		if (word2path.length<form) return -1;
		return word2path[form][l]==0?-1:word2path[form][l];
	}

	final public int size() {
		return word2path.length;
	}
}