CONLLWriter09.java 7.51 KB
package is2.io;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.StringTokenizer;

import is2.data.SentenceData09;
import is2.util.DB;

public class CONLLWriter09 extends IOGenerals {

	int format = 0;

	public static final String DASH = "_";

	public static final boolean NO_ROOT = true, ROOT = false;

	protected BufferedWriter writer;

	public CONLLWriter09() {
	}

	public static void main(String args[]) throws IOException {

		if (args.length == 2) {
			File f = new File(args[0]);
			File f2 = new File(args[1]);
			BufferedReader ir = new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8"), 32768);
			BufferedWriter br = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(f2), "UTF8"));
			;
			boolean found = false;
			boolean tab = false;
			while (true) {
				String l = ir.readLine();
				if (l == null)
					break;
				String x = l.trim();
				if (x.endsWith("\t"))
					tab = true;
				br.write(x);
				br.newLine();
				if (!l.equals(x))
					found = true;

			}
			ir.close();
			br.flush();
			br.close();

			if (found)
				DB.println("found diff. found tab? " + tab);
		} else if (args.length == 3) {
			File f1 = new File(args[1]);
			File f2 = new File(args[2]);

			BufferedReader ir1 = new BufferedReader(new InputStreamReader(new FileInputStream(f1), "UTF-8"), 32768);
			BufferedReader ir2 = new BufferedReader(new InputStreamReader(new FileInputStream(f2), "UTF-8"), 32768);

			int line = 0, alltabs1 = 0, alltabs2 = 0;
			while (true) {
				String l1 = ir1.readLine();
				String l2 = ir2.readLine();

				if (l1 == null && l2 != null)
					DB.println("files do not end at the same line ");
				if (l1 != null && l2 == null)
					DB.println("files do not end at the same line ");
				if (l1 == null)
					break;
				StringTokenizer t1 = new StringTokenizer(l1, "\t");
				StringTokenizer t2 = new StringTokenizer(l2, "\t");
				int tabs1 = 0;
				while (t1.hasMoreTokens()) {

					t1.nextElement();
					tabs1++;
					alltabs1++;
				}

				int tabs2 = 0;
				while (t2.hasMoreTokens()) {

					t2.nextElement();
					tabs2++;
					alltabs2++;
				}
				line++;
				if (tabs1 != tabs2) {
					DB.println("number of tabs different in line " + line + " file1-tabs " + tabs1 + " file2-tabs "
							+ tabs2);
					System.exit(0);
				}

			}
			ir1.close();
			ir2.close();
			DB.println("checked lines " + line + " with tabs in file 1 " + alltabs1 + " in file2 " + alltabs2);

		} else {
			File f = new File(args[0]);
			String[] dir = f.list();
			for (String fx : dir) {
				BufferedReader ir = new BufferedReader(
						new InputStreamReader(new FileInputStream(args[0] + File.separatorChar + fx), "UTF-8"), 32768);
				System.out.println("check file " + fx);
				while (true) {
					String l = ir.readLine();
					if (l == null)
						break;
					if (l.endsWith("\t")) {
						DB.println("found tab in file " + fx);
						break;
					}
				}
				ir.close();
			}
		}

	}

	public CONLLWriter09(String file) {

		try {
			writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF8"));
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	public CONLLWriter09(Writer writer) {
		this.writer = new BufferedWriter(writer);
	}

	public CONLLWriter09(String outfile, int formatTask) {
		this(outfile);
	}

	public void write(SentenceData09 inst) throws IOException {
		write(inst, NO_ROOT);
	}

	/**
	 *
	 * @param inst
	 * @param root
	 *            true: remove root node
	 * @throws IOException
	 */
	public void write(SentenceData09 inst, boolean root) throws IOException {

		int i, mod;
		if (root && (inst.forms[0].startsWith("<root")
				|| (inst.lemmas[0] != null && inst.lemmas[0].startsWith("<root")))) {
			i = 1;
			mod = 0;
		} else {
			i = 0;
			mod = 1;
		}
		// =()?1:0;

		if (format == IOGenerals.F_ONE_LINE) {
			boolean first = true;
			for (; i < inst.length(); i++) {
				if (first) {
					first = false;
				} else
					writer.write(" ");
				writer.write(inst.plemmas[i]);
			}
			writer.newLine();

			return;
		}

		for (; i < inst.length(); i++) {

			if (inst.id == null || inst.id[i] == null) {
				writer.write(Integer.toString(i + mod));
				writer.write('\t');
			} // id
			else {
				writer.write(inst.id[i]);
				writer.write('\t');
			}

			writer.write(inst.forms[i]);
			writer.write('\t'); // form

			if (inst.lemmas != null && inst.lemmas[i] != null) {
				writer.write(inst.lemmas[i]);
			} else
				writer.write(DASH); // lemma
			writer.write('\t');

			if (inst.plemmas != null && inst.plemmas[i] != null)
				writer.write(inst.plemmas[i]);
			else
				writer.write(DASH); // plemma
			writer.write('\t');

			if (inst.gpos[i] != null)
				writer.write(inst.gpos[i]); // gpos
			else
				writer.write(DASH);
			writer.write('\t');

			if (inst.ppos != null && inst.ppos[i] != null)
				writer.write(inst.ppos[i]);
			else
				writer.write(DASH); // ppos
			writer.write('\t');

			if (inst.ofeats != null && inst.ofeats[i] != null)
				writer.write(inst.ofeats[i]);
			else
				writer.write(DASH);
			writer.write('\t');

			// writer.write(DASH); writer.write('\t'); // feat
			if (inst.pfeats != null && inst.pfeats[i] != null) {
				// System.out.println(""+inst.pfeats[i]);
				writer.write(inst.pfeats[i]);
			} else
				writer.write(DASH);
			writer.write('\t');

			writer.write(Integer.toString(inst.heads[i]));
			writer.write('\t'); // head

			if (inst.pheads != null)
				writer.write(Integer.toString(inst.pheads[i]));
			else
				writer.write(DASH);
			writer.write('\t'); // phead

			if (inst.labels[i] != null)
				writer.write(inst.labels[i]); // rel
			else
				writer.write(DASH);
			writer.write('\t');

			if (inst.plabels != null && inst.plabels[i] != null)
				writer.write(inst.plabels[i]); // rel
			else
				writer.write(DASH);
			writer.write('\t');

			if (inst.fillp != null && inst.fillp[i] != null)
				writer.write(inst.fillp[i]); // fill p
			else {
				writer.write(DASH);
			}

			// writer.write('\t');

			if (inst.sem == null) {
				writer.write('\t');
				writer.write(DASH);

			} else {

				boolean foundPred = false;
				// print the predicate
				for (int p = 0; p < inst.sem.length; p++) {
					if (inst.semposition[p] == i) {
						foundPred = true;
						// System.out.println("write pred "+inst.sem[p] );
						writer.write('\t');
						writer.write(inst.sem[p]);

						// if (inst.sem[p].startsWith(".")) DB.println("error
						// "+inst.sem[p]);
					}
				}

				if (!foundPred) {
					writer.write('\t');
					writer.write(DASH);
					// writer.write('\t');
					// writer.write(DASH);
				}

				// print the arguments
				for (int p = 0; p < inst.sem.length; p++) {

					boolean found = false;
					if (inst.arg != null && inst.arg.length > p && inst.arg[p] != null)
						for (int a = 0; a < inst.arg[p].length; a++) {

							if (i == inst.argposition[p][a]) {
								writer.write('\t');
								writer.write(inst.arg[p][a]);
								found = true;
								break;
							}

						}
					if (!found) {
						writer.write('\t');
						writer.write(DASH);
					}

				}

			}
			writer.newLine();
		}
		writer.newLine();
		writer.flush();
	}

	public void finishWriting() throws IOException {
		writer.flush();
		writer.close();
	}

	/**
	 * Sets the output format such as CoNLL or one line for the lemmata of the
	 * sentence (see F_xxxx constants).
	 * 
	 * @param formatTask
	 */
	public void setOutputFormat(int formatTask) {
		format = formatTask;
	}

}