Blame view

tools/mate-tools/src/is2/util/Split2.java 1.51 KB
Jan Lupa authored
1
2
3
4
5
6
7
8
9
package is2.util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
Jan Lupa authored
10
11
12
13
14
import java.io.Reader;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
Jan Lupa authored
15
16
17
18
19
20
21
import java.util.StringTokenizer;

public class Split2 {

	/**
	 * Splits a tokenized sentences into one word per line format:
	 *
Jan Lupa authored
22
23
24
25
26
27
28
29
30
	 * Input
	 * > I am an text .
	 * > Sentence two ...
	 * 
	 * Output:
	 * I	_	_	_ 	...
	 * am	_	_	_ 	...
	 * ...
	 * 
Jan Lupa authored
31
32
33
34
35
	 * @param args
	 * @throws IOException
	 */
	public static void main(String args[]) throws IOException {
Jan Lupa authored
36
		if (args.length<1) {
Jan Lupa authored
37
38
39
			System.out.println("Please provide a file name.");
			System.exit(0);
		}
Jan Lupa authored
40
41
42
43
44
45


	    BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]),"UTF-8"),32768);
		BufferedWriter write = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[1]),"ISO-8859-1"));
Jan Lupa authored
46
		String s;
Jan Lupa authored
47
		int cnt=0;
Jan Lupa authored
48
49
		while ((s = reader.readLine()) != null) {
			StringTokenizer t = new StringTokenizer(s);
Jan Lupa authored
50
51
52
53
			while(t.hasMoreTokens()) {
				String tk =t.nextToken();
				for(int c : tk.toCharArray()) {
					if (c<0 && c>=255) System.out.println("contain sign "+c+" "+cnt);
Jan Lupa authored
54
55
56
57
58
59
60
61
62
63
				}
				write.write(tk);
				write.newLine();
				cnt++;
			}
			write.newLine();
		}
		reader.close();
		write.flush();
		write.close();
Jan Lupa authored
64
65
66
Jan Lupa authored
67
	}
Jan Lupa authored
68
69
Jan Lupa authored
70
}