ExtractParagraphs.java 1.72 KB
package is2.util;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.nio.channels.Channels;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.util.StringTokenizer;

public class ExtractParagraphs {

	/**

	 * @param args
	 * @throws IOException
	 */
	public static void main(String args[]) throws IOException {

		if (args.length<1) {
			System.out.println("Please provide a file name.");
			System.exit(0);
		}
		
		File file = new File(args[0]);
		file.isDirectory();
		String[] dirs = file.list();
	
		BufferedWriter write = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args[1]),"UTF-8"),32768);
		int cnt=0;
		
for (String fileName : dirs) {		
	    BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]+fileName),"UTF-8"),32768);
	
		
		
	
		int state =0;
	
		String s;
		while ((s = reader.readLine()) != null) {
			
			if (s.startsWith("<P>")||s.startsWith("<p>")) {
				state=1; // paragraph start
				continue;
			}
			
		
			
			
			if (s.startsWith("</P>")||s.startsWith("</p>")) {
				state=2; // paragraph end
				write.newLine();
			}
			
			boolean lastNL =false;
			if (state==1) {
				String sp[] = s.split("\\. ");
				for(String p : sp) {
					write.write(p);
	//				if (sp.length>1) write.newLine();
				}
				cnt++;
			}
		}
		
		//if (cnt>5000) break;
		
		reader.close();
}
		write.flush();
		write.close();
		
		System.out.println("Extract "+cnt+" lines ");
		
			
	}
	
	
}