package gate.creole.vu.util; import java.io.*; /** * Created by IntelliJ IDEA. * User: marta * Date: Jan 17, 2005 * Time: 12:23:49 PM * Utility to transform minipar output to a gate output. */ public class MiniparToGate { public static void main (String args[]){ MiniparToGate mtg = new MiniparToGate(); mtg.run(); } public void run(){ //this is the directory containing the files processed by MInipar File directory = new File("D:\\Test\\in\\TestCorpus\\"); File currentFile; File[] allFiles = directory.listFiles(); for (int i = 0; i < allFiles.length; i++){ currentFile = allFiles[i]; System.out.println("Current File " + currentFile.getName()); processFile(currentFile); } } private void processFile(File file){ String line, out, rel, token, pos, head, tag; //this is the directory where the trasnformed files will be written. File newFile = new File("D:\\Test\\out\\TestCorpus\\" + file.getName().replaceAll(".txt", ".xml")); int govStart, sentenceNr = 0; boolean firstSentence = true; try{ BufferedReader fr = new BufferedReader(new FileReader(file)); BufferedWriter fw = new BufferedWriter(new FileWriter(newFile)); fw.write(""); fw.newLine(); fw.write(""); line = fr.readLine(); while (line != null){ if (firstSentence){ firstSentence = false; fw.write(""); sentenceNr = 1; } else { //at new sentence println. if ( line.startsWith(">")){ fw.write(""); fw.newLine(); sentenceNr = sentenceNr + 1; fw.write(""); } } //processes only lines that do not start with tehse symbols if (!(line.startsWith(">") | line.startsWith(")")| line.startsWith("E"))) { //eliminate "gov" govStart = line.indexOf("(gov"); if (govStart > -1) { line = line.substring(0, govStart); } line = line.trim(); line = line.replaceAll("\t", " "); out = "" + token + ""; fw.write(out); } line = fr.readLine(); } fw.write(""); fw.write(""); fr.close(); fw.flush(); fw.close(); } catch (Exception exc){ exc.printStackTrace(); } } }