From 1ed8f98cfc84530725600672e69a2762c69c97f3 Mon Sep 17 00:00:00 2001 From: bniton <bartek.niton@gmail.com> Date: Mon, 4 Jun 2018 11:54:51 +0200 Subject: [PATCH] Removed unused depparse layer. --- src/main/java/pl/waw/ipipan/zil/core/md/Main.java | 22 +++++----------------- src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java | 26 +++----------------------- src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java | 77 ++++------------------------------------------------------------------------- 3 files changed, 12 insertions(+), 113 deletions(-) diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/Main.java b/src/main/java/pl/waw/ipipan/zil/core/md/Main.java index 32dea8d..d41fff9 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/Main.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/Main.java @@ -21,11 +21,9 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; -import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import java.io.PrintWriter; import java.util.ArrayList; import java.util.EnumMap; import java.util.HashMap; @@ -152,14 +150,6 @@ public class Main { File inputDir = new File(args[0]); File outputDir = new File(args[1]); - File defsOutputFile = new File(args[1], "definitions.csv"); - PrintWriter defsWriter = null; - try { - defsWriter = new PrintWriter(defsOutputFile); - } catch (FileNotFoundException e1) { - // TODO Auto-generated catch block - e1.printStackTrace(); - } if (!inputDir.isDirectory()) { logger.error(inputDir + " is not a directory!"); @@ -188,15 +178,13 @@ public class Main { try { File targetDir = createTargetTextDir(inputDir, outputDir, teiDir); TEICorpusText teiText = TeiLoader.readTeiText(teiDir); - annotateTeiText(teiText, teiDir, defsWriter); + annotateTeiText(teiText, teiDir); TeiSaver.saveTeiText(teiText, targetDir, GZIP_OUTPUT); } catch (IOException e) { logger.error("Error processing text in dir:" + teiDir + " Error details: " + e.getLocalizedMessage(), e); errors++; } } - - defsWriter.close(); logger.info(all + " texts processed succesfully."); if (errors > 0) @@ -231,9 +219,9 @@ public class Main { * @param thriftText text to annotate with mentions * @throws MultiserviceException when an error occures */ - public static void annotateThriftText(TText thriftText, PrintWriter defsWriter) throws MultiserviceException { + public static void annotateThriftText(TText thriftText) throws MultiserviceException { Text responseText = ThriftLoader.loadTextFromThrift(thriftText); - Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence, defsWriter); + Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence); ThriftSaver.updateThriftText(responseText, thriftText); } @@ -244,9 +232,9 @@ public class Main { * @param teiText text to annotate with mentions * @throws TEIException when an error occurs */ - public static void annotateTeiText(TEICorpusText teiText, File textDir, PrintWriter defsWriter) throws TEIException { + public static void annotateTeiText(TEICorpusText teiText, File textDir) throws TEIException { Text responseText = TeiLoader.loadTextFromTei(teiText, textDir); - Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence, defsWriter); + Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence); TeiSaver.updateTeiText(responseText, teiText); } diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java b/src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java index 1c4197c..5fb6a06 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java @@ -29,44 +29,24 @@ public class Detector { HeadDetector headModel, ZeroSubjectDetector zeroSubjectModel, NominalMentionDetector nominalMentionModel, - Map<ValenceDicts,Map<String,ArrayList<String>>> valence, - PrintWriter defsWriter) { + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { text.clearMentions(); logger.debug("Detecting mentions in text " + text.getId()); for (Paragraph p : text) for (Sentence s : p) - detectMentionsInSentence(s, headModel, zeroSubjectModel, nominalMentionModel, valence, defsWriter); + detectMentionsInSentence(s, headModel, zeroSubjectModel, nominalMentionModel, valence); } private static void detectMentionsInSentence(Sentence sentence, HeadDetector headModel, ZeroSubjectDetector zeroSubjectModel, NominalMentionDetector nominalMentionModel, - Map<ValenceDicts,Map<String,ArrayList<String>>> valence, - PrintWriter defsWriter) { - // adding mentions -// addMentionsByTokenCtag(sentence); -// addMentionsBySyntacticWordsCtag(sentence); -// addMentionsByNamedEntities(sentence); -// addMentionsByGroups(sentence, valence); -// //addMentionsByDeppParse(sentence); -// addSpeakerMentionsInSpoken(sentence); - + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { // zero subject detection zeroSubjectModel.addZeroSubjectMentions(sentence); List<Token> heads = headModel.detectHeads(sentence); nominalMentionModel.addNominalMentions(sentence, valence, heads); - - // removing mentions - // removeTo(sentence); to nic nie daje, jeszcze ponizsze spradzic -// Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence)); -// Cleaner.cleanUnnecessarySentenceMentions(sentence); -// Cleaner.cleanFrazeos(sentence); - - - // updating mention heads - // updateMentionHeads(sentence); } /** diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java b/src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java index df7a1ef..67462e8 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java @@ -8,18 +8,11 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException; import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO; import java.io.File; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Paths; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; -import org.json.JSONArray; -import org.json.JSONObject; - public class TeiLoader { @@ -36,68 +29,33 @@ public class TeiLoader { public static Text loadTextFromTei(TEICorpusText teiText, File textDir) { Text text = new Text(teiText.getCorpusHeader().getId()); - String textId = textDir.getName(); - - System.out.println(textId); - - byte[] encoded; - JSONArray jsonParagraphs = null; - try { - //encoded = Files.readAllBytes(Paths.get("/home/bniton/Projekty/koreferencja/COTHEC/clients/python/data/train-JSON-Concraft-old_grammar/"+textId+".json")); - encoded = Files.readAllBytes(Paths.get("/home/bartek/Projekty/COTHEC/clients_data/"+textId+".json")); - String jsonContent = new String(encoded, StandardCharsets.UTF_8); - JSONObject jsonObject = new JSONObject(jsonContent); - - jsonParagraphs = jsonObject.getJSONArray("paragraphs"); - } catch (IOException e) { - // TODO Auto-generated catch block - //e.printStackTrace(); - logger.debug("No depparse layer."); - } - logger.debug("Loading tei text " + text.getId() + "..."); List<TEIParagraph> teiParagraphs = teiText.getParagraphs(); for (int i=0; i < teiParagraphs.size(); i++) { TEIParagraph teiP = teiParagraphs.get(i); - JSONObject jsonP = null; - if (jsonParagraphs != null) { - jsonP = new JSONObject(jsonParagraphs.get(i).toString()); - } - loadParagraph(text, teiP, jsonP); + loadParagraph(text, teiP); } logger.debug("Tei text loaded."); return text; } - private static void loadParagraph(Text text, TEIParagraph teiP, JSONObject jsonP) { + private static void loadParagraph(Text text, TEIParagraph teiP) { Paragraph p = new Paragraph(); text.add(p); List<TEISentence> teiSentences = teiP.getSentences(); - JSONArray jsonSentences = null; - if (jsonP != null) { - jsonSentences = jsonP.getJSONArray("sentences"); - } - for (int i=0; i < teiSentences.size(); i++) { TEISentence teiS = teiSentences.get(i); - JSONObject jsonS = null; - if (jsonP != null) { - if (i < jsonSentences.length()) { - jsonS = new JSONObject(jsonSentences.get(i).toString()); - } - } - - loadSentence(p, teiS, jsonS); + loadSentence(p, teiS); } } - private static void loadSentence(Paragraph p, TEISentence teiS, JSONObject jsonS) { + private static void loadSentence(Paragraph p, TEISentence teiS) { Sentence s = new Sentence(); p.add(s); @@ -114,33 +72,6 @@ public class TeiLoader { loadSyntacticGroup(s, g, teiMorph2Segment); for (TEIMention m : teiS.getAllMentions()) loadMentions(s, m, teiMorph2Segment); - - if (jsonS != null && s.size() == jsonS.getJSONArray("tokens").length()) { - JSONArray relations = jsonS.getJSONArray("dependencyParse"); - for (int i=0; i<relations.length(); i++) { - loadRelation(s, new JSONObject(relations.get(i).toString())); - } - } else { - //System.out.println(s.toStringWithoutMentions()); - } - } - - private static void loadRelation(Sentence s, JSONObject jsonRelation) { - String label = jsonRelation.getString("label"); - if (!label.equals("root") && !jsonRelation.getString("startTokenId").isEmpty() && - jsonRelation.get("startTokenId").getClass() == String.class) { - String[] sourceIdParts = jsonRelation.getString("startTokenId").split("\\."); - String[] targetIdParts = jsonRelation.getString("endTokenId").split("\\."); - - int sourceId = Integer.parseInt(sourceIdParts[sourceIdParts.length-1]); - int targetId = Integer.parseInt(targetIdParts[sourceIdParts.length-1]); - - Token source = s.get(sourceId); - Token target = s.get(targetId); - - source.addRelation(new Relation(label, target)); - target.setReturnRelation(new Relation(label, source)); - } } private static void loadMentions(Sentence s, TEIMention m, -- libgit2 0.22.2