Commit 1ed8f98cfc84530725600672e69a2762c69c97f3
1 parent
3d23a642
Removed unused depparse layer.
Showing
3 changed files
with
12 additions
and
113 deletions
src/main/java/pl/waw/ipipan/zil/core/md/Main.java
@@ -21,11 +21,9 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils; | @@ -21,11 +21,9 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils; | ||
21 | import java.io.BufferedReader; | 21 | import java.io.BufferedReader; |
22 | import java.io.File; | 22 | import java.io.File; |
23 | import java.io.FileInputStream; | 23 | import java.io.FileInputStream; |
24 | -import java.io.FileNotFoundException; | ||
25 | import java.io.IOException; | 24 | import java.io.IOException; |
26 | import java.io.InputStream; | 25 | import java.io.InputStream; |
27 | import java.io.InputStreamReader; | 26 | import java.io.InputStreamReader; |
28 | -import java.io.PrintWriter; | ||
29 | import java.util.ArrayList; | 27 | import java.util.ArrayList; |
30 | import java.util.EnumMap; | 28 | import java.util.EnumMap; |
31 | import java.util.HashMap; | 29 | import java.util.HashMap; |
@@ -152,14 +150,6 @@ public class Main { | @@ -152,14 +150,6 @@ public class Main { | ||
152 | 150 | ||
153 | File inputDir = new File(args[0]); | 151 | File inputDir = new File(args[0]); |
154 | File outputDir = new File(args[1]); | 152 | File outputDir = new File(args[1]); |
155 | - File defsOutputFile = new File(args[1], "definitions.csv"); | ||
156 | - PrintWriter defsWriter = null; | ||
157 | - try { | ||
158 | - defsWriter = new PrintWriter(defsOutputFile); | ||
159 | - } catch (FileNotFoundException e1) { | ||
160 | - // TODO Auto-generated catch block | ||
161 | - e1.printStackTrace(); | ||
162 | - } | ||
163 | 153 | ||
164 | if (!inputDir.isDirectory()) { | 154 | if (!inputDir.isDirectory()) { |
165 | logger.error(inputDir + " is not a directory!"); | 155 | logger.error(inputDir + " is not a directory!"); |
@@ -188,15 +178,13 @@ public class Main { | @@ -188,15 +178,13 @@ public class Main { | ||
188 | try { | 178 | try { |
189 | File targetDir = createTargetTextDir(inputDir, outputDir, teiDir); | 179 | File targetDir = createTargetTextDir(inputDir, outputDir, teiDir); |
190 | TEICorpusText teiText = TeiLoader.readTeiText(teiDir); | 180 | TEICorpusText teiText = TeiLoader.readTeiText(teiDir); |
191 | - annotateTeiText(teiText, teiDir, defsWriter); | 181 | + annotateTeiText(teiText, teiDir); |
192 | TeiSaver.saveTeiText(teiText, targetDir, GZIP_OUTPUT); | 182 | TeiSaver.saveTeiText(teiText, targetDir, GZIP_OUTPUT); |
193 | } catch (IOException e) { | 183 | } catch (IOException e) { |
194 | logger.error("Error processing text in dir:" + teiDir + " Error details: " + e.getLocalizedMessage(), e); | 184 | logger.error("Error processing text in dir:" + teiDir + " Error details: " + e.getLocalizedMessage(), e); |
195 | errors++; | 185 | errors++; |
196 | } | 186 | } |
197 | } | 187 | } |
198 | - | ||
199 | - defsWriter.close(); | ||
200 | 188 | ||
201 | logger.info(all + " texts processed succesfully."); | 189 | logger.info(all + " texts processed succesfully."); |
202 | if (errors > 0) | 190 | if (errors > 0) |
@@ -231,9 +219,9 @@ public class Main { | @@ -231,9 +219,9 @@ public class Main { | ||
231 | * @param thriftText text to annotate with mentions | 219 | * @param thriftText text to annotate with mentions |
232 | * @throws MultiserviceException when an error occures | 220 | * @throws MultiserviceException when an error occures |
233 | */ | 221 | */ |
234 | - public static void annotateThriftText(TText thriftText, PrintWriter defsWriter) throws MultiserviceException { | 222 | + public static void annotateThriftText(TText thriftText) throws MultiserviceException { |
235 | Text responseText = ThriftLoader.loadTextFromThrift(thriftText); | 223 | Text responseText = ThriftLoader.loadTextFromThrift(thriftText); |
236 | - Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence, defsWriter); | 224 | + Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence); |
237 | ThriftSaver.updateThriftText(responseText, thriftText); | 225 | ThriftSaver.updateThriftText(responseText, thriftText); |
238 | } | 226 | } |
239 | 227 | ||
@@ -244,9 +232,9 @@ public class Main { | @@ -244,9 +232,9 @@ public class Main { | ||
244 | * @param teiText text to annotate with mentions | 232 | * @param teiText text to annotate with mentions |
245 | * @throws TEIException when an error occurs | 233 | * @throws TEIException when an error occurs |
246 | */ | 234 | */ |
247 | - public static void annotateTeiText(TEICorpusText teiText, File textDir, PrintWriter defsWriter) throws TEIException { | 235 | + public static void annotateTeiText(TEICorpusText teiText, File textDir) throws TEIException { |
248 | Text responseText = TeiLoader.loadTextFromTei(teiText, textDir); | 236 | Text responseText = TeiLoader.loadTextFromTei(teiText, textDir); |
249 | - Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence, defsWriter); | 237 | + Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence); |
250 | TeiSaver.updateTeiText(responseText, teiText); | 238 | TeiSaver.updateTeiText(responseText, teiText); |
251 | } | 239 | } |
252 | 240 |
src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java
@@ -29,44 +29,24 @@ public class Detector { | @@ -29,44 +29,24 @@ public class Detector { | ||
29 | HeadDetector headModel, | 29 | HeadDetector headModel, |
30 | ZeroSubjectDetector zeroSubjectModel, | 30 | ZeroSubjectDetector zeroSubjectModel, |
31 | NominalMentionDetector nominalMentionModel, | 31 | NominalMentionDetector nominalMentionModel, |
32 | - Map<ValenceDicts,Map<String,ArrayList<String>>> valence, | ||
33 | - PrintWriter defsWriter) { | 32 | + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { |
34 | text.clearMentions(); | 33 | text.clearMentions(); |
35 | logger.debug("Detecting mentions in text " + text.getId()); | 34 | logger.debug("Detecting mentions in text " + text.getId()); |
36 | for (Paragraph p : text) | 35 | for (Paragraph p : text) |
37 | for (Sentence s : p) | 36 | for (Sentence s : p) |
38 | - detectMentionsInSentence(s, headModel, zeroSubjectModel, nominalMentionModel, valence, defsWriter); | 37 | + detectMentionsInSentence(s, headModel, zeroSubjectModel, nominalMentionModel, valence); |
39 | } | 38 | } |
40 | 39 | ||
41 | private static void detectMentionsInSentence(Sentence sentence, | 40 | private static void detectMentionsInSentence(Sentence sentence, |
42 | HeadDetector headModel, | 41 | HeadDetector headModel, |
43 | ZeroSubjectDetector zeroSubjectModel, | 42 | ZeroSubjectDetector zeroSubjectModel, |
44 | NominalMentionDetector nominalMentionModel, | 43 | NominalMentionDetector nominalMentionModel, |
45 | - Map<ValenceDicts,Map<String,ArrayList<String>>> valence, | ||
46 | - PrintWriter defsWriter) { | ||
47 | - // adding mentions | ||
48 | -// addMentionsByTokenCtag(sentence); | ||
49 | -// addMentionsBySyntacticWordsCtag(sentence); | ||
50 | -// addMentionsByNamedEntities(sentence); | ||
51 | -// addMentionsByGroups(sentence, valence); | ||
52 | -// //addMentionsByDeppParse(sentence); | ||
53 | -// addSpeakerMentionsInSpoken(sentence); | ||
54 | - | 44 | + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { |
55 | // zero subject detection | 45 | // zero subject detection |
56 | zeroSubjectModel.addZeroSubjectMentions(sentence); | 46 | zeroSubjectModel.addZeroSubjectMentions(sentence); |
57 | 47 | ||
58 | List<Token> heads = headModel.detectHeads(sentence); | 48 | List<Token> heads = headModel.detectHeads(sentence); |
59 | nominalMentionModel.addNominalMentions(sentence, valence, heads); | 49 | nominalMentionModel.addNominalMentions(sentence, valence, heads); |
60 | - | ||
61 | - // removing mentions | ||
62 | - // removeTo(sentence); to nic nie daje, jeszcze ponizsze spradzic | ||
63 | -// Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence)); | ||
64 | -// Cleaner.cleanUnnecessarySentenceMentions(sentence); | ||
65 | -// Cleaner.cleanFrazeos(sentence); | ||
66 | - | ||
67 | - | ||
68 | - // updating mention heads | ||
69 | - // updateMentionHeads(sentence); | ||
70 | } | 50 | } |
71 | 51 | ||
72 | /** | 52 | /** |
src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java
@@ -8,18 +8,11 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException; | @@ -8,18 +8,11 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException; | ||
8 | import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO; | 8 | import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO; |
9 | 9 | ||
10 | import java.io.File; | 10 | import java.io.File; |
11 | -import java.io.IOException; | ||
12 | -import java.nio.charset.StandardCharsets; | ||
13 | -import java.nio.file.Files; | ||
14 | -import java.nio.file.Paths; | ||
15 | import java.util.ArrayList; | 11 | import java.util.ArrayList; |
16 | import java.util.HashMap; | 12 | import java.util.HashMap; |
17 | import java.util.List; | 13 | import java.util.List; |
18 | import java.util.Map; | 14 | import java.util.Map; |
19 | 15 | ||
20 | -import org.json.JSONArray; | ||
21 | -import org.json.JSONObject; | ||
22 | - | ||
23 | 16 | ||
24 | public class TeiLoader { | 17 | public class TeiLoader { |
25 | 18 | ||
@@ -36,68 +29,33 @@ public class TeiLoader { | @@ -36,68 +29,33 @@ public class TeiLoader { | ||
36 | public static Text loadTextFromTei(TEICorpusText teiText, File textDir) { | 29 | public static Text loadTextFromTei(TEICorpusText teiText, File textDir) { |
37 | Text text = new Text(teiText.getCorpusHeader().getId()); | 30 | Text text = new Text(teiText.getCorpusHeader().getId()); |
38 | 31 | ||
39 | - String textId = textDir.getName(); | ||
40 | - | ||
41 | - System.out.println(textId); | ||
42 | - | ||
43 | - byte[] encoded; | ||
44 | - JSONArray jsonParagraphs = null; | ||
45 | - try { | ||
46 | - //encoded = Files.readAllBytes(Paths.get("/home/bniton/Projekty/koreferencja/COTHEC/clients/python/data/train-JSON-Concraft-old_grammar/"+textId+".json")); | ||
47 | - encoded = Files.readAllBytes(Paths.get("/home/bartek/Projekty/COTHEC/clients_data/"+textId+".json")); | ||
48 | - String jsonContent = new String(encoded, StandardCharsets.UTF_8); | ||
49 | - JSONObject jsonObject = new JSONObject(jsonContent); | ||
50 | - | ||
51 | - jsonParagraphs = jsonObject.getJSONArray("paragraphs"); | ||
52 | - } catch (IOException e) { | ||
53 | - // TODO Auto-generated catch block | ||
54 | - //e.printStackTrace(); | ||
55 | - logger.debug("No depparse layer."); | ||
56 | - } | ||
57 | - | ||
58 | logger.debug("Loading tei text " + text.getId() + "..."); | 32 | logger.debug("Loading tei text " + text.getId() + "..."); |
59 | 33 | ||
60 | List<TEIParagraph> teiParagraphs = teiText.getParagraphs(); | 34 | List<TEIParagraph> teiParagraphs = teiText.getParagraphs(); |
61 | 35 | ||
62 | for (int i=0; i < teiParagraphs.size(); i++) { | 36 | for (int i=0; i < teiParagraphs.size(); i++) { |
63 | TEIParagraph teiP = teiParagraphs.get(i); | 37 | TEIParagraph teiP = teiParagraphs.get(i); |
64 | - JSONObject jsonP = null; | ||
65 | - if (jsonParagraphs != null) { | ||
66 | - jsonP = new JSONObject(jsonParagraphs.get(i).toString()); | ||
67 | - } | ||
68 | - loadParagraph(text, teiP, jsonP); | 38 | + loadParagraph(text, teiP); |
69 | } | 39 | } |
70 | logger.debug("Tei text loaded."); | 40 | logger.debug("Tei text loaded."); |
71 | 41 | ||
72 | return text; | 42 | return text; |
73 | } | 43 | } |
74 | 44 | ||
75 | - private static void loadParagraph(Text text, TEIParagraph teiP, JSONObject jsonP) { | 45 | + private static void loadParagraph(Text text, TEIParagraph teiP) { |
76 | Paragraph p = new Paragraph(); | 46 | Paragraph p = new Paragraph(); |
77 | text.add(p); | 47 | text.add(p); |
78 | 48 | ||
79 | List<TEISentence> teiSentences = teiP.getSentences(); | 49 | List<TEISentence> teiSentences = teiP.getSentences(); |
80 | 50 | ||
81 | - JSONArray jsonSentences = null; | ||
82 | - if (jsonP != null) { | ||
83 | - jsonSentences = jsonP.getJSONArray("sentences"); | ||
84 | - } | ||
85 | - | ||
86 | for (int i=0; i < teiSentences.size(); i++) { | 51 | for (int i=0; i < teiSentences.size(); i++) { |
87 | TEISentence teiS = teiSentences.get(i); | 52 | TEISentence teiS = teiSentences.get(i); |
88 | 53 | ||
89 | - JSONObject jsonS = null; | ||
90 | - if (jsonP != null) { | ||
91 | - if (i < jsonSentences.length()) { | ||
92 | - jsonS = new JSONObject(jsonSentences.get(i).toString()); | ||
93 | - } | ||
94 | - } | ||
95 | - | ||
96 | - loadSentence(p, teiS, jsonS); | 54 | + loadSentence(p, teiS); |
97 | } | 55 | } |
98 | } | 56 | } |
99 | 57 | ||
100 | - private static void loadSentence(Paragraph p, TEISentence teiS, JSONObject jsonS) { | 58 | + private static void loadSentence(Paragraph p, TEISentence teiS) { |
101 | Sentence s = new Sentence(); | 59 | Sentence s = new Sentence(); |
102 | p.add(s); | 60 | p.add(s); |
103 | 61 | ||
@@ -114,33 +72,6 @@ public class TeiLoader { | @@ -114,33 +72,6 @@ public class TeiLoader { | ||
114 | loadSyntacticGroup(s, g, teiMorph2Segment); | 72 | loadSyntacticGroup(s, g, teiMorph2Segment); |
115 | for (TEIMention m : teiS.getAllMentions()) | 73 | for (TEIMention m : teiS.getAllMentions()) |
116 | loadMentions(s, m, teiMorph2Segment); | 74 | loadMentions(s, m, teiMorph2Segment); |
117 | - | ||
118 | - if (jsonS != null && s.size() == jsonS.getJSONArray("tokens").length()) { | ||
119 | - JSONArray relations = jsonS.getJSONArray("dependencyParse"); | ||
120 | - for (int i=0; i<relations.length(); i++) { | ||
121 | - loadRelation(s, new JSONObject(relations.get(i).toString())); | ||
122 | - } | ||
123 | - } else { | ||
124 | - //System.out.println(s.toStringWithoutMentions()); | ||
125 | - } | ||
126 | - } | ||
127 | - | ||
128 | - private static void loadRelation(Sentence s, JSONObject jsonRelation) { | ||
129 | - String label = jsonRelation.getString("label"); | ||
130 | - if (!label.equals("root") && !jsonRelation.getString("startTokenId").isEmpty() && | ||
131 | - jsonRelation.get("startTokenId").getClass() == String.class) { | ||
132 | - String[] sourceIdParts = jsonRelation.getString("startTokenId").split("\\."); | ||
133 | - String[] targetIdParts = jsonRelation.getString("endTokenId").split("\\."); | ||
134 | - | ||
135 | - int sourceId = Integer.parseInt(sourceIdParts[sourceIdParts.length-1]); | ||
136 | - int targetId = Integer.parseInt(targetIdParts[sourceIdParts.length-1]); | ||
137 | - | ||
138 | - Token source = s.get(sourceId); | ||
139 | - Token target = s.get(targetId); | ||
140 | - | ||
141 | - source.addRelation(new Relation(label, target)); | ||
142 | - target.setReturnRelation(new Relation(label, source)); | ||
143 | - } | ||
144 | } | 75 | } |
145 | 76 | ||
146 | private static void loadMentions(Sentence s, TEIMention m, | 77 | private static void loadMentions(Sentence s, TEIMention m, |