Commit 1ed8f98cfc84530725600672e69a2762c69c97f3
1 parent
3d23a642
Removed unused depparse layer.
Showing
3 changed files
with
12 additions
and
113 deletions
src/main/java/pl/waw/ipipan/zil/core/md/Main.java
... | ... | @@ -21,11 +21,9 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils; |
21 | 21 | import java.io.BufferedReader; |
22 | 22 | import java.io.File; |
23 | 23 | import java.io.FileInputStream; |
24 | -import java.io.FileNotFoundException; | |
25 | 24 | import java.io.IOException; |
26 | 25 | import java.io.InputStream; |
27 | 26 | import java.io.InputStreamReader; |
28 | -import java.io.PrintWriter; | |
29 | 27 | import java.util.ArrayList; |
30 | 28 | import java.util.EnumMap; |
31 | 29 | import java.util.HashMap; |
... | ... | @@ -152,14 +150,6 @@ public class Main { |
152 | 150 | |
153 | 151 | File inputDir = new File(args[0]); |
154 | 152 | File outputDir = new File(args[1]); |
155 | - File defsOutputFile = new File(args[1], "definitions.csv"); | |
156 | - PrintWriter defsWriter = null; | |
157 | - try { | |
158 | - defsWriter = new PrintWriter(defsOutputFile); | |
159 | - } catch (FileNotFoundException e1) { | |
160 | - // TODO Auto-generated catch block | |
161 | - e1.printStackTrace(); | |
162 | - } | |
163 | 153 | |
164 | 154 | if (!inputDir.isDirectory()) { |
165 | 155 | logger.error(inputDir + " is not a directory!"); |
... | ... | @@ -188,15 +178,13 @@ public class Main { |
188 | 178 | try { |
189 | 179 | File targetDir = createTargetTextDir(inputDir, outputDir, teiDir); |
190 | 180 | TEICorpusText teiText = TeiLoader.readTeiText(teiDir); |
191 | - annotateTeiText(teiText, teiDir, defsWriter); | |
181 | + annotateTeiText(teiText, teiDir); | |
192 | 182 | TeiSaver.saveTeiText(teiText, targetDir, GZIP_OUTPUT); |
193 | 183 | } catch (IOException e) { |
194 | 184 | logger.error("Error processing text in dir:" + teiDir + " Error details: " + e.getLocalizedMessage(), e); |
195 | 185 | errors++; |
196 | 186 | } |
197 | 187 | } |
198 | - | |
199 | - defsWriter.close(); | |
200 | 188 | |
201 | 189 | logger.info(all + " texts processed succesfully."); |
202 | 190 | if (errors > 0) |
... | ... | @@ -231,9 +219,9 @@ public class Main { |
231 | 219 | * @param thriftText text to annotate with mentions |
232 | 220 | * @throws MultiserviceException when an error occures |
233 | 221 | */ |
234 | - public static void annotateThriftText(TText thriftText, PrintWriter defsWriter) throws MultiserviceException { | |
222 | + public static void annotateThriftText(TText thriftText) throws MultiserviceException { | |
235 | 223 | Text responseText = ThriftLoader.loadTextFromThrift(thriftText); |
236 | - Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence, defsWriter); | |
224 | + Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence); | |
237 | 225 | ThriftSaver.updateThriftText(responseText, thriftText); |
238 | 226 | } |
239 | 227 | |
... | ... | @@ -244,9 +232,9 @@ public class Main { |
244 | 232 | * @param teiText text to annotate with mentions |
245 | 233 | * @throws TEIException when an error occurs |
246 | 234 | */ |
247 | - public static void annotateTeiText(TEICorpusText teiText, File textDir, PrintWriter defsWriter) throws TEIException { | |
235 | + public static void annotateTeiText(TEICorpusText teiText, File textDir) throws TEIException { | |
248 | 236 | Text responseText = TeiLoader.loadTextFromTei(teiText, textDir); |
249 | - Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence, defsWriter); | |
237 | + Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence); | |
250 | 238 | TeiSaver.updateTeiText(responseText, teiText); |
251 | 239 | } |
252 | 240 | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java
... | ... | @@ -29,44 +29,24 @@ public class Detector { |
29 | 29 | HeadDetector headModel, |
30 | 30 | ZeroSubjectDetector zeroSubjectModel, |
31 | 31 | NominalMentionDetector nominalMentionModel, |
32 | - Map<ValenceDicts,Map<String,ArrayList<String>>> valence, | |
33 | - PrintWriter defsWriter) { | |
32 | + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { | |
34 | 33 | text.clearMentions(); |
35 | 34 | logger.debug("Detecting mentions in text " + text.getId()); |
36 | 35 | for (Paragraph p : text) |
37 | 36 | for (Sentence s : p) |
38 | - detectMentionsInSentence(s, headModel, zeroSubjectModel, nominalMentionModel, valence, defsWriter); | |
37 | + detectMentionsInSentence(s, headModel, zeroSubjectModel, nominalMentionModel, valence); | |
39 | 38 | } |
40 | 39 | |
41 | 40 | private static void detectMentionsInSentence(Sentence sentence, |
42 | 41 | HeadDetector headModel, |
43 | 42 | ZeroSubjectDetector zeroSubjectModel, |
44 | 43 | NominalMentionDetector nominalMentionModel, |
45 | - Map<ValenceDicts,Map<String,ArrayList<String>>> valence, | |
46 | - PrintWriter defsWriter) { | |
47 | - // adding mentions | |
48 | -// addMentionsByTokenCtag(sentence); | |
49 | -// addMentionsBySyntacticWordsCtag(sentence); | |
50 | -// addMentionsByNamedEntities(sentence); | |
51 | -// addMentionsByGroups(sentence, valence); | |
52 | -// //addMentionsByDeppParse(sentence); | |
53 | -// addSpeakerMentionsInSpoken(sentence); | |
54 | - | |
44 | + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { | |
55 | 45 | // zero subject detection |
56 | 46 | zeroSubjectModel.addZeroSubjectMentions(sentence); |
57 | 47 | |
58 | 48 | List<Token> heads = headModel.detectHeads(sentence); |
59 | 49 | nominalMentionModel.addNominalMentions(sentence, valence, heads); |
60 | - | |
61 | - // removing mentions | |
62 | - // removeTo(sentence); to nic nie daje, jeszcze ponizsze spradzic | |
63 | -// Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence)); | |
64 | -// Cleaner.cleanUnnecessarySentenceMentions(sentence); | |
65 | -// Cleaner.cleanFrazeos(sentence); | |
66 | - | |
67 | - | |
68 | - // updating mention heads | |
69 | - // updateMentionHeads(sentence); | |
70 | 50 | } |
71 | 51 | |
72 | 52 | /** |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java
... | ... | @@ -8,18 +8,11 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException; |
8 | 8 | import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO; |
9 | 9 | |
10 | 10 | import java.io.File; |
11 | -import java.io.IOException; | |
12 | -import java.nio.charset.StandardCharsets; | |
13 | -import java.nio.file.Files; | |
14 | -import java.nio.file.Paths; | |
15 | 11 | import java.util.ArrayList; |
16 | 12 | import java.util.HashMap; |
17 | 13 | import java.util.List; |
18 | 14 | import java.util.Map; |
19 | 15 | |
20 | -import org.json.JSONArray; | |
21 | -import org.json.JSONObject; | |
22 | - | |
23 | 16 | |
24 | 17 | public class TeiLoader { |
25 | 18 | |
... | ... | @@ -36,68 +29,33 @@ public class TeiLoader { |
36 | 29 | public static Text loadTextFromTei(TEICorpusText teiText, File textDir) { |
37 | 30 | Text text = new Text(teiText.getCorpusHeader().getId()); |
38 | 31 | |
39 | - String textId = textDir.getName(); | |
40 | - | |
41 | - System.out.println(textId); | |
42 | - | |
43 | - byte[] encoded; | |
44 | - JSONArray jsonParagraphs = null; | |
45 | - try { | |
46 | - //encoded = Files.readAllBytes(Paths.get("/home/bniton/Projekty/koreferencja/COTHEC/clients/python/data/train-JSON-Concraft-old_grammar/"+textId+".json")); | |
47 | - encoded = Files.readAllBytes(Paths.get("/home/bartek/Projekty/COTHEC/clients_data/"+textId+".json")); | |
48 | - String jsonContent = new String(encoded, StandardCharsets.UTF_8); | |
49 | - JSONObject jsonObject = new JSONObject(jsonContent); | |
50 | - | |
51 | - jsonParagraphs = jsonObject.getJSONArray("paragraphs"); | |
52 | - } catch (IOException e) { | |
53 | - // TODO Auto-generated catch block | |
54 | - //e.printStackTrace(); | |
55 | - logger.debug("No depparse layer."); | |
56 | - } | |
57 | - | |
58 | 32 | logger.debug("Loading tei text " + text.getId() + "..."); |
59 | 33 | |
60 | 34 | List<TEIParagraph> teiParagraphs = teiText.getParagraphs(); |
61 | 35 | |
62 | 36 | for (int i=0; i < teiParagraphs.size(); i++) { |
63 | 37 | TEIParagraph teiP = teiParagraphs.get(i); |
64 | - JSONObject jsonP = null; | |
65 | - if (jsonParagraphs != null) { | |
66 | - jsonP = new JSONObject(jsonParagraphs.get(i).toString()); | |
67 | - } | |
68 | - loadParagraph(text, teiP, jsonP); | |
38 | + loadParagraph(text, teiP); | |
69 | 39 | } |
70 | 40 | logger.debug("Tei text loaded."); |
71 | 41 | |
72 | 42 | return text; |
73 | 43 | } |
74 | 44 | |
75 | - private static void loadParagraph(Text text, TEIParagraph teiP, JSONObject jsonP) { | |
45 | + private static void loadParagraph(Text text, TEIParagraph teiP) { | |
76 | 46 | Paragraph p = new Paragraph(); |
77 | 47 | text.add(p); |
78 | 48 | |
79 | 49 | List<TEISentence> teiSentences = teiP.getSentences(); |
80 | 50 | |
81 | - JSONArray jsonSentences = null; | |
82 | - if (jsonP != null) { | |
83 | - jsonSentences = jsonP.getJSONArray("sentences"); | |
84 | - } | |
85 | - | |
86 | 51 | for (int i=0; i < teiSentences.size(); i++) { |
87 | 52 | TEISentence teiS = teiSentences.get(i); |
88 | 53 | |
89 | - JSONObject jsonS = null; | |
90 | - if (jsonP != null) { | |
91 | - if (i < jsonSentences.length()) { | |
92 | - jsonS = new JSONObject(jsonSentences.get(i).toString()); | |
93 | - } | |
94 | - } | |
95 | - | |
96 | - loadSentence(p, teiS, jsonS); | |
54 | + loadSentence(p, teiS); | |
97 | 55 | } |
98 | 56 | } |
99 | 57 | |
100 | - private static void loadSentence(Paragraph p, TEISentence teiS, JSONObject jsonS) { | |
58 | + private static void loadSentence(Paragraph p, TEISentence teiS) { | |
101 | 59 | Sentence s = new Sentence(); |
102 | 60 | p.add(s); |
103 | 61 | |
... | ... | @@ -114,33 +72,6 @@ public class TeiLoader { |
114 | 72 | loadSyntacticGroup(s, g, teiMorph2Segment); |
115 | 73 | for (TEIMention m : teiS.getAllMentions()) |
116 | 74 | loadMentions(s, m, teiMorph2Segment); |
117 | - | |
118 | - if (jsonS != null && s.size() == jsonS.getJSONArray("tokens").length()) { | |
119 | - JSONArray relations = jsonS.getJSONArray("dependencyParse"); | |
120 | - for (int i=0; i<relations.length(); i++) { | |
121 | - loadRelation(s, new JSONObject(relations.get(i).toString())); | |
122 | - } | |
123 | - } else { | |
124 | - //System.out.println(s.toStringWithoutMentions()); | |
125 | - } | |
126 | - } | |
127 | - | |
128 | - private static void loadRelation(Sentence s, JSONObject jsonRelation) { | |
129 | - String label = jsonRelation.getString("label"); | |
130 | - if (!label.equals("root") && !jsonRelation.getString("startTokenId").isEmpty() && | |
131 | - jsonRelation.get("startTokenId").getClass() == String.class) { | |
132 | - String[] sourceIdParts = jsonRelation.getString("startTokenId").split("\\."); | |
133 | - String[] targetIdParts = jsonRelation.getString("endTokenId").split("\\."); | |
134 | - | |
135 | - int sourceId = Integer.parseInt(sourceIdParts[sourceIdParts.length-1]); | |
136 | - int targetId = Integer.parseInt(targetIdParts[sourceIdParts.length-1]); | |
137 | - | |
138 | - Token source = s.get(sourceId); | |
139 | - Token target = s.get(targetId); | |
140 | - | |
141 | - source.addRelation(new Relation(label, target)); | |
142 | - target.setReturnRelation(new Relation(label, source)); | |
143 | - } | |
144 | 75 | } |
145 | 76 | |
146 | 77 | private static void loadMentions(Sentence s, TEIMention m, |
... | ... |