Commit dfbfe3fdf1b83e7b7fe76d1421ab9d9488227f62
0 parents
Initial commit
Showing
82 changed files
with
3236 additions
and
0 deletions
.gitignore
0 → 100644
pom.xml
0 → 100644
1 | +++ a/pom.xml | |
1 | +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
2 | + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
3 | + <modelVersion>4.0.0</modelVersion> | |
4 | + <groupId>pl.waw.ipipan.zil.core</groupId> | |
5 | + <artifactId>md</artifactId> | |
6 | + <version>1.2-SNAPSHOT</version> | |
7 | + <build> | |
8 | + <plugins> | |
9 | + <plugin> | |
10 | + <artifactId>maven-compiler-plugin</artifactId> | |
11 | + <version>2.3.2</version> | |
12 | + <configuration> | |
13 | + <source>1.7</source> | |
14 | + <target>1.7</target> | |
15 | + </configuration> | |
16 | + </plugin> | |
17 | + <plugin> | |
18 | + <groupId>org.dstovall</groupId> | |
19 | + <artifactId>onejar-maven-plugin</artifactId> | |
20 | + <version>1.4.4</version> | |
21 | + <executions> | |
22 | + <execution> | |
23 | + <configuration> | |
24 | + <mainClass>pl.waw.ipipan.zil.core.md.Main</mainClass> | |
25 | + </configuration> | |
26 | + <goals> | |
27 | + <goal>one-jar</goal> | |
28 | + </goals> | |
29 | + </execution> | |
30 | + </executions> | |
31 | + </plugin> | |
32 | + </plugins> | |
33 | + </build> | |
34 | + <dependencies> | |
35 | + <dependency> | |
36 | + <groupId>log4j</groupId> | |
37 | + <artifactId>log4j</artifactId> | |
38 | + <version>1.2.17</version> | |
39 | + </dependency> | |
40 | + <dependency> | |
41 | + <groupId>ipipan.multiservice</groupId> | |
42 | + <artifactId>MultiserviceUtils</artifactId> | |
43 | + <version>1.0-SNAPSHOT</version> | |
44 | + </dependency> | |
45 | + <dependency> | |
46 | + <groupId>ipipan</groupId> | |
47 | + <artifactId>teiapi</artifactId> | |
48 | + <version>1.0-SNAPSHOT</version> | |
49 | + </dependency> | |
50 | + <dependency> | |
51 | + <groupId>junit</groupId> | |
52 | + <artifactId>junit</artifactId> | |
53 | + <version>4.11</version> | |
54 | + </dependency> | |
55 | + <dependency> | |
56 | + <groupId>nz.ac.waikato.cms.weka</groupId> | |
57 | + <artifactId>weka-stable</artifactId> | |
58 | + <version>3.6.10</version> | |
59 | + </dependency> | |
60 | + </dependencies> | |
61 | + <repositories> | |
62 | + <repository> | |
63 | + <id>zil-maven-repo</id> | |
64 | + <name>ZIL maven repository</name> | |
65 | + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/snapshots</url> | |
66 | + </repository> | |
67 | + </repositories> | |
68 | + <pluginRepositories> | |
69 | + <pluginRepository> | |
70 | + <id>onejar-maven-plugin.googlecode.com</id> | |
71 | + <url>http://onejar-maven-plugin.googlecode.com/svn/mavenrepo</url> | |
72 | + </pluginRepository> | |
73 | + </pluginRepositories> | |
74 | +</project> | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/Main.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/Main.java | |
1 | +package pl.waw.ipipan.zil.core.md; | |
2 | + | |
3 | +import ipipan.clarin.tei.api.entities.TEICorpusText; | |
4 | +import ipipan.clarin.tei.api.exceptions.TEIException; | |
5 | +import ipipan.clarin.tei.api.io.IOUtils; | |
6 | + | |
7 | +import java.io.File; | |
8 | +import java.io.FileInputStream; | |
9 | +import java.io.IOException; | |
10 | +import java.io.InputStream; | |
11 | + | |
12 | +import org.apache.log4j.Logger; | |
13 | + | |
14 | +import pl.waw.ipipan.multiservice.thrift.types.MultiserviceException; | |
15 | +import pl.waw.ipipan.multiservice.thrift.types.TText; | |
16 | +import pl.waw.ipipan.zil.core.md.detection.Detector; | |
17 | +import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; | |
18 | +import pl.waw.ipipan.zil.core.md.entities.Text; | |
19 | +import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader; | |
20 | +import pl.waw.ipipan.zil.core.md.io.tei.TeiSaver; | |
21 | +import pl.waw.ipipan.zil.core.md.io.thrift.ThriftLoader; | |
22 | +import pl.waw.ipipan.zil.core.md.io.thrift.ThriftSaver; | |
23 | + | |
24 | +/** | |
25 | + * @author Mateusz Kopeć | |
26 | + * | |
27 | + */ | |
28 | +public class Main { | |
29 | + | |
30 | + private final static Logger logger = Logger.getLogger(Main.class); | |
31 | + private final static boolean GZIP_OUTPUT = true; | |
32 | + | |
33 | + private final static String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin"; | |
34 | + | |
35 | + private static ZeroSubjectDetector zeroSubjectModel; | |
36 | + static { | |
37 | + InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL); | |
38 | + zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream); | |
39 | + } | |
40 | + | |
41 | + /** | |
42 | + * Main method for detecting mentions in corpus encoded in Tei format. | |
43 | + * | |
44 | + * @param args | |
45 | + * @throws TEIException | |
46 | + */ | |
47 | + public static void main(String[] args) { | |
48 | + | |
49 | + if (args.length != 2 && args.length != 3) { | |
50 | + logger.error("Wrong usage! should be: " + Main.class.getSimpleName() | |
51 | + + " input_dir result_dir [zero_subject_model]"); | |
52 | + return; | |
53 | + } | |
54 | + | |
55 | + File inputDir = new File(args[0]); | |
56 | + File outputDir = new File(args[1]); | |
57 | + | |
58 | + if (!inputDir.isDirectory()) { | |
59 | + logger.error(inputDir + " is not a directory!"); | |
60 | + return; | |
61 | + } | |
62 | + if (!outputDir.isDirectory()) { | |
63 | + logger.error(outputDir + " is not a directory!"); | |
64 | + return; | |
65 | + } | |
66 | + if (args.length == 3) { | |
67 | + try { | |
68 | + InputStream zeroSubjectDetectionModelStream; | |
69 | + zeroSubjectDetectionModelStream = new FileInputStream(new File(args[2])); | |
70 | + zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream); | |
71 | + if (zeroSubjectModel == null) | |
72 | + throw new IOException(); | |
73 | + } catch (IOException e) { | |
74 | + logger.error("Unable to load model from file: " + args[2] + ": " + e); | |
75 | + return; | |
76 | + } | |
77 | + } | |
78 | + | |
79 | + int all = 0; | |
80 | + int errors = 0; | |
81 | + for (File teiDir : IOUtils.getNKJPDirs(inputDir)) { | |
82 | + all++; | |
83 | + try { | |
84 | + File targetDir = createTargetTextDir(inputDir, outputDir, teiDir); | |
85 | + TEICorpusText teiText = TeiLoader.readTeiText(teiDir); | |
86 | + annotateTeiText(teiText); | |
87 | + TeiSaver.saveTeiText(teiText, targetDir, GZIP_OUTPUT); | |
88 | + } catch (IOException e) { | |
89 | + logger.error("Error processing text in dir:" + teiDir + " Error details: " + e.getLocalizedMessage()); | |
90 | + errors++; | |
91 | + } | |
92 | + } | |
93 | + | |
94 | + logger.info(all + " texts processed succesfully."); | |
95 | + if (errors > 0) | |
96 | + logger.info(errors + " texts not processed."); | |
97 | + logger.info(ZeroSubjectDetector.verbsWithoutSubject + " verbs with zero subject detected."); | |
98 | + logger.info(ZeroSubjectDetector.verbsWithSubject + " verbs with explicit subject detected."); | |
99 | + } | |
100 | + | |
101 | + /** | |
102 | + * Find relative path of text directory in the corpus directory and create | |
103 | + * similar directory structure in the output corpus directory. | |
104 | + * | |
105 | + * @param inputCorpusDir | |
106 | + * @param outputCorpusDir | |
107 | + * @param textDir | |
108 | + * @return | |
109 | + * @throws IOException | |
110 | + */ | |
111 | + private static File createTargetTextDir(File inputCorpusDir, File outputCorpusDir, File textDir) throws IOException { | |
112 | + String relativeDirPath = textDir.toString().substring(inputCorpusDir.toString().length()); | |
113 | + File targetDir = new File(outputCorpusDir, relativeDirPath); | |
114 | + targetDir.mkdirs(); | |
115 | + if (!targetDir.exists() || !targetDir.isDirectory()) | |
116 | + throw new IOException("Failed to create output directory at: " + targetDir); | |
117 | + return targetDir; | |
118 | + } | |
119 | + | |
120 | + /** | |
121 | + * Find mentions in Thrift text and update this Thrift text with mention | |
122 | + * annotation. | |
123 | + * | |
124 | + * @param thriftText | |
125 | + * @throws MultiserviceException | |
126 | + */ | |
127 | + public static void annotateThriftText(TText thriftText) throws MultiserviceException { | |
128 | + Text responseText = ThriftLoader.loadTextFromThrift(thriftText); | |
129 | + Detector.findMentionsInText(responseText, zeroSubjectModel); | |
130 | + ThriftSaver.updateThriftText(responseText, thriftText); | |
131 | + } | |
132 | + | |
133 | + /** | |
134 | + * Find mentions in Tei text and update this Tei text with mention | |
135 | + * annotation. This method does not save this Tei text on disk. | |
136 | + * | |
137 | + * @param teiText | |
138 | + * @param zeroSubjectModel | |
139 | + * @throws TEIException | |
140 | + */ | |
141 | + public static void annotateTeiText(TEICorpusText teiText) throws TEIException { | |
142 | + Text responseText = TeiLoader.loadTextFromTei(teiText); | |
143 | + Detector.findMentionsInText(responseText, zeroSubjectModel); | |
144 | + TeiSaver.updateTeiText(responseText, teiText); | |
145 | + } | |
146 | + | |
147 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java | |
1 | +package pl.waw.ipipan.zil.core.md.detection; | |
2 | + | |
3 | +import java.util.Collection; | |
4 | +import java.util.HashSet; | |
5 | +import java.util.List; | |
6 | +import java.util.Set; | |
7 | + | |
8 | +import pl.waw.ipipan.zil.core.md.entities.Mention; | |
9 | +import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
10 | +import pl.waw.ipipan.zil.core.md.entities.Token; | |
11 | + | |
12 | +public class Cleaner { | |
13 | + public static void cleanUnnecessarySentenceMentions(Sentence sentence) { | |
14 | + List<Mention> mentions = sentence.getMentions(); | |
15 | + Collection<Mention> unnecessaryMentions = new HashSet<Mention>(); | |
16 | + | |
17 | + for (int i = 0; i < mentions.size(); i++) { | |
18 | + Mention m1 = mentions.get(i); | |
19 | + for (int j = i + 1; j < mentions.size(); j++) { | |
20 | + Mention m2 = mentions.get(j); | |
21 | + | |
22 | + Mention lessImportantMention = getLessImportantMention(m1, m2); | |
23 | + Mention moreImportantMention = m1 == lessImportantMention ? m2 | |
24 | + : m1; | |
25 | + | |
26 | + // same mention borders | |
27 | + if (m1.getSegments().equals(m2.getSegments())) { | |
28 | + unnecessaryMentions.add(lessImportantMention); | |
29 | + // System.out.println("Same borders: "+ m1 +", "+ | |
30 | + // m2+": "+getLessImportantMention(m1, m2)+" removed"); | |
31 | + continue; | |
32 | + } | |
33 | + // same mention heads | |
34 | + if (!m1.getHeadSegments().isEmpty() | |
35 | + && !m2.getHeadSegments().isEmpty()) { | |
36 | + if (m1.getHeadSegments().equals(m2.getHeadSegments())) { | |
37 | + | |
38 | + List<Token> segments = moreImportantMention | |
39 | + .getSegments(); | |
40 | + | |
41 | + boolean isConj = false; | |
42 | + for (Token seg : segments) { | |
43 | + if (seg.getChosenInterpretation().getCtag() | |
44 | + .equals("conj")) { | |
45 | + isConj = true; | |
46 | + break; | |
47 | + } | |
48 | + } | |
49 | + | |
50 | + if (!isConj) { | |
51 | + unnecessaryMentions.add(lessImportantMention); | |
52 | + // System.out.println("Same heads: " + m1 + ", " + | |
53 | + // m2 + ": " + lessImportantMention | |
54 | + // + " removed"); | |
55 | + | |
56 | + continue; | |
57 | + } | |
58 | + } | |
59 | + } | |
60 | + | |
61 | + // mention head equals whole other mention | |
62 | + if (m1.getHeadSegments().isEmpty() | |
63 | + && !m2.getHeadSegments().isEmpty()) { | |
64 | + if (m2.getHeadSegments().equals(m1.getSegments())) { | |
65 | + unnecessaryMentions.add(lessImportantMention); | |
66 | + continue; | |
67 | + // System.out.println("head is other mention: " + m1 + | |
68 | + // ", " + m2 + ": " | |
69 | + // + getLessImportantMention(m1, m2) + " removed"); | |
70 | + } | |
71 | + } | |
72 | + | |
73 | + // the same, but other way round | |
74 | + if (m2.getHeadSegments().isEmpty() | |
75 | + && !m1.getHeadSegments().isEmpty()) { | |
76 | + | |
77 | + if (m1.getHeadSegments().equals(m2.getSegments())) { | |
78 | + unnecessaryMentions.add(lessImportantMention); | |
79 | + continue; | |
80 | + // System.out.println("head is other mention: " + m1 + | |
81 | + // ", " + m2 + ": " | |
82 | + // + getLessImportantMention(m1, m2) + " removed"); | |
83 | + } | |
84 | + } | |
85 | + | |
86 | + // nie zawieraja sie w sobie, lecz maja czesc wspolna | |
87 | + boolean intersect = false; | |
88 | + | |
89 | + Set<Token> notInM1 = new HashSet<Token>(m2.getSegments()); | |
90 | + notInM1.removeAll(m1.getSegments()); | |
91 | + if (notInM1.size() < m2.getSegments().size()) | |
92 | + intersect = true; | |
93 | + | |
94 | + Set<Token> notInM2 = new HashSet<Token>(m1.getSegments()); | |
95 | + notInM2.removeAll(m2.getSegments()); | |
96 | + if (notInM2.size() < m1.getSegments().size()) | |
97 | + intersect = true; | |
98 | + | |
99 | + // if (intersect) | |
100 | + // System.out.println(m1+","+m2); | |
101 | + | |
102 | + if (intersect && !notInM1.isEmpty() && !notInM2.isEmpty()) { | |
103 | + unnecessaryMentions.add(lessImportantMention); | |
104 | + continue; | |
105 | + // System.out.println("intersection!" + m1 + ", " + m2 + | |
106 | + // ": " | |
107 | + // + getLessImportantMention(m1, m2) + " removed"); | |
108 | + } | |
109 | + | |
110 | + } | |
111 | + } | |
112 | + | |
113 | + for (Mention m : unnecessaryMentions) | |
114 | + sentence.removeMention(m); | |
115 | + | |
116 | + // heurystyka dla usuwania rzeczy w stylu: [[Ernest][Kwiecien]] | |
117 | + unnecessaryMentions.clear(); | |
118 | + | |
119 | + OUTER: for (Mention m : sentence.getMentions()) { | |
120 | + for (Token seg : m.getSegments()) | |
121 | + if (seg.getOrth().toLowerCase().equals(seg.getOrth())) | |
122 | + continue OUTER; | |
123 | + | |
124 | + //only for children of fully capitalized mentions | |
125 | + Set<Mention> allMentions = new HashSet<Mention>(); | |
126 | + for (Token seg : m.getSegments()) | |
127 | + for (Mention m2 : seg.getMentions()) | |
128 | + if (m.getSegments().containsAll(m2.getSegments())) | |
129 | + allMentions.add(m2); | |
130 | + | |
131 | + allMentions.remove(m); | |
132 | + | |
133 | + unnecessaryMentions.addAll(allMentions); | |
134 | + } | |
135 | + for (Mention m : unnecessaryMentions) | |
136 | + sentence.removeMention(m); | |
137 | + } | |
138 | + | |
139 | + private static Mention getLessImportantMention(Mention m1, Mention m2) { | |
140 | + if (m1.getSegments().size() > m2.getSegments().size()) | |
141 | + return m2; | |
142 | + else | |
143 | + return m1; | |
144 | + } | |
145 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java | |
1 | +package pl.waw.ipipan.zil.core.md.detection; | |
2 | + | |
3 | +public class Constants { | |
4 | + public static final String MORPHO_NOUN_CTAGS = "subst|depr|ger"; | |
5 | + public static final String MORPHO_VERB_CTAGS = "fin|bedzie|aglt|impt"; | |
6 | + public static final String MORPHO_PRONOUN_CTAGS = "ppron3|ppron12"; | |
7 | + public static final String MORPHO_CTAGS = MORPHO_NOUN_CTAGS + "|" | |
8 | + + MORPHO_PRONOUN_CTAGS; | |
9 | + public static final String WORDS_CTAGS = "Noun|Ppron.*"; | |
10 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java | |
1 | +package pl.waw.ipipan.zil.core.md.detection; | |
2 | + | |
3 | +import java.util.ArrayList; | |
4 | +import java.util.HashSet; | |
5 | +import java.util.List; | |
6 | +import java.util.Set; | |
7 | + | |
8 | +import org.apache.log4j.Logger; | |
9 | + | |
10 | +import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; | |
11 | +import pl.waw.ipipan.zil.core.md.entities.Mention; | |
12 | +import pl.waw.ipipan.zil.core.md.entities.NamedEntity; | |
13 | +import pl.waw.ipipan.zil.core.md.entities.Paragraph; | |
14 | +import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
15 | +import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup; | |
16 | +import pl.waw.ipipan.zil.core.md.entities.SyntacticWord; | |
17 | +import pl.waw.ipipan.zil.core.md.entities.Text; | |
18 | +import pl.waw.ipipan.zil.core.md.entities.Token; | |
19 | + | |
20 | +public class Detector { | |
21 | + private static Logger logger = Logger.getLogger(Detector.class); | |
22 | + | |
23 | + public static void findMentionsInText(Text text, | |
24 | + ZeroSubjectDetector zeroSubjectModel) { | |
25 | + text.clearMentions(); | |
26 | + logger.debug("Detecting mentions in text " + text.getId()); | |
27 | + for (Paragraph p : text) | |
28 | + for (Sentence s : p) | |
29 | + detectMentionsInSentence(s, zeroSubjectModel); | |
30 | + } | |
31 | + | |
32 | + private static void detectMentionsInSentence(Sentence sentence, | |
33 | + ZeroSubjectDetector zeroSubjectModel) { | |
34 | + // adding mentions | |
35 | + addMentionsByTokenCtag(sentence); | |
36 | + addMentionsBySyntacticWordsCtag(sentence); | |
37 | + addMentionsByNamedEntities(sentence); | |
38 | + addMentionsByGroups(sentence); | |
39 | + addSpeakerMentionsInSpoken(sentence); | |
40 | + | |
41 | + // zero subject detection | |
42 | + zeroSubjectModel.addZeroSubjectMentions(sentence); | |
43 | + | |
44 | + // removing mentions | |
45 | + removeTo(sentence); | |
46 | + Cleaner.cleanUnnecessarySentenceMentions(sentence); | |
47 | + | |
48 | + // updating mention heads | |
49 | + updateMentionHeads(sentence); | |
50 | + } | |
51 | + | |
52 | + /** | |
53 | + * heurystyka ustawiajaca jako glowe pierwszy segment gdy glowy brak | |
54 | + * | |
55 | + * @param sentence | |
56 | + */ | |
57 | + private static void updateMentionHeads(Sentence sentence) { | |
58 | + for (Mention m : sentence.getMentions()) | |
59 | + if (m.getHeadSegments().isEmpty()) | |
60 | + m.addHeadSegment(m.getFirstSegment()); | |
61 | + } | |
62 | + | |
63 | + /** | |
64 | + * heurystyka dla "to" w zdaniu z ""jeśli"/"jeżeli"/"skoro"" | |
65 | + * | |
66 | + * @param sentence | |
67 | + */ | |
68 | + private static void removeTo(Sentence sentence) { | |
69 | + Set<String> orths = new HashSet<String>(); | |
70 | + for (Token morph : sentence) | |
71 | + orths.add(morph.getOrth()); | |
72 | + | |
73 | + if (orths.contains("jeśli") || orths.contains("jeżeli") | |
74 | + || orths.contains("skoro")) { | |
75 | + for (Mention mention : sentence.getMentions()) { | |
76 | + List<Token> mentSegs = mention.getSegments(); | |
77 | + if (mentSegs.size() == 1 | |
78 | + && mentSegs.get(0).getBase().equals("to")) { | |
79 | + sentence.removeMention(mention); | |
80 | + } | |
81 | + } | |
82 | + } | |
83 | + } | |
84 | + | |
85 | + private static void addSpeakerMentionsInSpoken(Sentence sentence) { | |
86 | + // heurystyka dla sp1:, sp2:, MarszałekJAkistam: | |
87 | + if (sentence.size() > 2) { | |
88 | + Token first = sentence.get(0); | |
89 | + Token second = sentence.get(1); | |
90 | + if (second.getOrth().equals(":")) { | |
91 | + sentence.addMention(new Mention(first)); | |
92 | + } | |
93 | + } | |
94 | + } | |
95 | + | |
96 | + /** | |
97 | + * Wyszukuję i oznaczam wszystkie NG* | |
98 | + * | |
99 | + * @param sentence | |
100 | + */ | |
101 | + private static void addMentionsByGroups(Sentence sentence) { | |
102 | + for (SyntacticGroup group : sentence.getGroups()) { | |
103 | + if (group.getType().startsWith("NG")) { | |
104 | + List<Token> segments = group.getTokens(); | |
105 | + List<Token> heads = group.getSemanticHeadTokens(); | |
106 | + | |
107 | + sentence.addMention(new Mention(segments, heads)); | |
108 | + } | |
109 | + } | |
110 | + } | |
111 | + | |
112 | + /** | |
113 | + * Wyszukuję i oznaczam wszystkie NER | |
114 | + * | |
115 | + * @param sentence | |
116 | + */ | |
117 | + private static void addMentionsByNamedEntities(Sentence sentence) { | |
118 | + for (NamedEntity ne : sentence.getNamedEntities()) { | |
119 | + | |
120 | + List<Token> headTokens = new ArrayList<Token>(); | |
121 | + List<Token> tokens = ne.getTokens(); | |
122 | + | |
123 | + boolean containsNoun = false; | |
124 | + for (Token seg : tokens) { | |
125 | + if (seg.getCtag().matches(Constants.MORPHO_NOUN_CTAGS)) { | |
126 | + containsNoun = true; | |
127 | + break; | |
128 | + } | |
129 | + } | |
130 | + if (!containsNoun) | |
131 | + continue; | |
132 | + | |
133 | + sentence.addMention(new Mention(tokens, headTokens)); | |
134 | + } | |
135 | + } | |
136 | + | |
137 | + /** | |
138 | + * @param sentence | |
139 | + */ | |
140 | + private static void addMentionsBySyntacticWordsCtag(Sentence sentence) { | |
141 | + for (SyntacticWord w : sentence.getSyntacticWords()) | |
142 | + if (w.getCtag().matches(Constants.WORDS_CTAGS)) { | |
143 | + List<Token> tokens = w.getTokens(); | |
144 | + if (tokens.size() == 1) { | |
145 | + sentence.addMention(new Mention(tokens.get(0))); | |
146 | + } else { | |
147 | + List<Token> heads = new ArrayList<Token>(); | |
148 | + sentence.addMention(new Mention(tokens, heads)); | |
149 | + } | |
150 | + } | |
151 | + } | |
152 | + | |
153 | + /** | |
154 | + * Wyszukuję wszystkie interesujace czesci mowy jesli jest poziom slow | |
155 | + * skladniowych, to korzystam z niego zamiast morfoskladni | |
156 | + * | |
157 | + * @param sentence | |
158 | + */ | |
159 | + private static void addMentionsByTokenCtag(Sentence sentence) { | |
160 | + for (Token token : sentence) | |
161 | + if (token.getCtag().matches(Constants.MORPHO_CTAGS)) | |
162 | + sentence.addMention(new Mention(token)); | |
163 | + } | |
164 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Constants.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Constants.java | |
1 | +package pl.waw.ipipan.zil.core.md.detection.zero; | |
2 | + | |
3 | +import java.util.Arrays; | |
4 | +import java.util.HashSet; | |
5 | +import java.util.Set; | |
6 | + | |
7 | +public class Constants { | |
8 | + final public static Set<String> VERB_TAGS = new HashSet<>( | |
9 | + Arrays.asList(new String[] { "fin", "bedzie", "aglt", "praet", | |
10 | + "winien" })); | |
11 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/FeatureGeneration.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/FeatureGeneration.java | |
1 | +package pl.waw.ipipan.zil.core.md.detection.zero; | |
2 | + | |
3 | +import ipipan.clarin.tei.api.entities.TEIMention; | |
4 | +import ipipan.clarin.tei.api.entities.TEIMorph; | |
5 | + | |
6 | +import java.util.ArrayList; | |
7 | +import java.util.Arrays; | |
8 | +import java.util.HashMap; | |
9 | +import java.util.HashSet; | |
10 | +import java.util.Iterator; | |
11 | +import java.util.LinkedList; | |
12 | +import java.util.List; | |
13 | +import java.util.Map; | |
14 | +import java.util.Set; | |
15 | + | |
16 | +import pl.waw.ipipan.zil.core.md.entities.Mention; | |
17 | +import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
18 | +import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup; | |
19 | +import pl.waw.ipipan.zil.core.md.entities.SyntacticWord; | |
20 | +import pl.waw.ipipan.zil.core.md.entities.Token; | |
21 | + | |
22 | +public class FeatureGeneration { | |
23 | + final private static Set<String> CLAUSE_SPLIT_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "i", "albo", | |
24 | + "lub", "oraz", "bądź", "ani", "czy", "niż", "tudzież", ",", ";", "-", "–", ":" })); | |
25 | + | |
26 | + final private static Set<String> CLAUSE_SPLIT_LEMMAS2 = new HashSet<>(Arrays.asList(new String[] { "a", "ale", | |
27 | + "lecz", "jednak", "jednakże", "zaś", "wszakże", "owszem", "natomiast", "tylko", "dlatego", "jedynie", | |
28 | + "przecież", "tymczasem", "ponieważ", "więc", "dlatego", "toteż", "zatem" })); | |
29 | + | |
30 | + final private static Set<String> CLAUSE_SPLIT_LEMMAS_STRICT = new HashSet<>( | |
31 | + Arrays.asList(new String[] { "?", "!" })); | |
32 | + | |
33 | + final private static Map<String, String> CLAUSE_SPLIT_LEMMAS_PAIRWISE = new HashMap<>(); | |
34 | + static { | |
35 | + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("(", ")"); | |
36 | + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("\"", "\""); | |
37 | + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("'", "'"); | |
38 | + } | |
39 | + | |
40 | + final private static Set<String> NOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "subst", "depr", "ppron12", | |
41 | + "ppron3", "ger", "num", "numcol" })); | |
42 | + | |
43 | + final private static Set<String> PRONOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "ppron12", "ppron3" })); | |
44 | + | |
45 | + final private static Set<String> VERB_TAGS = new HashSet<>(Arrays.asList(new String[] { "fin", "bedzie", "aglt", | |
46 | + "praet", "winien" })); | |
47 | + | |
48 | + final private static Set<String> ZAIMKI_WZGLEDNE_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "jaki", | |
49 | + "który" })); | |
50 | + | |
51 | + public static void generateFeatures(Map<String, Object> features, Token m, Sentence s, Set<String> quasiVerbs) { | |
52 | + | |
53 | + features.put("verbCtag", m.getChosenInterpretation().getCtag()); | |
54 | + features.put("verbNumber", m.getChosenInterpretation().getNumber()); | |
55 | + features.put("verbGender", m.getChosenInterpretation().getGender()); | |
56 | + features.put("verbPerson", m.getChosenInterpretation().getPerson()); | |
57 | + | |
58 | + features.put("quasi", quasiVerbs.contains(m.getChosenInterpretation().getBase())); | |
59 | + | |
60 | + features.put("nextCtag", getNeighbouringTag(s, m, 1)); | |
61 | + features.put("prevCtag", getNeighbouringTag(s, m, -1)); | |
62 | + | |
63 | + features.put("isPrevPraet", isPrevPraet(m, s)); | |
64 | + features.put("isPrevComma", isPrevComma(m, s)); | |
65 | + features.put("isPrev2Pred", isPrev2Pred(m, s)); | |
66 | + features.put("isNextInf", isNextInf(m, s)); | |
67 | + | |
68 | + List<Token> clause = getClause(s, m); | |
69 | + features.put("sentLength", s.size()); | |
70 | + features.put("clauseLength", clause.size()); | |
71 | + | |
72 | + addFeatures(features, clause, "clause", m); | |
73 | + addFeatures(features, s, "sent", m); | |
74 | + for (int i = 1; i < 6; i++) | |
75 | + addFeatures(features, getWindow(s, m, i, 0), "window_" + i + "_" + 0, m); | |
76 | + for (int i = 1; i < 6; i++) | |
77 | + addFeatures(features, getWindow(s, m, 0, i), "window_" + 0 + "_" + i, m); | |
78 | + for (int i = 1; i < 6; i++) | |
79 | + addFeatures(features, getWindow(s, m, i, i), "window_" + i + "_" + i, m); | |
80 | + } | |
81 | + | |
82 | + private static boolean isNextInf(Token m, Sentence s) { | |
83 | + boolean now = false; | |
84 | + for (Token morph : s) { | |
85 | + if (now) | |
86 | + return morph.getChosenInterpretation().getCtag().equals("inf"); | |
87 | + if (m.equals(morph)) | |
88 | + now = true; | |
89 | + } | |
90 | + return false; | |
91 | + } | |
92 | + | |
93 | + private static boolean isPrev2Pred(Token m, Sentence s) { | |
94 | + Token prev = null; | |
95 | + Token prev2 = null; | |
96 | + for (Token morph : s) { | |
97 | + if (m.equals(morph)) | |
98 | + break; | |
99 | + prev2 = prev; | |
100 | + prev = morph; | |
101 | + } | |
102 | + return (prev != null && prev.getChosenInterpretation().getCtag().equals("pred")) | |
103 | + || (prev2 != null && prev2.getChosenInterpretation().getCtag().equals("pred")); | |
104 | + } | |
105 | + | |
106 | + private static Object isPrevComma(Token m, Sentence s) { | |
107 | + Token prev = null; | |
108 | + for (Token morph : s) { | |
109 | + if (m.equals(morph)) | |
110 | + break; | |
111 | + prev = morph; | |
112 | + } | |
113 | + return prev != null && prev.getChosenInterpretation().getBase().equals(","); | |
114 | + } | |
115 | + | |
116 | + private static String getNeighbouringTag(Sentence s, Token m, int i) { | |
117 | + int idx = s.indexOf(m) + i; | |
118 | + if (idx >= s.size() || idx < 0) | |
119 | + return "None"; | |
120 | + return s.get(idx).getChosenInterpretation().getCtag(); | |
121 | + } | |
122 | + | |
123 | + private static void addFeatures(Map<String, Object> features, List<Token> clause, String prefix, Token m) { | |
124 | + | |
125 | + boolean hasNom = false; // 1 | |
126 | + boolean hasNum = false; // 2 | |
127 | + boolean hasPOG = false; // 3 | |
128 | + | |
129 | + boolean hasNomNum = false; | |
130 | + boolean hasNumPOG = false; | |
131 | + boolean hasNomPOG = false; | |
132 | + boolean hasNomNumPOG = false; | |
133 | + | |
134 | + boolean has2Nom = false; | |
135 | + boolean has2NomPOG = false; | |
136 | + boolean has2POG = false; | |
137 | + | |
138 | + Token prev = null; | |
139 | + for (Token candidate : clause) { | |
140 | + | |
141 | + if (!isNoun(candidate) || isJakJako(prev)) { | |
142 | + prev = candidate; | |
143 | + continue; | |
144 | + } | |
145 | + | |
146 | + // nom, nom2 | |
147 | + if (isNom(candidate)) { | |
148 | + if (hasNom) | |
149 | + has2Nom = true; | |
150 | + hasNom = true; | |
151 | + } | |
152 | + // num | |
153 | + if (agreedNum(candidate, m)) { | |
154 | + hasNum = true; | |
155 | + } | |
156 | + // pog, pog2 | |
157 | + if (agreedGenderOrPerson(candidate, m)) { | |
158 | + if (hasPOG) | |
159 | + has2POG = true; | |
160 | + hasPOG = true; | |
161 | + } | |
162 | + | |
163 | + // nom num, nom num pog | |
164 | + if (isNom(candidate) && agreedNum(candidate, m)) { | |
165 | + if (agreedGenderOrPerson(candidate, m)) | |
166 | + hasNomNumPOG = true; | |
167 | + hasNomNum = true; | |
168 | + } | |
169 | + | |
170 | + // nom pog, num pog | |
171 | + if (agreedGenderOrPerson(candidate, m)) | |
172 | + if (isNom(candidate)) { | |
173 | + if (hasNomPOG) | |
174 | + has2NomPOG = true; | |
175 | + hasNomPOG = true; | |
176 | + } else if (agreedNum(candidate, m)) | |
177 | + hasNumPOG = true; | |
178 | + | |
179 | + prev = candidate; | |
180 | + } | |
181 | + | |
182 | + // features.put("conj_" + prefix, hasConj); | |
183 | + features.put("cand_2_nom_" + prefix, has2Nom); | |
184 | + features.put("cand_2_POG_" + prefix, has2POG); | |
185 | + features.put("cand_2_nom+POG_" + prefix, has2NomPOG); | |
186 | + | |
187 | + features.put("cand_nom_" + prefix, hasNom); | |
188 | + features.put("cand_num_" + prefix, hasNum); | |
189 | + features.put("cand_POG_" + prefix, hasPOG); | |
190 | + | |
191 | + features.put("cand_nom+num_" + prefix, hasNomNum); | |
192 | + features.put("cand_nom+num+POG_" + prefix, hasNomNumPOG); | |
193 | + features.put("cand_nom+POG_" + prefix, hasNomPOG); | |
194 | + features.put("cand_num+POG_" + prefix, hasNumPOG); | |
195 | + } | |
196 | + | |
197 | + private static List<Token> getWindow(Sentence s, Token m, int pre, int post) { | |
198 | + | |
199 | + int idx = s.indexOf(m); | |
200 | + int from = Math.max(0, idx - pre); | |
201 | + int to = Math.min(s.size(), idx + post + 1); | |
202 | + | |
203 | + return new ArrayList<>(s.subList(from, to)); | |
204 | + } | |
205 | + | |
206 | + private static boolean isPrevPraet(Token m, Sentence s) { | |
207 | + Token prev = null; | |
208 | + for (Token morph : s) { | |
209 | + if (m.equals(morph)) | |
210 | + break; | |
211 | + prev = morph; | |
212 | + } | |
213 | + return prev != null && prev.getChosenInterpretation().getCtag().equals("praet"); | |
214 | + } | |
215 | + | |
216 | + /** | |
217 | + * - podział na klauzule: przecinki oraz spójniki bezprzecinkowe: i, albo, | |
218 | + * lub (jak przy streszczeniach: w środku musi być czasownik w formie | |
219 | + * osobowej), | |
220 | + */ | |
221 | + public static List<Token> getClause(Sentence s, Token m2) { | |
222 | + | |
223 | + List<List<Token>> sublists = getClauses(s); | |
224 | + | |
225 | + for (List<Token> sub : sublists) | |
226 | + for (Token m : sub) | |
227 | + if (m.equals(m2)) | |
228 | + return sub; | |
229 | + | |
230 | + return null; | |
231 | + } | |
232 | + | |
233 | + public static List<List<Token>> getClauses(Sentence s) { | |
234 | + | |
235 | + Set<Token> noSplitMorphs = new HashSet<>(); | |
236 | + for (SyntacticGroup g : s.getGroups()) { | |
237 | + for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) { | |
238 | + noSplitMorphs.add(m); | |
239 | + } | |
240 | + } | |
241 | + for (SyntacticWord g : s.getSyntacticWords()) { | |
242 | + for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) { | |
243 | + noSplitMorphs.add(m); | |
244 | + } | |
245 | + } | |
246 | + | |
247 | + LinkedList<List<Token>> sublists = new LinkedList<>(); | |
248 | + List<Token> currentSublist = new ArrayList<>(); | |
249 | + boolean clauseHasVerb = false; | |
250 | + for (Token m : s) { | |
251 | + String base = m.getChosenInterpretation().getBase(); | |
252 | + if (!noSplitMorphs.contains(m) | |
253 | + && (CLAUSE_SPLIT_LEMMAS_STRICT.contains(base) || ((CLAUSE_SPLIT_LEMMAS.contains(base) || CLAUSE_SPLIT_LEMMAS2 | |
254 | + .contains(base)) && clauseHasVerb))) { | |
255 | + sublists.add(currentSublist); | |
256 | + currentSublist = new ArrayList<>(); | |
257 | + clauseHasVerb = false; | |
258 | + } else { | |
259 | + if (isVerb(m)) | |
260 | + clauseHasVerb = true; | |
261 | + } | |
262 | + currentSublist.add(m); | |
263 | + } | |
264 | + if (currentSublist.size() > 0) { | |
265 | + if (clauseHasVerb) | |
266 | + sublists.add(currentSublist); | |
267 | + else | |
268 | + sublists.getLast().addAll(currentSublist); | |
269 | + } | |
270 | + | |
271 | + // merge clause beginning with zaimek wzgl. etc to previous clause | |
272 | + List<Token> prev = null; | |
273 | + Iterator<List<Token>> it = sublists.iterator(); | |
274 | + while (it.hasNext()) { | |
275 | + List<Token> sublist = it.next(); | |
276 | + boolean containsRelPron = false; | |
277 | + int i = 1; | |
278 | + for (Token m : sublist) { | |
279 | + if (i > 2) | |
280 | + break; | |
281 | + if (ZAIMKI_WZGLEDNE_LEMMAS.contains(m.getChosenInterpretation().getBase())) { | |
282 | + containsRelPron = true; | |
283 | + break; | |
284 | + } | |
285 | + i++; | |
286 | + } | |
287 | + if (prev != null && containsRelPron) { | |
288 | + prev.addAll(sublist); | |
289 | + it.remove(); | |
290 | + } else | |
291 | + prev = sublist; | |
292 | + } | |
293 | + | |
294 | + return sublists; | |
295 | + } | |
296 | + | |
297 | + private static boolean agreedNum(Token candidate, Token keyword) { | |
298 | + String keywordNum = keyword.getNumber(); | |
299 | + String wordNum = candidate.getNumber(); | |
300 | + return keywordNum.equals(wordNum); | |
301 | + } | |
302 | + | |
303 | + private static boolean agreedGenderOrPerson(Token candidate, Token keyword) { | |
304 | + if (isPraet(keyword)) { | |
305 | + // praet has number:gender | |
306 | + String keywordGender = keyword.getGender(); | |
307 | + String wordGender = candidate.getGender(); | |
308 | + return keywordGender.equals(wordGender); | |
309 | + } else { | |
310 | + // other verbs have number:person | |
311 | + String keywordPerson = keyword.getPerson(); | |
312 | + String wordPerson = "ter"; // default | |
313 | + if (PRONOUN_TAGS.contains(candidate)) | |
314 | + wordPerson = candidate.getPerson(); | |
315 | + return wordPerson.equals(keywordPerson); | |
316 | + } | |
317 | + } | |
318 | + | |
319 | + private static boolean isJakJako(Token prev) { | |
320 | + String base = prev == null ? null : prev.getBase(); | |
321 | + return prev != null && (base.equals("jak") || base.equals("jako")); | |
322 | + } | |
323 | + | |
324 | + private static boolean isPraet(Token keyword) { | |
325 | + return keyword.getCtag().equals("praet"); | |
326 | + } | |
327 | + | |
328 | + private static boolean isNom(Token candidate) { | |
329 | + return "nom".equals(candidate.getCase()); // dziala dla rzeczownikow | |
330 | + // tylko! | |
331 | + } | |
332 | + | |
333 | + private static boolean isNoun(Token m) { | |
334 | + return NOUN_TAGS.contains(m.getCtag()); | |
335 | + } | |
336 | + | |
337 | + public static boolean isVerb(Token morph) { | |
338 | + return VERB_TAGS.contains(morph.getCtag()); | |
339 | + } | |
340 | + | |
341 | + public static boolean isVerb(Mention m) { | |
342 | + boolean hasOnlyVerbs = true; | |
343 | + for (Token morph : m.getSegments()) | |
344 | + if (!isVerb(morph)) { | |
345 | + hasOnlyVerbs = false; | |
346 | + break; | |
347 | + } | |
348 | + return hasOnlyVerbs; | |
349 | + } | |
350 | + | |
351 | + public static boolean isVerb(TEIMention m) { | |
352 | + boolean hasOnlyVerbs = true; | |
353 | + for (TEIMorph morph : m.getMorphs()) | |
354 | + if (!isVerb(morph)) { | |
355 | + hasOnlyVerbs = false; | |
356 | + break; | |
357 | + } | |
358 | + return hasOnlyVerbs; | |
359 | + } | |
360 | + | |
361 | + private static boolean isVerb(TEIMorph morph) { | |
362 | + return VERB_TAGS.contains(morph.getChosenInterpretation().getCtag()); | |
363 | + } | |
364 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/InstanceCreator.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/InstanceCreator.java | |
1 | +package pl.waw.ipipan.zil.core.md.detection.zero; | |
2 | + | |
3 | +import ipipan.clarin.tei.api.entities.TEICorpusText; | |
4 | +import ipipan.clarin.tei.api.io.IOUtils; | |
5 | +import ipipan.clarin.tei.api.io.TEI_IO; | |
6 | + | |
7 | +import java.io.File; | |
8 | +import java.util.ArrayList; | |
9 | +import java.util.HashSet; | |
10 | +import java.util.List; | |
11 | +import java.util.Map.Entry; | |
12 | +import java.util.Set; | |
13 | +import java.util.TreeMap; | |
14 | +import java.util.TreeSet; | |
15 | + | |
16 | +import org.apache.log4j.Logger; | |
17 | + | |
18 | +import pl.waw.ipipan.zil.core.md.entities.Mention; | |
19 | +import pl.waw.ipipan.zil.core.md.entities.Paragraph; | |
20 | +import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
21 | +import pl.waw.ipipan.zil.core.md.entities.Text; | |
22 | +import pl.waw.ipipan.zil.core.md.entities.Token; | |
23 | +import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader; | |
24 | +import weka.core.Attribute; | |
25 | +import weka.core.FastVector; | |
26 | +import weka.core.Instance; | |
27 | +import weka.core.Instances; | |
28 | + | |
29 | +public class InstanceCreator { | |
30 | + | |
31 | + final private static Logger logger = Logger.getLogger(InstanceCreator.class); | |
32 | + final private static TEI_IO teiIO = TEI_IO.getInstance(); | |
33 | + | |
34 | + public static List<TreeMap<String, Object>> loadExamples(File dataDir, Set<String> quasiVerbs) { | |
35 | + int allTexts = 0; | |
36 | + int exceptions = 0; | |
37 | + int allSentences = 0; | |
38 | + | |
39 | + List<TreeMap<String, Object>> examples = new ArrayList<>(); | |
40 | + for (File textDir : IOUtils.getNKJPDirs(dataDir)) { | |
41 | + try { | |
42 | + allTexts++; | |
43 | + logger.info("Processing text " + textDir); | |
44 | + TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir); | |
45 | + Text text = TeiLoader.loadTextFromTei(ct); | |
46 | + | |
47 | + for (Paragraph p : text) | |
48 | + for (Sentence s : p) { | |
49 | + allSentences++; | |
50 | + loadExamplesFromSentence(quasiVerbs, examples, s); | |
51 | + } | |
52 | + | |
53 | + } catch (Exception e) { | |
54 | + logger.error(e.getLocalizedMessage()); | |
55 | + exceptions++; | |
56 | + } | |
57 | + } | |
58 | + | |
59 | + logger.info(allTexts + " texts found."); | |
60 | + if (exceptions != 0) | |
61 | + logger.error(exceptions + " texts with exceptions."); | |
62 | + logger.info(allSentences + " sentences found."); | |
63 | + | |
64 | + return examples; | |
65 | + } | |
66 | + | |
67 | + public static void loadExamplesFromSentence(Set<String> quasiVerbs, List<TreeMap<String, Object>> examples, | |
68 | + Sentence s) { | |
69 | + | |
70 | + // collect positive examples | |
71 | + Set<Token> positive = new HashSet<>(); | |
72 | + for (Mention m : s.getMentions()) { | |
73 | + if (FeatureGeneration.isVerb(m)) { | |
74 | + positive.addAll(m.getSegments()); | |
75 | + } | |
76 | + } | |
77 | + | |
78 | + for (Token m : s) { | |
79 | + if (!FeatureGeneration.isVerb(m)) | |
80 | + continue; | |
81 | + | |
82 | + TreeMap<String, Object> features = new TreeMap<>(); | |
83 | + if (positive.contains(m)) { | |
84 | + features.put("class", Boolean.valueOf(true)); | |
85 | + } else { | |
86 | + features.put("class", Boolean.valueOf(false)); | |
87 | + } | |
88 | + | |
89 | + FeatureGeneration.generateFeatures(features, m, s, quasiVerbs); | |
90 | + examples.add(features); | |
91 | + } | |
92 | + } | |
93 | + | |
94 | + public static Instances createInstances(List<TreeMap<String, Object>> examples, String classFeatureName) { | |
95 | + | |
96 | + TreeSet<String> booleanAttsOccurred = new TreeSet<>(); | |
97 | + TreeSet<String> doubleAttsOccurred = new TreeSet<>(); | |
98 | + TreeMap<String, Set<String>> att2values = new TreeMap<>(); | |
99 | + for (TreeMap<String, Object> example : examples) { | |
100 | + for (Entry<String, Object> e : example.entrySet()) { | |
101 | + String key = e.getKey(); | |
102 | + Object val = e.getValue(); | |
103 | + if (val instanceof Integer || val instanceof Double) { | |
104 | + doubleAttsOccurred.add(key); | |
105 | + continue; | |
106 | + } | |
107 | + if (val instanceof Boolean) { | |
108 | + booleanAttsOccurred.add(key); | |
109 | + continue; | |
110 | + } | |
111 | + if (!att2values.containsKey(key)) | |
112 | + att2values.put(key, new HashSet<String>()); | |
113 | + att2values.get(key).add(val.toString()); | |
114 | + } | |
115 | + } | |
116 | + | |
117 | + List<Attribute> atts = new ArrayList<>(); | |
118 | + | |
119 | + // double attributes | |
120 | + for (String attName : doubleAttsOccurred) { | |
121 | + Attribute att = new Attribute(attName); | |
122 | + atts.add(att); | |
123 | + } | |
124 | + | |
125 | + // boolean attributes (treated as nominal) | |
126 | + FastVector values = new FastVector(2); | |
127 | + values.addElement("false"); | |
128 | + values.addElement("true"); | |
129 | + for (String attName : booleanAttsOccurred) { | |
130 | + Attribute att = new Attribute(attName, values); | |
131 | + atts.add(att); | |
132 | + } | |
133 | + | |
134 | + // nominal attributes | |
135 | + for (Entry<String, Set<String>> attVals : att2values.entrySet()) { | |
136 | + FastVector vals = new FastVector(attVals.getValue().size()); | |
137 | + for (String val : attVals.getValue()) | |
138 | + vals.addElement(val); | |
139 | + Attribute att = new Attribute(attVals.getKey(), vals); | |
140 | + atts.add(att); | |
141 | + } | |
142 | + | |
143 | + FastVector fvWekaAttributes = new FastVector(atts.size()); | |
144 | + for (Attribute attr : atts) { | |
145 | + fvWekaAttributes.addElement(attr); | |
146 | + } | |
147 | + | |
148 | + Instances data = new Instances("Zero", fvWekaAttributes, 10); | |
149 | + data.setClass(data.attribute(classFeatureName)); | |
150 | + return data; | |
151 | + } | |
152 | + | |
153 | + public static void fillInstances(List<TreeMap<String, Object>> examples, Instances instances) { | |
154 | + for (TreeMap<String, Object> example : examples) { | |
155 | + Instance instance = new Instance(instances.numAttributes()); | |
156 | + | |
157 | + for (Entry<String, Object> e : example.entrySet()) { | |
158 | + Object val = e.getValue(); | |
159 | + String name = e.getKey(); | |
160 | + if (val instanceof Integer) { | |
161 | + instance.setValue(instances.attribute(name), (int) val); | |
162 | + } else if (val instanceof Boolean) { | |
163 | + instance.setValue(instances.attribute(name), ((Boolean) val) ? "true" : "false"); | |
164 | + } else { | |
165 | + int indexOfValue = instances.attribute(name).indexOfValue(val.toString()); | |
166 | + if (indexOfValue == -1) { | |
167 | + logger.debug("Unkown value: " + val.toString() + " of feature: " + name | |
168 | + + ". Marking as missing value."); | |
169 | + instance.setMissing(instances.attribute(name)); | |
170 | + } else | |
171 | + instance.setValue(instances.attribute(name), indexOfValue); | |
172 | + } | |
173 | + } | |
174 | + | |
175 | + instance.setDataset(instances); | |
176 | + instances.add(instance); | |
177 | + } | |
178 | + } | |
179 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Model.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Model.java | |
1 | +package pl.waw.ipipan.zil.core.md.detection.zero; | |
2 | + | |
3 | +import java.io.Serializable; | |
4 | +import java.util.List; | |
5 | +import java.util.Set; | |
6 | +import java.util.TreeMap; | |
7 | + | |
8 | +import org.apache.log4j.Logger; | |
9 | + | |
10 | +import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
11 | +import weka.classifiers.Classifier; | |
12 | +import weka.core.Instance; | |
13 | +import weka.core.Instances; | |
14 | + | |
15 | +public class Model implements Serializable { | |
16 | + | |
17 | + private static final long serialVersionUID = 3351727361273283076L; | |
18 | + private static final Logger logger = Logger.getLogger(Model.class); | |
19 | + | |
20 | + private Classifier classifier; | |
21 | + private Set<String> quasiVerbs; | |
22 | + private Instances instances; | |
23 | + | |
24 | + public Model(Classifier classifier, Instances instances, Set<String> quasiVerbs) { | |
25 | + this.classifier = classifier; | |
26 | + this.instances = instances; | |
27 | + this.quasiVerbs = quasiVerbs; | |
28 | + } | |
29 | + | |
30 | + public boolean isZeroSubject(Instance instance, Sentence sentence) { | |
31 | + try { | |
32 | + double response = this.classifier.classifyInstance(instance); | |
33 | + return response > 0; | |
34 | + } catch (Exception e) { | |
35 | + logger.error("Error classyfing verb in sentence: " + sentence); | |
36 | + return false; | |
37 | + } | |
38 | + } | |
39 | + | |
40 | + public Instances getInstances(List<TreeMap<String, Object>> examples) { | |
41 | + Instances instances = new Instances(this.instances); | |
42 | + InstanceCreator.fillInstances(examples, instances); | |
43 | + return instances; | |
44 | + } | |
45 | + | |
46 | + public Set<String> getQuasiVerbs() { | |
47 | + return quasiVerbs; | |
48 | + } | |
49 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Serializer.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Serializer.java | |
1 | +package pl.waw.ipipan.zil.core.md.detection.zero; | |
2 | + | |
3 | +import java.io.InputStream; | |
4 | + | |
5 | +import weka.core.SerializationHelper; | |
6 | + | |
7 | +public class Serializer { | |
8 | + | |
9 | + public static void saveModel(Model m, String targetModelFilePath) throws Exception { | |
10 | + SerializationHelper.write(targetModelFilePath, m); | |
11 | + } | |
12 | + | |
13 | + public static Model loadModel(String path) throws Exception { | |
14 | + Model m = (Model) SerializationHelper.read(path); | |
15 | + return m; | |
16 | + } | |
17 | + | |
18 | + public static Model loadModelFromStream(InputStream stream) throws Exception { | |
19 | + Model m = (Model) SerializationHelper.read(stream); | |
20 | + return m; | |
21 | + } | |
22 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Trainer.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Trainer.java | |
1 | +package pl.waw.ipipan.zil.core.md.detection.zero; | |
2 | + | |
3 | +import java.io.BufferedReader; | |
4 | +import java.io.File; | |
5 | +import java.io.IOException; | |
6 | +import java.io.InputStream; | |
7 | +import java.io.InputStreamReader; | |
8 | +import java.util.HashSet; | |
9 | +import java.util.List; | |
10 | +import java.util.Random; | |
11 | +import java.util.Set; | |
12 | +import java.util.TreeMap; | |
13 | + | |
14 | +import org.apache.log4j.Logger; | |
15 | + | |
16 | +import weka.classifiers.Evaluation; | |
17 | +import weka.classifiers.rules.JRip; | |
18 | +import weka.classifiers.rules.JRip.RipperRule; | |
19 | +import weka.core.Attribute; | |
20 | +import weka.core.Instance; | |
21 | +import weka.core.Instances; | |
22 | + | |
23 | +public class Trainer { | |
24 | + | |
25 | + final private static Logger logger = Logger.getLogger(Trainer.class); | |
26 | + | |
27 | + private static final boolean DO_CV = false; | |
28 | + private static final String QUASI_LIST_PATH = "/quasi_verbs.txt"; | |
29 | + | |
30 | + public static void main(String[] args) { | |
31 | + | |
32 | + if (args.length != 2) { | |
33 | + logger.error("Wrong number of arguments! Should be: " + Trainer.class.getSimpleName() | |
34 | + + " trainDir targetModelFile"); | |
35 | + return; | |
36 | + } | |
37 | + | |
38 | + File dataDir = new File(args[0]); | |
39 | + String targetModelFilePath = args[1]; | |
40 | + | |
41 | + if (!dataDir.isDirectory()) { | |
42 | + logger.error(dataDir + " is not a directory!"); | |
43 | + return; | |
44 | + } | |
45 | + | |
46 | + Set<String> quasiVerbs = loadQuasiVerbs(); | |
47 | + | |
48 | + List<TreeMap<String, Object>> examples = InstanceCreator.loadExamples(dataDir, quasiVerbs); | |
49 | + Instances instances = InstanceCreator.createInstances(examples, "class"); | |
50 | + InstanceCreator.fillInstances(examples, instances); | |
51 | + | |
52 | + printStats(instances); | |
53 | + | |
54 | + try { | |
55 | + JRip model = new JRip(); | |
56 | + | |
57 | + if (DO_CV) { | |
58 | + logger.info("Crossvalidation..."); | |
59 | + Evaluation eval = new Evaluation(instances); | |
60 | + eval.crossValidateModel(model, instances, 10, new Random(1)); | |
61 | + logger.info(eval.toSummaryString()); | |
62 | + logger.info(eval.toMatrixString()); | |
63 | + logger.info(eval.toClassDetailsString()); | |
64 | + } | |
65 | + | |
66 | + logger.info("Building final classifier..."); | |
67 | + model = new JRip(); | |
68 | + model.buildClassifier(instances); | |
69 | + logger.info(model.getRuleset().size() + " rules generated."); | |
70 | + for (int i = 0; i < model.getRuleset().size(); i++) { | |
71 | + RipperRule v = (RipperRule) model.getRuleset().elementAt(i); | |
72 | + logger.info("\t" + v.toString(instances.classAttribute())); | |
73 | + } | |
74 | + | |
75 | + instances.delete(); | |
76 | + logger.info("Features stats:"); | |
77 | + for (int i = 0; i < instances.numAttributes(); i++) { | |
78 | + Attribute att = instances.attribute(i); | |
79 | + logger.info(i + ".\t" + att.toString()); | |
80 | + } | |
81 | + | |
82 | + logger.info("Saving classifier..."); | |
83 | + Model m = new Model(model, instances, quasiVerbs); | |
84 | + Serializer.saveModel(m, targetModelFilePath); | |
85 | + logger.info("Done."); | |
86 | + | |
87 | + } catch (Exception e) { | |
88 | + logger.error("Error: " + e); | |
89 | + } | |
90 | + } | |
91 | + | |
92 | + private static Set<String> loadQuasiVerbs() { | |
93 | + Set<String> quasiVerbs = new HashSet<>(); | |
94 | + InputStream stream = Trainer.class.getResourceAsStream(QUASI_LIST_PATH); | |
95 | + try (BufferedReader br = new BufferedReader(new InputStreamReader(stream))) { | |
96 | + String line = null; | |
97 | + while ((line = br.readLine()) != null) { | |
98 | + quasiVerbs.add(line.trim()); | |
99 | + } | |
100 | + } catch (IOException e) { | |
101 | + logger.error(e.getLocalizedMessage()); | |
102 | + } | |
103 | + return quasiVerbs; | |
104 | + } | |
105 | + | |
106 | + private static void printStats(Instances instances) { | |
107 | + int positive = 0; | |
108 | + int negative = 0; | |
109 | + for (int i = 0; i < instances.numInstances(); i++) { | |
110 | + Instance inst = instances.instance(i); | |
111 | + if (inst.classValue() > 0) | |
112 | + negative++; | |
113 | + else | |
114 | + positive++; | |
115 | + } | |
116 | + logger.info(positive + " positive examples"); | |
117 | + logger.info(negative + " negative examples"); | |
118 | + logger.info((positive + negative) + " examples total"); | |
119 | + logger.info((instances.numAttributes() - 1) + " attributes"); | |
120 | + logger.info(instances.toSummaryString()); | |
121 | + } | |
122 | + | |
123 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/ZeroSubjectDetector.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/ZeroSubjectDetector.java | |
1 | +package pl.waw.ipipan.zil.core.md.detection.zero; | |
2 | + | |
3 | +import java.io.File; | |
4 | +import java.io.InputStream; | |
5 | +import java.util.ArrayList; | |
6 | +import java.util.HashSet; | |
7 | +import java.util.List; | |
8 | +import java.util.Set; | |
9 | +import java.util.TreeMap; | |
10 | + | |
11 | +import org.apache.log4j.Logger; | |
12 | + | |
13 | +import pl.waw.ipipan.zil.core.md.entities.Mention; | |
14 | +import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
15 | +import pl.waw.ipipan.zil.core.md.entities.Token; | |
16 | +import weka.core.Instances; | |
17 | + | |
18 | +public class ZeroSubjectDetector { | |
19 | + final private static Logger logger = Logger.getLogger(ZeroSubjectDetector.class); | |
20 | + | |
21 | + private Model model; | |
22 | + private Set<String> quasiVerbs = new HashSet<>(); | |
23 | + | |
24 | + public static int verbsWithoutSubject = 0; | |
25 | + public static int verbsWithSubject = 0; | |
26 | + | |
27 | + public void addZeroSubjectMentions(Sentence sentence) { | |
28 | + List<TreeMap<String, Object>> examples = new ArrayList<>(); | |
29 | + InstanceCreator.loadExamplesFromSentence(quasiVerbs, examples, sentence); | |
30 | + if (examples.isEmpty()) | |
31 | + return; | |
32 | + | |
33 | + Instances instances = model.getInstances(examples); | |
34 | + | |
35 | + // label instances | |
36 | + List<Boolean> areZeros = new ArrayList<>(); | |
37 | + for (int i = 0; i < instances.numInstances(); i++) { | |
38 | + boolean isZero = model.isZeroSubject(instances.instance(i), sentence); | |
39 | + areZeros.add(isZero); | |
40 | + if (isZero) | |
41 | + verbsWithoutSubject++; | |
42 | + else | |
43 | + verbsWithSubject++; | |
44 | + } | |
45 | + | |
46 | + int i = 0; | |
47 | + for (Token m : sentence) { | |
48 | + if (!FeatureGeneration.isVerb(m)) | |
49 | + continue; | |
50 | + if (areZeros.get(i)) | |
51 | + sentence.addMention(new Mention(m, true)); | |
52 | + i++; | |
53 | + } | |
54 | + } | |
55 | + | |
56 | + public ZeroSubjectDetector(File zeroSubjectDetectionModel) { | |
57 | + try { | |
58 | + this.model = Serializer.loadModel(zeroSubjectDetectionModel.getAbsolutePath()); | |
59 | + this.quasiVerbs = this.model.getQuasiVerbs(); | |
60 | + } catch (Exception e) { | |
61 | + logger.error("Error loading model:" + e); | |
62 | + } | |
63 | + } | |
64 | + | |
65 | + public ZeroSubjectDetector(InputStream zeroSubjectDetectionModelStream) { | |
66 | + try { | |
67 | + this.model = Serializer.loadModelFromStream(zeroSubjectDetectionModelStream); | |
68 | + this.quasiVerbs = this.model.getQuasiVerbs(); | |
69 | + } catch (Exception e) { | |
70 | + logger.error("Error loading model:" + e); | |
71 | + } | |
72 | + } | |
73 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/Interpretation.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Interpretation.java | |
1 | +package pl.waw.ipipan.zil.core.md.entities; | |
2 | + | |
3 | +import pl.waw.ipipan.zil.core.md.detection.zero.Constants; | |
4 | + | |
5 | +public class Interpretation { | |
6 | + private String ctag = "null"; | |
7 | + private String base = "null"; | |
8 | + | |
9 | + private String number = "null"; | |
10 | + private String casee = "null"; | |
11 | + private String gender = "null"; | |
12 | + private String person = "null"; | |
13 | + | |
14 | + public Interpretation(String ctag2, String morph, String base) { | |
15 | + this.ctag = ctag2; | |
16 | + this.base = base; | |
17 | + | |
18 | + String[] spl = morph.split(":"); | |
19 | + if (ctag.equalsIgnoreCase("subst") || ctag.equalsIgnoreCase("depr") || ctag.equalsIgnoreCase("ger")) { | |
20 | + this.number = spl[0]; | |
21 | + this.casee = spl[1]; | |
22 | + this.gender = spl[2]; | |
23 | + } else if (ctag.equalsIgnoreCase("ppron12") || ctag.equalsIgnoreCase("ppron3")) { | |
24 | + this.number = spl[0]; | |
25 | + this.casee = spl[1]; | |
26 | + this.gender = spl[2]; | |
27 | + this.person = spl[3]; | |
28 | + } else if (ctag.equalsIgnoreCase("siebie")) { | |
29 | + this.casee = spl[0]; | |
30 | + } else if (Constants.VERB_TAGS.contains(ctag)) { | |
31 | + this.number = spl[0]; | |
32 | + if (ctag.matches("winien|praet")) | |
33 | + this.gender = spl[1]; | |
34 | + else | |
35 | + this.person = spl[1]; | |
36 | + } | |
37 | + } | |
38 | + | |
39 | + public String getCtag() { | |
40 | + return this.ctag; | |
41 | + } | |
42 | + | |
43 | + public String getNumber() { | |
44 | + return this.number; | |
45 | + } | |
46 | + | |
47 | + public String getGender() { | |
48 | + return this.gender; | |
49 | + } | |
50 | + | |
51 | + public String getCase() { | |
52 | + return this.casee; | |
53 | + } | |
54 | + | |
55 | + public String getBase() { | |
56 | + return this.base; | |
57 | + } | |
58 | + | |
59 | + public String getPerson() { | |
60 | + return this.person; | |
61 | + } | |
62 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java | |
1 | +package pl.waw.ipipan.zil.core.md.entities; | |
2 | + | |
3 | +import java.util.ArrayList; | |
4 | +import java.util.List; | |
5 | + | |
6 | +/** | |
7 | + * @author Mateusz Kopec | |
8 | + * | |
9 | + */ | |
10 | +public class Mention implements Comparable<Mention> { | |
11 | + | |
12 | + private MentionGroup mentionGroup = null; | |
13 | + | |
14 | + private List<Token> segments = new ArrayList<Token>(); | |
15 | + private List<Token> headSegments = new ArrayList<Token>(); | |
16 | + | |
17 | + private boolean isZeroSubject = false; | |
18 | + | |
19 | + // empty if no head info gathered for multi-segment mention | |
20 | + // if single-segment mention, then this segment is head | |
21 | + | |
22 | + public Mention(Token segment) { | |
23 | + this(segment, false); | |
24 | + } | |
25 | + | |
26 | + public Mention(List<Token> segments, List<Token> heads, boolean isZero) { | |
27 | + for (Token s : segments) { | |
28 | + s.addMention(this); | |
29 | + this.segments.add(s); | |
30 | + } | |
31 | + this.headSegments.addAll(heads); | |
32 | + this.isZeroSubject = isZero; | |
33 | + } | |
34 | + | |
35 | + public Mention(List<Token> segments, List<Token> heads) { | |
36 | + this(segments, heads, false); | |
37 | + } | |
38 | + | |
39 | + public Mention(Token token, boolean isZero) { | |
40 | + this.isZeroSubject = isZero; | |
41 | + token.addMention(this); | |
42 | + this.segments.add(token); | |
43 | + this.headSegments.add(token); | |
44 | + } | |
45 | + | |
46 | + public void addSegment(Token s) { | |
47 | + s.addMention(this); | |
48 | + this.segments.add(s); | |
49 | + } | |
50 | + | |
51 | + public void addHeadSegment(Token s) { | |
52 | + this.headSegments.add(s); | |
53 | + } | |
54 | + | |
55 | + public List<Token> getSegments() { | |
56 | + return segments; | |
57 | + } | |
58 | + | |
59 | + public Token getFirstSegment() { | |
60 | + return segments.get(0); | |
61 | + } | |
62 | + | |
63 | + public Token getLastSegment() { | |
64 | + return segments.get(segments.size() - 1); | |
65 | + } | |
66 | + | |
67 | + private Token getLastHeadSegment() { | |
68 | + List<Token> hs = this.getHeadSegments(); | |
69 | + if (hs.size() != 0) | |
70 | + return hs.get(hs.size() - 1); | |
71 | + return null; | |
72 | + } | |
73 | + | |
74 | + public String toString() { | |
75 | + StringBuffer sb = new StringBuffer(); | |
76 | + sb.append("["); | |
77 | + for (Token seg : segments) { | |
78 | + sb.append(seg.toString() + " "); | |
79 | + } | |
80 | + sb.append("]"); | |
81 | + return sb.toString(); | |
82 | + } | |
83 | + | |
84 | + public MentionGroup getMentionGroup() { | |
85 | + return mentionGroup; | |
86 | + } | |
87 | + | |
88 | + public void setMentionGroup(MentionGroup mentionGroup) { | |
89 | + this.mentionGroup = mentionGroup; | |
90 | + } | |
91 | + | |
92 | + public List<Token> getHeadSegments() { | |
93 | + return headSegments; | |
94 | + } | |
95 | + | |
96 | + public int getNoOfParentMentions() { | |
97 | + int result = -1; // because we don't want to count this mention | |
98 | + | |
99 | + // each parenting mention must contain all the segments of this one | |
100 | + for (Mention m : getFirstSegment().getMentions()) { | |
101 | + if (m.getSegments().containsAll(getSegments())) | |
102 | + result++; | |
103 | + } | |
104 | + return result; | |
105 | + } | |
106 | + | |
107 | + public boolean isPronoun() { | |
108 | + return this.segments.get(0).getChosenInterpretation().getCtag().matches("ppron.*"); | |
109 | + } | |
110 | + | |
111 | + @Override | |
112 | + public int hashCode() { | |
113 | + final int prime = 31; | |
114 | + int result = 1; | |
115 | + result = prime * result + ((headSegments == null) ? 0 : headSegments.hashCode()); | |
116 | + result = prime * result + ((segments == null) ? 0 : segments.hashCode()); | |
117 | + return result; | |
118 | + } | |
119 | + | |
120 | + @Override | |
121 | + public boolean equals(Object obj) { | |
122 | + if (this == obj) | |
123 | + return true; | |
124 | + if (obj == null) | |
125 | + return false; | |
126 | + if (getClass() != obj.getClass()) | |
127 | + return false; | |
128 | + Mention other = (Mention) obj; | |
129 | + if (headSegments == null) { | |
130 | + if (other.headSegments != null) | |
131 | + return false; | |
132 | + } else if (!headSegments.equals(other.headSegments)) | |
133 | + return false; | |
134 | + if (segments == null) { | |
135 | + if (other.segments != null) | |
136 | + return false; | |
137 | + } else if (!segments.equals(other.segments)) | |
138 | + return false; | |
139 | + return true; | |
140 | + } | |
141 | + | |
142 | + @Override | |
143 | + public int compareTo(Mention other) { | |
144 | + Token thisLastSegment = getLastSegment(); | |
145 | + Token anotherLastSegment = other.getLastSegment(); | |
146 | + | |
147 | + Sentence thisSentence = thisLastSegment.getSentence(); | |
148 | + Sentence anotherSentence = anotherLastSegment.getSentence(); | |
149 | + | |
150 | + Paragraph thisParagraph = thisSentence == null ? null : thisSentence.getParagraph(); | |
151 | + Paragraph anotherParagraph = anotherSentence == null ? null : anotherSentence.getParagraph(); | |
152 | + | |
153 | + String thisTextId = thisParagraph == null ? null : thisParagraph.getText().getId(); | |
154 | + String anotherTextId = anotherParagraph == null ? null : anotherParagraph.getText().getId(); | |
155 | + | |
156 | + int compare; | |
157 | + // first, compare by ids of texts | |
158 | + if (thisTextId != null && anotherTextId != null) { | |
159 | + compare = thisTextId.compareTo(anotherTextId); | |
160 | + if (compare != 0) | |
161 | + return compare; | |
162 | + } | |
163 | + | |
164 | + // second, compare by paragraph position | |
165 | + if (thisParagraph != null && anotherParagraph != null) { | |
166 | + compare = thisParagraph.getTextPosition().compareTo(anotherParagraph.getTextPosition()); | |
167 | + if (compare != 0) | |
168 | + return compare; | |
169 | + | |
170 | + // third, compare by sentence position | |
171 | + compare = thisSentence.getParagraphPosition().compareTo(anotherSentence.getParagraphPosition()); | |
172 | + if (compare != 0) | |
173 | + return compare; | |
174 | + } | |
175 | + | |
176 | + // fourth, compare by last segments | |
177 | + compare = thisLastSegment.getSentencePosition().compareTo(anotherLastSegment.getSentencePosition()); | |
178 | + if (compare != 0) | |
179 | + return compare; | |
180 | + | |
181 | + // fifth, compare by size | |
182 | + Integer thisSize = getSegments().size(); | |
183 | + Integer anotherSize = other.getSegments().size(); | |
184 | + compare = thisSize.compareTo(anotherSize); | |
185 | + if (compare != 0) | |
186 | + return compare; | |
187 | + | |
188 | + // sixth, compare by last head segments | |
189 | + Token thisLastHeadSegment = getLastHeadSegment(); | |
190 | + Token anotherLastHeadSegment = other.getLastHeadSegment(); | |
191 | + if (thisLastHeadSegment != null && anotherLastHeadSegment != null) { | |
192 | + compare = thisLastHeadSegment.getSentencePosition().compareTo(anotherLastHeadSegment.getSentencePosition()); | |
193 | + } | |
194 | + | |
195 | + // seventh, compare by head segments size | |
196 | + thisSize = getHeadSegments().size(); | |
197 | + anotherSize = other.getHeadSegments().size(); | |
198 | + compare = thisSize.compareTo(anotherSize); | |
199 | + | |
200 | + return compare; | |
201 | + } | |
202 | + | |
203 | + public boolean isZeroSubject() { | |
204 | + return isZeroSubject; | |
205 | + } | |
206 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/MentionGroup.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/MentionGroup.java | |
1 | +package pl.waw.ipipan.zil.core.md.entities; | |
2 | + | |
3 | +import java.util.ArrayList; | |
4 | +import java.util.Comparator; | |
5 | + | |
6 | +public class MentionGroup extends ArrayList<Mention> { | |
7 | + | |
8 | + private static final long serialVersionUID = 7051256137623728016L; | |
9 | + private String dominant; | |
10 | + | |
11 | + public MentionGroup() { | |
12 | + } | |
13 | + | |
14 | + public MentionGroup(Mention currentMention) { | |
15 | + add(currentMention); | |
16 | + } | |
17 | + | |
18 | + public boolean add(Mention m) { | |
19 | + m.setMentionGroup(this); | |
20 | + return super.add(m); | |
21 | + } | |
22 | + | |
23 | + public Mention getLastAddedMention() { | |
24 | + return this.get(this.size() - 1); | |
25 | + } | |
26 | + | |
27 | + public final static Comparator<MentionGroup> getMentionGroupComparator() { | |
28 | + return mentionGroupComparator; | |
29 | + } | |
30 | + | |
31 | + private final static Comparator<MentionGroup> mentionGroupComparator = new Comparator<MentionGroup>() { | |
32 | + | |
33 | + public int compare(MentionGroup mg1, MentionGroup mg2) { | |
34 | + Mention m1 = mg1.getLastAddedMention(); | |
35 | + Mention m2 = mg2.getLastAddedMention(); | |
36 | + return m1.compareTo(m2); | |
37 | + } | |
38 | + | |
39 | + }; | |
40 | + | |
41 | + public void setDominant(String string) { | |
42 | + this.dominant = string; | |
43 | + } | |
44 | + | |
45 | + public String getDominant() { | |
46 | + return this.dominant; | |
47 | + } | |
48 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/NamedEntity.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/NamedEntity.java | |
1 | +package pl.waw.ipipan.zil.core.md.entities; | |
2 | + | |
3 | +import java.util.Iterator; | |
4 | +import java.util.List; | |
5 | + | |
6 | +public class NamedEntity implements Comparable<NamedEntity> { | |
7 | + | |
8 | + private List<Token> tokens; | |
9 | + | |
10 | + public NamedEntity(List<Token> tokens) { | |
11 | + this.tokens = tokens; | |
12 | + } | |
13 | + | |
14 | + public List<Token> getTokens() { | |
15 | + return this.tokens; | |
16 | + } | |
17 | + | |
18 | + @Override | |
19 | + public int compareTo(NamedEntity o) { | |
20 | + Iterator<Token> it1 = getTokens().iterator(); | |
21 | + Iterator<Token> it2 = o.getTokens().iterator(); | |
22 | + while (it1.hasNext() && it2.hasNext()) { | |
23 | + Token t1 = it1.next(); | |
24 | + Token t2 = it2.next(); | |
25 | + if (t1.compareTo(t2) != 0) | |
26 | + return t1.compareTo(t2); | |
27 | + } | |
28 | + if (it1.hasNext()) | |
29 | + return 1; | |
30 | + if (it2.hasNext()) | |
31 | + return -1; | |
32 | + | |
33 | + return 0; | |
34 | + } | |
35 | + | |
36 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/Paragraph.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Paragraph.java | |
1 | +package pl.waw.ipipan.zil.core.md.entities; | |
2 | + | |
3 | +import java.util.ArrayList; | |
4 | + | |
5 | +public class Paragraph extends ArrayList<Sentence>{ | |
6 | + | |
7 | + private static final long serialVersionUID = 4871431562737902082L; | |
8 | + | |
9 | + private Text text; | |
10 | + private int textPosition; | |
11 | + | |
12 | + public boolean add(Sentence s) { | |
13 | + s.setParagraphPosition(this.size()); | |
14 | + s.setParagraph(this); | |
15 | + return super.add(s); | |
16 | + } | |
17 | + | |
18 | + public String toString() { | |
19 | + StringBuffer sb = new StringBuffer(); | |
20 | + for (Sentence sentence : this) | |
21 | + sb.append(sentence.toString()+"\n"); | |
22 | + return sb.toString(); | |
23 | + } | |
24 | + | |
25 | + public Text getText() { | |
26 | + return this.text; | |
27 | + } | |
28 | + | |
29 | + public void setText(Text text) { | |
30 | + this.text = text; | |
31 | + } | |
32 | + | |
33 | + public Integer getTextPosition() { | |
34 | + return this.textPosition; | |
35 | + } | |
36 | + | |
37 | + public void setTextPosition(int textPos) { | |
38 | + this.textPosition = textPos; | |
39 | + } | |
40 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/Sentence.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Sentence.java | |
1 | +package pl.waw.ipipan.zil.core.md.entities; | |
2 | + | |
3 | +import java.util.ArrayList; | |
4 | +import java.util.List; | |
5 | +import java.util.Set; | |
6 | +import java.util.TreeSet; | |
7 | + | |
8 | +public class Sentence extends ArrayList<Token> { | |
9 | + | |
10 | + private static final long serialVersionUID = -7300822552646737716L; | |
11 | + | |
12 | + private Paragraph paragraph; | |
13 | + private int paragraphPosition; | |
14 | + | |
15 | + private Set<Mention> mentions = new TreeSet<>(); | |
16 | + private Set<SyntacticWord> syntacticWords = new TreeSet<>(); | |
17 | + private Set<SyntacticGroup> syntacticGroups = new TreeSet<>(); | |
18 | + private Set<NamedEntity> namedEntities = new TreeSet<>(); | |
19 | + | |
20 | + public boolean add(Token s) { | |
21 | + s.setSentencePosition(this.size()); | |
22 | + s.setSentence(this); | |
23 | + return super.add(s); | |
24 | + } | |
25 | + | |
26 | + public void setParagraphPosition(int paragraphPosition) { | |
27 | + this.paragraphPosition = paragraphPosition; | |
28 | + } | |
29 | + | |
30 | + public Integer getParagraphPosition() { | |
31 | + return this.paragraphPosition; | |
32 | + } | |
33 | + | |
34 | + public void setParagraph(Paragraph paragraph) { | |
35 | + this.paragraph = paragraph; | |
36 | + } | |
37 | + | |
38 | + public Paragraph getParagraph() { | |
39 | + return this.paragraph; | |
40 | + } | |
41 | + | |
42 | + public void removeMention(Mention mention) { | |
43 | + mentions.remove(mention); | |
44 | + for (Token s : mention.getSegments()) | |
45 | + s.removeMention(mention); | |
46 | + } | |
47 | + | |
48 | + public void clearMentions() { | |
49 | + for (Mention mention : mentions) | |
50 | + for (Token s : mention.getSegments()) | |
51 | + s.removeMention(mention); | |
52 | + mentions.clear(); | |
53 | + } | |
54 | + | |
55 | + public String toStringWithoutMentions() { | |
56 | + StringBuffer sb = new StringBuffer(); | |
57 | + for (Token seg : this) { | |
58 | + if (!seg.toString().matches("\\[.*\\]")) { | |
59 | + sb.append(seg.toString()); | |
60 | + sb.append(" "); | |
61 | + } | |
62 | + } | |
63 | + return sb.toString(); | |
64 | + } | |
65 | + | |
66 | + public String toString() { | |
67 | + StringBuffer sb = new StringBuffer(); | |
68 | + for (Token seg : this) { | |
69 | + for (@SuppressWarnings("unused") | |
70 | + Mention m : seg.getMentionsStartingBeforeSegment()) | |
71 | + sb.append("["); | |
72 | + sb.append(seg.toString()); | |
73 | + for (@SuppressWarnings("unused") | |
74 | + Mention m : seg.getMentionsEndingAfterSegment()) | |
75 | + sb.append("]"); | |
76 | + sb.append(" "); | |
77 | + } | |
78 | + return sb.toString(); | |
79 | + } | |
80 | + | |
81 | + public List<Mention> getMentions() { | |
82 | + return new ArrayList<Mention>(mentions); | |
83 | + } | |
84 | + | |
85 | + public List<SyntacticWord> getSyntacticWords() { | |
86 | + return new ArrayList<>(syntacticWords); | |
87 | + } | |
88 | + | |
89 | + public List<NamedEntity> getNamedEntities() { | |
90 | + return new ArrayList<>(namedEntities); | |
91 | + } | |
92 | + | |
93 | + public List<SyntacticGroup> getGroups() { | |
94 | + return new ArrayList<>(syntacticGroups); | |
95 | + } | |
96 | + | |
97 | + public void addMention(Mention mention) { | |
98 | + mentions.add(mention); | |
99 | + } | |
100 | + | |
101 | + public void addSyntacticWord(SyntacticWord syntacticWord) { | |
102 | + syntacticWords.add(syntacticWord); | |
103 | + } | |
104 | + | |
105 | + public void addSyntacticGroup(SyntacticGroup syntacticGroup) { | |
106 | + syntacticGroups.add(syntacticGroup); | |
107 | + } | |
108 | + | |
109 | + public void addNamedEntity(NamedEntity namedEntity) { | |
110 | + namedEntities.add(namedEntity); | |
111 | + } | |
112 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java | |
1 | +package pl.waw.ipipan.zil.core.md.entities; | |
2 | + | |
3 | +import java.util.Iterator; | |
4 | +import java.util.List; | |
5 | + | |
6 | +public class SyntacticGroup implements Comparable<SyntacticGroup> { | |
7 | + | |
8 | + private String type; | |
9 | + private List<Token> tokens; | |
10 | + private List<Token> headTokens; | |
11 | + | |
12 | + public SyntacticGroup(String type, List<Token> tokens, | |
13 | + List<Token> headTokens) { | |
14 | + this.type = type; | |
15 | + this.tokens = tokens; | |
16 | + this.headTokens = headTokens; | |
17 | + } | |
18 | + | |
19 | + public String getType() { | |
20 | + return type; | |
21 | + } | |
22 | + | |
23 | + public List<Token> getTokens() { | |
24 | + return tokens; | |
25 | + } | |
26 | + | |
27 | + public List<Token> getSemanticHeadTokens() { | |
28 | + return headTokens; | |
29 | + } | |
30 | + | |
31 | + @Override | |
32 | + public int compareTo(SyntacticGroup o) { | |
33 | + Iterator<Token> it1 = getTokens().iterator(); | |
34 | + Iterator<Token> it2 = o.getTokens().iterator(); | |
35 | + while (it1.hasNext() && it2.hasNext()) { | |
36 | + Token t1 = it1.next(); | |
37 | + Token t2 = it2.next(); | |
38 | + if (t1.compareTo(t2) != 0) | |
39 | + return t1.compareTo(t2); | |
40 | + } | |
41 | + it1 = getSemanticHeadTokens().iterator(); | |
42 | + it2 = o.getSemanticHeadTokens().iterator(); | |
43 | + while (it1.hasNext() && it2.hasNext()) { | |
44 | + Token t1 = it1.next(); | |
45 | + Token t2 = it2.next(); | |
46 | + if (t1.compareTo(t2) != 0) | |
47 | + return t1.compareTo(t2); | |
48 | + } | |
49 | + if (it1.hasNext()) | |
50 | + return 1; | |
51 | + if (it2.hasNext()) | |
52 | + return -1; | |
53 | + | |
54 | + return getType().compareTo(o.getType()); | |
55 | + } | |
56 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticWord.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticWord.java | |
1 | +package pl.waw.ipipan.zil.core.md.entities; | |
2 | + | |
3 | +import java.util.ArrayList; | |
4 | +import java.util.Iterator; | |
5 | +import java.util.List; | |
6 | + | |
7 | +public class SyntacticWord implements Comparable<SyntacticWord> { | |
8 | + | |
9 | + private String ctag; | |
10 | + private List<Token> tokens = new ArrayList<>(); | |
11 | + | |
12 | + public SyntacticWord(String ctag, List<Token> tokens) { | |
13 | + this.ctag = ctag; | |
14 | + this.tokens = tokens; | |
15 | + } | |
16 | + | |
17 | + public String getCtag() { | |
18 | + return ctag; | |
19 | + } | |
20 | + | |
21 | + public List<Token> getTokens() { | |
22 | + return tokens; | |
23 | + } | |
24 | + | |
25 | + @Override | |
26 | + public int compareTo(SyntacticWord o) { | |
27 | + Iterator<Token> it1 = getTokens().iterator(); | |
28 | + Iterator<Token> it2 = o.getTokens().iterator(); | |
29 | + while (it1.hasNext() && it2.hasNext()) { | |
30 | + Token t1 = it1.next(); | |
31 | + Token t2 = it2.next(); | |
32 | + if (t1.compareTo(t2) != 0) | |
33 | + return t1.compareTo(t2); | |
34 | + } | |
35 | + if (it1.hasNext()) | |
36 | + return 1; | |
37 | + if (it2.hasNext()) | |
38 | + return -1; | |
39 | + | |
40 | + return getCtag().compareTo(o.getCtag()); | |
41 | + } | |
42 | + | |
43 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/Text.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Text.java | |
1 | +package pl.waw.ipipan.zil.core.md.entities; | |
2 | + | |
3 | +import java.util.ArrayList; | |
4 | + | |
5 | +public class Text extends ArrayList<Paragraph> implements Comparable<Text> { | |
6 | + | |
7 | + private static final long serialVersionUID = 3433069117444647544L; | |
8 | + | |
9 | + private String id; | |
10 | + | |
11 | + public boolean add(Paragraph p) { | |
12 | + p.setTextPosition(this.size()); | |
13 | + p.setText(this); | |
14 | + return super.add(p); | |
15 | + } | |
16 | + | |
17 | + public String getId() { | |
18 | + return id; | |
19 | + } | |
20 | + | |
21 | + public void setId(String id) { | |
22 | + this.id = id; | |
23 | + } | |
24 | + | |
25 | + public Text(String id) { | |
26 | + setId(id); | |
27 | + } | |
28 | + | |
29 | + public String toString() { | |
30 | + StringBuffer sb = new StringBuffer(); | |
31 | + for (Paragraph par : this) | |
32 | + sb.append(par.toString() + "\n\n"); | |
33 | + return sb.toString(); | |
34 | + } | |
35 | + | |
36 | + public int compareTo(Text o) { | |
37 | + return getId().compareTo(o.getId()); | |
38 | + } | |
39 | + | |
40 | + public void clearMentions() { | |
41 | + for (Paragraph p : this) | |
42 | + for (Sentence sent : p) | |
43 | + sent.clearMentions(); | |
44 | + } | |
45 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/Token.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Token.java | |
1 | +package pl.waw.ipipan.zil.core.md.entities; | |
2 | + | |
3 | +import java.util.ArrayList; | |
4 | +import java.util.Collection; | |
5 | +import java.util.Collections; | |
6 | +import java.util.HashSet; | |
7 | +import java.util.List; | |
8 | +import java.util.Set; | |
9 | + | |
10 | +public class Token implements Comparable<Token> { | |
11 | + private Sentence sentence; | |
12 | + private int sentencePosition; | |
13 | + | |
14 | + private Set<Mention> mentions = null; | |
15 | + | |
16 | + private String orth; | |
17 | + private Interpretation chosenInterpretation; | |
18 | + private Collection<Interpretation> allInterpretations = new HashSet<Interpretation>(); | |
19 | + | |
20 | + public Integer getSentencePosition() { | |
21 | + return sentencePosition; | |
22 | + } | |
23 | + | |
24 | + public void setSentencePosition(int sentencePosition) { | |
25 | + this.sentencePosition = sentencePosition; | |
26 | + } | |
27 | + | |
28 | + public Sentence getSentence() { | |
29 | + return sentence; | |
30 | + } | |
31 | + | |
32 | + public void setSentence(Sentence sentence) { | |
33 | + this.sentence = sentence; | |
34 | + } | |
35 | + | |
36 | + public void setOrth(String orth2) { | |
37 | + this.orth = orth2; | |
38 | + } | |
39 | + | |
40 | + public String getOrth() { | |
41 | + return this.orth; | |
42 | + } | |
43 | + | |
44 | + public void addChosenInterpretation(Interpretation chosenIterpretation) { | |
45 | + setChosenInterpretation(chosenIterpretation); | |
46 | + addInterpretation(chosenIterpretation); | |
47 | + } | |
48 | + | |
49 | + public void setChosenInterpretation(Interpretation chosenIterpretation) { | |
50 | + this.chosenInterpretation = chosenIterpretation; | |
51 | + } | |
52 | + | |
53 | + public Interpretation getChosenInterpretation() { | |
54 | + return this.chosenInterpretation; | |
55 | + } | |
56 | + | |
57 | + public String getBase() { | |
58 | + return this.getChosenInterpretation().getBase(); | |
59 | + } | |
60 | + | |
61 | + public String getNumber() { | |
62 | + return this.getChosenInterpretation().getNumber(); | |
63 | + } | |
64 | + | |
65 | + public String getGender() { | |
66 | + return this.getChosenInterpretation().getGender(); | |
67 | + } | |
68 | + | |
69 | + public String getCase() { | |
70 | + return this.getChosenInterpretation().getCase(); | |
71 | + } | |
72 | + | |
73 | + public String getPerson() { | |
74 | + return this.getChosenInterpretation().getPerson(); | |
75 | + } | |
76 | + | |
77 | + public void addInterpretation(Interpretation inter) { | |
78 | + this.allInterpretations.add(inter); | |
79 | + } | |
80 | + | |
81 | + public String toString() { | |
82 | + return orth; | |
83 | + } | |
84 | + | |
85 | + public void addMention(Mention mention) { | |
86 | + if (this.mentions == null) | |
87 | + this.mentions = new HashSet<Mention>(); | |
88 | + | |
89 | + this.mentions.add(mention); | |
90 | + } | |
91 | + | |
92 | + public void removeMention(Mention mention) { | |
93 | + this.mentions.remove(mention); | |
94 | + } | |
95 | + | |
96 | + public Set<Mention> getMentions() { | |
97 | + if (this.mentions == null) | |
98 | + return new HashSet<Mention>(); | |
99 | + return this.mentions; | |
100 | + } | |
101 | + | |
102 | + public List<Mention> getMentionsStartingBeforeSegment() { | |
103 | + List<Mention> result = new ArrayList<Mention>(); | |
104 | + for (Mention m : getMentions()) | |
105 | + if (m.getFirstSegment().equals(this)) | |
106 | + result.add(m); | |
107 | + | |
108 | + Collections.sort(result); | |
109 | + Collections.reverse(result); | |
110 | + return result; | |
111 | + } | |
112 | + | |
113 | + public List<Mention> getMentionsEndingAfterSegment() { | |
114 | + List<Mention> result = new ArrayList<Mention>(); | |
115 | + for (Mention m : getMentions()) | |
116 | + if (m.getLastSegment().equals(this)) | |
117 | + result.add(m); | |
118 | + | |
119 | + Collections.sort(result); | |
120 | + Collections.reverse(result); | |
121 | + return result; | |
122 | + } | |
123 | + | |
124 | + public String getCtag() { | |
125 | + return getChosenInterpretation().getCtag(); | |
126 | + } | |
127 | + | |
128 | + @Override | |
129 | + public int compareTo(Token o) { | |
130 | + return getSentencePosition().compareTo(o.getSentencePosition()); | |
131 | + } | |
132 | + | |
133 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java | |
1 | +package pl.waw.ipipan.zil.core.md.io.tei; | |
2 | + | |
3 | +import ipipan.clarin.tei.api.entities.TEICorpusText; | |
4 | +import ipipan.clarin.tei.api.entities.TEIGroup; | |
5 | +import ipipan.clarin.tei.api.entities.TEIInterpretation; | |
6 | +import ipipan.clarin.tei.api.entities.TEIMention; | |
7 | +import ipipan.clarin.tei.api.entities.TEIMorph; | |
8 | +import ipipan.clarin.tei.api.entities.TEINamedEntity; | |
9 | +import ipipan.clarin.tei.api.entities.TEIParagraph; | |
10 | +import ipipan.clarin.tei.api.entities.TEISentence; | |
11 | +import ipipan.clarin.tei.api.entities.TEISyntacticEntity; | |
12 | +import ipipan.clarin.tei.api.entities.TEIWord; | |
13 | +import ipipan.clarin.tei.api.exceptions.TEIException; | |
14 | +import ipipan.clarin.tei.api.io.TEI_IO; | |
15 | + | |
16 | +import java.io.File; | |
17 | +import java.util.ArrayList; | |
18 | +import java.util.HashMap; | |
19 | +import java.util.List; | |
20 | +import java.util.Map; | |
21 | + | |
22 | +import org.apache.log4j.Logger; | |
23 | + | |
24 | +import pl.waw.ipipan.zil.core.md.entities.Interpretation; | |
25 | +import pl.waw.ipipan.zil.core.md.entities.Mention; | |
26 | +import pl.waw.ipipan.zil.core.md.entities.NamedEntity; | |
27 | +import pl.waw.ipipan.zil.core.md.entities.Paragraph; | |
28 | +import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
29 | +import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup; | |
30 | +import pl.waw.ipipan.zil.core.md.entities.SyntacticWord; | |
31 | +import pl.waw.ipipan.zil.core.md.entities.Text; | |
32 | +import pl.waw.ipipan.zil.core.md.entities.Token; | |
33 | + | |
34 | +public class TeiLoader { | |
35 | + | |
36 | + private static Logger logger = Logger.getLogger(TeiLoader.class); | |
37 | + private static TEI_IO teiAPI = TEI_IO.getInstance(); | |
38 | + | |
39 | + public static TEICorpusText readTeiText(File teiDir) throws TEIException { | |
40 | + return teiAPI.readFromNKJPDirectory(teiDir); | |
41 | + } | |
42 | + | |
43 | + public static Text loadTextFromTei(TEICorpusText teiText) { | |
44 | + Text text = new Text(teiText.getCorpusHeader().getId()); | |
45 | + | |
46 | + logger.debug("Loading tei text " + text.getId() + "..."); | |
47 | + for (TEIParagraph teiP : teiText.getParagraphs()) | |
48 | + loadParagraph(text, teiP); | |
49 | + logger.debug("Tei text loaded."); | |
50 | + | |
51 | + return text; | |
52 | + } | |
53 | + | |
54 | + private static void loadParagraph(Text text, TEIParagraph teiP) { | |
55 | + Paragraph p = new Paragraph(); | |
56 | + text.add(p); | |
57 | + for (TEISentence teiS : teiP.getSentences()) | |
58 | + loadSentence(p, teiS); | |
59 | + } | |
60 | + | |
61 | + private static void loadSentence(Paragraph p, TEISentence teiS) { | |
62 | + Sentence s = new Sentence(); | |
63 | + p.add(s); | |
64 | + Map<TEIMorph, Token> teiMorph2Segment = new HashMap<>(); | |
65 | + for (TEIMorph teiM : teiS.getMorphs()) { | |
66 | + Token token = loadToken(s, teiM); | |
67 | + teiMorph2Segment.put(teiM, token); | |
68 | + } | |
69 | + for (TEINamedEntity ne : teiS.getAllNamedEntities()) | |
70 | + loadNE(s, ne, teiMorph2Segment); | |
71 | + for (TEIWord w : teiS.getAllWords()) | |
72 | + loadSyntacticWord(s, w, teiMorph2Segment); | |
73 | + for (TEIGroup g : teiS.getAllGroups()) | |
74 | + loadSyntacticGroup(s, g, teiMorph2Segment); | |
75 | + for (TEIMention m : teiS.getAllMentions()) | |
76 | + loadMentions(s, m, teiMorph2Segment); | |
77 | + } | |
78 | + | |
79 | + private static void loadMentions(Sentence s, TEIMention m, | |
80 | + Map<TEIMorph, Token> teiMorph2Segment) { | |
81 | + List<Token> tokens = new ArrayList<>(); | |
82 | + for (TEIMorph mo : m.getMorphs()) | |
83 | + tokens.add(teiMorph2Segment.get(mo)); | |
84 | + List<Token> headTokens = new ArrayList<>(); | |
85 | + for (TEIMorph mo : m.getHeadMorphs()) | |
86 | + headTokens.add(teiMorph2Segment.get(mo)); | |
87 | + s.addMention(new Mention(tokens, headTokens, m.isZeroSubject())); | |
88 | + } | |
89 | + | |
90 | + private static void loadSyntacticGroup(Sentence s, TEIGroup g, | |
91 | + Map<TEIMorph, Token> teiMorph2Segment) { | |
92 | + String type = g.getType(); | |
93 | + | |
94 | + List<Token> tokens = new ArrayList<>(); | |
95 | + for (TEIMorph m : g.getLeaves()) | |
96 | + tokens.add(teiMorph2Segment.get(m)); | |
97 | + | |
98 | + List<Token> headTokens = new ArrayList<>(); | |
99 | + TEISyntacticEntity semanticHead = g; | |
100 | + while (semanticHead.isGroup() | |
101 | + && semanticHead.asGroup().getSemanticHead() != null) | |
102 | + semanticHead = semanticHead.asGroup().getSemanticHead(); | |
103 | + for (TEIMorph m : semanticHead.getLeaves()) | |
104 | + headTokens.add(teiMorph2Segment.get(m)); | |
105 | + | |
106 | + s.addSyntacticGroup(new SyntacticGroup(type, tokens, headTokens)); | |
107 | + } | |
108 | + | |
109 | + private static void loadSyntacticWord(Sentence s, TEIWord w, | |
110 | + Map<TEIMorph, Token> teiMorph2Segment) { | |
111 | + String ctag = w.getInterpretation().getCtag(); | |
112 | + List<Token> tokens = new ArrayList<>(); | |
113 | + for (TEIMorph m : w.getAllMorphs()) | |
114 | + tokens.add(teiMorph2Segment.get(m)); | |
115 | + s.addSyntacticWord(new SyntacticWord(ctag, tokens)); | |
116 | + } | |
117 | + | |
118 | + private static void loadNE(Sentence s, TEINamedEntity ne, | |
119 | + Map<TEIMorph, Token> teiMorph2Segment) { | |
120 | + List<Token> tokens = new ArrayList<>(); | |
121 | + for (TEIMorph m : ne.getLeaves()) | |
122 | + tokens.add(teiMorph2Segment.get(m)); | |
123 | + s.addNamedEntity(new NamedEntity(tokens)); | |
124 | + } | |
125 | + | |
126 | + private static Token loadToken(Sentence s, TEIMorph teiM) { | |
127 | + Token seg = new Token(); | |
128 | + s.add(seg); | |
129 | + | |
130 | + seg.setOrth(teiM.getOrth()); | |
131 | + TEIInterpretation interp = teiM.getChosenInterpretation(); | |
132 | + Interpretation chosenIterpretation = new Interpretation( | |
133 | + interp.getCtag(), interp.getMorph(), interp.getBase()); | |
134 | + seg.addChosenInterpretation(chosenIterpretation); | |
135 | + | |
136 | + for (TEIInterpretation interp2 : teiM.getAllInterpretations()) { | |
137 | + Interpretation inter = new Interpretation(interp2.getCtag(), | |
138 | + interp2.getMorph(), interp.getBase()); | |
139 | + seg.addInterpretation(inter); | |
140 | + } | |
141 | + | |
142 | + return seg; | |
143 | + } | |
144 | + | |
145 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiSaver.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiSaver.java | |
1 | +package pl.waw.ipipan.zil.core.md.io.tei; | |
2 | + | |
3 | +import ipipan.clarin.tei.api.entities.AnnotationLayer; | |
4 | +import ipipan.clarin.tei.api.entities.EntitiesFactory; | |
5 | +import ipipan.clarin.tei.api.entities.TEICoreference; | |
6 | +import ipipan.clarin.tei.api.entities.TEICorpusText; | |
7 | +import ipipan.clarin.tei.api.entities.TEIMention; | |
8 | +import ipipan.clarin.tei.api.entities.TEIMorph; | |
9 | +import ipipan.clarin.tei.api.entities.TEIParagraph; | |
10 | +import ipipan.clarin.tei.api.entities.TEISentence; | |
11 | +import ipipan.clarin.tei.api.exceptions.TEIException; | |
12 | +import ipipan.clarin.tei.api.io.TEI_IO; | |
13 | +import ipipan.clarin.tei.api.io.TEI_IO.CompressionMethod; | |
14 | + | |
15 | +import java.io.File; | |
16 | +import java.util.ArrayList; | |
17 | +import java.util.HashMap; | |
18 | +import java.util.Iterator; | |
19 | +import java.util.List; | |
20 | +import java.util.Map; | |
21 | + | |
22 | +import org.apache.log4j.Logger; | |
23 | + | |
24 | +import pl.waw.ipipan.zil.core.md.entities.Mention; | |
25 | +import pl.waw.ipipan.zil.core.md.entities.Paragraph; | |
26 | +import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
27 | +import pl.waw.ipipan.zil.core.md.entities.Text; | |
28 | +import pl.waw.ipipan.zil.core.md.entities.Token; | |
29 | + | |
30 | +public class TeiSaver { | |
31 | + | |
32 | + private static Logger logger = Logger.getLogger(TeiSaver.class); | |
33 | + private static TEI_IO teiAPI = TEI_IO.getInstance(); | |
34 | + final private static EntitiesFactory ef = EntitiesFactory.getInstance(); | |
35 | + | |
36 | + public static void saveTeiText(TEICorpusText teiText, File targetDir, boolean gzip) throws TEIException { | |
37 | + logger.debug("Saving text in " + targetDir); | |
38 | + CompressionMethod cm = gzip ? CompressionMethod.GZIP : CompressionMethod.NONE; | |
39 | + teiAPI.writeToNKJPDirectory(teiText, targetDir, cm); | |
40 | + } | |
41 | + | |
42 | + public static void updateTeiText(Text t, TEICorpusText teiText) throws TEIException { | |
43 | + Map<Mention, TEIMention> mention2mention = new HashMap<Mention, TEIMention>(); | |
44 | + | |
45 | + Iterator<Paragraph> pIt = t.iterator(); | |
46 | + Iterator<TEIParagraph> pItTei = teiText.getParagraphs().iterator(); | |
47 | + int mentionId = 0; | |
48 | + while (pIt.hasNext() && pItTei.hasNext()) { | |
49 | + Paragraph p = pIt.next(); | |
50 | + TEIParagraph pTei = pItTei.next(); | |
51 | + | |
52 | + mentionId = updateTeiParagraph(mention2mention, mentionId, p, pTei); | |
53 | + } | |
54 | + checkIterators(pIt, pItTei, "paragraph"); | |
55 | + | |
56 | + teiText.addAnnotationLayer(AnnotationLayer.MENTIONS, | |
57 | + EntitiesFactory.getInstance().createHeader(AnnotationLayer.MENTIONS)); | |
58 | + | |
59 | + // clear coreference as we have new mentions it became invalid | |
60 | + teiText.getAnnotationLayers().remove(AnnotationLayer.COREFERENCE); | |
61 | + teiText.setCoreferences(new ArrayList<TEICoreference>()); | |
62 | + | |
63 | + logger.debug(mentionId + " mentions added"); | |
64 | + } | |
65 | + | |
66 | + private static int updateTeiParagraph(Map<Mention, TEIMention> mention2mention, int mentionId, Paragraph p, | |
67 | + TEIParagraph pTei) throws TEIException { | |
68 | + Iterator<Sentence> sIt = p.iterator(); | |
69 | + Iterator<TEISentence> sItTei = pTei.getSentences().iterator(); | |
70 | + | |
71 | + while (sIt.hasNext() && sItTei.hasNext()) { | |
72 | + Sentence s = sIt.next(); | |
73 | + TEISentence sTei = sItTei.next(); | |
74 | + mentionId = updateTeiSentence(mention2mention, mentionId, s, sTei); | |
75 | + } | |
76 | + checkIterators(sIt, sItTei, "sentence"); | |
77 | + return mentionId; | |
78 | + } | |
79 | + | |
80 | + private static int updateTeiSentence(Map<Mention, TEIMention> mention2mention, int mentionId, Sentence s, | |
81 | + TEISentence sTei) throws TEIException { | |
82 | + sTei.getAllMentions().clear(); | |
83 | + | |
84 | + Map<Token, TEIMorph> seg2morph = new HashMap<Token, TEIMorph>(); | |
85 | + | |
86 | + Iterator<Token> segIt = s.iterator(); | |
87 | + Iterator<TEIMorph> segItTei = sTei.getMorphs().iterator(); | |
88 | + | |
89 | + while (segIt.hasNext() && segItTei.hasNext()) { | |
90 | + seg2morph.put(segIt.next(), segItTei.next()); | |
91 | + } | |
92 | + checkIterators(segIt, segItTei, "token"); | |
93 | + | |
94 | + List<TEIMention> mentions = new ArrayList<TEIMention>(); | |
95 | + | |
96 | + for (Mention m : s.getMentions()) { | |
97 | + List<TEIMorph> morphs = new ArrayList<TEIMorph>(); | |
98 | + List<TEIMorph> heads = new ArrayList<TEIMorph>(); | |
99 | + | |
100 | + for (Token seg : m.getSegments()) | |
101 | + morphs.add(seg2morph.get(seg)); | |
102 | + | |
103 | + for (Token seg : m.getHeadSegments()) | |
104 | + heads.add(seg2morph.get(seg)); | |
105 | + | |
106 | + TEIMention mention = ef.createMention("mention_" + mentionId++, morphs, heads, m.isZeroSubject()); | |
107 | + mentions.add(mention); | |
108 | + mention2mention.put(m, mention); | |
109 | + } | |
110 | + sTei.setMentions(mentions); | |
111 | + return mentionId; | |
112 | + } | |
113 | + | |
114 | + private static void checkIterators(Iterator<? extends Object> one, Iterator<? extends Object> other, String level) | |
115 | + throws TEIException { | |
116 | + if (one.hasNext() || other.hasNext()) | |
117 | + throw new TEIException("Problem mapping tei to thrift for level " + level); | |
118 | + } | |
119 | + | |
120 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java | |
1 | +package pl.waw.ipipan.zil.core.md.io.thrift; | |
2 | + | |
3 | +import java.util.ArrayList; | |
4 | +import java.util.HashMap; | |
5 | +import java.util.List; | |
6 | +import java.util.Map; | |
7 | + | |
8 | +import org.apache.log4j.Logger; | |
9 | + | |
10 | +import pl.waw.ipipan.multiservice.thrift.types.MultiserviceException; | |
11 | +import pl.waw.ipipan.multiservice.thrift.types.TInterpretation; | |
12 | +import pl.waw.ipipan.multiservice.thrift.types.TNamedEntity; | |
13 | +import pl.waw.ipipan.multiservice.thrift.types.TParagraph; | |
14 | +import pl.waw.ipipan.multiservice.thrift.types.TSentence; | |
15 | +import pl.waw.ipipan.multiservice.thrift.types.TSyntacticGroup; | |
16 | +import pl.waw.ipipan.multiservice.thrift.types.TSyntacticWord; | |
17 | +import pl.waw.ipipan.multiservice.thrift.types.TText; | |
18 | +import pl.waw.ipipan.multiservice.thrift.types.TToken; | |
19 | +import pl.waw.ipipan.zil.core.md.entities.Interpretation; | |
20 | +import pl.waw.ipipan.zil.core.md.entities.NamedEntity; | |
21 | +import pl.waw.ipipan.zil.core.md.entities.Paragraph; | |
22 | +import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
23 | +import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup; | |
24 | +import pl.waw.ipipan.zil.core.md.entities.SyntacticWord; | |
25 | +import pl.waw.ipipan.zil.core.md.entities.Text; | |
26 | +import pl.waw.ipipan.zil.core.md.entities.Token; | |
27 | + | |
28 | +public class ThriftLoader { | |
29 | + | |
30 | + private static Logger logger = Logger.getLogger(ThriftLoader.class); | |
31 | + | |
32 | + public static Text loadTextFromThrift(TText thriftText) | |
33 | + throws MultiserviceException { | |
34 | + Text text = new Text(thriftText.getTextHeader() == null ? "null" | |
35 | + : thriftText.getTextHeader().getId()); | |
36 | + | |
37 | + logger.debug("Loading text " + text.getId() + " from thrift format..."); | |
38 | + for (TParagraph teiP : thriftText.getParagraphs()) | |
39 | + loadParagraph(text, teiP); | |
40 | + logger.debug("Thrift text loaded."); | |
41 | + | |
42 | + return text; | |
43 | + } | |
44 | + | |
45 | + private static void loadParagraph(Text text, TParagraph teiP) | |
46 | + throws MultiserviceException { | |
47 | + Paragraph p = new Paragraph(); | |
48 | + text.add(p); | |
49 | + | |
50 | + for (TSentence teiS : teiP.getSentences()) | |
51 | + loadSentence(p, teiS); | |
52 | + } | |
53 | + | |
54 | + private static void loadSentence(Paragraph p, TSentence thriftSent) | |
55 | + throws MultiserviceException { | |
56 | + Sentence s = new Sentence(); | |
57 | + p.add(s); | |
58 | + | |
59 | + Map<String, Object> thirftId2Entity = getThriftId2EntityMap(thriftSent); | |
60 | + | |
61 | + Map<String, Token> thiftTokenId2Token = new HashMap<>(); | |
62 | + for (TToken teiM : thriftSent.getTokens()) { | |
63 | + Token token = loadToken(s, teiM); | |
64 | + thiftTokenId2Token.put(teiM.getId(), token); | |
65 | + } | |
66 | + if (thriftSent.isSetNames()) | |
67 | + for (TNamedEntity ne : thriftSent.getNames()) | |
68 | + loadNE(s, ne, thirftId2Entity, thiftTokenId2Token); | |
69 | + if (thriftSent.isSetWords()) | |
70 | + for (TSyntacticWord w : thriftSent.getWords()) | |
71 | + loadSyntacticWord(s, w, thirftId2Entity, thiftTokenId2Token); | |
72 | + if (thriftSent.isSetGroups()) | |
73 | + for (TSyntacticGroup g : thriftSent.getGroups()) | |
74 | + loadSyntacticGroup(s, g, thirftId2Entity, thiftTokenId2Token); | |
75 | + } | |
76 | + | |
77 | + private static void loadSyntacticGroup(Sentence s, TSyntacticGroup g, | |
78 | + Map<String, Object> thirftId2Entity, | |
79 | + Map<String, Token> thiftTokenId2Token) { | |
80 | + String type = g.getType(); | |
81 | + List<Token> tokens = getUnderlyingSegments(g, thirftId2Entity, | |
82 | + thiftTokenId2Token, false); | |
83 | + List<Token> headTokens = getUnderlyingSegments(g, thirftId2Entity, | |
84 | + thiftTokenId2Token, true); | |
85 | + s.addSyntacticGroup(new SyntacticGroup(type, tokens, headTokens)); | |
86 | + } | |
87 | + | |
88 | + private static void loadSyntacticWord(Sentence s, TSyntacticWord w, | |
89 | + Map<String, Object> thirftId2Entity, | |
90 | + Map<String, Token> thiftTokenId2Token) { | |
91 | + String ctag = w.getChosenInterpretation().getCtag(); | |
92 | + List<Token> tokens = getUnderlyingSegments(w, thirftId2Entity, | |
93 | + thiftTokenId2Token, false); | |
94 | + s.addSyntacticWord(new SyntacticWord(ctag, tokens)); | |
95 | + } | |
96 | + | |
97 | + private static void loadNE(Sentence s, TNamedEntity ne, | |
98 | + Map<String, Object> thirftId2Entity, | |
99 | + Map<String, Token> thiftTokenId2Token) { | |
100 | + List<Token> tokens = getUnderlyingSegments(ne, thirftId2Entity, | |
101 | + thiftTokenId2Token, false); | |
102 | + s.addNamedEntity(new NamedEntity(tokens)); | |
103 | + } | |
104 | + | |
105 | + private static Map<String, Object> getThriftId2EntityMap( | |
106 | + TSentence thriftSent) { | |
107 | + Map<String, Object> idToEntity = new HashMap<>(); | |
108 | + for (TToken tok : thriftSent.getTokens()) | |
109 | + idToEntity.put(tok.getId(), tok); | |
110 | + if (thriftSent.isSetWords()) | |
111 | + for (TSyntacticWord w : thriftSent.getWords()) | |
112 | + idToEntity.put(w.getId(), w); | |
113 | + if (thriftSent.isSetNames()) | |
114 | + for (TNamedEntity ne : thriftSent.getNames()) | |
115 | + idToEntity.put(ne.getId(), ne); | |
116 | + if (thriftSent.isSetGroups()) | |
117 | + for (TSyntacticGroup group : thriftSent.getGroups()) | |
118 | + idToEntity.put(group.getId(), group); | |
119 | + return idToEntity; | |
120 | + } | |
121 | + | |
122 | + private static Token loadToken(Sentence s, TToken teiM) | |
123 | + throws MultiserviceException { | |
124 | + Token seg = new Token(); | |
125 | + s.add(seg); | |
126 | + | |
127 | + seg.setOrth(teiM.getOrth()); | |
128 | + TInterpretation interp = getTokenChosenInt(teiM); | |
129 | + Interpretation chosenIterpretation = new Interpretation( | |
130 | + interp.getCtag(), interp.getMsd(), interp.getBase()); | |
131 | + seg.addChosenInterpretation(chosenIterpretation); | |
132 | + | |
133 | + for (TInterpretation interp2 : teiM.getInterpretations()) { | |
134 | + Interpretation inter = new Interpretation(interp2.getCtag(), | |
135 | + interp2.getMsd(), interp.getBase()); | |
136 | + seg.addInterpretation(inter); | |
137 | + } | |
138 | + return seg; | |
139 | + } | |
140 | + | |
141 | + private static TInterpretation getTokenChosenInt(TToken token) | |
142 | + throws MultiserviceException { | |
143 | + TInterpretation interp = token.getChosenInterpretation(); | |
144 | + if (interp == null || interp.getBase() == null | |
145 | + || interp.getBase().equals("")) { | |
146 | + if (token.getCandidateInterpretations() == null | |
147 | + || token.getCandidateInterpretations().size() == 0 | |
148 | + || token.getCandidateInterpretations().get(0).getBase() == null | |
149 | + || token.getCandidateInterpretations().get(0).getBase() | |
150 | + .equals("")) | |
151 | + throw new MultiserviceException( | |
152 | + "No proper chosen or candidate interpretation for segment: " | |
153 | + + token.id); | |
154 | + interp = token.getCandidateInterpretations().get(0); | |
155 | + } | |
156 | + return interp; | |
157 | + } | |
158 | + | |
159 | + private static List<Token> getUnderlyingSegments(Object entity, | |
160 | + Map<String, Object> idToEntity, Map<String, Token> tokenId2Segment, | |
161 | + boolean headsOnly) { | |
162 | + List<Token> result = new ArrayList<>(); | |
163 | + | |
164 | + if (entity instanceof TToken) { | |
165 | + result.add(tokenId2Segment.get(((TToken) entity).getId())); | |
166 | + return result; | |
167 | + } | |
168 | + | |
169 | + List<String> childIds = new ArrayList<>(); | |
170 | + if (entity instanceof TSyntacticWord) | |
171 | + childIds = ((TSyntacticWord) entity).getChildIds(); | |
172 | + else if (entity instanceof TNamedEntity) | |
173 | + childIds = ((TNamedEntity) entity).getChildIds(); | |
174 | + else if (entity instanceof TSyntacticGroup) | |
175 | + if (headsOnly) { | |
176 | + childIds = new ArrayList<String>(); | |
177 | + childIds.add(((TSyntacticGroup) entity).getSemanticHeadId()); | |
178 | + } else | |
179 | + childIds = ((TSyntacticGroup) entity).getChildIds(); | |
180 | + | |
181 | + for (String id : childIds) | |
182 | + result.addAll(getUnderlyingSegments(idToEntity.get(id), idToEntity, | |
183 | + tokenId2Segment, headsOnly)); | |
184 | + | |
185 | + return result; | |
186 | + } | |
187 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftSaver.java
0 → 100644
1 | +++ a/src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftSaver.java | |
1 | +package pl.waw.ipipan.zil.core.md.io.thrift; | |
2 | + | |
3 | +import java.util.ArrayList; | |
4 | +import java.util.HashMap; | |
5 | +import java.util.Iterator; | |
6 | +import java.util.List; | |
7 | +import java.util.Map; | |
8 | + | |
9 | +import org.apache.log4j.Logger; | |
10 | + | |
11 | +import pl.waw.ipipan.multiservice.thrift.types.MultiserviceException; | |
12 | +import pl.waw.ipipan.multiservice.thrift.types.TMention; | |
13 | +import pl.waw.ipipan.multiservice.thrift.types.TParagraph; | |
14 | +import pl.waw.ipipan.multiservice.thrift.types.TSentence; | |
15 | +import pl.waw.ipipan.multiservice.thrift.types.TText; | |
16 | +import pl.waw.ipipan.multiservice.thrift.types.TToken; | |
17 | +import pl.waw.ipipan.zil.core.md.entities.Mention; | |
18 | +import pl.waw.ipipan.zil.core.md.entities.Paragraph; | |
19 | +import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
20 | +import pl.waw.ipipan.zil.core.md.entities.Text; | |
21 | +import pl.waw.ipipan.zil.core.md.entities.Token; | |
22 | + | |
23 | +public class ThriftSaver { | |
24 | + | |
25 | + private static Logger logger = Logger.getLogger(ThriftSaver.class); | |
26 | + | |
27 | + public static void updateThriftText(Text responseText, TText text) | |
28 | + throws MultiserviceException { | |
29 | + | |
30 | + logger.debug("Updating thrift text..."); | |
31 | + Map<Mention, TMention> teiMention2ThriftMention = new HashMap<>(); | |
32 | + | |
33 | + Iterator<TParagraph> thrPI = text.getParagraphsIterator(); | |
34 | + Iterator<Paragraph> teiPI = responseText.iterator(); | |
35 | + int freeMentionId = 0; | |
36 | + while (thrPI.hasNext() && teiPI.hasNext()) { | |
37 | + TParagraph thrP = thrPI.next(); | |
38 | + Paragraph teiP = teiPI.next(); | |
39 | + | |
40 | + freeMentionId = updateThriftParagraph(teiMention2ThriftMention, | |
41 | + freeMentionId, thrP, teiP); | |
42 | + } | |
43 | + checkIterators(thrPI, teiPI, "paragraph"); | |
44 | + } | |
45 | + | |
46 | + private static int updateThriftParagraph( | |
47 | + Map<Mention, TMention> teiMention2ThriftMention, int freeMentionId, | |
48 | + TParagraph thrP, Paragraph teiP) throws MultiserviceException { | |
49 | + Iterator<TSentence> thrSI = thrP.getSentencesIterator(); | |
50 | + Iterator<Sentence> teiSI = teiP.iterator(); | |
51 | + while (thrSI.hasNext() && teiSI.hasNext()) { | |
52 | + TSentence thrS = thrSI.next(); | |
53 | + Sentence teiS = teiSI.next(); | |
54 | + freeMentionId = updateThriftSentence(teiMention2ThriftMention, | |
55 | + freeMentionId, thrS, teiS); | |
56 | + } | |
57 | + checkIterators(thrSI, teiSI, "sentence"); | |
58 | + return freeMentionId; | |
59 | + } | |
60 | + | |
61 | + private static int updateThriftSentence( | |
62 | + Map<Mention, TMention> teiMention2ThriftMention, int id, | |
63 | + TSentence thrS, Sentence teiS) throws MultiserviceException { | |
64 | + thrS.unsetMentions(); | |
65 | + thrS.setMentions(new ArrayList<TMention>()); | |
66 | + | |
67 | + Map<Token, TToken> teiMorph2ThriftToken = new HashMap<>(); | |
68 | + Iterator<TToken> thrMI = thrS.getTokensIterator(); | |
69 | + Iterator<Token> teiMI = teiS.iterator(); | |
70 | + while (thrMI.hasNext() && teiMI.hasNext()) { | |
71 | + teiMorph2ThriftToken.put(teiMI.next(), thrMI.next()); | |
72 | + } | |
73 | + checkIterators(thrMI, teiMI, "morph"); | |
74 | + | |
75 | + for (Mention m : teiS.getMentions()) { | |
76 | + List<String> childIds = new ArrayList<>(); | |
77 | + List<String> headIds = new ArrayList<>(); | |
78 | + for (Token ch : m.getSegments()) | |
79 | + childIds.add(teiMorph2ThriftToken.get(ch).getId()); | |
80 | + for (Token h : m.getHeadSegments()) | |
81 | + headIds.add(teiMorph2ThriftToken.get(h).getId()); | |
82 | + | |
83 | + TMention tm = new TMention("m-" + (id++), headIds, childIds, | |
84 | + m.isZeroSubject()); | |
85 | + teiMention2ThriftMention.put(m, tm); | |
86 | + thrS.addToMentions(tm); | |
87 | + } | |
88 | + return id; | |
89 | + } | |
90 | + | |
91 | + private static void checkIterators(Iterator<? extends Object> one, | |
92 | + Iterator<? extends Object> other, String level) | |
93 | + throws MultiserviceException { | |
94 | + if (one.hasNext() || other.hasNext()) | |
95 | + throw new MultiserviceException( | |
96 | + "Problem mapping interal text representation to thrift for level " | |
97 | + + level); | |
98 | + } | |
99 | + | |
100 | +} | |
... | ... |
src/main/resources/log4j.properties
0 → 100644
1 | +++ a/src/main/resources/log4j.properties | |
1 | +log4j.appender.stderr=org.apache.log4j.ConsoleAppender | |
2 | +log4j.appender.stderr.layout=org.apache.log4j.PatternLayout | |
3 | +log4j.appender.stderr.layout.ConversionPattern=[%p] [%C{1}] %m%n | |
4 | + | |
5 | +log4j.logger.ipipan=INFO, stderr | |
6 | +log4j.logger.pl.waw.ipipan=INFO, stderr | |
7 | +log4j.logger.org.apache.thrift=INFO, stderr | |
0 | 8 | \ No newline at end of file |
... | ... |
src/main/resources/quasi_verbs.txt
0 → 100644
1 | +++ a/src/main/resources/quasi_verbs.txt | |
1 | +bawić | |
2 | +brać | |
3 | +brak | |
4 | +brakować | |
5 | +być | |
6 | +bywać | |
7 | +chcieć | |
8 | +chodzić | |
9 | +ciągnąć | |
10 | +ciec | |
11 | +czas | |
12 | +czuć | |
13 | +dobiec | |
14 | +dobiegać | |
15 | +dochodzić | |
16 | +docierać | |
17 | +dojść | |
18 | +dotrzeć | |
19 | +dusić | |
20 | +godzić | |
21 | +gotować | |
22 | +gryźć | |
23 | +grzmieć | |
24 | +iść | |
25 | +jechać | |
26 | +kłuć | |
27 | +kończyć | |
28 | +kręcić | |
29 | +kropić | |
30 | +lać | |
31 | +łamać | |
32 | +lecieć | |
33 | +mieć | |
34 | +mieszać | |
35 | +móc | |
36 | +można | |
37 | +musieć | |
38 | +należeć | |
39 | +nieść | |
40 | +nosić | |
41 | +nudzić | |
42 | +nudzić | |
43 | +obejść | |
44 | +odbijać | |
45 | +odchodzić | |
46 | +odejmować | |
47 | +odejść | |
48 | +odrzucać | |
49 | +odrzucić | |
50 | +okazać | |
51 | +okazywać | |
52 | +opłacać | |
53 | +opłacić | |
54 | +oznaczać | |
55 | +pachnieć | |
56 | +padać | |
57 | +palić | |
58 | +palić | |
59 | +paść | |
60 | +piec | |
61 | +podobać | |
62 | +pogorszyć | |
63 | +pójść | |
64 | +ponieść | |
65 | +poprawiać | |
66 | +pora | |
67 | +potwierdzać | |
68 | +potwierdzić | |
69 | +powinno | |
70 | +pozostać | |
71 | +pozostawać | |
72 | +prosić | |
73 | +przechodzić | |
74 | +przestać | |
75 | +przybyć | |
76 | +przybywać | |
77 | +przyjąć | |
78 | +przyjmować | |
79 | +przypominać | |
80 | +przypomnieć | |
81 | +robić | |
82 | +rozerwać | |
83 | +rozumieć | |
84 | +składać | |
85 | +skończyć | |
86 | +skręcać | |
87 | +skręcić | |
88 | +słychać | |
89 | +śnić | |
90 | +śpieszyć | |
91 | +stać | |
92 | +stać | |
93 | +stanąć | |
94 | +strzelić | |
95 | +swędzić | |
96 | +świecić | |
97 | +szkoda | |
98 | +trafiać | |
99 | +trafić | |
100 | +trząść | |
101 | +trzeba | |
102 | +ucieszyć | |
103 | +uczynić | |
104 | +udać | |
105 | +udawać | |
106 | +uderzać | |
107 | +uderzyć | |
108 | +układać | |
109 | +ułożyć | |
110 | +warto | |
111 | +wiadomo | |
112 | +widać | |
113 | +wieść | |
114 | +wolno | |
115 | +wstyd | |
116 | +wychodzić | |
117 | +wydać | |
118 | +wydawać | |
119 | +wyjaśniać | |
120 | +wyjaśnić | |
121 | +wyjść | |
122 | +wypadać | |
123 | +wypaść | |
124 | +wypogadzać | |
125 | +wyrzucić | |
126 | +wystarczyć | |
127 | +wziąć | |
128 | +zabraknąć | |
129 | +zacząć | |
130 | +zaczynać | |
131 | +zagotować | |
132 | +zainteresować | |
133 | +zakręcić | |
134 | +żal | |
135 | +zależeć | |
136 | +zanieść | |
137 | +zanieść | |
138 | +zanosić | |
139 | +zanosić | |
140 | +zapowiadać | |
141 | +zarzucać | |
142 | +zastanowić | |
143 | +zbierać | |
144 | +zdarzać | |
145 | +zdziwić | |
146 | +zebrać | |
147 | +zemrzeć | |
148 | +złożyć | |
149 | +znać | |
150 | +zrobić | |
... | ... |
src/main/resources/zero_subject_model.bin
0 → 100644
No preview for this file type
src/test/java/pl/waw/ipipan/zil/core/md/MentionDetectorTest.java
0 → 100644
1 | +++ a/src/test/java/pl/waw/ipipan/zil/core/md/MentionDetectorTest.java | |
1 | +package pl.waw.ipipan.zil.core.md; | |
2 | + | |
3 | +import java.io.IOException; | |
4 | + | |
5 | +import org.junit.Rule; | |
6 | +import org.junit.Test; | |
7 | +import org.junit.rules.TemporaryFolder; | |
8 | + | |
9 | +public class MentionDetectorTest { | |
10 | + | |
11 | + @Rule | |
12 | + public TemporaryFolder results = new TemporaryFolder(); | |
13 | + | |
14 | + @Test | |
15 | + public final void test() throws IOException { | |
16 | + String[] args = { | |
17 | + MentionDetectorTest.class.getResource("/example_test_tei/") | |
18 | + .getFile(), | |
19 | + results.newFolder().getAbsolutePath() }; | |
20 | + Main.main(args); | |
21 | + } | |
22 | +} | |
... | ... |
src/test/java/pl/waw/ipipan/zil/core/md/detection/zero/TrainerTest.java
0 → 100644
1 | +++ a/src/test/java/pl/waw/ipipan/zil/core/md/detection/zero/TrainerTest.java | |
1 | +package pl.waw.ipipan.zil.core.md.detection.zero; | |
2 | + | |
3 | +import java.io.File; | |
4 | +import java.io.IOException; | |
5 | + | |
6 | +import org.junit.Rule; | |
7 | +import org.junit.Test; | |
8 | +import org.junit.rules.TemporaryFolder; | |
9 | + | |
10 | +public class TrainerTest { | |
11 | + @Rule | |
12 | + public TemporaryFolder results = new TemporaryFolder(); | |
13 | + | |
14 | + @Test | |
15 | + public final void test() throws IOException { | |
16 | + String[] args = { | |
17 | + TrainerTest.class.getResource("/example_train_tei/").getFile(), | |
18 | + new File(results.newFolder(), "model.bin").getAbsolutePath(), | |
19 | + TrainerTest.class.getResource("/example_model/quasi_verbs.txt") | |
20 | + .getFile() }; | |
21 | + Trainer.main(args); | |
22 | + } | |
23 | +} | |
... | ... |
src/test/resources/example_model/model.bin
0 → 100644
No preview for this file type
src/test/resources/example_model/quasi_verbs.txt
0 → 100644
1 | +++ a/src/test/resources/example_model/quasi_verbs.txt | |
1 | +bawić | |
2 | +brać | |
3 | +brak | |
4 | +brakować | |
5 | +być | |
6 | +bywać | |
7 | +chcieć | |
8 | +chodzić | |
9 | +ciągnąć | |
10 | +ciec | |
11 | +czas | |
12 | +czuć | |
13 | +dobiec | |
14 | +dobiegać | |
15 | +dochodzić | |
16 | +docierać | |
17 | +dojść | |
18 | +dotrzeć | |
19 | +dusić | |
20 | +godzić | |
21 | +gotować | |
22 | +gryźć | |
23 | +grzmieć | |
24 | +iść | |
25 | +jechać | |
26 | +kłuć | |
27 | +kończyć | |
28 | +kręcić | |
29 | +kropić | |
30 | +lać | |
31 | +łamać | |
32 | +lecieć | |
33 | +mieć | |
34 | +mieszać | |
35 | +móc | |
36 | +można | |
37 | +musieć | |
38 | +należeć | |
39 | +nieść | |
40 | +nosić | |
41 | +nudzić | |
42 | +nudzić | |
43 | +obejść | |
44 | +odbijać | |
45 | +odchodzić | |
46 | +odejmować | |
47 | +odejść | |
48 | +odrzucać | |
49 | +odrzucić | |
50 | +okazać | |
51 | +okazywać | |
52 | +opłacać | |
53 | +opłacić | |
54 | +oznaczać | |
55 | +pachnieć | |
56 | +padać | |
57 | +palić | |
58 | +palić | |
59 | +paść | |
60 | +piec | |
61 | +podobać | |
62 | +pogorszyć | |
63 | +pójść | |
64 | +ponieść | |
65 | +poprawiać | |
66 | +pora | |
67 | +potwierdzać | |
68 | +potwierdzić | |
69 | +powinno | |
70 | +pozostać | |
71 | +pozostawać | |
72 | +prosić | |
73 | +przechodzić | |
74 | +przestać | |
75 | +przybyć | |
76 | +przybywać | |
77 | +przyjąć | |
78 | +przyjmować | |
79 | +przypominać | |
80 | +przypomnieć | |
81 | +robić | |
82 | +rozerwać | |
83 | +rozumieć | |
84 | +składać | |
85 | +skończyć | |
86 | +skręcać | |
87 | +skręcić | |
88 | +słychać | |
89 | +śnić | |
90 | +śpieszyć | |
91 | +stać | |
92 | +stać | |
93 | +stanąć | |
94 | +strzelić | |
95 | +swędzić | |
96 | +świecić | |
97 | +szkoda | |
98 | +trafiać | |
99 | +trafić | |
100 | +trząść | |
101 | +trzeba | |
102 | +ucieszyć | |
103 | +uczynić | |
104 | +udać | |
105 | +udawać | |
106 | +uderzać | |
107 | +uderzyć | |
108 | +układać | |
109 | +ułożyć | |
110 | +warto | |
111 | +wiadomo | |
112 | +widać | |
113 | +wieść | |
114 | +wolno | |
115 | +wstyd | |
116 | +wychodzić | |
117 | +wydać | |
118 | +wydawać | |
119 | +wyjaśniać | |
120 | +wyjaśnić | |
121 | +wyjść | |
122 | +wypadać | |
123 | +wypaść | |
124 | +wypogadzać | |
125 | +wyrzucić | |
126 | +wystarczyć | |
127 | +wziąć | |
128 | +zabraknąć | |
129 | +zacząć | |
130 | +zaczynać | |
131 | +zagotować | |
132 | +zainteresować | |
133 | +zakręcić | |
134 | +żal | |
135 | +zależeć | |
136 | +zanieść | |
137 | +zanieść | |
138 | +zanosić | |
139 | +zanosić | |
140 | +zapowiadać | |
141 | +zarzucać | |
142 | +zastanowić | |
143 | +zbierać | |
144 | +zdarzać | |
145 | +zdziwić | |
146 | +zebrać | |
147 | +zemrzeć | |
148 | +złożyć | |
149 | +znać | |
150 | +zrobić | |
... | ... |
src/test/resources/example_test_tei/1/ann_groups.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_test_tei/1/ann_morphosyntax.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_test_tei/1/ann_named.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_test_tei/1/ann_segmentation.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_test_tei/1/ann_words.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_test_tei/1/header.xml
0 → 100644
1 | +++ a/src/test/resources/example_test_tei/1/header.xml | |
1 | +<?xml version="1.0" encoding="UTF-8" standalone="no"?> | |
2 | +<teiHeader xmlns="http://www.tei-c.org/ns/1.0" xmlns:nkjp="http://www.nkjp.pl/ns/1.0" xml:lang="en"> | |
3 | + <fileDesc> | |
4 | + <titleStmt> | |
5 | + <title>Paragraphs: p-279,p-280,p-281,p-282,p-283,p-284,p-285,p-286,p-287 from NKJP text with id: IJPPAN_PolPr_TS00264</title> | |
6 | + </titleStmt> | |
7 | + </fileDesc> | |
8 | + <profileDesc> | |
9 | + <textClass> | |
10 | + <catRef scheme="#taxonomy-CORE" target="Dzienniki"/> | |
11 | + </textClass> | |
12 | + </profileDesc> | |
13 | + <revisionDesc/> | |
14 | +</teiHeader> | |
... | ... |
src/test/resources/example_test_tei/1/text.xml
0 → 100644
1 | +++ a/src/test/resources/example_test_tei/1/text.xml | |
1 | +<?xml version="1.0" ?> | |
2 | +<teiCorpus xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:nkjp="http://www.nkjp.pl/ns/1.0"> | |
3 | + <xi:include href="PCC_header.xml"/> | |
4 | + <TEI> | |
5 | + <xi:include href="header.xml"/> | |
6 | + <text> | |
7 | + <body> | |
8 | + <p xml:id="p-1">– Sensownym rozwiązaniem będzie zmiana istniejącego oświetlenia na typ uliczny, czyli na wysokie słupy. W tym roku nie mamy jednak na to pieniędzy – mówi Anita Tyszkiewicz-Zimałka, rzecznik Urzędu Miasta w Raciborzu.</p> | |
9 | + <p xml:id="p-2">Przyjęto więc salomonowe rozwiązanie ograniczenia nakładów do minimum. Na odcinku od kładki dla pieszych do restauracji „Zamkowa” co druga latarnia będzie zdemontowana – A elementy z nich będą służyły do naprawiania pozostałych – wyjaśnia rzecznik.</p> | |
10 | + <p xml:id="p-3">Jacek Bombor</p> | |
11 | + <p xml:id="p-4">W ekstraklasie Francji prowadzący w tabeli zespół Jacka Bąka RC Lens wygrał wyjazdowe spotkanie z Montpellier. Sukces gości jest tym cenniejszy, że od 33 minuty grali oni w osłabieniu, bez Ferdinanda Coly, który ukarany został czerwoną kartką.</p> | |
12 | + <p xml:id="p-5">Montpellier – RC Lens 1:2 (0:1). Fugier (88) – Diouf (43), Pedron (65). Czerwona kartka: Coly (Lens)</p> | |
13 | + <p xml:id="p-6">Paris St Germain – Sedan 3:0 (1:0). Arteta (23, karny), Alex (82), Cisse (90). Czerwona kartka: Elzeard (Sedan).</p> | |
14 | + <p xml:id="p-7">AJ Auxerre – Nantes 2:1 (1:1). Cisse (19), Gonzales (78) – Moldovan (26). Czerwona kartka: Cetto (Nantes).</p> | |
15 | + <p xml:id="p-8">Lorient – Troyes 1:0 (0:0). Feindouno (60).</p> | |
16 | + <p xml:id="p-9">Metz – Girondins Bordeaux 1:2 (1:0). Desire Job (36) – Pauleta (71), Vikash Dhorasoo (83).</p> | |
17 | + </body> | |
18 | + </text> | |
19 | + </TEI> | |
20 | +</teiCorpus> | |
0 | 21 | \ No newline at end of file |
... | ... |
src/test/resources/example_test_tei/2/ann_groups.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_test_tei/2/ann_morphosyntax.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_test_tei/2/ann_named.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_test_tei/2/ann_segmentation.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_test_tei/2/ann_words.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_test_tei/2/header.xml
0 → 100644
1 | +++ a/src/test/resources/example_test_tei/2/header.xml | |
1 | +<?xml version="1.0" encoding="UTF-8" standalone="no"?> | |
2 | +<teiHeader xmlns="http://www.tei-c.org/ns/1.0" xmlns:nkjp="http://www.nkjp.pl/ns/1.0" xml:lang="en"> | |
3 | + <fileDesc> | |
4 | + <titleStmt> | |
5 | + <title>Paragraphs: p-328,p-329 from NKJP text with id: PWN_3102000000066</title> | |
6 | + </titleStmt> | |
7 | + </fileDesc> | |
8 | + <profileDesc> | |
9 | + <textClass> | |
10 | + <catRef scheme="#taxonomy-CORE" target="Literatura faktu"/> | |
11 | + </textClass> | |
12 | + </profileDesc> | |
13 | + <revisionDesc/> | |
14 | +</teiHeader> | |
... | ... |
src/test/resources/example_test_tei/2/text.xml
0 → 100644
1 | +++ a/src/test/resources/example_test_tei/2/text.xml | |
1 | +<?xml version="1.0" ?> | |
2 | +<teiCorpus xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:nkjp="http://www.nkjp.pl/ns/1.0"> | |
3 | + <xi:include href="PCC_header.xml"/> | |
4 | + <TEI> | |
5 | + <xi:include href="header.xml"/> | |
6 | + <text> | |
7 | + <body> | |
8 | + <p xml:id="p-1">To, że mściciele mieli prawo bezkarnie zabić nie tylko mordercę, ale i jego synów, zostało zapisane czarno na białym. Ale nie koniec na tym. Sens formułki et ille ac filii eius soli sint faidosi polega na zawężeniu kręgu osób, które mścicielom wolno zabić. Wiąże się to z poprzednią częścią zdania: ośmiokrotność zwykłego wergeldu morderca ma zapłacić sam, bez udziału dalszych krewnych. Wolno stąd wnosić, że gdyby nie zapłacono "zwykłego" wergeldu in simplo, którego trzecią część musieli pokryć boczni krewni zbrodniarza, byliby oni razem z mordercą i jego domownikami wystawieni na wróżdę strony poszkodowanej.</p> | |
9 | + <p xml:id="p-2">Tytuł XVIII Prawa Sasów poświęcony jest odpowiedzialności karnej pana za zabójstwo popełnione przez lita, a właściwie temu, jak można się od tej odpowiedzialności uwolnić: „Jeżeli lit z rozkazu lub z poduszczenia swojego pana zabije jakiegoś człowieka, na przykład nobila, to pan płaci główszczyznę lub podlega wróżdzie; jeżeli zaś [lit] popełni ten czyn bez wiedzy pana, to ma być przez pana wyzwolony, i [wtedy] krewni ofiary mają się mścić na nim samym [to jest sprawcy] i na pozostałych siedmiu jego krewnych, a pan lita musi przysiąc z jedenastoma współprzysiężnikami, że nie był wtajemniczony w zbrodnię" (Litus si per iuissum vel consilium domini sui hominem occiderit, ut puta nobilem, dominus conpositionem persolvat vel faidam portet; si autem absque conscientia domini hoc fecerit, dimittatur a domino, et vindicetur in illo et aliis VII consanguineis eius a propinquis occisi, et dominus liti se in hoc conscium non esse cum XI iuret).</p> | |
10 | + </body> | |
11 | + </text> | |
12 | + </TEI> | |
13 | +</teiCorpus> | |
0 | 14 | \ No newline at end of file |
... | ... |
src/test/resources/example_test_tei/3/ann_groups.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_test_tei/3/ann_morphosyntax.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_test_tei/3/ann_named.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_test_tei/3/ann_segmentation.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_test_tei/3/ann_words.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_test_tei/3/header.xml
0 → 100644
1 | +++ a/src/test/resources/example_test_tei/3/header.xml | |
1 | +<?xml version="1.0" encoding="UTF-8" standalone="no"?> | |
2 | +<teiHeader xmlns="http://www.tei-c.org/ns/1.0" xmlns:nkjp="http://www.nkjp.pl/ns/1.0" xml:lang="en"> | |
3 | + <fileDesc> | |
4 | + <titleStmt> | |
5 | + <title>Paragraphs: p-38,p-39,p-40,p-41,p-42,p-43,p-44,p-45 from NKJP text with id: IJPPAN_p00111b00010a</title> | |
6 | + </titleStmt> | |
7 | + </fileDesc> | |
8 | + <profileDesc> | |
9 | + <textClass> | |
10 | + <catRef scheme="#taxonomy-CORE" target="Literatura piękna"/> | |
11 | + </textClass> | |
12 | + </profileDesc> | |
13 | + <revisionDesc/> | |
14 | +</teiHeader> | |
... | ... |
src/test/resources/example_test_tei/3/text.xml
0 → 100644
1 | +++ a/src/test/resources/example_test_tei/3/text.xml | |
1 | +<?xml version="1.0" ?> | |
2 | +<teiCorpus xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:nkjp="http://www.nkjp.pl/ns/1.0"> | |
3 | + <xi:include href="PCC_header.xml"/> | |
4 | + <TEI> | |
5 | + <xi:include href="header.xml"/> | |
6 | + <text> | |
7 | + <body> | |
8 | + <p xml:id="p-1">Wrócił niedługo potem i szepnął coś do Margaret.</p> | |
9 | + <p xml:id="p-2">Oboje odwrócili się w kierunku majora Kovalsky’ego.</p> | |
10 | + <p xml:id="p-3">Działo się coś złego, i to bardzo.</p> | |
11 | + <p xml:id="p-4">Zdążył nawet wyciągnąć pistolet i postrzelić Smitha, ale Margaret była szybsza. Ciosem dłoni powaliła go na ziemię. Tracąc przytomność pomyślał, że nie spodziewał się tyle siły w tak wątłym ciele.</p> | |
12 | + <p xml:id="p-5">VII</p> | |
13 | + <p xml:id="p-6">Gdy się obudził nie miał lewej ręki. Z rany sączyła się krew. Obok leżał, dysząc ciężko, John Smith. Również krwawił, tyle, że na niebiesko. "Wszystko na opak w tym pojebanym miejscu" – pomyślał Kovalsky i znów zemdlał.</p> | |
14 | + <p xml:id="p-7">Gdy ocknął się drugi raz, Smith wyglądał trochę lepiej, a całą twarz miał we krwi. Czerwonej. Ręka majora obficie krwawiła do jakiegoś naczynia. Opodal uwijała się Margaret, która sprawiedliwie rozdzielała krwawy posiłek między siebie i Johna Smitha. Oboje sprawiali wrażenie bardzo szczęśliwych.</p> | |
15 | + <p xml:id="p-8">Margaret podeszła do Kovalsky’ego i pogłaskała po policzku. – Kochany, to był cudowny pomysł z tym wyścigiem. Naprawdę świetny. Nawet nie przypuszczałam... Nie przypuszczaliśmy... Jeden z nich się przewrócił i rozciął dłoń. Zaczął ssać i krwawienie ustało. A potem drugi, ale skaleczył się w nogę. Nie mógł sobie pomóc, więc myśmy to zrobili. Och... – Margaret jęknęła zmysłowo, a Kovalsky’ego znów ogarnęła ciemność.</p> | |
16 | + </body> | |
17 | + </text> | |
18 | + </TEI> | |
19 | +</teiCorpus> | |
0 | 20 | \ No newline at end of file |
... | ... |
src/test/resources/example_train_tei/1/ann_coreference.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_train_tei/1/ann_groups.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_train_tei/1/ann_mentions.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_train_tei/1/ann_morphosyntax.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_train_tei/1/ann_named.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_train_tei/1/ann_segmentation.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_train_tei/1/ann_words.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_train_tei/1/header.xml
0 → 100644
1 | +++ a/src/test/resources/example_train_tei/1/header.xml | |
1 | +<?xml version="1.0" encoding="UTF-8" standalone="no"?> | |
2 | +<teiHeader xmlns="http://www.tei-c.org/ns/1.0" xmlns:nkjp="http://www.nkjp.pl/ns/1.0" xml:lang="en"> | |
3 | + <fileDesc> | |
4 | + <titleStmt> | |
5 | + <title>Paragraphs: p-57,p-58,p-59,p-60 from NKJP text with id: IPIPAN_1301919980826</title> | |
6 | + </titleStmt> | |
7 | + </fileDesc> | |
8 | + <profileDesc> | |
9 | + <textClass> | |
10 | + <catRef scheme="#taxonomy-CORE" target="Dzienniki"/> | |
11 | + </textClass> | |
12 | + </profileDesc> | |
13 | + <revisionDesc/> | |
14 | +</teiHeader> | |
... | ... |
src/test/resources/example_train_tei/1/text.xml
0 → 100644
1 | +++ a/src/test/resources/example_train_tei/1/text.xml | |
1 | +<?xml version="1.0" ?> | |
2 | +<teiCorpus xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:nkjp="http://www.nkjp.pl/ns/1.0"> | |
3 | + <xi:include href="PCC_header.xml"/> | |
4 | + <TEI> | |
5 | + <xi:include href="header.xml"/> | |
6 | + <text> | |
7 | + <body> | |
8 | + <p xml:id="p-1">W spotkaniu weźmie udział blisko 7 tysięcy braci z całej Europy, ale tylko 206 z nich będzie ubiegało się o tytuł Europejskiego Króla Kurkowego. - Wezmę udział w strzelaniu, choć moje szanse są marne. Wynika to przede wszystkim z moich obowiązków gospodarza spotkań; w tym nawale pracy ciężko mi będzie się skupić na strzelaniu - przewiduje Zdzisław Maj, prezes krakowskiego Bractwa Kurkowego, panujący Król Kurkowy.</p> | |
9 | + <p xml:id="p-2">Strzelanie o tytuł Europejskiego Króla Kurkowego będzie się odbywało w kilku etapach. Do finału zostanie dopuszczonych 27 braci - jeden z nich otrzyma tytuł Europejskiego Króla Kurkowego odbierając go obecnie panującemu Wilfriedowi Stammermannowi. - Król nie otrzymuje żadnych nagród finansowych, ale taki tytuł jest ogromnym zaszczytem; król jest np. zapraszany na posiedzenia Parlamentu Europejskiego - mówi Zdzisław Maj.</p> | |
10 | + <p xml:id="p-3">Największą atrakcją 12. Europejskich Spotkań Bractw Strzeleckich będzie wielka parada, która rozpocznie się w niedzielę o godz. 13. Kilkuset braci w historycznych strojach przejdzie z Błoń na Rynek ulicami: Piłsudskiego, Straszewskiego, Franciszkańską i Grodzką.</p> | |
11 | + <p xml:id="p-4">Początki istnienia Bractwa Kurkowego w Krakowie sięgają XIII wieku. Skupiało ono znamienitych obywateli, kupców i rzemieślników pragnących wspomóc obronność miasta. Wielkim świętem bractwa był turniej, który odbywał się na strzelnicy zwanej Celestatem. Zawody trwały zwykle trzy dni. Strzelano do drewnianego kura umocowanego na wysokiej żerdzi. Brat, który zdołał celnym strzałem strącić ostatni jego fragment zdobywał miano Króla Kurkowego. Z tym tytułem wiązały się nie tylko honory, ale także przywileje: Rada Miejska zwalniała jego posiadacza m.in. z obowiązku płacenia podatków (ten zwyczaj utrzymał się do dziś).</p> | |
12 | + </body> | |
13 | + </text> | |
14 | + </TEI> | |
15 | +</teiCorpus> | |
0 | 16 | \ No newline at end of file |
... | ... |
src/test/resources/example_train_tei/2/ann_coreference.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_train_tei/2/ann_groups.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_train_tei/2/ann_mentions.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_train_tei/2/ann_morphosyntax.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_train_tei/2/ann_named.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_train_tei/2/ann_segmentation.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_train_tei/2/ann_words.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_train_tei/2/header.xml
0 → 100644
1 | +++ a/src/test/resources/example_train_tei/2/header.xml | |
1 | +<?xml version="1.0" encoding="UTF-8" standalone="no"?> | |
2 | +<teiHeader xmlns="http://www.tei-c.org/ns/1.0" xmlns:nkjp="http://www.nkjp.pl/ns/1.0" xml:lang="en"> | |
3 | + <fileDesc> | |
4 | + <titleStmt> | |
5 | + <title>Paragraphs: p-437,p-438,p-439,p-440,p-441,p-442,p-443,p-444,p-445 from NKJP text with id: IJPPAN_PolPr_SlP00841</title> | |
6 | + </titleStmt> | |
7 | + </fileDesc> | |
8 | + <profileDesc> | |
9 | + <textClass> | |
10 | + <catRef scheme="#taxonomy-CORE" target="Dzienniki"/> | |
11 | + </textClass> | |
12 | + </profileDesc> | |
13 | + <revisionDesc/> | |
14 | +</teiHeader> | |
... | ... |
src/test/resources/example_train_tei/2/text.xml
0 → 100644
1 | +++ a/src/test/resources/example_train_tei/2/text.xml | |
1 | +<?xml version="1.0" ?> | |
2 | +<teiCorpus xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:nkjp="http://www.nkjp.pl/ns/1.0"> | |
3 | + <xi:include href="PCC_header.xml"/> | |
4 | + <TEI> | |
5 | + <xi:include href="header.xml"/> | |
6 | + <text> | |
7 | + <body> | |
8 | + <p xml:id="p-1">Ernest i Agnieszka nie planowali, że będą mieli wielką, babską rodzinę. Ale tak wyszło. – I całe szczęście. Lepiej się dogaduję z dziewczętami – cieszy się Ernest Kwiecień.</p> | |
9 | + <p xml:id="p-2">W Wigilię do jego obowiązków, poza dostarczeniem choinki, należeć będzie zmywanie naczyń. Agnieszka zrobi pierogi, ugotuje barszcz z uszkami, usmaży karpia. Córki upieką ciasta. Potem przyjdzie czas na prezenty. Może to nawet będą empetrójki, o których marzą starsze dziewczyny.</p> | |
10 | + <p xml:id="p-3">Jodełek sadzimy mniej</p> | |
11 | + <p xml:id="p-4">Leśniczy, od którego pan Ernest przywozi choinkę, mieszka kilka kilometrów od domu Kwietniów. On także nie wyobraża sobie świąt bez prawdziwego świerku. – I musi być kiczowaty – uśmiecha się Gabriel Grobelny, nadleśniczy wałbrzyski.</p> | |
12 | + <p xml:id="p-5">To znaczy, że powinny na nim wisieć ozdoby zrobione przez dzieci, przechowywane latami, wyciągane na tę jedyną okazję.</p> | |
13 | + <p xml:id="p-6">Pan Gabriel ma dwóch synów i trzy córki. W domu została najmłodsza, 12-letnia, ale na święta zjadą wszyscy. I ubiorą choinkę. – Żona rozwiesi anielskie włosy, ja podłączę lampki – w domu nadleśniczego podział świątecznych ról jest określony.</p> | |
14 | + <p xml:id="p-7">W dolnośląskich lasach najwięcej jest świerków. Na plantacjach sadzą także coraz popularniejsze jodły z miękkimi igłami.</p> | |
15 | + <p xml:id="p-8">– Ale i tych jodełek sadzimy już mniej. To nie lata dziewięćdziesiąte, gdy sprzedawaliśmy prawie wszystkie wyhodowane drzewka – wspomina nadleśniczy.</p> | |
16 | + <p xml:id="p-9">U Grobelnego choinkę można sobie wybrać. – Mamy rodziny, w których co roku ojciec przyjeżdża z synem, by samemu ściąć drzewko. Taką mają tradycję – dodaje pan Gabriel.</p> | |
17 | + </body> | |
18 | + </text> | |
19 | + </TEI> | |
20 | +</teiCorpus> | |
0 | 21 | \ No newline at end of file |
... | ... |
src/test/resources/example_train_tei/3/ann_coreference.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_train_tei/3/ann_groups.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_train_tei/3/ann_mentions.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_train_tei/3/ann_morphosyntax.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_train_tei/3/ann_named.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_train_tei/3/ann_segmentation.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_train_tei/3/ann_words.xml.gz
0 → 100644
No preview for this file type
src/test/resources/example_train_tei/3/header.xml
0 → 100644
1 | +++ a/src/test/resources/example_train_tei/3/header.xml | |
1 | +<?xml version="1.0" encoding="UTF-8" standalone="no"?> | |
2 | +<teiHeader xmlns="http://www.tei-c.org/ns/1.0" xmlns:nkjp="http://www.nkjp.pl/ns/1.0" xml:lang="en"> | |
3 | + <fileDesc> | |
4 | + <titleStmt> | |
5 | + <title>Paragraphs: p-6,p-7,p-8,p-9 from NKJP text with id: PELCRA_1303919960926</title> | |
6 | + </titleStmt> | |
7 | + </fileDesc> | |
8 | + <profileDesc> | |
9 | + <textClass> | |
10 | + <catRef scheme="#taxonomy-CORE" target="Dzienniki"/> | |
11 | + </textClass> | |
12 | + </profileDesc> | |
13 | + <revisionDesc/> | |
14 | +</teiHeader> | |
... | ... |
src/test/resources/example_train_tei/3/text.xml
0 → 100644
1 | +++ a/src/test/resources/example_train_tei/3/text.xml | |
1 | +<?xml version="1.0" ?> | |
2 | +<teiCorpus xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:nkjp="http://www.nkjp.pl/ns/1.0"> | |
3 | + <xi:include href="PCC_header.xml"/> | |
4 | + <TEI> | |
5 | + <xi:include href="header.xml"/> | |
6 | + <text> | |
7 | + <body> | |
8 | + <p xml:id="p-1">Cena życia</p> | |
9 | + <p xml:id="p-2">Z tego pogromu ocalało kilkudziesięciu Żydów, a wśród nich rodzina Mosze Sonensona. Przed wojną była to w skali miasteczka rodzina bogata. Sonensonowie mieli garbarnię. Nie udało mi się dociec, u kogo mianowicie przechowywali się Sonensonowie oraz pozostali Żydzi w czasie okupacji niemieckiej. Faktem pozostaje natomiast, że okupację tę przeżyli. Faktem oczywistym pozostaje i to, że liczne rodziny polskie - w Ejszyszkach i w pobliskich okolicach - przechowywały Żydów. Parę kilometrów od Ejszyszek, w Korkucianach (w folwarku Lebiedniki), żołnierz AK Kazimierz Korkuć w czasie wojny w swoim domu przechowywał 28 Żydów. Od studni do piwnic domu był przekopany tunel, dzięki czemu mieli wodę. Natomiast w skali siatki AK Kazimierz Korkuć przechowywał około 70 Żydów. Rodzina Świeczków również przechowywała Żydów. W tamtych stronach liczne rodziny polskie postępowały podobnie.</p> | |
10 | + <p xml:id="p-3">Prawdą jest również i to, że Żydzi za swe przechowanie płacili. Płacili za utrzymanie i chyba jeszcze - za ryzyko. O tym dzisiaj raczej tu się nie mówi, ale prawdopodobnie różnie z tym było: jedni za pieniądze, inni - z odruchu serca. Ryzykowali i Polacy, i Żydzi. Te rachunki mogły wyglądać bardzo różnie.</p> | |
11 | + <p xml:id="p-4">Mieszkam w jednej z podwileńskich wsi. Otóż w tej mojej wsi pewien gospodarz - Polak - przechowywał w czasie wojny młodą Żydówkę. Spodobała mu się, z czego wynikł dramat. Zdenerwowana żona doniosła na policję. Aresztowano Żydówkę razem z gospodarzem, przerażona kobieta próbowała ocalić męża. Zanim uzbierała potrzebną sumę na łapówkę, było już za późno - rozstrzelano nie tylko Żydówkę, ale i gospodarza. Czy żonę tego straceńca można nazwać antysemitką?</p> | |
12 | + </body> | |
13 | + </text> | |
14 | + </TEI> | |
15 | +</teiCorpus> | |
0 | 16 | \ No newline at end of file |
... | ... |