Commit dfbfe3fdf1b83e7b7fe76d1421ab9d9488227f62

Authored by Mateusz Kopeć
0 parents

Initial commit

Showing 82 changed files with 3236 additions and 0 deletions
.gitignore 0 → 100644
  1 +++ a/.gitignore
  1 +/target/
  2 +.classpath
  3 +.project
  4 +.settings
... ...
pom.xml 0 → 100644
  1 +++ a/pom.xml
  1 +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 + <modelVersion>4.0.0</modelVersion>
  4 + <groupId>pl.waw.ipipan.zil.core</groupId>
  5 + <artifactId>md</artifactId>
  6 + <version>1.2-SNAPSHOT</version>
  7 + <build>
  8 + <plugins>
  9 + <plugin>
  10 + <artifactId>maven-compiler-plugin</artifactId>
  11 + <version>2.3.2</version>
  12 + <configuration>
  13 + <source>1.7</source>
  14 + <target>1.7</target>
  15 + </configuration>
  16 + </plugin>
  17 + <plugin>
  18 + <groupId>org.dstovall</groupId>
  19 + <artifactId>onejar-maven-plugin</artifactId>
  20 + <version>1.4.4</version>
  21 + <executions>
  22 + <execution>
  23 + <configuration>
  24 + <mainClass>pl.waw.ipipan.zil.core.md.Main</mainClass>
  25 + </configuration>
  26 + <goals>
  27 + <goal>one-jar</goal>
  28 + </goals>
  29 + </execution>
  30 + </executions>
  31 + </plugin>
  32 + </plugins>
  33 + </build>
  34 + <dependencies>
  35 + <dependency>
  36 + <groupId>log4j</groupId>
  37 + <artifactId>log4j</artifactId>
  38 + <version>1.2.17</version>
  39 + </dependency>
  40 + <dependency>
  41 + <groupId>ipipan.multiservice</groupId>
  42 + <artifactId>MultiserviceUtils</artifactId>
  43 + <version>1.0-SNAPSHOT</version>
  44 + </dependency>
  45 + <dependency>
  46 + <groupId>ipipan</groupId>
  47 + <artifactId>teiapi</artifactId>
  48 + <version>1.0-SNAPSHOT</version>
  49 + </dependency>
  50 + <dependency>
  51 + <groupId>junit</groupId>
  52 + <artifactId>junit</artifactId>
  53 + <version>4.11</version>
  54 + </dependency>
  55 + <dependency>
  56 + <groupId>nz.ac.waikato.cms.weka</groupId>
  57 + <artifactId>weka-stable</artifactId>
  58 + <version>3.6.10</version>
  59 + </dependency>
  60 + </dependencies>
  61 + <repositories>
  62 + <repository>
  63 + <id>zil-maven-repo</id>
  64 + <name>ZIL maven repository</name>
  65 + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/snapshots</url>
  66 + </repository>
  67 + </repositories>
  68 + <pluginRepositories>
  69 + <pluginRepository>
  70 + <id>onejar-maven-plugin.googlecode.com</id>
  71 + <url>http://onejar-maven-plugin.googlecode.com/svn/mavenrepo</url>
  72 + </pluginRepository>
  73 + </pluginRepositories>
  74 +</project>
... ...
src/main/java/pl/waw/ipipan/zil/core/md/Main.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/Main.java
  1 +package pl.waw.ipipan.zil.core.md;
  2 +
  3 +import ipipan.clarin.tei.api.entities.TEICorpusText;
  4 +import ipipan.clarin.tei.api.exceptions.TEIException;
  5 +import ipipan.clarin.tei.api.io.IOUtils;
  6 +
  7 +import java.io.File;
  8 +import java.io.FileInputStream;
  9 +import java.io.IOException;
  10 +import java.io.InputStream;
  11 +
  12 +import org.apache.log4j.Logger;
  13 +
  14 +import pl.waw.ipipan.multiservice.thrift.types.MultiserviceException;
  15 +import pl.waw.ipipan.multiservice.thrift.types.TText;
  16 +import pl.waw.ipipan.zil.core.md.detection.Detector;
  17 +import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector;
  18 +import pl.waw.ipipan.zil.core.md.entities.Text;
  19 +import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader;
  20 +import pl.waw.ipipan.zil.core.md.io.tei.TeiSaver;
  21 +import pl.waw.ipipan.zil.core.md.io.thrift.ThriftLoader;
  22 +import pl.waw.ipipan.zil.core.md.io.thrift.ThriftSaver;
  23 +
  24 +/**
  25 + * @author Mateusz Kopeć
  26 + *
  27 + */
  28 +public class Main {
  29 +
  30 + private final static Logger logger = Logger.getLogger(Main.class);
  31 + private final static boolean GZIP_OUTPUT = true;
  32 +
  33 + private final static String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin";
  34 +
  35 + private static ZeroSubjectDetector zeroSubjectModel;
  36 + static {
  37 + InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL);
  38 + zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream);
  39 + }
  40 +
  41 + /**
  42 + * Main method for detecting mentions in corpus encoded in Tei format.
  43 + *
  44 + * @param args
  45 + * @throws TEIException
  46 + */
  47 + public static void main(String[] args) {
  48 +
  49 + if (args.length != 2 && args.length != 3) {
  50 + logger.error("Wrong usage! should be: " + Main.class.getSimpleName()
  51 + + " input_dir result_dir [zero_subject_model]");
  52 + return;
  53 + }
  54 +
  55 + File inputDir = new File(args[0]);
  56 + File outputDir = new File(args[1]);
  57 +
  58 + if (!inputDir.isDirectory()) {
  59 + logger.error(inputDir + " is not a directory!");
  60 + return;
  61 + }
  62 + if (!outputDir.isDirectory()) {
  63 + logger.error(outputDir + " is not a directory!");
  64 + return;
  65 + }
  66 + if (args.length == 3) {
  67 + try {
  68 + InputStream zeroSubjectDetectionModelStream;
  69 + zeroSubjectDetectionModelStream = new FileInputStream(new File(args[2]));
  70 + zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream);
  71 + if (zeroSubjectModel == null)
  72 + throw new IOException();
  73 + } catch (IOException e) {
  74 + logger.error("Unable to load model from file: " + args[2] + ": " + e);
  75 + return;
  76 + }
  77 + }
  78 +
  79 + int all = 0;
  80 + int errors = 0;
  81 + for (File teiDir : IOUtils.getNKJPDirs(inputDir)) {
  82 + all++;
  83 + try {
  84 + File targetDir = createTargetTextDir(inputDir, outputDir, teiDir);
  85 + TEICorpusText teiText = TeiLoader.readTeiText(teiDir);
  86 + annotateTeiText(teiText);
  87 + TeiSaver.saveTeiText(teiText, targetDir, GZIP_OUTPUT);
  88 + } catch (IOException e) {
  89 + logger.error("Error processing text in dir:" + teiDir + " Error details: " + e.getLocalizedMessage());
  90 + errors++;
  91 + }
  92 + }
  93 +
  94 + logger.info(all + " texts processed succesfully.");
  95 + if (errors > 0)
  96 + logger.info(errors + " texts not processed.");
  97 + logger.info(ZeroSubjectDetector.verbsWithoutSubject + " verbs with zero subject detected.");
  98 + logger.info(ZeroSubjectDetector.verbsWithSubject + " verbs with explicit subject detected.");
  99 + }
  100 +
  101 + /**
  102 + * Find relative path of text directory in the corpus directory and create
  103 + * similar directory structure in the output corpus directory.
  104 + *
  105 + * @param inputCorpusDir
  106 + * @param outputCorpusDir
  107 + * @param textDir
  108 + * @return
  109 + * @throws IOException
  110 + */
  111 + private static File createTargetTextDir(File inputCorpusDir, File outputCorpusDir, File textDir) throws IOException {
  112 + String relativeDirPath = textDir.toString().substring(inputCorpusDir.toString().length());
  113 + File targetDir = new File(outputCorpusDir, relativeDirPath);
  114 + targetDir.mkdirs();
  115 + if (!targetDir.exists() || !targetDir.isDirectory())
  116 + throw new IOException("Failed to create output directory at: " + targetDir);
  117 + return targetDir;
  118 + }
  119 +
  120 + /**
  121 + * Find mentions in Thrift text and update this Thrift text with mention
  122 + * annotation.
  123 + *
  124 + * @param thriftText
  125 + * @throws MultiserviceException
  126 + */
  127 + public static void annotateThriftText(TText thriftText) throws MultiserviceException {
  128 + Text responseText = ThriftLoader.loadTextFromThrift(thriftText);
  129 + Detector.findMentionsInText(responseText, zeroSubjectModel);
  130 + ThriftSaver.updateThriftText(responseText, thriftText);
  131 + }
  132 +
  133 + /**
  134 + * Find mentions in Tei text and update this Tei text with mention
  135 + * annotation. This method does not save this Tei text on disk.
  136 + *
  137 + * @param teiText
  138 + * @param zeroSubjectModel
  139 + * @throws TEIException
  140 + */
  141 + public static void annotateTeiText(TEICorpusText teiText) throws TEIException {
  142 + Text responseText = TeiLoader.loadTextFromTei(teiText);
  143 + Detector.findMentionsInText(responseText, zeroSubjectModel);
  144 + TeiSaver.updateTeiText(responseText, teiText);
  145 + }
  146 +
  147 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java
  1 +package pl.waw.ipipan.zil.core.md.detection;
  2 +
  3 +import java.util.Collection;
  4 +import java.util.HashSet;
  5 +import java.util.List;
  6 +import java.util.Set;
  7 +
  8 +import pl.waw.ipipan.zil.core.md.entities.Mention;
  9 +import pl.waw.ipipan.zil.core.md.entities.Sentence;
  10 +import pl.waw.ipipan.zil.core.md.entities.Token;
  11 +
  12 +public class Cleaner {
  13 + public static void cleanUnnecessarySentenceMentions(Sentence sentence) {
  14 + List<Mention> mentions = sentence.getMentions();
  15 + Collection<Mention> unnecessaryMentions = new HashSet<Mention>();
  16 +
  17 + for (int i = 0; i < mentions.size(); i++) {
  18 + Mention m1 = mentions.get(i);
  19 + for (int j = i + 1; j < mentions.size(); j++) {
  20 + Mention m2 = mentions.get(j);
  21 +
  22 + Mention lessImportantMention = getLessImportantMention(m1, m2);
  23 + Mention moreImportantMention = m1 == lessImportantMention ? m2
  24 + : m1;
  25 +
  26 + // same mention borders
  27 + if (m1.getSegments().equals(m2.getSegments())) {
  28 + unnecessaryMentions.add(lessImportantMention);
  29 + // System.out.println("Same borders: "+ m1 +", "+
  30 + // m2+": "+getLessImportantMention(m1, m2)+" removed");
  31 + continue;
  32 + }
  33 + // same mention heads
  34 + if (!m1.getHeadSegments().isEmpty()
  35 + && !m2.getHeadSegments().isEmpty()) {
  36 + if (m1.getHeadSegments().equals(m2.getHeadSegments())) {
  37 +
  38 + List<Token> segments = moreImportantMention
  39 + .getSegments();
  40 +
  41 + boolean isConj = false;
  42 + for (Token seg : segments) {
  43 + if (seg.getChosenInterpretation().getCtag()
  44 + .equals("conj")) {
  45 + isConj = true;
  46 + break;
  47 + }
  48 + }
  49 +
  50 + if (!isConj) {
  51 + unnecessaryMentions.add(lessImportantMention);
  52 + // System.out.println("Same heads: " + m1 + ", " +
  53 + // m2 + ": " + lessImportantMention
  54 + // + " removed");
  55 +
  56 + continue;
  57 + }
  58 + }
  59 + }
  60 +
  61 + // mention head equals whole other mention
  62 + if (m1.getHeadSegments().isEmpty()
  63 + && !m2.getHeadSegments().isEmpty()) {
  64 + if (m2.getHeadSegments().equals(m1.getSegments())) {
  65 + unnecessaryMentions.add(lessImportantMention);
  66 + continue;
  67 + // System.out.println("head is other mention: " + m1 +
  68 + // ", " + m2 + ": "
  69 + // + getLessImportantMention(m1, m2) + " removed");
  70 + }
  71 + }
  72 +
  73 + // the same, but other way round
  74 + if (m2.getHeadSegments().isEmpty()
  75 + && !m1.getHeadSegments().isEmpty()) {
  76 +
  77 + if (m1.getHeadSegments().equals(m2.getSegments())) {
  78 + unnecessaryMentions.add(lessImportantMention);
  79 + continue;
  80 + // System.out.println("head is other mention: " + m1 +
  81 + // ", " + m2 + ": "
  82 + // + getLessImportantMention(m1, m2) + " removed");
  83 + }
  84 + }
  85 +
  86 + // nie zawieraja sie w sobie, lecz maja czesc wspolna
  87 + boolean intersect = false;
  88 +
  89 + Set<Token> notInM1 = new HashSet<Token>(m2.getSegments());
  90 + notInM1.removeAll(m1.getSegments());
  91 + if (notInM1.size() < m2.getSegments().size())
  92 + intersect = true;
  93 +
  94 + Set<Token> notInM2 = new HashSet<Token>(m1.getSegments());
  95 + notInM2.removeAll(m2.getSegments());
  96 + if (notInM2.size() < m1.getSegments().size())
  97 + intersect = true;
  98 +
  99 + // if (intersect)
  100 + // System.out.println(m1+","+m2);
  101 +
  102 + if (intersect && !notInM1.isEmpty() && !notInM2.isEmpty()) {
  103 + unnecessaryMentions.add(lessImportantMention);
  104 + continue;
  105 + // System.out.println("intersection!" + m1 + ", " + m2 +
  106 + // ": "
  107 + // + getLessImportantMention(m1, m2) + " removed");
  108 + }
  109 +
  110 + }
  111 + }
  112 +
  113 + for (Mention m : unnecessaryMentions)
  114 + sentence.removeMention(m);
  115 +
  116 + // heurystyka dla usuwania rzeczy w stylu: [[Ernest][Kwiecien]]
  117 + unnecessaryMentions.clear();
  118 +
  119 + OUTER: for (Mention m : sentence.getMentions()) {
  120 + for (Token seg : m.getSegments())
  121 + if (seg.getOrth().toLowerCase().equals(seg.getOrth()))
  122 + continue OUTER;
  123 +
  124 + //only for children of fully capitalized mentions
  125 + Set<Mention> allMentions = new HashSet<Mention>();
  126 + for (Token seg : m.getSegments())
  127 + for (Mention m2 : seg.getMentions())
  128 + if (m.getSegments().containsAll(m2.getSegments()))
  129 + allMentions.add(m2);
  130 +
  131 + allMentions.remove(m);
  132 +
  133 + unnecessaryMentions.addAll(allMentions);
  134 + }
  135 + for (Mention m : unnecessaryMentions)
  136 + sentence.removeMention(m);
  137 + }
  138 +
  139 + private static Mention getLessImportantMention(Mention m1, Mention m2) {
  140 + if (m1.getSegments().size() > m2.getSegments().size())
  141 + return m2;
  142 + else
  143 + return m1;
  144 + }
  145 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java
  1 +package pl.waw.ipipan.zil.core.md.detection;
  2 +
  3 +public class Constants {
  4 + public static final String MORPHO_NOUN_CTAGS = "subst|depr|ger";
  5 + public static final String MORPHO_VERB_CTAGS = "fin|bedzie|aglt|impt";
  6 + public static final String MORPHO_PRONOUN_CTAGS = "ppron3|ppron12";
  7 + public static final String MORPHO_CTAGS = MORPHO_NOUN_CTAGS + "|"
  8 + + MORPHO_PRONOUN_CTAGS;
  9 + public static final String WORDS_CTAGS = "Noun|Ppron.*";
  10 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java
  1 +package pl.waw.ipipan.zil.core.md.detection;
  2 +
  3 +import java.util.ArrayList;
  4 +import java.util.HashSet;
  5 +import java.util.List;
  6 +import java.util.Set;
  7 +
  8 +import org.apache.log4j.Logger;
  9 +
  10 +import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector;
  11 +import pl.waw.ipipan.zil.core.md.entities.Mention;
  12 +import pl.waw.ipipan.zil.core.md.entities.NamedEntity;
  13 +import pl.waw.ipipan.zil.core.md.entities.Paragraph;
  14 +import pl.waw.ipipan.zil.core.md.entities.Sentence;
  15 +import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup;
  16 +import pl.waw.ipipan.zil.core.md.entities.SyntacticWord;
  17 +import pl.waw.ipipan.zil.core.md.entities.Text;
  18 +import pl.waw.ipipan.zil.core.md.entities.Token;
  19 +
  20 +public class Detector {
  21 + private static Logger logger = Logger.getLogger(Detector.class);
  22 +
  23 + public static void findMentionsInText(Text text,
  24 + ZeroSubjectDetector zeroSubjectModel) {
  25 + text.clearMentions();
  26 + logger.debug("Detecting mentions in text " + text.getId());
  27 + for (Paragraph p : text)
  28 + for (Sentence s : p)
  29 + detectMentionsInSentence(s, zeroSubjectModel);
  30 + }
  31 +
  32 + private static void detectMentionsInSentence(Sentence sentence,
  33 + ZeroSubjectDetector zeroSubjectModel) {
  34 + // adding mentions
  35 + addMentionsByTokenCtag(sentence);
  36 + addMentionsBySyntacticWordsCtag(sentence);
  37 + addMentionsByNamedEntities(sentence);
  38 + addMentionsByGroups(sentence);
  39 + addSpeakerMentionsInSpoken(sentence);
  40 +
  41 + // zero subject detection
  42 + zeroSubjectModel.addZeroSubjectMentions(sentence);
  43 +
  44 + // removing mentions
  45 + removeTo(sentence);
  46 + Cleaner.cleanUnnecessarySentenceMentions(sentence);
  47 +
  48 + // updating mention heads
  49 + updateMentionHeads(sentence);
  50 + }
  51 +
  52 + /**
  53 + * heurystyka ustawiajaca jako glowe pierwszy segment gdy glowy brak
  54 + *
  55 + * @param sentence
  56 + */
  57 + private static void updateMentionHeads(Sentence sentence) {
  58 + for (Mention m : sentence.getMentions())
  59 + if (m.getHeadSegments().isEmpty())
  60 + m.addHeadSegment(m.getFirstSegment());
  61 + }
  62 +
  63 + /**
  64 + * heurystyka dla "to" w zdaniu z ""jeśli"/"jeżeli"/"skoro""
  65 + *
  66 + * @param sentence
  67 + */
  68 + private static void removeTo(Sentence sentence) {
  69 + Set<String> orths = new HashSet<String>();
  70 + for (Token morph : sentence)
  71 + orths.add(morph.getOrth());
  72 +
  73 + if (orths.contains("jeśli") || orths.contains("jeżeli")
  74 + || orths.contains("skoro")) {
  75 + for (Mention mention : sentence.getMentions()) {
  76 + List<Token> mentSegs = mention.getSegments();
  77 + if (mentSegs.size() == 1
  78 + && mentSegs.get(0).getBase().equals("to")) {
  79 + sentence.removeMention(mention);
  80 + }
  81 + }
  82 + }
  83 + }
  84 +
  85 + private static void addSpeakerMentionsInSpoken(Sentence sentence) {
  86 + // heurystyka dla sp1:, sp2:, MarszałekJAkistam:
  87 + if (sentence.size() > 2) {
  88 + Token first = sentence.get(0);
  89 + Token second = sentence.get(1);
  90 + if (second.getOrth().equals(":")) {
  91 + sentence.addMention(new Mention(first));
  92 + }
  93 + }
  94 + }
  95 +
  96 + /**
  97 + * Wyszukuję i oznaczam wszystkie NG*
  98 + *
  99 + * @param sentence
  100 + */
  101 + private static void addMentionsByGroups(Sentence sentence) {
  102 + for (SyntacticGroup group : sentence.getGroups()) {
  103 + if (group.getType().startsWith("NG")) {
  104 + List<Token> segments = group.getTokens();
  105 + List<Token> heads = group.getSemanticHeadTokens();
  106 +
  107 + sentence.addMention(new Mention(segments, heads));
  108 + }
  109 + }
  110 + }
  111 +
  112 + /**
  113 + * Wyszukuję i oznaczam wszystkie NER
  114 + *
  115 + * @param sentence
  116 + */
  117 + private static void addMentionsByNamedEntities(Sentence sentence) {
  118 + for (NamedEntity ne : sentence.getNamedEntities()) {
  119 +
  120 + List<Token> headTokens = new ArrayList<Token>();
  121 + List<Token> tokens = ne.getTokens();
  122 +
  123 + boolean containsNoun = false;
  124 + for (Token seg : tokens) {
  125 + if (seg.getCtag().matches(Constants.MORPHO_NOUN_CTAGS)) {
  126 + containsNoun = true;
  127 + break;
  128 + }
  129 + }
  130 + if (!containsNoun)
  131 + continue;
  132 +
  133 + sentence.addMention(new Mention(tokens, headTokens));
  134 + }
  135 + }
  136 +
  137 + /**
  138 + * @param sentence
  139 + */
  140 + private static void addMentionsBySyntacticWordsCtag(Sentence sentence) {
  141 + for (SyntacticWord w : sentence.getSyntacticWords())
  142 + if (w.getCtag().matches(Constants.WORDS_CTAGS)) {
  143 + List<Token> tokens = w.getTokens();
  144 + if (tokens.size() == 1) {
  145 + sentence.addMention(new Mention(tokens.get(0)));
  146 + } else {
  147 + List<Token> heads = new ArrayList<Token>();
  148 + sentence.addMention(new Mention(tokens, heads));
  149 + }
  150 + }
  151 + }
  152 +
  153 + /**
  154 + * Wyszukuję wszystkie interesujace czesci mowy jesli jest poziom slow
  155 + * skladniowych, to korzystam z niego zamiast morfoskladni
  156 + *
  157 + * @param sentence
  158 + */
  159 + private static void addMentionsByTokenCtag(Sentence sentence) {
  160 + for (Token token : sentence)
  161 + if (token.getCtag().matches(Constants.MORPHO_CTAGS))
  162 + sentence.addMention(new Mention(token));
  163 + }
  164 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Constants.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Constants.java
  1 +package pl.waw.ipipan.zil.core.md.detection.zero;
  2 +
  3 +import java.util.Arrays;
  4 +import java.util.HashSet;
  5 +import java.util.Set;
  6 +
  7 +public class Constants {
  8 + final public static Set<String> VERB_TAGS = new HashSet<>(
  9 + Arrays.asList(new String[] { "fin", "bedzie", "aglt", "praet",
  10 + "winien" }));
  11 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/FeatureGeneration.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/FeatureGeneration.java
  1 +package pl.waw.ipipan.zil.core.md.detection.zero;
  2 +
  3 +import ipipan.clarin.tei.api.entities.TEIMention;
  4 +import ipipan.clarin.tei.api.entities.TEIMorph;
  5 +
  6 +import java.util.ArrayList;
  7 +import java.util.Arrays;
  8 +import java.util.HashMap;
  9 +import java.util.HashSet;
  10 +import java.util.Iterator;
  11 +import java.util.LinkedList;
  12 +import java.util.List;
  13 +import java.util.Map;
  14 +import java.util.Set;
  15 +
  16 +import pl.waw.ipipan.zil.core.md.entities.Mention;
  17 +import pl.waw.ipipan.zil.core.md.entities.Sentence;
  18 +import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup;
  19 +import pl.waw.ipipan.zil.core.md.entities.SyntacticWord;
  20 +import pl.waw.ipipan.zil.core.md.entities.Token;
  21 +
  22 +public class FeatureGeneration {
  23 + final private static Set<String> CLAUSE_SPLIT_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "i", "albo",
  24 + "lub", "oraz", "bądź", "ani", "czy", "niż", "tudzież", ",", ";", "-", "–", ":" }));
  25 +
  26 + final private static Set<String> CLAUSE_SPLIT_LEMMAS2 = new HashSet<>(Arrays.asList(new String[] { "a", "ale",
  27 + "lecz", "jednak", "jednakże", "zaś", "wszakże", "owszem", "natomiast", "tylko", "dlatego", "jedynie",
  28 + "przecież", "tymczasem", "ponieważ", "więc", "dlatego", "toteż", "zatem" }));
  29 +
  30 + final private static Set<String> CLAUSE_SPLIT_LEMMAS_STRICT = new HashSet<>(
  31 + Arrays.asList(new String[] { "?", "!" }));
  32 +
  33 + final private static Map<String, String> CLAUSE_SPLIT_LEMMAS_PAIRWISE = new HashMap<>();
  34 + static {
  35 + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("(", ")");
  36 + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("\"", "\"");
  37 + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("'", "'");
  38 + }
  39 +
  40 + final private static Set<String> NOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "subst", "depr", "ppron12",
  41 + "ppron3", "ger", "num", "numcol" }));
  42 +
  43 + final private static Set<String> PRONOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "ppron12", "ppron3" }));
  44 +
  45 + final private static Set<String> VERB_TAGS = new HashSet<>(Arrays.asList(new String[] { "fin", "bedzie", "aglt",
  46 + "praet", "winien" }));
  47 +
  48 + final private static Set<String> ZAIMKI_WZGLEDNE_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "jaki",
  49 + "który" }));
  50 +
  51 + public static void generateFeatures(Map<String, Object> features, Token m, Sentence s, Set<String> quasiVerbs) {
  52 +
  53 + features.put("verbCtag", m.getChosenInterpretation().getCtag());
  54 + features.put("verbNumber", m.getChosenInterpretation().getNumber());
  55 + features.put("verbGender", m.getChosenInterpretation().getGender());
  56 + features.put("verbPerson", m.getChosenInterpretation().getPerson());
  57 +
  58 + features.put("quasi", quasiVerbs.contains(m.getChosenInterpretation().getBase()));
  59 +
  60 + features.put("nextCtag", getNeighbouringTag(s, m, 1));
  61 + features.put("prevCtag", getNeighbouringTag(s, m, -1));
  62 +
  63 + features.put("isPrevPraet", isPrevPraet(m, s));
  64 + features.put("isPrevComma", isPrevComma(m, s));
  65 + features.put("isPrev2Pred", isPrev2Pred(m, s));
  66 + features.put("isNextInf", isNextInf(m, s));
  67 +
  68 + List<Token> clause = getClause(s, m);
  69 + features.put("sentLength", s.size());
  70 + features.put("clauseLength", clause.size());
  71 +
  72 + addFeatures(features, clause, "clause", m);
  73 + addFeatures(features, s, "sent", m);
  74 + for (int i = 1; i < 6; i++)
  75 + addFeatures(features, getWindow(s, m, i, 0), "window_" + i + "_" + 0, m);
  76 + for (int i = 1; i < 6; i++)
  77 + addFeatures(features, getWindow(s, m, 0, i), "window_" + 0 + "_" + i, m);
  78 + for (int i = 1; i < 6; i++)
  79 + addFeatures(features, getWindow(s, m, i, i), "window_" + i + "_" + i, m);
  80 + }
  81 +
  82 + private static boolean isNextInf(Token m, Sentence s) {
  83 + boolean now = false;
  84 + for (Token morph : s) {
  85 + if (now)
  86 + return morph.getChosenInterpretation().getCtag().equals("inf");
  87 + if (m.equals(morph))
  88 + now = true;
  89 + }
  90 + return false;
  91 + }
  92 +
  93 + private static boolean isPrev2Pred(Token m, Sentence s) {
  94 + Token prev = null;
  95 + Token prev2 = null;
  96 + for (Token morph : s) {
  97 + if (m.equals(morph))
  98 + break;
  99 + prev2 = prev;
  100 + prev = morph;
  101 + }
  102 + return (prev != null && prev.getChosenInterpretation().getCtag().equals("pred"))
  103 + || (prev2 != null && prev2.getChosenInterpretation().getCtag().equals("pred"));
  104 + }
  105 +
  106 + private static Object isPrevComma(Token m, Sentence s) {
  107 + Token prev = null;
  108 + for (Token morph : s) {
  109 + if (m.equals(morph))
  110 + break;
  111 + prev = morph;
  112 + }
  113 + return prev != null && prev.getChosenInterpretation().getBase().equals(",");
  114 + }
  115 +
  116 + private static String getNeighbouringTag(Sentence s, Token m, int i) {
  117 + int idx = s.indexOf(m) + i;
  118 + if (idx >= s.size() || idx < 0)
  119 + return "None";
  120 + return s.get(idx).getChosenInterpretation().getCtag();
  121 + }
  122 +
  123 + private static void addFeatures(Map<String, Object> features, List<Token> clause, String prefix, Token m) {
  124 +
  125 + boolean hasNom = false; // 1
  126 + boolean hasNum = false; // 2
  127 + boolean hasPOG = false; // 3
  128 +
  129 + boolean hasNomNum = false;
  130 + boolean hasNumPOG = false;
  131 + boolean hasNomPOG = false;
  132 + boolean hasNomNumPOG = false;
  133 +
  134 + boolean has2Nom = false;
  135 + boolean has2NomPOG = false;
  136 + boolean has2POG = false;
  137 +
  138 + Token prev = null;
  139 + for (Token candidate : clause) {
  140 +
  141 + if (!isNoun(candidate) || isJakJako(prev)) {
  142 + prev = candidate;
  143 + continue;
  144 + }
  145 +
  146 + // nom, nom2
  147 + if (isNom(candidate)) {
  148 + if (hasNom)
  149 + has2Nom = true;
  150 + hasNom = true;
  151 + }
  152 + // num
  153 + if (agreedNum(candidate, m)) {
  154 + hasNum = true;
  155 + }
  156 + // pog, pog2
  157 + if (agreedGenderOrPerson(candidate, m)) {
  158 + if (hasPOG)
  159 + has2POG = true;
  160 + hasPOG = true;
  161 + }
  162 +
  163 + // nom num, nom num pog
  164 + if (isNom(candidate) && agreedNum(candidate, m)) {
  165 + if (agreedGenderOrPerson(candidate, m))
  166 + hasNomNumPOG = true;
  167 + hasNomNum = true;
  168 + }
  169 +
  170 + // nom pog, num pog
  171 + if (agreedGenderOrPerson(candidate, m))
  172 + if (isNom(candidate)) {
  173 + if (hasNomPOG)
  174 + has2NomPOG = true;
  175 + hasNomPOG = true;
  176 + } else if (agreedNum(candidate, m))
  177 + hasNumPOG = true;
  178 +
  179 + prev = candidate;
  180 + }
  181 +
  182 + // features.put("conj_" + prefix, hasConj);
  183 + features.put("cand_2_nom_" + prefix, has2Nom);
  184 + features.put("cand_2_POG_" + prefix, has2POG);
  185 + features.put("cand_2_nom+POG_" + prefix, has2NomPOG);
  186 +
  187 + features.put("cand_nom_" + prefix, hasNom);
  188 + features.put("cand_num_" + prefix, hasNum);
  189 + features.put("cand_POG_" + prefix, hasPOG);
  190 +
  191 + features.put("cand_nom+num_" + prefix, hasNomNum);
  192 + features.put("cand_nom+num+POG_" + prefix, hasNomNumPOG);
  193 + features.put("cand_nom+POG_" + prefix, hasNomPOG);
  194 + features.put("cand_num+POG_" + prefix, hasNumPOG);
  195 + }
  196 +
  197 + private static List<Token> getWindow(Sentence s, Token m, int pre, int post) {
  198 +
  199 + int idx = s.indexOf(m);
  200 + int from = Math.max(0, idx - pre);
  201 + int to = Math.min(s.size(), idx + post + 1);
  202 +
  203 + return new ArrayList<>(s.subList(from, to));
  204 + }
  205 +
  206 + private static boolean isPrevPraet(Token m, Sentence s) {
  207 + Token prev = null;
  208 + for (Token morph : s) {
  209 + if (m.equals(morph))
  210 + break;
  211 + prev = morph;
  212 + }
  213 + return prev != null && prev.getChosenInterpretation().getCtag().equals("praet");
  214 + }
  215 +
  216 + /**
  217 + * - podział na klauzule: przecinki oraz spójniki bezprzecinkowe: i, albo,
  218 + * lub (jak przy streszczeniach: w środku musi być czasownik w formie
  219 + * osobowej),
  220 + */
  221 + public static List<Token> getClause(Sentence s, Token m2) {
  222 +
  223 + List<List<Token>> sublists = getClauses(s);
  224 +
  225 + for (List<Token> sub : sublists)
  226 + for (Token m : sub)
  227 + if (m.equals(m2))
  228 + return sub;
  229 +
  230 + return null;
  231 + }
  232 +
  233 + public static List<List<Token>> getClauses(Sentence s) {
  234 +
  235 + Set<Token> noSplitMorphs = new HashSet<>();
  236 + for (SyntacticGroup g : s.getGroups()) {
  237 + for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) {
  238 + noSplitMorphs.add(m);
  239 + }
  240 + }
  241 + for (SyntacticWord g : s.getSyntacticWords()) {
  242 + for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) {
  243 + noSplitMorphs.add(m);
  244 + }
  245 + }
  246 +
  247 + LinkedList<List<Token>> sublists = new LinkedList<>();
  248 + List<Token> currentSublist = new ArrayList<>();
  249 + boolean clauseHasVerb = false;
  250 + for (Token m : s) {
  251 + String base = m.getChosenInterpretation().getBase();
  252 + if (!noSplitMorphs.contains(m)
  253 + && (CLAUSE_SPLIT_LEMMAS_STRICT.contains(base) || ((CLAUSE_SPLIT_LEMMAS.contains(base) || CLAUSE_SPLIT_LEMMAS2
  254 + .contains(base)) && clauseHasVerb))) {
  255 + sublists.add(currentSublist);
  256 + currentSublist = new ArrayList<>();
  257 + clauseHasVerb = false;
  258 + } else {
  259 + if (isVerb(m))
  260 + clauseHasVerb = true;
  261 + }
  262 + currentSublist.add(m);
  263 + }
  264 + if (currentSublist.size() > 0) {
  265 + if (clauseHasVerb)
  266 + sublists.add(currentSublist);
  267 + else
  268 + sublists.getLast().addAll(currentSublist);
  269 + }
  270 +
  271 + // merge clause beginning with zaimek wzgl. etc to previous clause
  272 + List<Token> prev = null;
  273 + Iterator<List<Token>> it = sublists.iterator();
  274 + while (it.hasNext()) {
  275 + List<Token> sublist = it.next();
  276 + boolean containsRelPron = false;
  277 + int i = 1;
  278 + for (Token m : sublist) {
  279 + if (i > 2)
  280 + break;
  281 + if (ZAIMKI_WZGLEDNE_LEMMAS.contains(m.getChosenInterpretation().getBase())) {
  282 + containsRelPron = true;
  283 + break;
  284 + }
  285 + i++;
  286 + }
  287 + if (prev != null && containsRelPron) {
  288 + prev.addAll(sublist);
  289 + it.remove();
  290 + } else
  291 + prev = sublist;
  292 + }
  293 +
  294 + return sublists;
  295 + }
  296 +
  297 + private static boolean agreedNum(Token candidate, Token keyword) {
  298 + String keywordNum = keyword.getNumber();
  299 + String wordNum = candidate.getNumber();
  300 + return keywordNum.equals(wordNum);
  301 + }
  302 +
  303 + private static boolean agreedGenderOrPerson(Token candidate, Token keyword) {
  304 + if (isPraet(keyword)) {
  305 + // praet has number:gender
  306 + String keywordGender = keyword.getGender();
  307 + String wordGender = candidate.getGender();
  308 + return keywordGender.equals(wordGender);
  309 + } else {
  310 + // other verbs have number:person
  311 + String keywordPerson = keyword.getPerson();
  312 + String wordPerson = "ter"; // default
  313 + if (PRONOUN_TAGS.contains(candidate))
  314 + wordPerson = candidate.getPerson();
  315 + return wordPerson.equals(keywordPerson);
  316 + }
  317 + }
  318 +
  319 + private static boolean isJakJako(Token prev) {
  320 + String base = prev == null ? null : prev.getBase();
  321 + return prev != null && (base.equals("jak") || base.equals("jako"));
  322 + }
  323 +
  324 + private static boolean isPraet(Token keyword) {
  325 + return keyword.getCtag().equals("praet");
  326 + }
  327 +
  328 + private static boolean isNom(Token candidate) {
  329 + return "nom".equals(candidate.getCase()); // dziala dla rzeczownikow
  330 + // tylko!
  331 + }
  332 +
  333 + private static boolean isNoun(Token m) {
  334 + return NOUN_TAGS.contains(m.getCtag());
  335 + }
  336 +
  337 + public static boolean isVerb(Token morph) {
  338 + return VERB_TAGS.contains(morph.getCtag());
  339 + }
  340 +
  341 + public static boolean isVerb(Mention m) {
  342 + boolean hasOnlyVerbs = true;
  343 + for (Token morph : m.getSegments())
  344 + if (!isVerb(morph)) {
  345 + hasOnlyVerbs = false;
  346 + break;
  347 + }
  348 + return hasOnlyVerbs;
  349 + }
  350 +
  351 + public static boolean isVerb(TEIMention m) {
  352 + boolean hasOnlyVerbs = true;
  353 + for (TEIMorph morph : m.getMorphs())
  354 + if (!isVerb(morph)) {
  355 + hasOnlyVerbs = false;
  356 + break;
  357 + }
  358 + return hasOnlyVerbs;
  359 + }
  360 +
  361 + private static boolean isVerb(TEIMorph morph) {
  362 + return VERB_TAGS.contains(morph.getChosenInterpretation().getCtag());
  363 + }
  364 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/InstanceCreator.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/InstanceCreator.java
  1 +package pl.waw.ipipan.zil.core.md.detection.zero;
  2 +
  3 +import ipipan.clarin.tei.api.entities.TEICorpusText;
  4 +import ipipan.clarin.tei.api.io.IOUtils;
  5 +import ipipan.clarin.tei.api.io.TEI_IO;
  6 +
  7 +import java.io.File;
  8 +import java.util.ArrayList;
  9 +import java.util.HashSet;
  10 +import java.util.List;
  11 +import java.util.Map.Entry;
  12 +import java.util.Set;
  13 +import java.util.TreeMap;
  14 +import java.util.TreeSet;
  15 +
  16 +import org.apache.log4j.Logger;
  17 +
  18 +import pl.waw.ipipan.zil.core.md.entities.Mention;
  19 +import pl.waw.ipipan.zil.core.md.entities.Paragraph;
  20 +import pl.waw.ipipan.zil.core.md.entities.Sentence;
  21 +import pl.waw.ipipan.zil.core.md.entities.Text;
  22 +import pl.waw.ipipan.zil.core.md.entities.Token;
  23 +import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader;
  24 +import weka.core.Attribute;
  25 +import weka.core.FastVector;
  26 +import weka.core.Instance;
  27 +import weka.core.Instances;
  28 +
  29 +public class InstanceCreator {
  30 +
  31 + final private static Logger logger = Logger.getLogger(InstanceCreator.class);
  32 + final private static TEI_IO teiIO = TEI_IO.getInstance();
  33 +
  34 + public static List<TreeMap<String, Object>> loadExamples(File dataDir, Set<String> quasiVerbs) {
  35 + int allTexts = 0;
  36 + int exceptions = 0;
  37 + int allSentences = 0;
  38 +
  39 + List<TreeMap<String, Object>> examples = new ArrayList<>();
  40 + for (File textDir : IOUtils.getNKJPDirs(dataDir)) {
  41 + try {
  42 + allTexts++;
  43 + logger.info("Processing text " + textDir);
  44 + TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir);
  45 + Text text = TeiLoader.loadTextFromTei(ct);
  46 +
  47 + for (Paragraph p : text)
  48 + for (Sentence s : p) {
  49 + allSentences++;
  50 + loadExamplesFromSentence(quasiVerbs, examples, s);
  51 + }
  52 +
  53 + } catch (Exception e) {
  54 + logger.error(e.getLocalizedMessage());
  55 + exceptions++;
  56 + }
  57 + }
  58 +
  59 + logger.info(allTexts + " texts found.");
  60 + if (exceptions != 0)
  61 + logger.error(exceptions + " texts with exceptions.");
  62 + logger.info(allSentences + " sentences found.");
  63 +
  64 + return examples;
  65 + }
  66 +
  67 + public static void loadExamplesFromSentence(Set<String> quasiVerbs, List<TreeMap<String, Object>> examples,
  68 + Sentence s) {
  69 +
  70 + // collect positive examples
  71 + Set<Token> positive = new HashSet<>();
  72 + for (Mention m : s.getMentions()) {
  73 + if (FeatureGeneration.isVerb(m)) {
  74 + positive.addAll(m.getSegments());
  75 + }
  76 + }
  77 +
  78 + for (Token m : s) {
  79 + if (!FeatureGeneration.isVerb(m))
  80 + continue;
  81 +
  82 + TreeMap<String, Object> features = new TreeMap<>();
  83 + if (positive.contains(m)) {
  84 + features.put("class", Boolean.valueOf(true));
  85 + } else {
  86 + features.put("class", Boolean.valueOf(false));
  87 + }
  88 +
  89 + FeatureGeneration.generateFeatures(features, m, s, quasiVerbs);
  90 + examples.add(features);
  91 + }
  92 + }
  93 +
  94 + public static Instances createInstances(List<TreeMap<String, Object>> examples, String classFeatureName) {
  95 +
  96 + TreeSet<String> booleanAttsOccurred = new TreeSet<>();
  97 + TreeSet<String> doubleAttsOccurred = new TreeSet<>();
  98 + TreeMap<String, Set<String>> att2values = new TreeMap<>();
  99 + for (TreeMap<String, Object> example : examples) {
  100 + for (Entry<String, Object> e : example.entrySet()) {
  101 + String key = e.getKey();
  102 + Object val = e.getValue();
  103 + if (val instanceof Integer || val instanceof Double) {
  104 + doubleAttsOccurred.add(key);
  105 + continue;
  106 + }
  107 + if (val instanceof Boolean) {
  108 + booleanAttsOccurred.add(key);
  109 + continue;
  110 + }
  111 + if (!att2values.containsKey(key))
  112 + att2values.put(key, new HashSet<String>());
  113 + att2values.get(key).add(val.toString());
  114 + }
  115 + }
  116 +
  117 + List<Attribute> atts = new ArrayList<>();
  118 +
  119 + // double attributes
  120 + for (String attName : doubleAttsOccurred) {
  121 + Attribute att = new Attribute(attName);
  122 + atts.add(att);
  123 + }
  124 +
  125 + // boolean attributes (treated as nominal)
  126 + FastVector values = new FastVector(2);
  127 + values.addElement("false");
  128 + values.addElement("true");
  129 + for (String attName : booleanAttsOccurred) {
  130 + Attribute att = new Attribute(attName, values);
  131 + atts.add(att);
  132 + }
  133 +
  134 + // nominal attributes
  135 + for (Entry<String, Set<String>> attVals : att2values.entrySet()) {
  136 + FastVector vals = new FastVector(attVals.getValue().size());
  137 + for (String val : attVals.getValue())
  138 + vals.addElement(val);
  139 + Attribute att = new Attribute(attVals.getKey(), vals);
  140 + atts.add(att);
  141 + }
  142 +
  143 + FastVector fvWekaAttributes = new FastVector(atts.size());
  144 + for (Attribute attr : atts) {
  145 + fvWekaAttributes.addElement(attr);
  146 + }
  147 +
  148 + Instances data = new Instances("Zero", fvWekaAttributes, 10);
  149 + data.setClass(data.attribute(classFeatureName));
  150 + return data;
  151 + }
  152 +
  153 + public static void fillInstances(List<TreeMap<String, Object>> examples, Instances instances) {
  154 + for (TreeMap<String, Object> example : examples) {
  155 + Instance instance = new Instance(instances.numAttributes());
  156 +
  157 + for (Entry<String, Object> e : example.entrySet()) {
  158 + Object val = e.getValue();
  159 + String name = e.getKey();
  160 + if (val instanceof Integer) {
  161 + instance.setValue(instances.attribute(name), (int) val);
  162 + } else if (val instanceof Boolean) {
  163 + instance.setValue(instances.attribute(name), ((Boolean) val) ? "true" : "false");
  164 + } else {
  165 + int indexOfValue = instances.attribute(name).indexOfValue(val.toString());
  166 + if (indexOfValue == -1) {
  167 + logger.debug("Unkown value: " + val.toString() + " of feature: " + name
  168 + + ". Marking as missing value.");
  169 + instance.setMissing(instances.attribute(name));
  170 + } else
  171 + instance.setValue(instances.attribute(name), indexOfValue);
  172 + }
  173 + }
  174 +
  175 + instance.setDataset(instances);
  176 + instances.add(instance);
  177 + }
  178 + }
  179 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Model.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Model.java
  1 +package pl.waw.ipipan.zil.core.md.detection.zero;
  2 +
  3 +import java.io.Serializable;
  4 +import java.util.List;
  5 +import java.util.Set;
  6 +import java.util.TreeMap;
  7 +
  8 +import org.apache.log4j.Logger;
  9 +
  10 +import pl.waw.ipipan.zil.core.md.entities.Sentence;
  11 +import weka.classifiers.Classifier;
  12 +import weka.core.Instance;
  13 +import weka.core.Instances;
  14 +
  15 +public class Model implements Serializable {
  16 +
  17 + private static final long serialVersionUID = 3351727361273283076L;
  18 + private static final Logger logger = Logger.getLogger(Model.class);
  19 +
  20 + private Classifier classifier;
  21 + private Set<String> quasiVerbs;
  22 + private Instances instances;
  23 +
  24 + public Model(Classifier classifier, Instances instances, Set<String> quasiVerbs) {
  25 + this.classifier = classifier;
  26 + this.instances = instances;
  27 + this.quasiVerbs = quasiVerbs;
  28 + }
  29 +
  30 + public boolean isZeroSubject(Instance instance, Sentence sentence) {
  31 + try {
  32 + double response = this.classifier.classifyInstance(instance);
  33 + return response > 0;
  34 + } catch (Exception e) {
  35 + logger.error("Error classyfing verb in sentence: " + sentence);
  36 + return false;
  37 + }
  38 + }
  39 +
  40 + public Instances getInstances(List<TreeMap<String, Object>> examples) {
  41 + Instances instances = new Instances(this.instances);
  42 + InstanceCreator.fillInstances(examples, instances);
  43 + return instances;
  44 + }
  45 +
  46 + public Set<String> getQuasiVerbs() {
  47 + return quasiVerbs;
  48 + }
  49 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Serializer.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Serializer.java
  1 +package pl.waw.ipipan.zil.core.md.detection.zero;
  2 +
  3 +import java.io.InputStream;
  4 +
  5 +import weka.core.SerializationHelper;
  6 +
  7 +public class Serializer {
  8 +
  9 + public static void saveModel(Model m, String targetModelFilePath) throws Exception {
  10 + SerializationHelper.write(targetModelFilePath, m);
  11 + }
  12 +
  13 + public static Model loadModel(String path) throws Exception {
  14 + Model m = (Model) SerializationHelper.read(path);
  15 + return m;
  16 + }
  17 +
  18 + public static Model loadModelFromStream(InputStream stream) throws Exception {
  19 + Model m = (Model) SerializationHelper.read(stream);
  20 + return m;
  21 + }
  22 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Trainer.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Trainer.java
  1 +package pl.waw.ipipan.zil.core.md.detection.zero;
  2 +
  3 +import java.io.BufferedReader;
  4 +import java.io.File;
  5 +import java.io.IOException;
  6 +import java.io.InputStream;
  7 +import java.io.InputStreamReader;
  8 +import java.util.HashSet;
  9 +import java.util.List;
  10 +import java.util.Random;
  11 +import java.util.Set;
  12 +import java.util.TreeMap;
  13 +
  14 +import org.apache.log4j.Logger;
  15 +
  16 +import weka.classifiers.Evaluation;
  17 +import weka.classifiers.rules.JRip;
  18 +import weka.classifiers.rules.JRip.RipperRule;
  19 +import weka.core.Attribute;
  20 +import weka.core.Instance;
  21 +import weka.core.Instances;
  22 +
  23 +public class Trainer {
  24 +
  25 + final private static Logger logger = Logger.getLogger(Trainer.class);
  26 +
  27 + private static final boolean DO_CV = false;
  28 + private static final String QUASI_LIST_PATH = "/quasi_verbs.txt";
  29 +
  30 + public static void main(String[] args) {
  31 +
  32 + if (args.length != 2) {
  33 + logger.error("Wrong number of arguments! Should be: " + Trainer.class.getSimpleName()
  34 + + " trainDir targetModelFile");
  35 + return;
  36 + }
  37 +
  38 + File dataDir = new File(args[0]);
  39 + String targetModelFilePath = args[1];
  40 +
  41 + if (!dataDir.isDirectory()) {
  42 + logger.error(dataDir + " is not a directory!");
  43 + return;
  44 + }
  45 +
  46 + Set<String> quasiVerbs = loadQuasiVerbs();
  47 +
  48 + List<TreeMap<String, Object>> examples = InstanceCreator.loadExamples(dataDir, quasiVerbs);
  49 + Instances instances = InstanceCreator.createInstances(examples, "class");
  50 + InstanceCreator.fillInstances(examples, instances);
  51 +
  52 + printStats(instances);
  53 +
  54 + try {
  55 + JRip model = new JRip();
  56 +
  57 + if (DO_CV) {
  58 + logger.info("Crossvalidation...");
  59 + Evaluation eval = new Evaluation(instances);
  60 + eval.crossValidateModel(model, instances, 10, new Random(1));
  61 + logger.info(eval.toSummaryString());
  62 + logger.info(eval.toMatrixString());
  63 + logger.info(eval.toClassDetailsString());
  64 + }
  65 +
  66 + logger.info("Building final classifier...");
  67 + model = new JRip();
  68 + model.buildClassifier(instances);
  69 + logger.info(model.getRuleset().size() + " rules generated.");
  70 + for (int i = 0; i < model.getRuleset().size(); i++) {
  71 + RipperRule v = (RipperRule) model.getRuleset().elementAt(i);
  72 + logger.info("\t" + v.toString(instances.classAttribute()));
  73 + }
  74 +
  75 + instances.delete();
  76 + logger.info("Features stats:");
  77 + for (int i = 0; i < instances.numAttributes(); i++) {
  78 + Attribute att = instances.attribute(i);
  79 + logger.info(i + ".\t" + att.toString());
  80 + }
  81 +
  82 + logger.info("Saving classifier...");
  83 + Model m = new Model(model, instances, quasiVerbs);
  84 + Serializer.saveModel(m, targetModelFilePath);
  85 + logger.info("Done.");
  86 +
  87 + } catch (Exception e) {
  88 + logger.error("Error: " + e);
  89 + }
  90 + }
  91 +
  92 + private static Set<String> loadQuasiVerbs() {
  93 + Set<String> quasiVerbs = new HashSet<>();
  94 + InputStream stream = Trainer.class.getResourceAsStream(QUASI_LIST_PATH);
  95 + try (BufferedReader br = new BufferedReader(new InputStreamReader(stream))) {
  96 + String line = null;
  97 + while ((line = br.readLine()) != null) {
  98 + quasiVerbs.add(line.trim());
  99 + }
  100 + } catch (IOException e) {
  101 + logger.error(e.getLocalizedMessage());
  102 + }
  103 + return quasiVerbs;
  104 + }
  105 +
  106 + private static void printStats(Instances instances) {
  107 + int positive = 0;
  108 + int negative = 0;
  109 + for (int i = 0; i < instances.numInstances(); i++) {
  110 + Instance inst = instances.instance(i);
  111 + if (inst.classValue() > 0)
  112 + negative++;
  113 + else
  114 + positive++;
  115 + }
  116 + logger.info(positive + " positive examples");
  117 + logger.info(negative + " negative examples");
  118 + logger.info((positive + negative) + " examples total");
  119 + logger.info((instances.numAttributes() - 1) + " attributes");
  120 + logger.info(instances.toSummaryString());
  121 + }
  122 +
  123 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/ZeroSubjectDetector.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/ZeroSubjectDetector.java
  1 +package pl.waw.ipipan.zil.core.md.detection.zero;
  2 +
  3 +import java.io.File;
  4 +import java.io.InputStream;
  5 +import java.util.ArrayList;
  6 +import java.util.HashSet;
  7 +import java.util.List;
  8 +import java.util.Set;
  9 +import java.util.TreeMap;
  10 +
  11 +import org.apache.log4j.Logger;
  12 +
  13 +import pl.waw.ipipan.zil.core.md.entities.Mention;
  14 +import pl.waw.ipipan.zil.core.md.entities.Sentence;
  15 +import pl.waw.ipipan.zil.core.md.entities.Token;
  16 +import weka.core.Instances;
  17 +
  18 +public class ZeroSubjectDetector {
  19 + final private static Logger logger = Logger.getLogger(ZeroSubjectDetector.class);
  20 +
  21 + private Model model;
  22 + private Set<String> quasiVerbs = new HashSet<>();
  23 +
  24 + public static int verbsWithoutSubject = 0;
  25 + public static int verbsWithSubject = 0;
  26 +
  27 + public void addZeroSubjectMentions(Sentence sentence) {
  28 + List<TreeMap<String, Object>> examples = new ArrayList<>();
  29 + InstanceCreator.loadExamplesFromSentence(quasiVerbs, examples, sentence);
  30 + if (examples.isEmpty())
  31 + return;
  32 +
  33 + Instances instances = model.getInstances(examples);
  34 +
  35 + // label instances
  36 + List<Boolean> areZeros = new ArrayList<>();
  37 + for (int i = 0; i < instances.numInstances(); i++) {
  38 + boolean isZero = model.isZeroSubject(instances.instance(i), sentence);
  39 + areZeros.add(isZero);
  40 + if (isZero)
  41 + verbsWithoutSubject++;
  42 + else
  43 + verbsWithSubject++;
  44 + }
  45 +
  46 + int i = 0;
  47 + for (Token m : sentence) {
  48 + if (!FeatureGeneration.isVerb(m))
  49 + continue;
  50 + if (areZeros.get(i))
  51 + sentence.addMention(new Mention(m, true));
  52 + i++;
  53 + }
  54 + }
  55 +
  56 + public ZeroSubjectDetector(File zeroSubjectDetectionModel) {
  57 + try {
  58 + this.model = Serializer.loadModel(zeroSubjectDetectionModel.getAbsolutePath());
  59 + this.quasiVerbs = this.model.getQuasiVerbs();
  60 + } catch (Exception e) {
  61 + logger.error("Error loading model:" + e);
  62 + }
  63 + }
  64 +
  65 + public ZeroSubjectDetector(InputStream zeroSubjectDetectionModelStream) {
  66 + try {
  67 + this.model = Serializer.loadModelFromStream(zeroSubjectDetectionModelStream);
  68 + this.quasiVerbs = this.model.getQuasiVerbs();
  69 + } catch (Exception e) {
  70 + logger.error("Error loading model:" + e);
  71 + }
  72 + }
  73 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/Interpretation.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Interpretation.java
  1 +package pl.waw.ipipan.zil.core.md.entities;
  2 +
  3 +import pl.waw.ipipan.zil.core.md.detection.zero.Constants;
  4 +
  5 +public class Interpretation {
  6 + private String ctag = "null";
  7 + private String base = "null";
  8 +
  9 + private String number = "null";
  10 + private String casee = "null";
  11 + private String gender = "null";
  12 + private String person = "null";
  13 +
  14 + public Interpretation(String ctag2, String morph, String base) {
  15 + this.ctag = ctag2;
  16 + this.base = base;
  17 +
  18 + String[] spl = morph.split(":");
  19 + if (ctag.equalsIgnoreCase("subst") || ctag.equalsIgnoreCase("depr") || ctag.equalsIgnoreCase("ger")) {
  20 + this.number = spl[0];
  21 + this.casee = spl[1];
  22 + this.gender = spl[2];
  23 + } else if (ctag.equalsIgnoreCase("ppron12") || ctag.equalsIgnoreCase("ppron3")) {
  24 + this.number = spl[0];
  25 + this.casee = spl[1];
  26 + this.gender = spl[2];
  27 + this.person = spl[3];
  28 + } else if (ctag.equalsIgnoreCase("siebie")) {
  29 + this.casee = spl[0];
  30 + } else if (Constants.VERB_TAGS.contains(ctag)) {
  31 + this.number = spl[0];
  32 + if (ctag.matches("winien|praet"))
  33 + this.gender = spl[1];
  34 + else
  35 + this.person = spl[1];
  36 + }
  37 + }
  38 +
  39 + public String getCtag() {
  40 + return this.ctag;
  41 + }
  42 +
  43 + public String getNumber() {
  44 + return this.number;
  45 + }
  46 +
  47 + public String getGender() {
  48 + return this.gender;
  49 + }
  50 +
  51 + public String getCase() {
  52 + return this.casee;
  53 + }
  54 +
  55 + public String getBase() {
  56 + return this.base;
  57 + }
  58 +
  59 + public String getPerson() {
  60 + return this.person;
  61 + }
  62 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java
  1 +package pl.waw.ipipan.zil.core.md.entities;
  2 +
  3 +import java.util.ArrayList;
  4 +import java.util.List;
  5 +
  6 +/**
  7 + * @author Mateusz Kopec
  8 + *
  9 + */
  10 +public class Mention implements Comparable<Mention> {
  11 +
  12 + private MentionGroup mentionGroup = null;
  13 +
  14 + private List<Token> segments = new ArrayList<Token>();
  15 + private List<Token> headSegments = new ArrayList<Token>();
  16 +
  17 + private boolean isZeroSubject = false;
  18 +
  19 + // empty if no head info gathered for multi-segment mention
  20 + // if single-segment mention, then this segment is head
  21 +
  22 + public Mention(Token segment) {
  23 + this(segment, false);
  24 + }
  25 +
  26 + public Mention(List<Token> segments, List<Token> heads, boolean isZero) {
  27 + for (Token s : segments) {
  28 + s.addMention(this);
  29 + this.segments.add(s);
  30 + }
  31 + this.headSegments.addAll(heads);
  32 + this.isZeroSubject = isZero;
  33 + }
  34 +
  35 + public Mention(List<Token> segments, List<Token> heads) {
  36 + this(segments, heads, false);
  37 + }
  38 +
  39 + public Mention(Token token, boolean isZero) {
  40 + this.isZeroSubject = isZero;
  41 + token.addMention(this);
  42 + this.segments.add(token);
  43 + this.headSegments.add(token);
  44 + }
  45 +
  46 + public void addSegment(Token s) {
  47 + s.addMention(this);
  48 + this.segments.add(s);
  49 + }
  50 +
  51 + public void addHeadSegment(Token s) {
  52 + this.headSegments.add(s);
  53 + }
  54 +
  55 + public List<Token> getSegments() {
  56 + return segments;
  57 + }
  58 +
  59 + public Token getFirstSegment() {
  60 + return segments.get(0);
  61 + }
  62 +
  63 + public Token getLastSegment() {
  64 + return segments.get(segments.size() - 1);
  65 + }
  66 +
  67 + private Token getLastHeadSegment() {
  68 + List<Token> hs = this.getHeadSegments();
  69 + if (hs.size() != 0)
  70 + return hs.get(hs.size() - 1);
  71 + return null;
  72 + }
  73 +
  74 + public String toString() {
  75 + StringBuffer sb = new StringBuffer();
  76 + sb.append("[");
  77 + for (Token seg : segments) {
  78 + sb.append(seg.toString() + " ");
  79 + }
  80 + sb.append("]");
  81 + return sb.toString();
  82 + }
  83 +
  84 + public MentionGroup getMentionGroup() {
  85 + return mentionGroup;
  86 + }
  87 +
  88 + public void setMentionGroup(MentionGroup mentionGroup) {
  89 + this.mentionGroup = mentionGroup;
  90 + }
  91 +
  92 + public List<Token> getHeadSegments() {
  93 + return headSegments;
  94 + }
  95 +
  96 + public int getNoOfParentMentions() {
  97 + int result = -1; // because we don't want to count this mention
  98 +
  99 + // each parenting mention must contain all the segments of this one
  100 + for (Mention m : getFirstSegment().getMentions()) {
  101 + if (m.getSegments().containsAll(getSegments()))
  102 + result++;
  103 + }
  104 + return result;
  105 + }
  106 +
  107 + public boolean isPronoun() {
  108 + return this.segments.get(0).getChosenInterpretation().getCtag().matches("ppron.*");
  109 + }
  110 +
  111 + @Override
  112 + public int hashCode() {
  113 + final int prime = 31;
  114 + int result = 1;
  115 + result = prime * result + ((headSegments == null) ? 0 : headSegments.hashCode());
  116 + result = prime * result + ((segments == null) ? 0 : segments.hashCode());
  117 + return result;
  118 + }
  119 +
  120 + @Override
  121 + public boolean equals(Object obj) {
  122 + if (this == obj)
  123 + return true;
  124 + if (obj == null)
  125 + return false;
  126 + if (getClass() != obj.getClass())
  127 + return false;
  128 + Mention other = (Mention) obj;
  129 + if (headSegments == null) {
  130 + if (other.headSegments != null)
  131 + return false;
  132 + } else if (!headSegments.equals(other.headSegments))
  133 + return false;
  134 + if (segments == null) {
  135 + if (other.segments != null)
  136 + return false;
  137 + } else if (!segments.equals(other.segments))
  138 + return false;
  139 + return true;
  140 + }
  141 +
  142 + @Override
  143 + public int compareTo(Mention other) {
  144 + Token thisLastSegment = getLastSegment();
  145 + Token anotherLastSegment = other.getLastSegment();
  146 +
  147 + Sentence thisSentence = thisLastSegment.getSentence();
  148 + Sentence anotherSentence = anotherLastSegment.getSentence();
  149 +
  150 + Paragraph thisParagraph = thisSentence == null ? null : thisSentence.getParagraph();
  151 + Paragraph anotherParagraph = anotherSentence == null ? null : anotherSentence.getParagraph();
  152 +
  153 + String thisTextId = thisParagraph == null ? null : thisParagraph.getText().getId();
  154 + String anotherTextId = anotherParagraph == null ? null : anotherParagraph.getText().getId();
  155 +
  156 + int compare;
  157 + // first, compare by ids of texts
  158 + if (thisTextId != null && anotherTextId != null) {
  159 + compare = thisTextId.compareTo(anotherTextId);
  160 + if (compare != 0)
  161 + return compare;
  162 + }
  163 +
  164 + // second, compare by paragraph position
  165 + if (thisParagraph != null && anotherParagraph != null) {
  166 + compare = thisParagraph.getTextPosition().compareTo(anotherParagraph.getTextPosition());
  167 + if (compare != 0)
  168 + return compare;
  169 +
  170 + // third, compare by sentence position
  171 + compare = thisSentence.getParagraphPosition().compareTo(anotherSentence.getParagraphPosition());
  172 + if (compare != 0)
  173 + return compare;
  174 + }
  175 +
  176 + // fourth, compare by last segments
  177 + compare = thisLastSegment.getSentencePosition().compareTo(anotherLastSegment.getSentencePosition());
  178 + if (compare != 0)
  179 + return compare;
  180 +
  181 + // fifth, compare by size
  182 + Integer thisSize = getSegments().size();
  183 + Integer anotherSize = other.getSegments().size();
  184 + compare = thisSize.compareTo(anotherSize);
  185 + if (compare != 0)
  186 + return compare;
  187 +
  188 + // sixth, compare by last head segments
  189 + Token thisLastHeadSegment = getLastHeadSegment();
  190 + Token anotherLastHeadSegment = other.getLastHeadSegment();
  191 + if (thisLastHeadSegment != null && anotherLastHeadSegment != null) {
  192 + compare = thisLastHeadSegment.getSentencePosition().compareTo(anotherLastHeadSegment.getSentencePosition());
  193 + }
  194 +
  195 + // seventh, compare by head segments size
  196 + thisSize = getHeadSegments().size();
  197 + anotherSize = other.getHeadSegments().size();
  198 + compare = thisSize.compareTo(anotherSize);
  199 +
  200 + return compare;
  201 + }
  202 +
  203 + public boolean isZeroSubject() {
  204 + return isZeroSubject;
  205 + }
  206 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/MentionGroup.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/MentionGroup.java
  1 +package pl.waw.ipipan.zil.core.md.entities;
  2 +
  3 +import java.util.ArrayList;
  4 +import java.util.Comparator;
  5 +
  6 +public class MentionGroup extends ArrayList<Mention> {
  7 +
  8 + private static final long serialVersionUID = 7051256137623728016L;
  9 + private String dominant;
  10 +
  11 + public MentionGroup() {
  12 + }
  13 +
  14 + public MentionGroup(Mention currentMention) {
  15 + add(currentMention);
  16 + }
  17 +
  18 + public boolean add(Mention m) {
  19 + m.setMentionGroup(this);
  20 + return super.add(m);
  21 + }
  22 +
  23 + public Mention getLastAddedMention() {
  24 + return this.get(this.size() - 1);
  25 + }
  26 +
  27 + public final static Comparator<MentionGroup> getMentionGroupComparator() {
  28 + return mentionGroupComparator;
  29 + }
  30 +
  31 + private final static Comparator<MentionGroup> mentionGroupComparator = new Comparator<MentionGroup>() {
  32 +
  33 + public int compare(MentionGroup mg1, MentionGroup mg2) {
  34 + Mention m1 = mg1.getLastAddedMention();
  35 + Mention m2 = mg2.getLastAddedMention();
  36 + return m1.compareTo(m2);
  37 + }
  38 +
  39 + };
  40 +
  41 + public void setDominant(String string) {
  42 + this.dominant = string;
  43 + }
  44 +
  45 + public String getDominant() {
  46 + return this.dominant;
  47 + }
  48 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/NamedEntity.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/NamedEntity.java
  1 +package pl.waw.ipipan.zil.core.md.entities;
  2 +
  3 +import java.util.Iterator;
  4 +import java.util.List;
  5 +
  6 +public class NamedEntity implements Comparable<NamedEntity> {
  7 +
  8 + private List<Token> tokens;
  9 +
  10 + public NamedEntity(List<Token> tokens) {
  11 + this.tokens = tokens;
  12 + }
  13 +
  14 + public List<Token> getTokens() {
  15 + return this.tokens;
  16 + }
  17 +
  18 + @Override
  19 + public int compareTo(NamedEntity o) {
  20 + Iterator<Token> it1 = getTokens().iterator();
  21 + Iterator<Token> it2 = o.getTokens().iterator();
  22 + while (it1.hasNext() && it2.hasNext()) {
  23 + Token t1 = it1.next();
  24 + Token t2 = it2.next();
  25 + if (t1.compareTo(t2) != 0)
  26 + return t1.compareTo(t2);
  27 + }
  28 + if (it1.hasNext())
  29 + return 1;
  30 + if (it2.hasNext())
  31 + return -1;
  32 +
  33 + return 0;
  34 + }
  35 +
  36 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/Paragraph.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Paragraph.java
  1 +package pl.waw.ipipan.zil.core.md.entities;
  2 +
  3 +import java.util.ArrayList;
  4 +
  5 +public class Paragraph extends ArrayList<Sentence>{
  6 +
  7 + private static final long serialVersionUID = 4871431562737902082L;
  8 +
  9 + private Text text;
  10 + private int textPosition;
  11 +
  12 + public boolean add(Sentence s) {
  13 + s.setParagraphPosition(this.size());
  14 + s.setParagraph(this);
  15 + return super.add(s);
  16 + }
  17 +
  18 + public String toString() {
  19 + StringBuffer sb = new StringBuffer();
  20 + for (Sentence sentence : this)
  21 + sb.append(sentence.toString()+"\n");
  22 + return sb.toString();
  23 + }
  24 +
  25 + public Text getText() {
  26 + return this.text;
  27 + }
  28 +
  29 + public void setText(Text text) {
  30 + this.text = text;
  31 + }
  32 +
  33 + public Integer getTextPosition() {
  34 + return this.textPosition;
  35 + }
  36 +
  37 + public void setTextPosition(int textPos) {
  38 + this.textPosition = textPos;
  39 + }
  40 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/Sentence.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Sentence.java
  1 +package pl.waw.ipipan.zil.core.md.entities;
  2 +
  3 +import java.util.ArrayList;
  4 +import java.util.List;
  5 +import java.util.Set;
  6 +import java.util.TreeSet;
  7 +
  8 +public class Sentence extends ArrayList<Token> {
  9 +
  10 + private static final long serialVersionUID = -7300822552646737716L;
  11 +
  12 + private Paragraph paragraph;
  13 + private int paragraphPosition;
  14 +
  15 + private Set<Mention> mentions = new TreeSet<>();
  16 + private Set<SyntacticWord> syntacticWords = new TreeSet<>();
  17 + private Set<SyntacticGroup> syntacticGroups = new TreeSet<>();
  18 + private Set<NamedEntity> namedEntities = new TreeSet<>();
  19 +
  20 + public boolean add(Token s) {
  21 + s.setSentencePosition(this.size());
  22 + s.setSentence(this);
  23 + return super.add(s);
  24 + }
  25 +
  26 + public void setParagraphPosition(int paragraphPosition) {
  27 + this.paragraphPosition = paragraphPosition;
  28 + }
  29 +
  30 + public Integer getParagraphPosition() {
  31 + return this.paragraphPosition;
  32 + }
  33 +
  34 + public void setParagraph(Paragraph paragraph) {
  35 + this.paragraph = paragraph;
  36 + }
  37 +
  38 + public Paragraph getParagraph() {
  39 + return this.paragraph;
  40 + }
  41 +
  42 + public void removeMention(Mention mention) {
  43 + mentions.remove(mention);
  44 + for (Token s : mention.getSegments())
  45 + s.removeMention(mention);
  46 + }
  47 +
  48 + public void clearMentions() {
  49 + for (Mention mention : mentions)
  50 + for (Token s : mention.getSegments())
  51 + s.removeMention(mention);
  52 + mentions.clear();
  53 + }
  54 +
  55 + public String toStringWithoutMentions() {
  56 + StringBuffer sb = new StringBuffer();
  57 + for (Token seg : this) {
  58 + if (!seg.toString().matches("\\[.*\\]")) {
  59 + sb.append(seg.toString());
  60 + sb.append(" ");
  61 + }
  62 + }
  63 + return sb.toString();
  64 + }
  65 +
  66 + public String toString() {
  67 + StringBuffer sb = new StringBuffer();
  68 + for (Token seg : this) {
  69 + for (@SuppressWarnings("unused")
  70 + Mention m : seg.getMentionsStartingBeforeSegment())
  71 + sb.append("[");
  72 + sb.append(seg.toString());
  73 + for (@SuppressWarnings("unused")
  74 + Mention m : seg.getMentionsEndingAfterSegment())
  75 + sb.append("]");
  76 + sb.append(" ");
  77 + }
  78 + return sb.toString();
  79 + }
  80 +
  81 + public List<Mention> getMentions() {
  82 + return new ArrayList<Mention>(mentions);
  83 + }
  84 +
  85 + public List<SyntacticWord> getSyntacticWords() {
  86 + return new ArrayList<>(syntacticWords);
  87 + }
  88 +
  89 + public List<NamedEntity> getNamedEntities() {
  90 + return new ArrayList<>(namedEntities);
  91 + }
  92 +
  93 + public List<SyntacticGroup> getGroups() {
  94 + return new ArrayList<>(syntacticGroups);
  95 + }
  96 +
  97 + public void addMention(Mention mention) {
  98 + mentions.add(mention);
  99 + }
  100 +
  101 + public void addSyntacticWord(SyntacticWord syntacticWord) {
  102 + syntacticWords.add(syntacticWord);
  103 + }
  104 +
  105 + public void addSyntacticGroup(SyntacticGroup syntacticGroup) {
  106 + syntacticGroups.add(syntacticGroup);
  107 + }
  108 +
  109 + public void addNamedEntity(NamedEntity namedEntity) {
  110 + namedEntities.add(namedEntity);
  111 + }
  112 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java
  1 +package pl.waw.ipipan.zil.core.md.entities;
  2 +
  3 +import java.util.Iterator;
  4 +import java.util.List;
  5 +
  6 +public class SyntacticGroup implements Comparable<SyntacticGroup> {
  7 +
  8 + private String type;
  9 + private List<Token> tokens;
  10 + private List<Token> headTokens;
  11 +
  12 + public SyntacticGroup(String type, List<Token> tokens,
  13 + List<Token> headTokens) {
  14 + this.type = type;
  15 + this.tokens = tokens;
  16 + this.headTokens = headTokens;
  17 + }
  18 +
  19 + public String getType() {
  20 + return type;
  21 + }
  22 +
  23 + public List<Token> getTokens() {
  24 + return tokens;
  25 + }
  26 +
  27 + public List<Token> getSemanticHeadTokens() {
  28 + return headTokens;
  29 + }
  30 +
  31 + @Override
  32 + public int compareTo(SyntacticGroup o) {
  33 + Iterator<Token> it1 = getTokens().iterator();
  34 + Iterator<Token> it2 = o.getTokens().iterator();
  35 + while (it1.hasNext() && it2.hasNext()) {
  36 + Token t1 = it1.next();
  37 + Token t2 = it2.next();
  38 + if (t1.compareTo(t2) != 0)
  39 + return t1.compareTo(t2);
  40 + }
  41 + it1 = getSemanticHeadTokens().iterator();
  42 + it2 = o.getSemanticHeadTokens().iterator();
  43 + while (it1.hasNext() && it2.hasNext()) {
  44 + Token t1 = it1.next();
  45 + Token t2 = it2.next();
  46 + if (t1.compareTo(t2) != 0)
  47 + return t1.compareTo(t2);
  48 + }
  49 + if (it1.hasNext())
  50 + return 1;
  51 + if (it2.hasNext())
  52 + return -1;
  53 +
  54 + return getType().compareTo(o.getType());
  55 + }
  56 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticWord.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticWord.java
  1 +package pl.waw.ipipan.zil.core.md.entities;
  2 +
  3 +import java.util.ArrayList;
  4 +import java.util.Iterator;
  5 +import java.util.List;
  6 +
  7 +public class SyntacticWord implements Comparable<SyntacticWord> {
  8 +
  9 + private String ctag;
  10 + private List<Token> tokens = new ArrayList<>();
  11 +
  12 + public SyntacticWord(String ctag, List<Token> tokens) {
  13 + this.ctag = ctag;
  14 + this.tokens = tokens;
  15 + }
  16 +
  17 + public String getCtag() {
  18 + return ctag;
  19 + }
  20 +
  21 + public List<Token> getTokens() {
  22 + return tokens;
  23 + }
  24 +
  25 + @Override
  26 + public int compareTo(SyntacticWord o) {
  27 + Iterator<Token> it1 = getTokens().iterator();
  28 + Iterator<Token> it2 = o.getTokens().iterator();
  29 + while (it1.hasNext() && it2.hasNext()) {
  30 + Token t1 = it1.next();
  31 + Token t2 = it2.next();
  32 + if (t1.compareTo(t2) != 0)
  33 + return t1.compareTo(t2);
  34 + }
  35 + if (it1.hasNext())
  36 + return 1;
  37 + if (it2.hasNext())
  38 + return -1;
  39 +
  40 + return getCtag().compareTo(o.getCtag());
  41 + }
  42 +
  43 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/Text.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Text.java
  1 +package pl.waw.ipipan.zil.core.md.entities;
  2 +
  3 +import java.util.ArrayList;
  4 +
  5 +public class Text extends ArrayList<Paragraph> implements Comparable<Text> {
  6 +
  7 + private static final long serialVersionUID = 3433069117444647544L;
  8 +
  9 + private String id;
  10 +
  11 + public boolean add(Paragraph p) {
  12 + p.setTextPosition(this.size());
  13 + p.setText(this);
  14 + return super.add(p);
  15 + }
  16 +
  17 + public String getId() {
  18 + return id;
  19 + }
  20 +
  21 + public void setId(String id) {
  22 + this.id = id;
  23 + }
  24 +
  25 + public Text(String id) {
  26 + setId(id);
  27 + }
  28 +
  29 + public String toString() {
  30 + StringBuffer sb = new StringBuffer();
  31 + for (Paragraph par : this)
  32 + sb.append(par.toString() + "\n\n");
  33 + return sb.toString();
  34 + }
  35 +
  36 + public int compareTo(Text o) {
  37 + return getId().compareTo(o.getId());
  38 + }
  39 +
  40 + public void clearMentions() {
  41 + for (Paragraph p : this)
  42 + for (Sentence sent : p)
  43 + sent.clearMentions();
  44 + }
  45 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/Token.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Token.java
  1 +package pl.waw.ipipan.zil.core.md.entities;
  2 +
  3 +import java.util.ArrayList;
  4 +import java.util.Collection;
  5 +import java.util.Collections;
  6 +import java.util.HashSet;
  7 +import java.util.List;
  8 +import java.util.Set;
  9 +
  10 +public class Token implements Comparable<Token> {
  11 + private Sentence sentence;
  12 + private int sentencePosition;
  13 +
  14 + private Set<Mention> mentions = null;
  15 +
  16 + private String orth;
  17 + private Interpretation chosenInterpretation;
  18 + private Collection<Interpretation> allInterpretations = new HashSet<Interpretation>();
  19 +
  20 + public Integer getSentencePosition() {
  21 + return sentencePosition;
  22 + }
  23 +
  24 + public void setSentencePosition(int sentencePosition) {
  25 + this.sentencePosition = sentencePosition;
  26 + }
  27 +
  28 + public Sentence getSentence() {
  29 + return sentence;
  30 + }
  31 +
  32 + public void setSentence(Sentence sentence) {
  33 + this.sentence = sentence;
  34 + }
  35 +
  36 + public void setOrth(String orth2) {
  37 + this.orth = orth2;
  38 + }
  39 +
  40 + public String getOrth() {
  41 + return this.orth;
  42 + }
  43 +
  44 + public void addChosenInterpretation(Interpretation chosenIterpretation) {
  45 + setChosenInterpretation(chosenIterpretation);
  46 + addInterpretation(chosenIterpretation);
  47 + }
  48 +
  49 + public void setChosenInterpretation(Interpretation chosenIterpretation) {
  50 + this.chosenInterpretation = chosenIterpretation;
  51 + }
  52 +
  53 + public Interpretation getChosenInterpretation() {
  54 + return this.chosenInterpretation;
  55 + }
  56 +
  57 + public String getBase() {
  58 + return this.getChosenInterpretation().getBase();
  59 + }
  60 +
  61 + public String getNumber() {
  62 + return this.getChosenInterpretation().getNumber();
  63 + }
  64 +
  65 + public String getGender() {
  66 + return this.getChosenInterpretation().getGender();
  67 + }
  68 +
  69 + public String getCase() {
  70 + return this.getChosenInterpretation().getCase();
  71 + }
  72 +
  73 + public String getPerson() {
  74 + return this.getChosenInterpretation().getPerson();
  75 + }
  76 +
  77 + public void addInterpretation(Interpretation inter) {
  78 + this.allInterpretations.add(inter);
  79 + }
  80 +
  81 + public String toString() {
  82 + return orth;
  83 + }
  84 +
  85 + public void addMention(Mention mention) {
  86 + if (this.mentions == null)
  87 + this.mentions = new HashSet<Mention>();
  88 +
  89 + this.mentions.add(mention);
  90 + }
  91 +
  92 + public void removeMention(Mention mention) {
  93 + this.mentions.remove(mention);
  94 + }
  95 +
  96 + public Set<Mention> getMentions() {
  97 + if (this.mentions == null)
  98 + return new HashSet<Mention>();
  99 + return this.mentions;
  100 + }
  101 +
  102 + public List<Mention> getMentionsStartingBeforeSegment() {
  103 + List<Mention> result = new ArrayList<Mention>();
  104 + for (Mention m : getMentions())
  105 + if (m.getFirstSegment().equals(this))
  106 + result.add(m);
  107 +
  108 + Collections.sort(result);
  109 + Collections.reverse(result);
  110 + return result;
  111 + }
  112 +
  113 + public List<Mention> getMentionsEndingAfterSegment() {
  114 + List<Mention> result = new ArrayList<Mention>();
  115 + for (Mention m : getMentions())
  116 + if (m.getLastSegment().equals(this))
  117 + result.add(m);
  118 +
  119 + Collections.sort(result);
  120 + Collections.reverse(result);
  121 + return result;
  122 + }
  123 +
  124 + public String getCtag() {
  125 + return getChosenInterpretation().getCtag();
  126 + }
  127 +
  128 + @Override
  129 + public int compareTo(Token o) {
  130 + return getSentencePosition().compareTo(o.getSentencePosition());
  131 + }
  132 +
  133 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java
  1 +package pl.waw.ipipan.zil.core.md.io.tei;
  2 +
  3 +import ipipan.clarin.tei.api.entities.TEICorpusText;
  4 +import ipipan.clarin.tei.api.entities.TEIGroup;
  5 +import ipipan.clarin.tei.api.entities.TEIInterpretation;
  6 +import ipipan.clarin.tei.api.entities.TEIMention;
  7 +import ipipan.clarin.tei.api.entities.TEIMorph;
  8 +import ipipan.clarin.tei.api.entities.TEINamedEntity;
  9 +import ipipan.clarin.tei.api.entities.TEIParagraph;
  10 +import ipipan.clarin.tei.api.entities.TEISentence;
  11 +import ipipan.clarin.tei.api.entities.TEISyntacticEntity;
  12 +import ipipan.clarin.tei.api.entities.TEIWord;
  13 +import ipipan.clarin.tei.api.exceptions.TEIException;
  14 +import ipipan.clarin.tei.api.io.TEI_IO;
  15 +
  16 +import java.io.File;
  17 +import java.util.ArrayList;
  18 +import java.util.HashMap;
  19 +import java.util.List;
  20 +import java.util.Map;
  21 +
  22 +import org.apache.log4j.Logger;
  23 +
  24 +import pl.waw.ipipan.zil.core.md.entities.Interpretation;
  25 +import pl.waw.ipipan.zil.core.md.entities.Mention;
  26 +import pl.waw.ipipan.zil.core.md.entities.NamedEntity;
  27 +import pl.waw.ipipan.zil.core.md.entities.Paragraph;
  28 +import pl.waw.ipipan.zil.core.md.entities.Sentence;
  29 +import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup;
  30 +import pl.waw.ipipan.zil.core.md.entities.SyntacticWord;
  31 +import pl.waw.ipipan.zil.core.md.entities.Text;
  32 +import pl.waw.ipipan.zil.core.md.entities.Token;
  33 +
  34 +public class TeiLoader {
  35 +
  36 + private static Logger logger = Logger.getLogger(TeiLoader.class);
  37 + private static TEI_IO teiAPI = TEI_IO.getInstance();
  38 +
  39 + public static TEICorpusText readTeiText(File teiDir) throws TEIException {
  40 + return teiAPI.readFromNKJPDirectory(teiDir);
  41 + }
  42 +
  43 + public static Text loadTextFromTei(TEICorpusText teiText) {
  44 + Text text = new Text(teiText.getCorpusHeader().getId());
  45 +
  46 + logger.debug("Loading tei text " + text.getId() + "...");
  47 + for (TEIParagraph teiP : teiText.getParagraphs())
  48 + loadParagraph(text, teiP);
  49 + logger.debug("Tei text loaded.");
  50 +
  51 + return text;
  52 + }
  53 +
  54 + private static void loadParagraph(Text text, TEIParagraph teiP) {
  55 + Paragraph p = new Paragraph();
  56 + text.add(p);
  57 + for (TEISentence teiS : teiP.getSentences())
  58 + loadSentence(p, teiS);
  59 + }
  60 +
  61 + private static void loadSentence(Paragraph p, TEISentence teiS) {
  62 + Sentence s = new Sentence();
  63 + p.add(s);
  64 + Map<TEIMorph, Token> teiMorph2Segment = new HashMap<>();
  65 + for (TEIMorph teiM : teiS.getMorphs()) {
  66 + Token token = loadToken(s, teiM);
  67 + teiMorph2Segment.put(teiM, token);
  68 + }
  69 + for (TEINamedEntity ne : teiS.getAllNamedEntities())
  70 + loadNE(s, ne, teiMorph2Segment);
  71 + for (TEIWord w : teiS.getAllWords())
  72 + loadSyntacticWord(s, w, teiMorph2Segment);
  73 + for (TEIGroup g : teiS.getAllGroups())
  74 + loadSyntacticGroup(s, g, teiMorph2Segment);
  75 + for (TEIMention m : teiS.getAllMentions())
  76 + loadMentions(s, m, teiMorph2Segment);
  77 + }
  78 +
  79 + private static void loadMentions(Sentence s, TEIMention m,
  80 + Map<TEIMorph, Token> teiMorph2Segment) {
  81 + List<Token> tokens = new ArrayList<>();
  82 + for (TEIMorph mo : m.getMorphs())
  83 + tokens.add(teiMorph2Segment.get(mo));
  84 + List<Token> headTokens = new ArrayList<>();
  85 + for (TEIMorph mo : m.getHeadMorphs())
  86 + headTokens.add(teiMorph2Segment.get(mo));
  87 + s.addMention(new Mention(tokens, headTokens, m.isZeroSubject()));
  88 + }
  89 +
  90 + private static void loadSyntacticGroup(Sentence s, TEIGroup g,
  91 + Map<TEIMorph, Token> teiMorph2Segment) {
  92 + String type = g.getType();
  93 +
  94 + List<Token> tokens = new ArrayList<>();
  95 + for (TEIMorph m : g.getLeaves())
  96 + tokens.add(teiMorph2Segment.get(m));
  97 +
  98 + List<Token> headTokens = new ArrayList<>();
  99 + TEISyntacticEntity semanticHead = g;
  100 + while (semanticHead.isGroup()
  101 + && semanticHead.asGroup().getSemanticHead() != null)
  102 + semanticHead = semanticHead.asGroup().getSemanticHead();
  103 + for (TEIMorph m : semanticHead.getLeaves())
  104 + headTokens.add(teiMorph2Segment.get(m));
  105 +
  106 + s.addSyntacticGroup(new SyntacticGroup(type, tokens, headTokens));
  107 + }
  108 +
  109 + private static void loadSyntacticWord(Sentence s, TEIWord w,
  110 + Map<TEIMorph, Token> teiMorph2Segment) {
  111 + String ctag = w.getInterpretation().getCtag();
  112 + List<Token> tokens = new ArrayList<>();
  113 + for (TEIMorph m : w.getAllMorphs())
  114 + tokens.add(teiMorph2Segment.get(m));
  115 + s.addSyntacticWord(new SyntacticWord(ctag, tokens));
  116 + }
  117 +
  118 + private static void loadNE(Sentence s, TEINamedEntity ne,
  119 + Map<TEIMorph, Token> teiMorph2Segment) {
  120 + List<Token> tokens = new ArrayList<>();
  121 + for (TEIMorph m : ne.getLeaves())
  122 + tokens.add(teiMorph2Segment.get(m));
  123 + s.addNamedEntity(new NamedEntity(tokens));
  124 + }
  125 +
  126 + private static Token loadToken(Sentence s, TEIMorph teiM) {
  127 + Token seg = new Token();
  128 + s.add(seg);
  129 +
  130 + seg.setOrth(teiM.getOrth());
  131 + TEIInterpretation interp = teiM.getChosenInterpretation();
  132 + Interpretation chosenIterpretation = new Interpretation(
  133 + interp.getCtag(), interp.getMorph(), interp.getBase());
  134 + seg.addChosenInterpretation(chosenIterpretation);
  135 +
  136 + for (TEIInterpretation interp2 : teiM.getAllInterpretations()) {
  137 + Interpretation inter = new Interpretation(interp2.getCtag(),
  138 + interp2.getMorph(), interp.getBase());
  139 + seg.addInterpretation(inter);
  140 + }
  141 +
  142 + return seg;
  143 + }
  144 +
  145 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiSaver.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiSaver.java
  1 +package pl.waw.ipipan.zil.core.md.io.tei;
  2 +
  3 +import ipipan.clarin.tei.api.entities.AnnotationLayer;
  4 +import ipipan.clarin.tei.api.entities.EntitiesFactory;
  5 +import ipipan.clarin.tei.api.entities.TEICoreference;
  6 +import ipipan.clarin.tei.api.entities.TEICorpusText;
  7 +import ipipan.clarin.tei.api.entities.TEIMention;
  8 +import ipipan.clarin.tei.api.entities.TEIMorph;
  9 +import ipipan.clarin.tei.api.entities.TEIParagraph;
  10 +import ipipan.clarin.tei.api.entities.TEISentence;
  11 +import ipipan.clarin.tei.api.exceptions.TEIException;
  12 +import ipipan.clarin.tei.api.io.TEI_IO;
  13 +import ipipan.clarin.tei.api.io.TEI_IO.CompressionMethod;
  14 +
  15 +import java.io.File;
  16 +import java.util.ArrayList;
  17 +import java.util.HashMap;
  18 +import java.util.Iterator;
  19 +import java.util.List;
  20 +import java.util.Map;
  21 +
  22 +import org.apache.log4j.Logger;
  23 +
  24 +import pl.waw.ipipan.zil.core.md.entities.Mention;
  25 +import pl.waw.ipipan.zil.core.md.entities.Paragraph;
  26 +import pl.waw.ipipan.zil.core.md.entities.Sentence;
  27 +import pl.waw.ipipan.zil.core.md.entities.Text;
  28 +import pl.waw.ipipan.zil.core.md.entities.Token;
  29 +
  30 +public class TeiSaver {
  31 +
  32 + private static Logger logger = Logger.getLogger(TeiSaver.class);
  33 + private static TEI_IO teiAPI = TEI_IO.getInstance();
  34 + final private static EntitiesFactory ef = EntitiesFactory.getInstance();
  35 +
  36 + public static void saveTeiText(TEICorpusText teiText, File targetDir, boolean gzip) throws TEIException {
  37 + logger.debug("Saving text in " + targetDir);
  38 + CompressionMethod cm = gzip ? CompressionMethod.GZIP : CompressionMethod.NONE;
  39 + teiAPI.writeToNKJPDirectory(teiText, targetDir, cm);
  40 + }
  41 +
  42 + public static void updateTeiText(Text t, TEICorpusText teiText) throws TEIException {
  43 + Map<Mention, TEIMention> mention2mention = new HashMap<Mention, TEIMention>();
  44 +
  45 + Iterator<Paragraph> pIt = t.iterator();
  46 + Iterator<TEIParagraph> pItTei = teiText.getParagraphs().iterator();
  47 + int mentionId = 0;
  48 + while (pIt.hasNext() && pItTei.hasNext()) {
  49 + Paragraph p = pIt.next();
  50 + TEIParagraph pTei = pItTei.next();
  51 +
  52 + mentionId = updateTeiParagraph(mention2mention, mentionId, p, pTei);
  53 + }
  54 + checkIterators(pIt, pItTei, "paragraph");
  55 +
  56 + teiText.addAnnotationLayer(AnnotationLayer.MENTIONS,
  57 + EntitiesFactory.getInstance().createHeader(AnnotationLayer.MENTIONS));
  58 +
  59 + // clear coreference as we have new mentions it became invalid
  60 + teiText.getAnnotationLayers().remove(AnnotationLayer.COREFERENCE);
  61 + teiText.setCoreferences(new ArrayList<TEICoreference>());
  62 +
  63 + logger.debug(mentionId + " mentions added");
  64 + }
  65 +
  66 + private static int updateTeiParagraph(Map<Mention, TEIMention> mention2mention, int mentionId, Paragraph p,
  67 + TEIParagraph pTei) throws TEIException {
  68 + Iterator<Sentence> sIt = p.iterator();
  69 + Iterator<TEISentence> sItTei = pTei.getSentences().iterator();
  70 +
  71 + while (sIt.hasNext() && sItTei.hasNext()) {
  72 + Sentence s = sIt.next();
  73 + TEISentence sTei = sItTei.next();
  74 + mentionId = updateTeiSentence(mention2mention, mentionId, s, sTei);
  75 + }
  76 + checkIterators(sIt, sItTei, "sentence");
  77 + return mentionId;
  78 + }
  79 +
  80 + private static int updateTeiSentence(Map<Mention, TEIMention> mention2mention, int mentionId, Sentence s,
  81 + TEISentence sTei) throws TEIException {
  82 + sTei.getAllMentions().clear();
  83 +
  84 + Map<Token, TEIMorph> seg2morph = new HashMap<Token, TEIMorph>();
  85 +
  86 + Iterator<Token> segIt = s.iterator();
  87 + Iterator<TEIMorph> segItTei = sTei.getMorphs().iterator();
  88 +
  89 + while (segIt.hasNext() && segItTei.hasNext()) {
  90 + seg2morph.put(segIt.next(), segItTei.next());
  91 + }
  92 + checkIterators(segIt, segItTei, "token");
  93 +
  94 + List<TEIMention> mentions = new ArrayList<TEIMention>();
  95 +
  96 + for (Mention m : s.getMentions()) {
  97 + List<TEIMorph> morphs = new ArrayList<TEIMorph>();
  98 + List<TEIMorph> heads = new ArrayList<TEIMorph>();
  99 +
  100 + for (Token seg : m.getSegments())
  101 + morphs.add(seg2morph.get(seg));
  102 +
  103 + for (Token seg : m.getHeadSegments())
  104 + heads.add(seg2morph.get(seg));
  105 +
  106 + TEIMention mention = ef.createMention("mention_" + mentionId++, morphs, heads, m.isZeroSubject());
  107 + mentions.add(mention);
  108 + mention2mention.put(m, mention);
  109 + }
  110 + sTei.setMentions(mentions);
  111 + return mentionId;
  112 + }
  113 +
  114 + private static void checkIterators(Iterator<? extends Object> one, Iterator<? extends Object> other, String level)
  115 + throws TEIException {
  116 + if (one.hasNext() || other.hasNext())
  117 + throw new TEIException("Problem mapping tei to thrift for level " + level);
  118 + }
  119 +
  120 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java
  1 +package pl.waw.ipipan.zil.core.md.io.thrift;
  2 +
  3 +import java.util.ArrayList;
  4 +import java.util.HashMap;
  5 +import java.util.List;
  6 +import java.util.Map;
  7 +
  8 +import org.apache.log4j.Logger;
  9 +
  10 +import pl.waw.ipipan.multiservice.thrift.types.MultiserviceException;
  11 +import pl.waw.ipipan.multiservice.thrift.types.TInterpretation;
  12 +import pl.waw.ipipan.multiservice.thrift.types.TNamedEntity;
  13 +import pl.waw.ipipan.multiservice.thrift.types.TParagraph;
  14 +import pl.waw.ipipan.multiservice.thrift.types.TSentence;
  15 +import pl.waw.ipipan.multiservice.thrift.types.TSyntacticGroup;
  16 +import pl.waw.ipipan.multiservice.thrift.types.TSyntacticWord;
  17 +import pl.waw.ipipan.multiservice.thrift.types.TText;
  18 +import pl.waw.ipipan.multiservice.thrift.types.TToken;
  19 +import pl.waw.ipipan.zil.core.md.entities.Interpretation;
  20 +import pl.waw.ipipan.zil.core.md.entities.NamedEntity;
  21 +import pl.waw.ipipan.zil.core.md.entities.Paragraph;
  22 +import pl.waw.ipipan.zil.core.md.entities.Sentence;
  23 +import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup;
  24 +import pl.waw.ipipan.zil.core.md.entities.SyntacticWord;
  25 +import pl.waw.ipipan.zil.core.md.entities.Text;
  26 +import pl.waw.ipipan.zil.core.md.entities.Token;
  27 +
  28 +public class ThriftLoader {
  29 +
  30 + private static Logger logger = Logger.getLogger(ThriftLoader.class);
  31 +
  32 + public static Text loadTextFromThrift(TText thriftText)
  33 + throws MultiserviceException {
  34 + Text text = new Text(thriftText.getTextHeader() == null ? "null"
  35 + : thriftText.getTextHeader().getId());
  36 +
  37 + logger.debug("Loading text " + text.getId() + " from thrift format...");
  38 + for (TParagraph teiP : thriftText.getParagraphs())
  39 + loadParagraph(text, teiP);
  40 + logger.debug("Thrift text loaded.");
  41 +
  42 + return text;
  43 + }
  44 +
  45 + private static void loadParagraph(Text text, TParagraph teiP)
  46 + throws MultiserviceException {
  47 + Paragraph p = new Paragraph();
  48 + text.add(p);
  49 +
  50 + for (TSentence teiS : teiP.getSentences())
  51 + loadSentence(p, teiS);
  52 + }
  53 +
  54 + private static void loadSentence(Paragraph p, TSentence thriftSent)
  55 + throws MultiserviceException {
  56 + Sentence s = new Sentence();
  57 + p.add(s);
  58 +
  59 + Map<String, Object> thirftId2Entity = getThriftId2EntityMap(thriftSent);
  60 +
  61 + Map<String, Token> thiftTokenId2Token = new HashMap<>();
  62 + for (TToken teiM : thriftSent.getTokens()) {
  63 + Token token = loadToken(s, teiM);
  64 + thiftTokenId2Token.put(teiM.getId(), token);
  65 + }
  66 + if (thriftSent.isSetNames())
  67 + for (TNamedEntity ne : thriftSent.getNames())
  68 + loadNE(s, ne, thirftId2Entity, thiftTokenId2Token);
  69 + if (thriftSent.isSetWords())
  70 + for (TSyntacticWord w : thriftSent.getWords())
  71 + loadSyntacticWord(s, w, thirftId2Entity, thiftTokenId2Token);
  72 + if (thriftSent.isSetGroups())
  73 + for (TSyntacticGroup g : thriftSent.getGroups())
  74 + loadSyntacticGroup(s, g, thirftId2Entity, thiftTokenId2Token);
  75 + }
  76 +
  77 + private static void loadSyntacticGroup(Sentence s, TSyntacticGroup g,
  78 + Map<String, Object> thirftId2Entity,
  79 + Map<String, Token> thiftTokenId2Token) {
  80 + String type = g.getType();
  81 + List<Token> tokens = getUnderlyingSegments(g, thirftId2Entity,
  82 + thiftTokenId2Token, false);
  83 + List<Token> headTokens = getUnderlyingSegments(g, thirftId2Entity,
  84 + thiftTokenId2Token, true);
  85 + s.addSyntacticGroup(new SyntacticGroup(type, tokens, headTokens));
  86 + }
  87 +
  88 + private static void loadSyntacticWord(Sentence s, TSyntacticWord w,
  89 + Map<String, Object> thirftId2Entity,
  90 + Map<String, Token> thiftTokenId2Token) {
  91 + String ctag = w.getChosenInterpretation().getCtag();
  92 + List<Token> tokens = getUnderlyingSegments(w, thirftId2Entity,
  93 + thiftTokenId2Token, false);
  94 + s.addSyntacticWord(new SyntacticWord(ctag, tokens));
  95 + }
  96 +
  97 + private static void loadNE(Sentence s, TNamedEntity ne,
  98 + Map<String, Object> thirftId2Entity,
  99 + Map<String, Token> thiftTokenId2Token) {
  100 + List<Token> tokens = getUnderlyingSegments(ne, thirftId2Entity,
  101 + thiftTokenId2Token, false);
  102 + s.addNamedEntity(new NamedEntity(tokens));
  103 + }
  104 +
  105 + private static Map<String, Object> getThriftId2EntityMap(
  106 + TSentence thriftSent) {
  107 + Map<String, Object> idToEntity = new HashMap<>();
  108 + for (TToken tok : thriftSent.getTokens())
  109 + idToEntity.put(tok.getId(), tok);
  110 + if (thriftSent.isSetWords())
  111 + for (TSyntacticWord w : thriftSent.getWords())
  112 + idToEntity.put(w.getId(), w);
  113 + if (thriftSent.isSetNames())
  114 + for (TNamedEntity ne : thriftSent.getNames())
  115 + idToEntity.put(ne.getId(), ne);
  116 + if (thriftSent.isSetGroups())
  117 + for (TSyntacticGroup group : thriftSent.getGroups())
  118 + idToEntity.put(group.getId(), group);
  119 + return idToEntity;
  120 + }
  121 +
  122 + private static Token loadToken(Sentence s, TToken teiM)
  123 + throws MultiserviceException {
  124 + Token seg = new Token();
  125 + s.add(seg);
  126 +
  127 + seg.setOrth(teiM.getOrth());
  128 + TInterpretation interp = getTokenChosenInt(teiM);
  129 + Interpretation chosenIterpretation = new Interpretation(
  130 + interp.getCtag(), interp.getMsd(), interp.getBase());
  131 + seg.addChosenInterpretation(chosenIterpretation);
  132 +
  133 + for (TInterpretation interp2 : teiM.getInterpretations()) {
  134 + Interpretation inter = new Interpretation(interp2.getCtag(),
  135 + interp2.getMsd(), interp.getBase());
  136 + seg.addInterpretation(inter);
  137 + }
  138 + return seg;
  139 + }
  140 +
  141 + private static TInterpretation getTokenChosenInt(TToken token)
  142 + throws MultiserviceException {
  143 + TInterpretation interp = token.getChosenInterpretation();
  144 + if (interp == null || interp.getBase() == null
  145 + || interp.getBase().equals("")) {
  146 + if (token.getCandidateInterpretations() == null
  147 + || token.getCandidateInterpretations().size() == 0
  148 + || token.getCandidateInterpretations().get(0).getBase() == null
  149 + || token.getCandidateInterpretations().get(0).getBase()
  150 + .equals(""))
  151 + throw new MultiserviceException(
  152 + "No proper chosen or candidate interpretation for segment: "
  153 + + token.id);
  154 + interp = token.getCandidateInterpretations().get(0);
  155 + }
  156 + return interp;
  157 + }
  158 +
  159 + private static List<Token> getUnderlyingSegments(Object entity,
  160 + Map<String, Object> idToEntity, Map<String, Token> tokenId2Segment,
  161 + boolean headsOnly) {
  162 + List<Token> result = new ArrayList<>();
  163 +
  164 + if (entity instanceof TToken) {
  165 + result.add(tokenId2Segment.get(((TToken) entity).getId()));
  166 + return result;
  167 + }
  168 +
  169 + List<String> childIds = new ArrayList<>();
  170 + if (entity instanceof TSyntacticWord)
  171 + childIds = ((TSyntacticWord) entity).getChildIds();
  172 + else if (entity instanceof TNamedEntity)
  173 + childIds = ((TNamedEntity) entity).getChildIds();
  174 + else if (entity instanceof TSyntacticGroup)
  175 + if (headsOnly) {
  176 + childIds = new ArrayList<String>();
  177 + childIds.add(((TSyntacticGroup) entity).getSemanticHeadId());
  178 + } else
  179 + childIds = ((TSyntacticGroup) entity).getChildIds();
  180 +
  181 + for (String id : childIds)
  182 + result.addAll(getUnderlyingSegments(idToEntity.get(id), idToEntity,
  183 + tokenId2Segment, headsOnly));
  184 +
  185 + return result;
  186 + }
  187 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftSaver.java 0 → 100644
  1 +++ a/src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftSaver.java
  1 +package pl.waw.ipipan.zil.core.md.io.thrift;
  2 +
  3 +import java.util.ArrayList;
  4 +import java.util.HashMap;
  5 +import java.util.Iterator;
  6 +import java.util.List;
  7 +import java.util.Map;
  8 +
  9 +import org.apache.log4j.Logger;
  10 +
  11 +import pl.waw.ipipan.multiservice.thrift.types.MultiserviceException;
  12 +import pl.waw.ipipan.multiservice.thrift.types.TMention;
  13 +import pl.waw.ipipan.multiservice.thrift.types.TParagraph;
  14 +import pl.waw.ipipan.multiservice.thrift.types.TSentence;
  15 +import pl.waw.ipipan.multiservice.thrift.types.TText;
  16 +import pl.waw.ipipan.multiservice.thrift.types.TToken;
  17 +import pl.waw.ipipan.zil.core.md.entities.Mention;
  18 +import pl.waw.ipipan.zil.core.md.entities.Paragraph;
  19 +import pl.waw.ipipan.zil.core.md.entities.Sentence;
  20 +import pl.waw.ipipan.zil.core.md.entities.Text;
  21 +import pl.waw.ipipan.zil.core.md.entities.Token;
  22 +
  23 +public class ThriftSaver {
  24 +
  25 + private static Logger logger = Logger.getLogger(ThriftSaver.class);
  26 +
  27 + public static void updateThriftText(Text responseText, TText text)
  28 + throws MultiserviceException {
  29 +
  30 + logger.debug("Updating thrift text...");
  31 + Map<Mention, TMention> teiMention2ThriftMention = new HashMap<>();
  32 +
  33 + Iterator<TParagraph> thrPI = text.getParagraphsIterator();
  34 + Iterator<Paragraph> teiPI = responseText.iterator();
  35 + int freeMentionId = 0;
  36 + while (thrPI.hasNext() && teiPI.hasNext()) {
  37 + TParagraph thrP = thrPI.next();
  38 + Paragraph teiP = teiPI.next();
  39 +
  40 + freeMentionId = updateThriftParagraph(teiMention2ThriftMention,
  41 + freeMentionId, thrP, teiP);
  42 + }
  43 + checkIterators(thrPI, teiPI, "paragraph");
  44 + }
  45 +
  46 + private static int updateThriftParagraph(
  47 + Map<Mention, TMention> teiMention2ThriftMention, int freeMentionId,
  48 + TParagraph thrP, Paragraph teiP) throws MultiserviceException {
  49 + Iterator<TSentence> thrSI = thrP.getSentencesIterator();
  50 + Iterator<Sentence> teiSI = teiP.iterator();
  51 + while (thrSI.hasNext() && teiSI.hasNext()) {
  52 + TSentence thrS = thrSI.next();
  53 + Sentence teiS = teiSI.next();
  54 + freeMentionId = updateThriftSentence(teiMention2ThriftMention,
  55 + freeMentionId, thrS, teiS);
  56 + }
  57 + checkIterators(thrSI, teiSI, "sentence");
  58 + return freeMentionId;
  59 + }
  60 +
  61 + private static int updateThriftSentence(
  62 + Map<Mention, TMention> teiMention2ThriftMention, int id,
  63 + TSentence thrS, Sentence teiS) throws MultiserviceException {
  64 + thrS.unsetMentions();
  65 + thrS.setMentions(new ArrayList<TMention>());
  66 +
  67 + Map<Token, TToken> teiMorph2ThriftToken = new HashMap<>();
  68 + Iterator<TToken> thrMI = thrS.getTokensIterator();
  69 + Iterator<Token> teiMI = teiS.iterator();
  70 + while (thrMI.hasNext() && teiMI.hasNext()) {
  71 + teiMorph2ThriftToken.put(teiMI.next(), thrMI.next());
  72 + }
  73 + checkIterators(thrMI, teiMI, "morph");
  74 +
  75 + for (Mention m : teiS.getMentions()) {
  76 + List<String> childIds = new ArrayList<>();
  77 + List<String> headIds = new ArrayList<>();
  78 + for (Token ch : m.getSegments())
  79 + childIds.add(teiMorph2ThriftToken.get(ch).getId());
  80 + for (Token h : m.getHeadSegments())
  81 + headIds.add(teiMorph2ThriftToken.get(h).getId());
  82 +
  83 + TMention tm = new TMention("m-" + (id++), headIds, childIds,
  84 + m.isZeroSubject());
  85 + teiMention2ThriftMention.put(m, tm);
  86 + thrS.addToMentions(tm);
  87 + }
  88 + return id;
  89 + }
  90 +
  91 + private static void checkIterators(Iterator<? extends Object> one,
  92 + Iterator<? extends Object> other, String level)
  93 + throws MultiserviceException {
  94 + if (one.hasNext() || other.hasNext())
  95 + throw new MultiserviceException(
  96 + "Problem mapping interal text representation to thrift for level "
  97 + + level);
  98 + }
  99 +
  100 +}
... ...
src/main/resources/log4j.properties 0 → 100644
  1 +++ a/src/main/resources/log4j.properties
  1 +log4j.appender.stderr=org.apache.log4j.ConsoleAppender
  2 +log4j.appender.stderr.layout=org.apache.log4j.PatternLayout
  3 +log4j.appender.stderr.layout.ConversionPattern=[%p] [%C{1}] %m%n
  4 +
  5 +log4j.logger.ipipan=INFO, stderr
  6 +log4j.logger.pl.waw.ipipan=INFO, stderr
  7 +log4j.logger.org.apache.thrift=INFO, stderr
0 8 \ No newline at end of file
... ...
src/main/resources/quasi_verbs.txt 0 → 100644
  1 +++ a/src/main/resources/quasi_verbs.txt
  1 +bawić
  2 +brać
  3 +brak
  4 +brakować
  5 +być
  6 +bywać
  7 +chcieć
  8 +chodzić
  9 +ciągnąć
  10 +ciec
  11 +czas
  12 +czuć
  13 +dobiec
  14 +dobiegać
  15 +dochodzić
  16 +docierać
  17 +dojść
  18 +dotrzeć
  19 +dusić
  20 +godzić
  21 +gotować
  22 +gryźć
  23 +grzmieć
  24 +iść
  25 +jechać
  26 +kłuć
  27 +kończyć
  28 +kręcić
  29 +kropić
  30 +lać
  31 +łamać
  32 +lecieć
  33 +mieć
  34 +mieszać
  35 +móc
  36 +można
  37 +musieć
  38 +należeć
  39 +nieść
  40 +nosić
  41 +nudzić
  42 +nudzić
  43 +obejść
  44 +odbijać
  45 +odchodzić
  46 +odejmować
  47 +odejść
  48 +odrzucać
  49 +odrzucić
  50 +okazać
  51 +okazywać
  52 +opłacać
  53 +opłacić
  54 +oznaczać
  55 +pachnieć
  56 +padać
  57 +palić
  58 +palić
  59 +paść
  60 +piec
  61 +podobać
  62 +pogorszyć
  63 +pójść
  64 +ponieść
  65 +poprawiać
  66 +pora
  67 +potwierdzać
  68 +potwierdzić
  69 +powinno
  70 +pozostać
  71 +pozostawać
  72 +prosić
  73 +przechodzić
  74 +przestać
  75 +przybyć
  76 +przybywać
  77 +przyjąć
  78 +przyjmować
  79 +przypominać
  80 +przypomnieć
  81 +robić
  82 +rozerwać
  83 +rozumieć
  84 +składać
  85 +skończyć
  86 +skręcać
  87 +skręcić
  88 +słychać
  89 +śnić
  90 +śpieszyć
  91 +stać
  92 +stać
  93 +stanąć
  94 +strzelić
  95 +swędzić
  96 +świecić
  97 +szkoda
  98 +trafiać
  99 +trafić
  100 +trząść
  101 +trzeba
  102 +ucieszyć
  103 +uczynić
  104 +udać
  105 +udawać
  106 +uderzać
  107 +uderzyć
  108 +układać
  109 +ułożyć
  110 +warto
  111 +wiadomo
  112 +widać
  113 +wieść
  114 +wolno
  115 +wstyd
  116 +wychodzić
  117 +wydać
  118 +wydawać
  119 +wyjaśniać
  120 +wyjaśnić
  121 +wyjść
  122 +wypadać
  123 +wypaść
  124 +wypogadzać
  125 +wyrzucić
  126 +wystarczyć
  127 +wziąć
  128 +zabraknąć
  129 +zacząć
  130 +zaczynać
  131 +zagotować
  132 +zainteresować
  133 +zakręcić
  134 +żal
  135 +zależeć
  136 +zanieść
  137 +zanieść
  138 +zanosić
  139 +zanosić
  140 +zapowiadać
  141 +zarzucać
  142 +zastanowić
  143 +zbierać
  144 +zdarzać
  145 +zdziwić
  146 +zebrać
  147 +zemrzeć
  148 +złożyć
  149 +znać
  150 +zrobić
... ...
src/main/resources/zero_subject_model.bin 0 → 100644
No preview for this file type
src/test/java/pl/waw/ipipan/zil/core/md/MentionDetectorTest.java 0 → 100644
  1 +++ a/src/test/java/pl/waw/ipipan/zil/core/md/MentionDetectorTest.java
  1 +package pl.waw.ipipan.zil.core.md;
  2 +
  3 +import java.io.IOException;
  4 +
  5 +import org.junit.Rule;
  6 +import org.junit.Test;
  7 +import org.junit.rules.TemporaryFolder;
  8 +
  9 +public class MentionDetectorTest {
  10 +
  11 + @Rule
  12 + public TemporaryFolder results = new TemporaryFolder();
  13 +
  14 + @Test
  15 + public final void test() throws IOException {
  16 + String[] args = {
  17 + MentionDetectorTest.class.getResource("/example_test_tei/")
  18 + .getFile(),
  19 + results.newFolder().getAbsolutePath() };
  20 + Main.main(args);
  21 + }
  22 +}
... ...
src/test/java/pl/waw/ipipan/zil/core/md/detection/zero/TrainerTest.java 0 → 100644
  1 +++ a/src/test/java/pl/waw/ipipan/zil/core/md/detection/zero/TrainerTest.java
  1 +package pl.waw.ipipan.zil.core.md.detection.zero;
  2 +
  3 +import java.io.File;
  4 +import java.io.IOException;
  5 +
  6 +import org.junit.Rule;
  7 +import org.junit.Test;
  8 +import org.junit.rules.TemporaryFolder;
  9 +
  10 +public class TrainerTest {
  11 + @Rule
  12 + public TemporaryFolder results = new TemporaryFolder();
  13 +
  14 + @Test
  15 + public final void test() throws IOException {
  16 + String[] args = {
  17 + TrainerTest.class.getResource("/example_train_tei/").getFile(),
  18 + new File(results.newFolder(), "model.bin").getAbsolutePath(),
  19 + TrainerTest.class.getResource("/example_model/quasi_verbs.txt")
  20 + .getFile() };
  21 + Trainer.main(args);
  22 + }
  23 +}
... ...
src/test/resources/example_model/model.bin 0 → 100644
No preview for this file type
src/test/resources/example_model/quasi_verbs.txt 0 → 100644
  1 +++ a/src/test/resources/example_model/quasi_verbs.txt
  1 +bawić
  2 +brać
  3 +brak
  4 +brakować
  5 +być
  6 +bywać
  7 +chcieć
  8 +chodzić
  9 +ciągnąć
  10 +ciec
  11 +czas
  12 +czuć
  13 +dobiec
  14 +dobiegać
  15 +dochodzić
  16 +docierać
  17 +dojść
  18 +dotrzeć
  19 +dusić
  20 +godzić
  21 +gotować
  22 +gryźć
  23 +grzmieć
  24 +iść
  25 +jechać
  26 +kłuć
  27 +kończyć
  28 +kręcić
  29 +kropić
  30 +lać
  31 +łamać
  32 +lecieć
  33 +mieć
  34 +mieszać
  35 +móc
  36 +można
  37 +musieć
  38 +należeć
  39 +nieść
  40 +nosić
  41 +nudzić
  42 +nudzić
  43 +obejść
  44 +odbijać
  45 +odchodzić
  46 +odejmować
  47 +odejść
  48 +odrzucać
  49 +odrzucić
  50 +okazać
  51 +okazywać
  52 +opłacać
  53 +opłacić
  54 +oznaczać
  55 +pachnieć
  56 +padać
  57 +palić
  58 +palić
  59 +paść
  60 +piec
  61 +podobać
  62 +pogorszyć
  63 +pójść
  64 +ponieść
  65 +poprawiać
  66 +pora
  67 +potwierdzać
  68 +potwierdzić
  69 +powinno
  70 +pozostać
  71 +pozostawać
  72 +prosić
  73 +przechodzić
  74 +przestać
  75 +przybyć
  76 +przybywać
  77 +przyjąć
  78 +przyjmować
  79 +przypominać
  80 +przypomnieć
  81 +robić
  82 +rozerwać
  83 +rozumieć
  84 +składać
  85 +skończyć
  86 +skręcać
  87 +skręcić
  88 +słychać
  89 +śnić
  90 +śpieszyć
  91 +stać
  92 +stać
  93 +stanąć
  94 +strzelić
  95 +swędzić
  96 +świecić
  97 +szkoda
  98 +trafiać
  99 +trafić
  100 +trząść
  101 +trzeba
  102 +ucieszyć
  103 +uczynić
  104 +udać
  105 +udawać
  106 +uderzać
  107 +uderzyć
  108 +układać
  109 +ułożyć
  110 +warto
  111 +wiadomo
  112 +widać
  113 +wieść
  114 +wolno
  115 +wstyd
  116 +wychodzić
  117 +wydać
  118 +wydawać
  119 +wyjaśniać
  120 +wyjaśnić
  121 +wyjść
  122 +wypadać
  123 +wypaść
  124 +wypogadzać
  125 +wyrzucić
  126 +wystarczyć
  127 +wziąć
  128 +zabraknąć
  129 +zacząć
  130 +zaczynać
  131 +zagotować
  132 +zainteresować
  133 +zakręcić
  134 +żal
  135 +zależeć
  136 +zanieść
  137 +zanieść
  138 +zanosić
  139 +zanosić
  140 +zapowiadać
  141 +zarzucać
  142 +zastanowić
  143 +zbierać
  144 +zdarzać
  145 +zdziwić
  146 +zebrać
  147 +zemrzeć
  148 +złożyć
  149 +znać
  150 +zrobić
... ...
src/test/resources/example_test_tei/1/ann_groups.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_test_tei/1/ann_morphosyntax.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_test_tei/1/ann_named.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_test_tei/1/ann_segmentation.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_test_tei/1/ann_words.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_test_tei/1/header.xml 0 → 100644
  1 +++ a/src/test/resources/example_test_tei/1/header.xml
  1 +<?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 +<teiHeader xmlns="http://www.tei-c.org/ns/1.0" xmlns:nkjp="http://www.nkjp.pl/ns/1.0" xml:lang="en">
  3 + <fileDesc>
  4 + <titleStmt>
  5 + <title>Paragraphs: p-279,p-280,p-281,p-282,p-283,p-284,p-285,p-286,p-287 from NKJP text with id: IJPPAN_PolPr_TS00264</title>
  6 + </titleStmt>
  7 + </fileDesc>
  8 + <profileDesc>
  9 + <textClass>
  10 + <catRef scheme="#taxonomy-CORE" target="Dzienniki"/>
  11 + </textClass>
  12 + </profileDesc>
  13 + <revisionDesc/>
  14 +</teiHeader>
... ...
src/test/resources/example_test_tei/1/text.xml 0 → 100644
  1 +++ a/src/test/resources/example_test_tei/1/text.xml
  1 +<?xml version="1.0" ?>
  2 +<teiCorpus xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:nkjp="http://www.nkjp.pl/ns/1.0">
  3 + <xi:include href="PCC_header.xml"/>
  4 + <TEI>
  5 + <xi:include href="header.xml"/>
  6 + <text>
  7 + <body>
  8 + <p xml:id="p-1">– Sensownym rozwiązaniem będzie zmiana istniejącego oświetlenia na typ uliczny, czyli na wysokie słupy. W tym roku nie mamy jednak na to pieniędzy – mówi Anita Tyszkiewicz-Zimałka, rzecznik Urzędu Miasta w Raciborzu.</p>
  9 + <p xml:id="p-2">Przyjęto więc salomonowe rozwiązanie ograniczenia nakładów do minimum. Na odcinku od kładki dla pieszych do restauracji „Zamkowa” co druga latarnia będzie zdemontowana – A elementy z nich będą służyły do naprawiania pozostałych – wyjaśnia rzecznik.</p>
  10 + <p xml:id="p-3">Jacek Bombor</p>
  11 + <p xml:id="p-4">W ekstraklasie Francji prowadzący w tabeli zespół Jacka Bąka RC Lens wygrał wyjazdowe spotkanie z Montpellier. Sukces gości jest tym cenniejszy, że od 33 minuty grali oni w osłabieniu, bez Ferdinanda Coly, który ukarany został czerwoną kartką.</p>
  12 + <p xml:id="p-5">Montpellier – RC Lens 1:2 (0:1). Fugier (88) – Diouf (43), Pedron (65). Czerwona kartka: Coly (Lens)</p>
  13 + <p xml:id="p-6">Paris St Germain – Sedan 3:0 (1:0). Arteta (23, karny), Alex (82), Cisse (90). Czerwona kartka: Elzeard (Sedan).</p>
  14 + <p xml:id="p-7">AJ Auxerre – Nantes 2:1 (1:1). Cisse (19), Gonzales (78) – Moldovan (26). Czerwona kartka: Cetto (Nantes).</p>
  15 + <p xml:id="p-8">Lorient – Troyes 1:0 (0:0). Feindouno (60).</p>
  16 + <p xml:id="p-9">Metz – Girondins Bordeaux 1:2 (1:0). Desire Job (36) – Pauleta (71), Vikash Dhorasoo (83).</p>
  17 + </body>
  18 + </text>
  19 + </TEI>
  20 +</teiCorpus>
0 21 \ No newline at end of file
... ...
src/test/resources/example_test_tei/2/ann_groups.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_test_tei/2/ann_morphosyntax.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_test_tei/2/ann_named.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_test_tei/2/ann_segmentation.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_test_tei/2/ann_words.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_test_tei/2/header.xml 0 → 100644
  1 +++ a/src/test/resources/example_test_tei/2/header.xml
  1 +<?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 +<teiHeader xmlns="http://www.tei-c.org/ns/1.0" xmlns:nkjp="http://www.nkjp.pl/ns/1.0" xml:lang="en">
  3 + <fileDesc>
  4 + <titleStmt>
  5 + <title>Paragraphs: p-328,p-329 from NKJP text with id: PWN_3102000000066</title>
  6 + </titleStmt>
  7 + </fileDesc>
  8 + <profileDesc>
  9 + <textClass>
  10 + <catRef scheme="#taxonomy-CORE" target="Literatura faktu"/>
  11 + </textClass>
  12 + </profileDesc>
  13 + <revisionDesc/>
  14 +</teiHeader>
... ...
src/test/resources/example_test_tei/2/text.xml 0 → 100644
  1 +++ a/src/test/resources/example_test_tei/2/text.xml
  1 +<?xml version="1.0" ?>
  2 +<teiCorpus xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:nkjp="http://www.nkjp.pl/ns/1.0">
  3 + <xi:include href="PCC_header.xml"/>
  4 + <TEI>
  5 + <xi:include href="header.xml"/>
  6 + <text>
  7 + <body>
  8 + <p xml:id="p-1">To, że mściciele mieli prawo bezkarnie zabić nie tylko mordercę, ale i jego synów, zostało zapisane czarno na białym. Ale nie koniec na tym. Sens formułki et ille ac filii eius soli sint faidosi polega na zawężeniu kręgu osób, które mścicielom wolno zabić. Wiąże się to z poprzednią częścią zdania: ośmiokrotność zwykłego wergeldu morderca ma zapłacić sam, bez udziału dalszych krewnych. Wolno stąd wnosić, że gdyby nie zapłacono "zwykłego" wergeldu in simplo, którego trzecią część musieli pokryć boczni krewni zbrodniarza, byliby oni razem z mordercą i jego domownikami wystawieni na wróżdę strony poszkodowanej.</p>
  9 + <p xml:id="p-2">Tytuł XVIII Prawa Sasów poświęcony jest odpowiedzialności karnej pana za zabójstwo popełnione przez lita, a właściwie temu, jak można się od tej odpowiedzialności uwolnić: „Jeżeli lit z rozkazu lub z poduszczenia swojego pana zabije jakiegoś człowieka, na przykład nobila, to pan płaci główszczyznę lub podlega wróżdzie; jeżeli zaś [lit] popełni ten czyn bez wiedzy pana, to ma być przez pana wyzwolony, i [wtedy] krewni ofiary mają się mścić na nim samym [to jest sprawcy] i na pozostałych siedmiu jego krewnych, a pan lita musi przysiąc z jedenastoma współprzysiężnikami, że nie był wtajemniczony w zbrodnię" (Litus si per iuissum vel consilium domini sui hominem occiderit, ut puta nobilem, dominus conpositionem persolvat vel faidam portet; si autem absque conscientia domini hoc fecerit, dimittatur a domino, et vindicetur in illo et aliis VII consanguineis eius a propinquis occisi, et dominus liti se in hoc conscium non esse cum XI iuret).</p>
  10 + </body>
  11 + </text>
  12 + </TEI>
  13 +</teiCorpus>
0 14 \ No newline at end of file
... ...
src/test/resources/example_test_tei/3/ann_groups.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_test_tei/3/ann_morphosyntax.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_test_tei/3/ann_named.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_test_tei/3/ann_segmentation.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_test_tei/3/ann_words.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_test_tei/3/header.xml 0 → 100644
  1 +++ a/src/test/resources/example_test_tei/3/header.xml
  1 +<?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 +<teiHeader xmlns="http://www.tei-c.org/ns/1.0" xmlns:nkjp="http://www.nkjp.pl/ns/1.0" xml:lang="en">
  3 + <fileDesc>
  4 + <titleStmt>
  5 + <title>Paragraphs: p-38,p-39,p-40,p-41,p-42,p-43,p-44,p-45 from NKJP text with id: IJPPAN_p00111b00010a</title>
  6 + </titleStmt>
  7 + </fileDesc>
  8 + <profileDesc>
  9 + <textClass>
  10 + <catRef scheme="#taxonomy-CORE" target="Literatura piękna"/>
  11 + </textClass>
  12 + </profileDesc>
  13 + <revisionDesc/>
  14 +</teiHeader>
... ...
src/test/resources/example_test_tei/3/text.xml 0 → 100644
  1 +++ a/src/test/resources/example_test_tei/3/text.xml
  1 +<?xml version="1.0" ?>
  2 +<teiCorpus xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:nkjp="http://www.nkjp.pl/ns/1.0">
  3 + <xi:include href="PCC_header.xml"/>
  4 + <TEI>
  5 + <xi:include href="header.xml"/>
  6 + <text>
  7 + <body>
  8 + <p xml:id="p-1">Wrócił niedługo potem i szepnął coś do Margaret.</p>
  9 + <p xml:id="p-2">Oboje odwrócili się w kierunku majora Kovalsky’ego.</p>
  10 + <p xml:id="p-3">Działo się coś złego, i to bardzo.</p>
  11 + <p xml:id="p-4">Zdążył nawet wyciągnąć pistolet i postrzelić Smitha, ale Margaret była szybsza. Ciosem dłoni powaliła go na ziemię. Tracąc przytomność pomyślał, że nie spodziewał się tyle siły w tak wątłym ciele.</p>
  12 + <p xml:id="p-5">VII</p>
  13 + <p xml:id="p-6">Gdy się obudził nie miał lewej ręki. Z rany sączyła się krew. Obok leżał, dysząc ciężko, John Smith. Również krwawił, tyle, że na niebiesko. "Wszystko na opak w tym pojebanym miejscu" – pomyślał Kovalsky i znów zemdlał.</p>
  14 + <p xml:id="p-7">Gdy ocknął się drugi raz, Smith wyglądał trochę lepiej, a całą twarz miał we krwi. Czerwonej. Ręka majora obficie krwawiła do jakiegoś naczynia. Opodal uwijała się Margaret, która sprawiedliwie rozdzielała krwawy posiłek między siebie i Johna Smitha. Oboje sprawiali wrażenie bardzo szczęśliwych.</p>
  15 + <p xml:id="p-8">Margaret podeszła do Kovalsky’ego i pogłaskała po policzku. – Kochany, to był cudowny pomysł z tym wyścigiem. Naprawdę świetny. Nawet nie przypuszczałam... Nie przypuszczaliśmy... Jeden z nich się przewrócił i rozciął dłoń. Zaczął ssać i krwawienie ustało. A potem drugi, ale skaleczył się w nogę. Nie mógł sobie pomóc, więc myśmy to zrobili. Och... – Margaret jęknęła zmysłowo, a Kovalsky’ego znów ogarnęła ciemność.</p>
  16 + </body>
  17 + </text>
  18 + </TEI>
  19 +</teiCorpus>
0 20 \ No newline at end of file
... ...
src/test/resources/example_train_tei/1/ann_coreference.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_train_tei/1/ann_groups.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_train_tei/1/ann_mentions.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_train_tei/1/ann_morphosyntax.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_train_tei/1/ann_named.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_train_tei/1/ann_segmentation.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_train_tei/1/ann_words.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_train_tei/1/header.xml 0 → 100644
  1 +++ a/src/test/resources/example_train_tei/1/header.xml
  1 +<?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 +<teiHeader xmlns="http://www.tei-c.org/ns/1.0" xmlns:nkjp="http://www.nkjp.pl/ns/1.0" xml:lang="en">
  3 + <fileDesc>
  4 + <titleStmt>
  5 + <title>Paragraphs: p-57,p-58,p-59,p-60 from NKJP text with id: IPIPAN_1301919980826</title>
  6 + </titleStmt>
  7 + </fileDesc>
  8 + <profileDesc>
  9 + <textClass>
  10 + <catRef scheme="#taxonomy-CORE" target="Dzienniki"/>
  11 + </textClass>
  12 + </profileDesc>
  13 + <revisionDesc/>
  14 +</teiHeader>
... ...
src/test/resources/example_train_tei/1/text.xml 0 → 100644
  1 +++ a/src/test/resources/example_train_tei/1/text.xml
  1 +<?xml version="1.0" ?>
  2 +<teiCorpus xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:nkjp="http://www.nkjp.pl/ns/1.0">
  3 + <xi:include href="PCC_header.xml"/>
  4 + <TEI>
  5 + <xi:include href="header.xml"/>
  6 + <text>
  7 + <body>
  8 + <p xml:id="p-1">W spotkaniu weźmie udział blisko 7 tysięcy braci z całej Europy, ale tylko 206 z nich będzie ubiegało się o tytuł Europejskiego Króla Kurkowego. - Wezmę udział w strzelaniu, choć moje szanse są marne. Wynika to przede wszystkim z moich obowiązków gospodarza spotkań; w tym nawale pracy ciężko mi będzie się skupić na strzelaniu - przewiduje Zdzisław Maj, prezes krakowskiego Bractwa Kurkowego, panujący Król Kurkowy.</p>
  9 + <p xml:id="p-2">Strzelanie o tytuł Europejskiego Króla Kurkowego będzie się odbywało w kilku etapach. Do finału zostanie dopuszczonych 27 braci - jeden z nich otrzyma tytuł Europejskiego Króla Kurkowego odbierając go obecnie panującemu Wilfriedowi Stammermannowi. - Król nie otrzymuje żadnych nagród finansowych, ale taki tytuł jest ogromnym zaszczytem; król jest np. zapraszany na posiedzenia Parlamentu Europejskiego - mówi Zdzisław Maj.</p>
  10 + <p xml:id="p-3">Największą atrakcją 12. Europejskich Spotkań Bractw Strzeleckich będzie wielka parada, która rozpocznie się w niedzielę o godz. 13. Kilkuset braci w historycznych strojach przejdzie z Błoń na Rynek ulicami: Piłsudskiego, Straszewskiego, Franciszkańską i Grodzką.</p>
  11 + <p xml:id="p-4">Początki istnienia Bractwa Kurkowego w Krakowie sięgają XIII wieku. Skupiało ono znamienitych obywateli, kupców i rzemieślników pragnących wspomóc obronność miasta. Wielkim świętem bractwa był turniej, który odbywał się na strzelnicy zwanej Celestatem. Zawody trwały zwykle trzy dni. Strzelano do drewnianego kura umocowanego na wysokiej żerdzi. Brat, który zdołał celnym strzałem strącić ostatni jego fragment zdobywał miano Króla Kurkowego. Z tym tytułem wiązały się nie tylko honory, ale także przywileje: Rada Miejska zwalniała jego posiadacza m.in. z obowiązku płacenia podatków (ten zwyczaj utrzymał się do dziś).</p>
  12 + </body>
  13 + </text>
  14 + </TEI>
  15 +</teiCorpus>
0 16 \ No newline at end of file
... ...
src/test/resources/example_train_tei/2/ann_coreference.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_train_tei/2/ann_groups.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_train_tei/2/ann_mentions.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_train_tei/2/ann_morphosyntax.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_train_tei/2/ann_named.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_train_tei/2/ann_segmentation.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_train_tei/2/ann_words.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_train_tei/2/header.xml 0 → 100644
  1 +++ a/src/test/resources/example_train_tei/2/header.xml
  1 +<?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 +<teiHeader xmlns="http://www.tei-c.org/ns/1.0" xmlns:nkjp="http://www.nkjp.pl/ns/1.0" xml:lang="en">
  3 + <fileDesc>
  4 + <titleStmt>
  5 + <title>Paragraphs: p-437,p-438,p-439,p-440,p-441,p-442,p-443,p-444,p-445 from NKJP text with id: IJPPAN_PolPr_SlP00841</title>
  6 + </titleStmt>
  7 + </fileDesc>
  8 + <profileDesc>
  9 + <textClass>
  10 + <catRef scheme="#taxonomy-CORE" target="Dzienniki"/>
  11 + </textClass>
  12 + </profileDesc>
  13 + <revisionDesc/>
  14 +</teiHeader>
... ...
src/test/resources/example_train_tei/2/text.xml 0 → 100644
  1 +++ a/src/test/resources/example_train_tei/2/text.xml
  1 +<?xml version="1.0" ?>
  2 +<teiCorpus xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:nkjp="http://www.nkjp.pl/ns/1.0">
  3 + <xi:include href="PCC_header.xml"/>
  4 + <TEI>
  5 + <xi:include href="header.xml"/>
  6 + <text>
  7 + <body>
  8 + <p xml:id="p-1">Ernest i Agnieszka nie planowali, że będą mieli wielką, babską rodzinę. Ale tak wyszło. – I całe szczęście. Lepiej się dogaduję z dziewczętami – cieszy się Ernest Kwiecień.</p>
  9 + <p xml:id="p-2">W Wigilię do jego obowiązków, poza dostarczeniem choinki, należeć będzie zmywanie naczyń. Agnieszka zrobi pierogi, ugotuje barszcz z uszkami, usmaży karpia. Córki upieką ciasta. Potem przyjdzie czas na prezenty. Może to nawet będą empetrójki, o których marzą starsze dziewczyny.</p>
  10 + <p xml:id="p-3">Jodełek sadzimy mniej</p>
  11 + <p xml:id="p-4">Leśniczy, od którego pan Ernest przywozi choinkę, mieszka kilka kilometrów od domu Kwietniów. On także nie wyobraża sobie świąt bez prawdziwego świerku. – I musi być kiczowaty – uśmiecha się Gabriel Grobelny, nadleśniczy wałbrzyski.</p>
  12 + <p xml:id="p-5">To znaczy, że powinny na nim wisieć ozdoby zrobione przez dzieci, przechowywane latami, wyciągane na tę jedyną okazję.</p>
  13 + <p xml:id="p-6">Pan Gabriel ma dwóch synów i trzy córki. W domu została najmłodsza, 12-letnia, ale na święta zjadą wszyscy. I ubiorą choinkę. – Żona rozwiesi anielskie włosy, ja podłączę lampki – w domu nadleśniczego podział świątecznych ról jest określony.</p>
  14 + <p xml:id="p-7">W dolnośląskich lasach najwięcej jest świerków. Na plantacjach sadzą także coraz popularniejsze jodły z miękkimi igłami.</p>
  15 + <p xml:id="p-8">– Ale i tych jodełek sadzimy już mniej. To nie lata dziewięćdziesiąte, gdy sprzedawaliśmy prawie wszystkie wyhodowane drzewka – wspomina nadleśniczy.</p>
  16 + <p xml:id="p-9">U Grobelnego choinkę można sobie wybrać. – Mamy rodziny, w których co roku ojciec przyjeżdża z synem, by samemu ściąć drzewko. Taką mają tradycję – dodaje pan Gabriel.</p>
  17 + </body>
  18 + </text>
  19 + </TEI>
  20 +</teiCorpus>
0 21 \ No newline at end of file
... ...
src/test/resources/example_train_tei/3/ann_coreference.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_train_tei/3/ann_groups.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_train_tei/3/ann_mentions.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_train_tei/3/ann_morphosyntax.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_train_tei/3/ann_named.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_train_tei/3/ann_segmentation.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_train_tei/3/ann_words.xml.gz 0 → 100644
No preview for this file type
src/test/resources/example_train_tei/3/header.xml 0 → 100644
  1 +++ a/src/test/resources/example_train_tei/3/header.xml
  1 +<?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 +<teiHeader xmlns="http://www.tei-c.org/ns/1.0" xmlns:nkjp="http://www.nkjp.pl/ns/1.0" xml:lang="en">
  3 + <fileDesc>
  4 + <titleStmt>
  5 + <title>Paragraphs: p-6,p-7,p-8,p-9 from NKJP text with id: PELCRA_1303919960926</title>
  6 + </titleStmt>
  7 + </fileDesc>
  8 + <profileDesc>
  9 + <textClass>
  10 + <catRef scheme="#taxonomy-CORE" target="Dzienniki"/>
  11 + </textClass>
  12 + </profileDesc>
  13 + <revisionDesc/>
  14 +</teiHeader>
... ...
src/test/resources/example_train_tei/3/text.xml 0 → 100644
  1 +++ a/src/test/resources/example_train_tei/3/text.xml
  1 +<?xml version="1.0" ?>
  2 +<teiCorpus xmlns="http://www.tei-c.org/ns/1.0" xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:nkjp="http://www.nkjp.pl/ns/1.0">
  3 + <xi:include href="PCC_header.xml"/>
  4 + <TEI>
  5 + <xi:include href="header.xml"/>
  6 + <text>
  7 + <body>
  8 + <p xml:id="p-1">Cena życia</p>
  9 + <p xml:id="p-2">Z tego pogromu ocalało kilkudziesięciu Żydów, a wśród nich rodzina Mosze Sonensona. Przed wojną była to w skali miasteczka rodzina bogata. Sonensonowie mieli garbarnię. Nie udało mi się dociec, u kogo mianowicie przechowywali się Sonensonowie oraz pozostali Żydzi w czasie okupacji niemieckiej. Faktem pozostaje natomiast, że okupację tę przeżyli. Faktem oczywistym pozostaje i to, że liczne rodziny polskie - w Ejszyszkach i w pobliskich okolicach - przechowywały Żydów. Parę kilometrów od Ejszyszek, w Korkucianach (w folwarku Lebiedniki), żołnierz AK Kazimierz Korkuć w czasie wojny w swoim domu przechowywał 28 Żydów. Od studni do piwnic domu był przekopany tunel, dzięki czemu mieli wodę. Natomiast w skali siatki AK Kazimierz Korkuć przechowywał około 70 Żydów. Rodzina Świeczków również przechowywała Żydów. W tamtych stronach liczne rodziny polskie postępowały podobnie.</p>
  10 + <p xml:id="p-3">Prawdą jest również i to, że Żydzi za swe przechowanie płacili. Płacili za utrzymanie i chyba jeszcze - za ryzyko. O tym dzisiaj raczej tu się nie mówi, ale prawdopodobnie różnie z tym było: jedni za pieniądze, inni - z odruchu serca. Ryzykowali i Polacy, i Żydzi. Te rachunki mogły wyglądać bardzo różnie.</p>
  11 + <p xml:id="p-4">Mieszkam w jednej z podwileńskich wsi. Otóż w tej mojej wsi pewien gospodarz - Polak - przechowywał w czasie wojny młodą Żydówkę. Spodobała mu się, z czego wynikł dramat. Zdenerwowana żona doniosła na policję. Aresztowano Żydówkę razem z gospodarzem, przerażona kobieta próbowała ocalić męża. Zanim uzbierała potrzebną sumę na łapówkę, było już za późno - rozstrzelano nie tylko Żydówkę, ale i gospodarza. Czy żonę tego straceńca można nazwać antysemitką?</p>
  12 + </body>
  13 + </text>
  14 + </TEI>
  15 +</teiCorpus>
0 16 \ No newline at end of file
... ...