indexing_formats_folia.html 17.2 KB
<!DOCTYPE html>
<!--
 | Generated by Apache Maven Doxia Site Renderer 1.7.4 at 2017-09-25 
 | Rendered using Apache Maven Fluido Skin 1.5
-->
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  <head>
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <meta name="Date-Revision-yyyymmdd" content="20170925" />
    <meta http-equiv="Content-Language" content="en" />
    <title>Multi Tier Annotation Search &#x2013; FoLiA</title>
    <link rel="stylesheet" href="./css/apache-maven-fluido-1.5.min.css" />
    <link rel="stylesheet" href="./css/site.css" />
    <link rel="stylesheet" href="./css/print.css" media="print" />

      
    <script type="text/javascript" src="./js/apache-maven-fluido-1.5.min.js"></script>

                      </head>
        <body class="topBarDisabled">
          
        
    
        <div class="container-fluid">
          <div id="banner">
        <div class="pull-left">
                                <div id="bannerLeft">
                <h2>MTAS</h2>
                </div>
                      </div>
        <div class="pull-right">                  <a href="http://www.meertens.knaw.nl/" id="bannerRight">
                                                                                                <img src="images/meertens.png"  alt="Meertens Instituut" width="93" height="104"/>
                </a>
      </div>
        <div class="clear"><hr/></div>
      </div>

      <div id="breadcrumbs">
        <ul class="breadcrumb">
              
                  <li id="projectVersion">Version: 6.6.1
                          <span class="divider">|</span>
                    </li>
                              <li class="">
                    <a href="index.html" title="Mtas">
        Mtas</a>
                    <span class="divider">/</span>
      </li>
        <li class="active ">FoLiA</li>
                
              
                  <li id="publishDate" class="pull-right">Last Published: 2017-09-25</li>
            
                            </ul>
      </div>

            
      <div class="row-fluid">
        <div id="leftColumn" class="span2">
          <div class="well sidebar-nav">
              
                <ul class="nav nav-list">
                    <li class="nav-header">Mtas</li>
                              
      <li>
  
                          <a href="index.html" title="Introduction">
          <span class="none"></span>
        Introduction</a>
            </li>
                
      <li>
  
                          <a href="features.html" title="Features">
          <span class="none"></span>
        Features</a>
            </li>
                                                                                                      
      <li>
  
                          <a href="installation.html" title="Getting started">
          <span class="icon-chevron-right"></span>
        Getting started</a>
                  </li>
                                                                                                                                                                                                                
      <li>
  
                          <a href="indexing.html" title="Indexing">
          <span class="icon-chevron-down"></span>
        Indexing</a>
                    <ul class="nav nav-list">
                    
      <li>
  
                          <a href="indexing_configuration.html" title="Configuration">
          <span class="none"></span>
        Configuration</a>
            </li>
                    
      <li>
  
                          <a href="indexing_mapping.html" title="Mapping">
          <span class="none"></span>
        Mapping</a>
            </li>
                                                                                                                                                  
      <li>
  
                          <a href="indexing_formats.html" title="Formats">
          <span class="icon-chevron-down"></span>
        Formats</a>
                    <ul class="nav nav-list">
                    
      <li class="active">
  
            <a href="#"><span class="none"></span>FoLiA</a>
          </li>
                    
      <li>
  
                          <a href="indexing_formats_tei.html" title="TEI">
          <span class="none"></span>
        TEI</a>
            </li>
                    
      <li>
  
                          <a href="indexing_formats_chat.html" title="CHAT">
          <span class="none"></span>
        CHAT</a>
            </li>
                    
      <li>
  
                          <a href="indexing_formats_sketch.html" title="Sketch">
          <span class="none"></span>
        Sketch</a>
            </li>
                    
      <li>
  
                          <a href="indexing_formats_crm.html" title="CRM">
          <span class="none"></span>
        CRM</a>
            </li>
              </ul>
        </li>
              </ul>
        </li>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
      <li>
  
                          <a href="search.html" title="Search">
          <span class="icon-chevron-right"></span>
        Search</a>
                  </li>
                
      <li>
  
                          <a href="download.html" title="Download">
          <span class="none"></span>
        Download</a>
            </li>
                              <li class="nav-header">Project Documentation</li>
                                                                                                                                                                                            
      <li>
  
                          <a href="project-info.html" title="Project Information">
          <span class="icon-chevron-right"></span>
        Project Information</a>
                  </li>
                                                                                                                                          
      <li>
  
                          <a href="project-reports.html" title="Project Reports">
          <span class="icon-chevron-right"></span>
        Project Reports</a>
                  </li>
            </ul>
              
                
          <hr />

           <div id="poweredBy">
                            <div class="clear"></div>
                            <div class="clear"></div>
                            <div class="clear"></div>
                            <div class="clear"></div>
                             <a href="http://maven.apache.org/" title="Built by Maven" class="poweredBy">
        <img class="builtBy" alt="Built by Maven" src="./images/logos/maven-feather.png" />
      </a>
                  </div>
          </div>
        </div>
        
                
        <div id="bodyColumn"  class="span10" >
                                  
            <h1>FoLiA</h1>
<p>For indexing <a class="externalLink" href="https://proycon.github.io/folia/">FoLiA</a> resources, the <i>mtas.analysis.parser.MtasFoliaParser</i> extending the abstract <i>MtasXMLParser</i> is available; full examples of configuration files are provided on <a class="externalLink" href="https://github.com/meertensinstituut/mtas/tree/master/conf/parser/mtas">GitHub</a>.</p>

<div class="source">
<div class="source"><pre class="prettyprint">&lt;!-- START CONFIGURATION MTAS PARSER --&gt;
&lt;parser name=&quot;mtas.analysis.parser.MtasFoliaParser&quot;&gt;
...
  &lt;!-- START MAPPINGS --&gt;
  &lt;mappings&gt;
  ...
  &lt;/mapping&gt;
  &lt;!-- END MAPPINGS ---&gt;
  ...
&lt;/parser&gt;
&lt;!-- END CONFIGURATION MTAS PARSER --&gt;
</pre></div></div>
<p>The <a href="indexing_configuration.html#configuration">configuration file</a> defining the <a href="indexing_mapping.html">mapping</a> has some specific settings for the FoLiA parser distinguishing several types of elements within the XML-based FoLiA resource: </p>

<ul>
  
<li><a href="indexing_formats_folia.html#word">words</a> : the basic tokenisation layer</li>
  
<li><a href="indexing_formats_folia.html#wordAnnotation">wordAnnotations</a> : annotations occurring within a word</li>
  
<li><a href="indexing_formats_folia.html#group">groups</a> : containing one or multiple words</li>
  
<li><a href="indexing_formats_folia.html#groupAnnotation">groupAnnotations</a> occurring within a group</li>
  
<li><a href="indexing_formats_folia.html#relation">relations</a> : containing one or multiple references</li>
  
<li><a href="indexing_formats_folia.html#reference">references</a> : elements referring to (typically) words by id</li>
</ul>
<p>Inside the <i>mappings</i> part of the configuration file, all elements are defined that may be mapped onto the index structure : <i>words</i>, <i>wordAnnotations</i>, <i>groups</i>, <i>groupAnnotations</i> and <i>relations</i>. Outside the <i>mappings</i> part the references can be defined, since a reference itself will never be mapped directly onto the index structure. </p>
<p>The use and meaning of the different elements is illustrated and explained by some examples. </p>
<p><a name="word"></a><b>Words</b></p>
<p>In the parser configuration, a word can be defined by </p>

<div class="source">
<div class="source"><pre class="prettyprint">&lt;mapping type=&quot;word&quot; name=&quot;w&quot;&gt;
&lt;/mapping&gt;
</pre></div></div>
<p>This will recognize every occurring w-tag within the FoLiA-resource as a word, defining the basic tokenization to be used in the mapping. To add a token for each occurring word, we have to add a token definition, for example</p>

<div class="source">
<div class="source"><pre class="prettyprint">&lt;mapping type=&quot;word&quot; name=&quot;w&quot;&gt;
  &lt;token type=&quot;string&quot; offset=&quot;false&quot; realoffset=&quot;false&quot; parent=&quot;false&quot;&gt;
    &lt;pre&gt;
      &lt;item type=&quot;name&quot; /&gt;
    &lt;/pre&gt;    
  &lt;/token&gt;
&lt;/mapping&gt;
</pre></div></div>
<p>Here the <i>prefix</i> is chosen to equal the <i>name</i> of the matching tag, and no <i>offset</i>, <i>realOffset</i> or <i>parent</i> will be included. To only add tokens conditionally, and/or to include the value from for example a provided attribute, we can define for example</p>

<div class="source">
<div class="source"><pre class="prettyprint">&lt;mapping type=&quot;word&quot; name=&quot;w&quot;&gt;
  &lt;token type=&quot;string&quot; offset=&quot;false&quot; realoffset=&quot;false&quot; parent=&quot;false&quot;&gt;
    &lt;pre&gt;
      &lt;item type=&quot;name&quot; /&gt;
    &lt;/pre&gt;
    &lt;post&gt;
      &lt;item type=&quot;attribute&quot; name=&quot;class&quot; /&gt;
    &lt;/post&gt;
  &lt;/token&gt;
  &lt;condition&gt;
    &lt;item type=&quot;attribute&quot; name=&quot;class&quot; /&gt;
    &lt;item type=&quot;attribute&quot; name=&quot;class&quot; not=&quot;true&quot; condition=&quot;WORD&quot; /&gt;
  &lt;/condition&gt;
&lt;/mapping&gt;
</pre></div></div>
<p>This will add tokens to the index for all w-tags with the attribute <i>class</i> set and unequal to &#x201c;WORD&#x201d;. The resulting single position tokens will have <i>prefix</i> value &#x201c;w&#x201d; and <i>postfix</i> value equal to the provided <i>class</i>.</p>
<p>If <i>parent</i> was set to <i>true</i>, the id of the first parenting <a href="indexing_formats_group.html">group</a> would have been used as <i>parentId</i> for the resulting token.</p>
<p><a name="wordAnnotation"></a><b>Word annotations</b></p>
<p>All elements occurring within a <a href="indexing_formats_folia.html#word">word</a> can be defined as <i>wordAnnotation</i>. </p>

<div class="source">
<div class="source"><pre class="prettyprint">&lt;mapping type=&quot;wordAnnotation&quot; name=&quot;lemma&quot;&gt;
  &lt;token type=&quot;string&quot; offset=&quot;false&quot; realoffset=&quot;false&quot; parent=&quot;false&quot;&gt;
    &lt;pre&gt;
      &lt;item type=&quot;name&quot; /&gt;
    &lt;/pre&gt;
    &lt;post&gt;
      &lt;item type=&quot;attribute&quot; name=&quot;class&quot; /&gt;
    &lt;/post&gt;
  &lt;/token&gt;
  &lt;condition&gt;
    &lt;item type=&quot;attribute&quot; name=&quot;class&quot; /&gt;
  &lt;/condition&gt;
&lt;/mapping&gt;
</pre></div></div>
<p>As illustrated in the next sample, not only attributes can be used, but also the text value within a matching tag. Furthermore, also multiple tokens can be generated from the same matching element. Finally, a filter may be applied.</p>

<div class="source">
<div class="source"><pre class="prettyprint">&lt;mapping type=&quot;wordAnnotation&quot; name=&quot;t&quot;&gt;
  &lt;token type=&quot;string&quot; offset=&quot;false&quot; parent=&quot;true&quot;&gt;
    &lt;pre&gt;
      &lt;item type=&quot;name&quot; /&gt;
    &lt;/pre&gt;
    &lt;post&gt;
      &lt;item type=&quot;text&quot; /&gt;
    &lt;/post&gt;
  &lt;/token&gt;
  &lt;token type=&quot;string&quot; offset=&quot;false&quot; realoffset=&quot;false&quot; parent=&quot;false&quot;&gt;
    &lt;pre&gt;
      &lt;item type=&quot;name&quot; /&gt;
      &lt;item type=&quot;string&quot; value=&quot;_lc&quot; /&gt;
    &lt;/pre&gt;
    &lt;post&gt;
      &lt;item type=&quot;text&quot; filter=&quot;ascii,lowercase&quot; /&gt;
    &lt;/post&gt;
  &lt;/token&gt;  
&lt;/mapping&gt;
</pre></div></div>
<p>If <i>parent</i> is set to true, the id of the first parenting <a href="indexing_formats_group.html">group</a> will be used as <i>parentId</i> for the generated token.</p>
<p><a name="group"></a><b>Groups</b></p>
<p>Elements containing one or multiple <a href="index_formats_folia.html#word">words</a> can be defined as <i>group</i>.</p>

<div class="source">
<div class="source"><pre class="prettyprint">&lt;mapping type=&quot;group&quot; name=&quot;s&quot;&gt;
  &lt;token type=&quot;string&quot; offset=&quot;false&quot; parent=&quot;true&quot;&gt;
    &lt;pre&gt;
      &lt;item type=&quot;name&quot; /&gt;
    &lt;/pre&gt;
    &lt;post&gt;
      &lt;item type=&quot;attribute&quot; name=&quot;class&quot; /&gt;
    &lt;/post&gt;
  &lt;/token&gt;
&lt;/mapping&gt;
</pre></div></div>
<p>The id of the first parenting group is used as <i>parentId</i>.</p>
<p><a name="groupAnnotation"></a><b>Group annotations</b></p>
<p>Elements within a <a href="index_formats_folia.html#group">group</a> and not containing one or multiple <a href="index_formats_folia.html#word">words</a> can be defined as <i>groupAnnotation</i>.</p>

<div class="source">
<div class="source"><pre class="prettyprint">&lt;mapping type=&quot;groupAnnotation&quot; name=&quot;lang&quot;&gt;
  &lt;token type=&quot;string&quot; offset=&quot;false&quot; realoffset=&quot;false&quot; parent=&quot;false&quot;&gt;
    &lt;pre&gt;
      &lt;item type=&quot;name&quot; /&gt;
    &lt;/pre&gt;
    &lt;post&gt;
      &lt;item type=&quot;attribute&quot; name=&quot;class&quot; /&gt;
    &lt;/post&gt;
  &lt;/token&gt;
&lt;/mapping&gt;
</pre></div></div>
<p>If <i>parent</i> was set to <i>true</i>, the id of the first parenting <a href="indexing_formats_group.html">group</a> would have been used as <i>parentId</i> for the resulting token.</p>
<p><a name="relation"></a><b>Relations</b></p>
<p>Elements containing one or multiple <a href="index_formats_folia.html#reference">references</a> and not containing one or multiple <a href="index_formats_folia.html#word">words</a> can be defined as <i>relation</i>.</p>

<div class="source">
<div class="source"><pre class="prettyprint">&lt;mapping type=&quot;relation&quot; name=&quot;entities&quot;&gt;
&lt;/mapping&gt;
&lt;mapping type=&quot;relation&quot; name=&quot;entity&quot;&gt;
  &lt;token type=&quot;string&quot; offset=&quot;false&quot; realoffset=&quot;false&quot; parent=&quot;false&quot;&gt;
    &lt;pre&gt;
      &lt;item type=&quot;name&quot; /&gt;
    &lt;/pre&gt;
    &lt;post&gt;
      &lt;item type=&quot;attribute&quot; name=&quot;class&quot; /&gt;
    &lt;/post&gt;
    &lt;payload&gt;
      &lt;item type=&quot;attribute&quot; name=&quot;confidence&quot; /&gt;
    &lt;/payload&gt;
  &lt;/token&gt;
  &lt;condition&gt;
    &lt;item type=&quot;ancestor&quot; number=&quot;1&quot; /&gt;
    &lt;item type=&quot;ancestorName&quot; condition=&quot;entities&quot; /&gt;
  &lt;/condition&gt;
&lt;/mapping&gt;
</pre></div></div>
<p><a name="reference"></a><b>References</b></p>
<p>Elements may be defined as <i>reference</i> to a word, for example <i>wref</i> elements referring in the <i>ref</i> attribute to the <i>id</i> of words. </p>

<div class="source">
<div class="source"><pre class="prettyprint">&lt;references&gt;
  &lt;reference name=&quot;wref&quot; ref=&quot;id&quot; /&gt;
&lt;/references&gt;
</pre></div></div>
                  </div>
            </div>
          </div>

    <hr/>

    <footer>
            <div class="container-fluid">
                      <div class="row-fluid">
                                      <p >Copyright &copy;                    2017
                        <a href="http://www.meertens.knaw.nl/">Meertens Institute</a>.
            All rights reserved.    
      </p>
                </div>

        
                </div>
    </footer>
        </body>
</html>