indexing_formats_sketch.html 11.6 KB
<!DOCTYPE html>
<!--
 | Generated by Apache Maven Doxia Site Renderer 1.7.4 at 2017-10-12 
 | Rendered using Apache Maven Fluido Skin 1.5
-->
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  <head>
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <meta name="Date-Revision-yyyymmdd" content="20171012" />
    <meta http-equiv="Content-Language" content="en" />
    <title>Multi Tier Annotation Search &#x2013; Sketch Engine</title>
    <link rel="stylesheet" href="./css/apache-maven-fluido-1.5.min.css" />
    <link rel="stylesheet" href="./css/site.css" />
    <link rel="stylesheet" href="./css/print.css" media="print" />

      
    <script type="text/javascript" src="./js/apache-maven-fluido-1.5.min.js"></script>

                      </head>
        <body class="topBarDisabled">
          
        
    
        <div class="container-fluid">
          <div id="banner">
        <div class="pull-left">
                                <div id="bannerLeft">
                <h2>MTAS</h2>
                </div>
                      </div>
        <div class="pull-right">                  <a href="http://www.meertens.knaw.nl/" id="bannerRight">
                                                                                                <img src="images/meertens.png"  alt="Meertens Instituut" width="93" height="104"/>
                </a>
      </div>
        <div class="clear"><hr/></div>
      </div>

      <div id="breadcrumbs">
        <ul class="breadcrumb">
              
                  <li id="projectVersion">Version: 7.0.1
                          <span class="divider">|</span>
                    </li>
                              <li class="">
                    <a href="index.html" title="Mtas">
        Mtas</a>
                    <span class="divider">/</span>
      </li>
        <li class="active ">Sketch Engine</li>
                
              
                  <li id="publishDate" class="pull-right">Last Published: 2017-10-12</li>
            
                            </ul>
      </div>

            
      <div class="row-fluid">
        <div id="leftColumn" class="span2">
          <div class="well sidebar-nav">
              
                <ul class="nav nav-list">
                    <li class="nav-header">Mtas</li>
                              
      <li>
  
                          <a href="index.html" title="Introduction">
          <span class="none"></span>
        Introduction</a>
            </li>
                
      <li>
  
                          <a href="features.html" title="Features">
          <span class="none"></span>
        Features</a>
            </li>
                                                                                                      
      <li>
  
                          <a href="installation.html" title="Getting started">
          <span class="icon-chevron-right"></span>
        Getting started</a>
                  </li>
                                                                                                                                                                                                                
      <li>
  
                          <a href="indexing.html" title="Indexing">
          <span class="icon-chevron-down"></span>
        Indexing</a>
                    <ul class="nav nav-list">
                    
      <li>
  
                          <a href="indexing_configuration.html" title="Configuration">
          <span class="none"></span>
        Configuration</a>
            </li>
                    
      <li>
  
                          <a href="indexing_mapping.html" title="Mapping">
          <span class="none"></span>
        Mapping</a>
            </li>
                                                                                                                                                  
      <li>
  
                          <a href="indexing_formats.html" title="Formats">
          <span class="icon-chevron-down"></span>
        Formats</a>
                    <ul class="nav nav-list">
                    
      <li>
  
                          <a href="indexing_formats_folia.html" title="FoLiA">
          <span class="none"></span>
        FoLiA</a>
            </li>
                    
      <li>
  
                          <a href="indexing_formats_tei.html" title="TEI">
          <span class="none"></span>
        TEI</a>
            </li>
                    
      <li>
  
                          <a href="indexing_formats_chat.html" title="CHAT">
          <span class="none"></span>
        CHAT</a>
            </li>
                    
      <li class="active">
  
            <a href="#"><span class="none"></span>Sketch</a>
          </li>
                    
      <li>
  
                          <a href="indexing_formats_crm.html" title="CRM">
          <span class="none"></span>
        CRM</a>
            </li>
              </ul>
        </li>
              </ul>
        </li>
                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
      <li>
  
                          <a href="search.html" title="Search">
          <span class="icon-chevron-right"></span>
        Search</a>
                  </li>
                
      <li>
  
                          <a href="download.html" title="Download">
          <span class="none"></span>
        Download</a>
            </li>
                              <li class="nav-header">Project Documentation</li>
                                                                                                                                                                                            
      <li>
  
                          <a href="project-info.html" title="Project Information">
          <span class="icon-chevron-right"></span>
        Project Information</a>
                  </li>
                                                                                                                                          
      <li>
  
                          <a href="project-reports.html" title="Project Reports">
          <span class="icon-chevron-right"></span>
        Project Reports</a>
                  </li>
            </ul>
              
                
          <hr />

           <div id="poweredBy">
                            <div class="clear"></div>
                            <div class="clear"></div>
                            <div class="clear"></div>
                            <div class="clear"></div>
                             <a href="http://maven.apache.org/" title="Built by Maven" class="poweredBy">
        <img class="builtBy" alt="Built by Maven" src="./images/logos/maven-feather.png" />
      </a>
                  </div>
          </div>
        </div>
        
                
        <div id="bodyColumn"  class="span10" >
                                  
            <h1>Sketch Engine</h1>
<p>For indexing <a class="externalLink" href="https://www.sketchengine.co.uk/word-sketch-index-format/">Sketch Engine</a> resources, the <i>mtas.analysis.parser.MtasSketchParser</i> extending the <i>MtasBasicParser</i> is available; full examples of configuration files are provided on <a class="externalLink" href="https://github.com/meertensinstituut/mtas/tree/master/conf/parser/mtas">GitHub</a>.</p>

<div class="source">
<div class="source"><pre class="prettyprint">&lt;!-- START CONFIGURATION MTAS PARSER --&gt;
&lt;parser name=&quot;mtas.analysis.parser.MtasSketchParser&quot;&gt;
...
  &lt;!-- START MAPPINGS --&gt;
  &lt;mappings&gt;
  ...
  &lt;/mapping&gt;
  &lt;!-- END MAPPINGS ---&gt;
  ...
&lt;/parser&gt;
&lt;!-- END CONFIGURATION MTAS PARSER --&gt;
</pre></div></div>
<p>The <a href="indexing_configuration.html#configuration">configuration file</a> defining the <a href="indexing_mapping.html">mapping</a> has some specific settings for the Sketch parser distinguishing several types of elements within the XML-based Sketch resource: </p>

<ul>
  
<li><a href="indexing_formats_sketch.html#word">words</a> : the basic tokenisation layer</li>
  
<li><a href="indexing_formats_sketch.html#wordAnnotation">wordAnnotations</a> : annotations occurring within a word</li>
  
<li><a href="indexing_formats_sketch.html#group">groups</a> : containing one or multiple words</li>
</ul>
<p>All these elements are defined inside the <i>mappings</i> part of the configuration file. The use and meaning of the different elements is illustrated and explained by some examples. </p>
<p><a name="word"></a><b>Words</b></p>
<p>All rows not consisting of a start or end tag in the Sketch resource are supposed to be a set of tab-separated values. Such a row is potentially to be interpreted as <i>word</i> with each value an associated <i>wordAnnotation</i>. In the parser configuration, conditions can be put on which potential items in the Sketch resource should really be interpreted as a <i>word</i>: </p>

<div class="source">
<div class="source"><pre class="prettyprint">&lt;mapping type=&quot;word&quot;&gt;
  &lt;condition&gt;
    &lt;item type=&quot;ancestorGroupName&quot; not=&quot;true&quot; condition=&quot;field&quot; /&gt;
  &lt;/condition&gt;
&lt;/mapping&gt;
</pre></div></div>
<p>The example above excludes potential words that are contained within a <i>field</i> tag.</p>
<p><a name="word"></a><b>Word annotations</b></p>
<p>Each value in the set of tab separated values from a word is a potential <i>wordAnnotation</i>. A mapping on such a <i>wordAnnotation</i> can be defined by referring to the position of the value in the <i>word</i> definition.</p>

<div class="source">
<div class="source"><pre class="prettyprint">&lt;mapping type=&quot;wordAnnotation&quot; name=&quot;0&quot;&gt;
  &lt;token type=&quot;string&quot; offset=&quot;false&quot; parent=&quot;false&quot;&gt;
    &lt;pre&gt;
      &lt;item type=&quot;string&quot; value=&quot;t&quot; /&gt;
    &lt;/pre&gt;
    &lt;post&gt;
      &lt;item type=&quot;text&quot; /&gt;
    &lt;/post&gt;
  &lt;/token&gt;
&lt;/mapping&gt;  
</pre></div></div>
<p>The example above will add a token based on the first <i>wordAnntotation</i> value from each <i>word</i>.</p>
<p><a name="group"></a><b>Groups</b></p>
<p>Rows containing start and end tags in the Sketch resource define potential groups. These groups must contain words, and mappings can be configured by referring to their name.</p>

<div class="source">
<div class="source"><pre class="prettyprint">&lt;mapping type=&quot;group&quot; name=&quot;s&quot;&gt;
  &lt;token type=&quot;string&quot; offset=&quot;false&quot;&gt;
    &lt;pre&gt;
      &lt;item type=&quot;name&quot; /&gt;
    &lt;/pre&gt;
    &lt;post&gt;
      &lt;item type=&quot;attribute&quot; name=&quot;class&quot; /&gt;
    &lt;/post&gt;
  &lt;/token&gt;        
&lt;/mapping&gt;
</pre></div></div>
                  </div>
            </div>
          </div>

    <hr/>

    <footer>
            <div class="container-fluid">
                      <div class="row-fluid">
                                      <p >Copyright &copy;                    2017
                        <a href="http://www.meertens.knaw.nl/">Meertens Institute</a>.
            All rights reserved.    
      </p>
                </div>

        
                </div>
    </footer>
        </body>
</html>