installation_lucene.html 14.2 KB

Edit Raw Blame History

<!DOCTYPE html>
<!--
 | Generated by Apache Maven Doxia Site Renderer 1.7.4 at 2017-09-25
 | Rendered using Apache Maven Fluido Skin 1.5
-->
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  <head>
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <meta name="Date-Revision-yyyymmdd" content="20170925" />
    <meta http-equiv="Content-Language" content="en" />
    <title>Multi Tier Annotation Search &#x2013; Apache Lucene</title>
    <link rel="stylesheet" href="./css/apache-maven-fluido-1.5.min.css" />
    <link rel="stylesheet" href="./css/site.css" />
    <link rel="stylesheet" href="./css/print.css" media="print" />


    <script type="text/javascript" src="./js/apache-maven-fluido-1.5.min.js"></script>

                      </head>
        <body class="topBarDisabled">


        <div class="container-fluid">
          <div id="banner">
        <div class="pull-left">
                                <div id="bannerLeft">
                <h2>MTAS</h2>
                </div>
                      </div>
        <div class="pull-right">                  <a href="http://www.meertens.knaw.nl/" id="bannerRight">
                                                                                                <img src="images/meertens.png"  alt="Meertens Instituut" width="93" height="104"/>
                </a>
      </div>
        <div class="clear"><hr/></div>
      </div>

      <div id="breadcrumbs">
        <ul class="breadcrumb">

                  <li id="projectVersion">Version: 6.6.1
                          <span class="divider">|</span>
                    </li>
                              <li class="">
                    <a href="index.html" title="Mtas">
        Mtas</a>
                    <span class="divider">/</span>
      </li>
        <li class="active ">Apache Lucene</li>


                  <li id="publishDate" class="pull-right">Last Published: 2017-09-25</li>

                            </ul>
      </div>


      <div class="row-fluid">
        <div id="leftColumn" class="span2">
          <div class="well sidebar-nav">

                <ul class="nav nav-list">
                    <li class="nav-header">Mtas</li>

      <li>

                          <a href="index.html" title="Introduction">
          <span class="none"></span>
        Introduction</a>
            </li>

      <li>

                          <a href="features.html" title="Features">
          <span class="none"></span>
        Features</a>
            </li>

      <li>

                          <a href="installation.html" title="Getting started">
          <span class="icon-chevron-down"></span>
        Getting started</a>
                    <ul class="nav nav-list">

      <li class="active">

            <a href="#"><span class="none"></span>Lucene</a>
          </li>

      <li>

                          <a href="installation_solr.html" title="Solr">
          <span class="none"></span>
        Solr</a>
            </li>

      <li>

                          <a href="installation_docker.html" title="Docker">
          <span class="none"></span>
        Docker</a>
            </li>
              </ul>
        </li>

      <li>

                          <a href="indexing.html" title="Indexing">
          <span class="icon-chevron-right"></span>
        Indexing</a>
                  </li>

      <li>

                          <a href="search.html" title="Search">
          <span class="icon-chevron-right"></span>
        Search</a>
                  </li>

      <li>

                          <a href="download.html" title="Download">
          <span class="none"></span>
        Download</a>
            </li>
                              <li class="nav-header">Project Documentation</li>

      <li>

                          <a href="project-info.html" title="Project Information">
          <span class="icon-chevron-right"></span>
        Project Information</a>
                  </li>

      <li>

                          <a href="project-reports.html" title="Project Reports">
          <span class="icon-chevron-right"></span>
        Project Reports</a>
                  </li>
            </ul>


          <hr />

           <div id="poweredBy">
                            <div class="clear"></div>
                            <div class="clear"></div>
                            <div class="clear"></div>
                            <div class="clear"></div>
                             <a href="http://maven.apache.org/" title="Built by Maven" class="poweredBy">
        <img class="builtBy" alt="Built by Maven" src="./images/logos/maven-feather.png" />
      </a>
                  </div>
          </div>
        </div>


        <div id="bodyColumn"  class="span10" >

            <h1>Apache Lucene</h1>
<p>Some code snippets to illustrate the use of Mtas directly with <a class="externalLink" href="https://lucene.apache.org/">Apache Lucene</a>.</p>
<p><b>Create index</b></p>
<p>Create an index with three folia files using <a href="indexing_configuration.html">configuration</a> file <tt>folia.xml</tt></p>

<div class="source">
<div class="source"><pre class="prettyprint">String configFile = &quot;folia.xml&quot;;
HashMap&lt;String,String&gt; files = new HashMap&lt;String,String&gt;();
files.put(&quot;title 1&quot;,&quot;resource1.xml.gz&quot;);
files.put(&quot;title 2&quot;,&quot;resource2.xml.gz&quot;);
files.put(&quot;title 3&quot;,&quot;resource3.xml.gz&quot;);
CreateIndex createIndex = new CreateIndex(configFile, files);
</pre></div></div>
<p><b>Basic search</b></p>
<p>With the created index and for <a href="search_cql.html">CQL</a> expression <tt>[pos=&quot;LID&quot;]</tt></p>

<div class="source">
<div class="source"><pre class="prettyprint">String cql = &quot;[pos=\&quot;LID\&quot;]&quot;;
Directory directory = createIndex.getDirectory();
</pre></div></div>
<p>the number of hits in each document can be computed with</p>

<div class="source">
<div class="source"><pre class="prettyprint">IndexReader indexReader = DirectoryReader.open(directory);
MtasSpanQuery q = createQuery(CreateIndex.FIELD_CONTENT, cql, null, null);
ListIterator&lt;LeafReaderContext&gt; iterator = indexReader.leaves()
    .listIterator();
IndexSearcher searcher = new IndexSearcher(indexReader);
SpanWeight spanweight = ((MtasSpanQuery) q.rewrite(indexReader))
    .createWeight(searcher, false);
while (iterator.hasNext()) {
  LeafReaderContext lrc = iterator.next();
  Spans spans = spanweight.getSpans(lrc, SpanWeight.Postings.POSITIONS);
  SegmentReader r = (SegmentReader) lrc.reader();
  if (spans != null) {
    while (spans.nextDoc() != Spans.NO_MORE_DOCS) {
      if (r.numDocs()==r.maxDoc() || r.getLiveDocs().get(spans.docID())) {
        System.out.print(&quot;Document &quot;+(lrc.docBase+spans.docID())+&quot;: &quot;);
        int hits = 0;
        while (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
          hits++;
        }
        System.out.println(hits+&quot; hits in '&quot;+r.document((lrc.docBase+spans.docID())).get(CreateIndex.FIELD_TITLE)+&quot;'&quot;);
      }
    }
  }
}
indexReader.close();
</pre></div></div>
<p><b>Advanced search</b></p>
<p>By using the provided <tt>collect</tt> method, also more advanced options are available, like computing the <a href="search_component_termvector.html">termvector</a></p>

<div class="source">
<div class="source"><pre class="prettyprint">IndexReader indexReader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(indexReader);
ComponentField fieldStats = new ComponentField(CreateIndex.FIELD_CONTENT, CreateIndex.FIELD_ID);
ArrayList&lt;Integer&gt; fullDocSet = new ArrayList&lt;Integer&gt;(Arrays.asList(new Integer[]{0,1,2}));
ArrayList&lt;Integer&gt; fullDocList = new ArrayList&lt;Integer&gt;();
try {
  fieldStats.termVectorList.add(new ComponentTermVector(&quot;wordList&quot;, &quot;t&quot;, null, false, &quot;n,sum&quot;, CodecUtil.STATS_TYPE_SUM, CodecUtil.SORT_DESC, null, 10, null, null, null, null, null, null, null, null, null));
  CodecUtil.collect(CreateIndex.FIELD_CONTENT, searcher, indexReader, fullDocList,
      fullDocSet, fieldStats);
  for (ComponentTermVector ct : fieldStats.termVectorList) {
    HashMap&lt;String, Map&lt;String, Object&gt;&gt; tvList = new HashMap&lt;String, Map&lt;String, Object&gt;&gt;();
    Map&lt;String, ?&gt; tcList = ct.subComponentFunction.dataCollector
        .getResult().getList();
    for (String key : tcList.keySet()) {
      tvList.put(key,
          ((MtasDataItem&lt;?, ?&gt;) tcList.get(key)).rewrite(false));
    }
    System.out.println(tvList);
  }
} catch (IllegalAccessException | IllegalArgumentException
    | InvocationTargetException | mtas.parser.function.ParseException e) {
  e.printStackTrace();
}
</pre></div></div>
<p><b>Appendix</b></p>
<p>Code class <tt>CreateIndex</tt></p>

<div class="source">
<div class="source"><pre class="prettyprint">import java.io.IOException;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;

public class CreateIndex {

  public static String FIELD_ID = &quot;id&quot;;
  public static String FIELD_TITLE = &quot;title&quot;;
  public static String FIELD_CONTENT = &quot;content&quot;;

  private Directory directory;

  public CreateIndex(String configFile, HashMap&lt;String, String&gt; files)
      throws IOException {
    this(null, configFile, files);
  }

  public CreateIndex(String indexPath, String configFile,
      HashMap&lt;String, String&gt; files) throws IOException {
    initialize(null, configFile, files);
  }

  public Directory getDirectory() {
    return directory;
  }

  private void initialize(String indexPath, String configFile,
      HashMap&lt;String, String&gt; files) throws IOException {
    if (indexPath != null) {
      directory = FSDirectory.open(Paths.get(indexPath));
    } else {
      directory = new RAMDirectory();
    }
    Map&lt;String, String&gt; paramsCharFilterMtas = new HashMap&lt;String, String&gt;();
    paramsCharFilterMtas.put(&quot;type&quot;, &quot;file&quot;);
    Map&lt;String, String&gt; paramsTokenizer = new HashMap&lt;String, String&gt;();
    paramsTokenizer.put(&quot;configFile&quot;, configFile);
    Analyzer mtasAnalyzer = CustomAnalyzer
        .builder(Paths.get(&quot;docker&quot;).toAbsolutePath())
        .addCharFilter(&quot;mtas&quot;, paramsCharFilterMtas)
        .withTokenizer(&quot;mtas&quot;, paramsTokenizer).build();
    Map&lt;String, Analyzer&gt; analyzerPerField = new HashMap&lt;String, Analyzer&gt;();
    analyzerPerField.put(FIELD_CONTENT, mtasAnalyzer);
    PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(
        new StandardAnalyzer(), analyzerPerField);
    IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setUseCompoundFile(false);
    config.setCodec(Codec.forName(&quot;MtasCodec&quot;));
    IndexWriter w = new IndexWriter(directory, config);
    w.deleteAll();
    int counter = 0;
    for (String title : files.keySet()) {
      Document doc = new Document();
      doc.add(new StringField(FIELD_ID, Integer.valueOf(counter).toString(),
          Field.Store.YES));
      doc.add(new StringField(FIELD_TITLE, title, Field.Store.YES));
      doc.add(new TextField(FIELD_CONTENT, files.get(title), Field.Store.YES));
      w.addDocument(doc);
      counter++;
    }
    w.commit();
    w.close();
  }
}
</pre></div></div>
<p>Code method <tt>createQuery</tt></p>

<div class="source">
<div class="source"><pre class="prettyprint">MtasSpanQuery createQuery(String field, String cql,
      MtasSpanQuery ignore, Integer maximumIgnoreLength) throws ParseException {
  Reader reader = new BufferedReader(new StringReader(cql));
  MtasCQLParser p = new MtasCQLParser(reader);
  return p.parse(field, null, null, ignore, maximumIgnoreLength);
}
</pre></div></div>
                  </div>
            </div>
          </div>

    <hr/>

    <footer>
            <div class="container-fluid">
                      <div class="row-fluid">
                                      <p >Copyright &copy;                    2017
                        <a href="http://www.meertens.knaw.nl/">Meertens Institute</a>.
            All rights reserved.
      </p>
                </div>


                </div>
    </footer>
        </body>
</html>