Solr: highlight: hl.simple.pre / post does not appear

With solr, I'm trying to highlight some text using the hl.formatter option with hl.simple.pre / post.

My problem is that hl.simple.pre / post code does not appear in the selection results, I do not understand why.

As an example, I call this URL:

http://localhost:8080/solr/Employees/select?q=lastName:anthan&fl=lastName&wt=json&indent=true&hl=true&hl.fl=lastName&hl.simple.pre=<em>&hl.simple.post=</em>

I get:

 ..."highlighting": {
    "NB0094418": {
      "lastName": [
        "Yogan<em>anthan</em>" => OK
      ]
    },
    "NB0104046": {
      "lastName": [
        "Vijayakanthan" => KO, I want Vijayak<em>anthan</em>
      ]
    },
    "NB0144981": {
      "lastName": [
        "Parmananthan" => KO, I want Parman<em>anthan</em>
      ]
    },...

Does anyone have an idea why I have this behavior?

My configuration:

schema.xml

<fieldType name="nameType" class="solr.TextField">
    <analyzer type="index">
        <tokenizer class="solr.NGramTokenizerFactory" minGramSize="2" maxGramSize="50" />
        <filter class="solr.LowerCaseFilterFactory" />
        <filter class="solr.ASCIIFoldingFilterFactory" />
        <filter class="solr.TrimFilterFactory" />
        <filter class="solr.PatternReplaceFilterFactory" pattern="([^a-z])" replacement="" replace="all" />
    </analyzer>

    <analyzer type="query">
        <tokenizer class="solr.KeywordTokenizerFactory" />
        <filter class="solr.LowerCaseFilterFactory" />
        <filter class="solr.ASCIIFoldingFilterFactory" />
        <filter class="solr.TrimFilterFactory" />
        <filter class="solr.PatternReplaceFilterFactory" pattern="([^a-z])" replacement="" replace="all" />
    </analyzer>
</fieldType>

...
<fields>
    <field name="lastName" type="nameType" indexed="true" stored="true" required="true" />
</fields>

solrconfig.xml

<requestHandler name="standard" class="solr.SearchHandler" default="true">
    <lst name="defaults">
        <str name="echoParams">explicit</str>
    </lst>
</requestHandler>

...

<searchComponent class="solr.HighlightComponent" name="highlight">
    <highlighting>
        <fragmenter name="gap" default="true" class="solr.highlight.GapFragmenter">
            <lst name="defaults">
                <int name="hl.fragsize">100</int>
            </lst>
        </fragmenter>

        <fragmenter name="regex" class="solr.highlight.RegexFragmenter">
            <lst name="defaults">
                <int name="hl.fragsize">70</int>
                <float name="hl.regex.slop">0.5</float>
                <str name="hl.regex.pattern">[-\w ,/\n\&quot;&apos;]{20,200}</str>
            </lst>
        </fragmenter>

        <formatter name="html" default="true" class="solr.highlight.HtmlFormatter">
            <lst name="defaults">
                <str name="hl.simple.pre"><![CDATA[<em>]]></str>
                <str name="hl.simple.post"><![CDATA[</em>]]></str>
            </lst>
        </formatter>

        <encoder name="html" default="true" class="solr.highlight.HtmlEncoder" />

        <fragListBuilder name="simple" default="true" class="solr.highlight.SimpleFragListBuilder" />
        <fragListBuilder name="single" class="solr.highlight.SingleFragListBuilder" />
        <fragmentsBuilder name="default" default="true" class="solr.highlight.ScoreOrderFragmentsBuilder">
        </fragmentsBuilder>

        <fragmentsBuilder name="colored" class="solr.highlight.ScoreOrderFragmentsBuilder">
            <lst name="defaults">
                <str name="hl.tag.pre"><![CDATA[
                <b style="background:yellow">,<b style="background:lawgreen">,
                <b style="background:aquamarine">,<b style="background:magenta">,
                <b style="background:palegreen">,<b style="background:coral">,
                <b style="background:wheat">,<b style="background:khaki">,
                <b style="background:lime">,<b style="background:deepskyblue">]]></str>
                <str name="hl.tag.post"><![CDATA[</b>]]></str>
            </lst>
        </fragmentsBuilder>
    </highlighting>
</searchComponent>
+4
source share
2 answers

. , , , , . , . , , , .

-, , : , , . , . , (, , ), , . 1--1. , 11 . 12 , ngram 9 . 15 ngrams 7 . 18 ngrams 6 . 21 ngrams 5 , 24 4 . ( , , , , , , , , , , , . )

, :

  • WhitespaceTokenizer NGramFilterFactory NGramTokenizerFactory. ( , , .) , , .
  • FastVectorHighlighter. , ( , ), hl.simple.pre hl.tag.pre ( * ).

, . , , , WhitespaceTokenizer. , " ", ngrams, "s is", "tes" .., ngrams , . NGramTokenizer JavaDocs , NGramTokenizer.isTokenChar(), , . .

:

WhitespaceSplittingNGramTokenizer.java:

package info.jwismar.solr.plugin;

import java.io.Reader;

import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.util.Version;

public class WhitespaceSplittingNGramTokenizer extends NGramTokenizer {

    public WhitespaceSplittingNGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
        super(version, input, minGram, maxGram);
    }

    public WhitespaceSplittingNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram,
            int maxGram) {
        super(version, factory, input, minGram, maxGram);
    }

    public WhitespaceSplittingNGramTokenizer(Version version, Reader input) {
        super(version, input);
    }

    @Override
    protected boolean isTokenChar(int chr) {
        return !Character.isWhitespace(chr);
    }
}

WhitespaceSplittingNGramTokenizerFactory.java:

package info.jwismar.solr.plugin;

import java.io.Reader;
import java.util.Map;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeSource.AttributeFactory;

public class WhitespaceSplittingNGramTokenizerFactory extends TokenizerFactory {

    private final int maxGramSize;
    private final int minGramSize;

    /** Creates a new WhitespaceSplittingNGramTokenizer */
    public WhitespaceSplittingNGramTokenizerFactory(Map<String, String> args) {
        super(args);
        minGramSize = getInt(args, "minGramSize", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
        maxGramSize = getInt(args, "maxGramSize", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
        if (!args.isEmpty()) {
            throw new IllegalArgumentException("Unknown parameters: " + args);
        }
    }

    @Override
    public Tokenizer create(AttributeFactory factory, Reader reader) {
        return new WhitespaceSplittingNGramTokenizer(luceneMatchVersion, factory, reader, minGramSize, maxGramSize);
    }
}

.jar -, SOLR . - lib solrconfig.xml, SOLR, . ( solr-ngram-plugin.jar /opt/solr-ngram-plugin/.)

solrconfig.xml:

<lib path="/opt/solr-ngram-plugin/solr-ngram-plugin.jar" />

schema.xml( ):

<fieldType name="any_token_ngram" class="solr.TextField">
    <analyzer type="index">
        <tokenizer class="info.jwismar.solr.plugin.WhitespaceSplittingNGramTokenizerFactory" maxGramSize="30" minGramSize="2"/>
        <filter class="solr.LowerCaseFilterFactory" />
    </analyzer>
    <analyzer type="query">
        <tokenizer class="solr.StandardTokenizerFactory" />
        <filter class="solr.LowerCaseFilterFactory" />
        <filter class="solr.PatternReplaceFilterFactory"
            pattern="^(.{30})(.*)?" replacement="$1" replace="all" />
    </analyzer>
</fieldType>

schema.xml( ):

<fields>
    <field name="property_address_full" type="string" indexed="false" stored="true" />
    <field name="property_address_full_any_ngram" type="any_token_ngram" indexed="true"
        stored="true" omitNorms="true" termVectors="true" termPositions="true"
        termOffsets="true"/>
</fields>
<copyField source="property_address_full" dest="property_address_full_any_ngram" />

solrconfig.xml( ( URL-, , ):

<!-- request handler to return typeahead suggestions -->
<requestHandler name="/suggest" class="solr.SearchHandler">
    <lst name="defaults">
        <str name="echoParams">explicit</str>
        <str name="defType">edismax</str>
        <str name="rows">10</str>
        <str name="mm">2</str>
        <str name="fl">*,score</str>
        <str name="qf">
            property_address_full^100.0
            property_address_full_any_ngram^10.0
        </str>
        <str name="sort">score desc</str>
        <str name="hl">true</str>
        <str name="hl.fl">property_address_full_any_ngram</str>
        <str name="hl.tag.pre">|-&gt;</str>
        <str name="hl.tag.post">&lt;-|</str>
        <str name="hl.fragsize">1000</str>
        <str name="hl.mergeContinuous">true</str>
        <str name="hl.useFastVectorHighlighter">true</str>
    </lst>
</requestHandler>
+3

, , hl.tag.pre hl.tag.post, , , <em> </em> pre/post...

, hl.tag.pre hl.tag.post ( ). defaults, searchComponent solrconfig.xml.

, searchComponent hl.tag.pre hl.tag.post <lst name="invariant">, .

0

Source: https://habr.com/ru/post/1541238/


All Articles