. , , , , . , . , , , .
-, , : , , . , . , (, , ), , . 1--1. , 11 . 12 , ngram 9 . 15 ngrams 7 . 18 ngrams 6 . 21 ngrams 5 , 24 4 . ( , , , , , , , , , , , . )
, :
WhitespaceTokenizer NGramFilterFactory NGramTokenizerFactory. ( , , .) , , .FastVectorHighlighter. , ( , ), hl.simple.pre hl.tag.pre ( * ).
, . , , , WhitespaceTokenizer. , " ", ngrams, "s is", "tes" .., ngrams , . NGramTokenizer JavaDocs , NGramTokenizer.isTokenChar(), , . .
:
WhitespaceSplittingNGramTokenizer.java:
package info.jwismar.solr.plugin;
import java.io.Reader;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.util.Version;
public class WhitespaceSplittingNGramTokenizer extends NGramTokenizer {
public WhitespaceSplittingNGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
super(version, input, minGram, maxGram);
}
public WhitespaceSplittingNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram,
int maxGram) {
super(version, factory, input, minGram, maxGram);
}
public WhitespaceSplittingNGramTokenizer(Version version, Reader input) {
super(version, input);
}
@Override
protected boolean isTokenChar(int chr) {
return !Character.isWhitespace(chr);
}
}
WhitespaceSplittingNGramTokenizerFactory.java:
package info.jwismar.solr.plugin;
import java.io.Reader;
import java.util.Map;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeSource.AttributeFactory;
public class WhitespaceSplittingNGramTokenizerFactory extends TokenizerFactory {
private final int maxGramSize;
private final int minGramSize;
public WhitespaceSplittingNGramTokenizerFactory(Map<String, String> args) {
super(args);
minGramSize = getInt(args, "minGramSize", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
maxGramSize = getInt(args, "maxGramSize", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public Tokenizer create(AttributeFactory factory, Reader reader) {
return new WhitespaceSplittingNGramTokenizer(luceneMatchVersion, factory, reader, minGramSize, maxGramSize);
}
}
.jar -, SOLR . - lib solrconfig.xml, SOLR, . ( solr-ngram-plugin.jar /opt/solr-ngram-plugin/.)
solrconfig.xml:
<lib path="/opt/solr-ngram-plugin/solr-ngram-plugin.jar" />
schema.xml( ):
<fieldType name="any_token_ngram" class="solr.TextField">
<analyzer type="index">
<tokenizer class="info.jwismar.solr.plugin.WhitespaceSplittingNGramTokenizerFactory" maxGramSize="30" minGramSize="2"/>
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.PatternReplaceFilterFactory"
pattern="^(.{30})(.*)?" replacement="$1" replace="all" />
</analyzer>
</fieldType>
schema.xml( ):
<fields>
<field name="property_address_full" type="string" indexed="false" stored="true" />
<field name="property_address_full_any_ngram" type="any_token_ngram" indexed="true"
stored="true" omitNorms="true" termVectors="true" termPositions="true"
termOffsets="true"/>
</fields>
<copyField source="property_address_full" dest="property_address_full_any_ngram" />
solrconfig.xml( ( URL-, , ):
<requestHandler name="/suggest" class="solr.SearchHandler">
<lst name="defaults">
<str name="echoParams">explicit</str>
<str name="defType">edismax</str>
<str name="rows">10</str>
<str name="mm">2</str>
<str name="fl">*,score</str>
<str name="qf">
property_address_full^100.0
property_address_full_any_ngram^10.0
</str>
<str name="sort">score desc</str>
<str name="hl">true</str>
<str name="hl.fl">property_address_full_any_ngram</str>
<str name="hl.tag.pre">|-></str>
<str name="hl.tag.post"><-|</str>
<str name="hl.fragsize">1000</str>
<str name="hl.mergeContinuous">true</str>
<str name="hl.useFastVectorHighlighter">true</str>
</lst>
</requestHandler>