打印语汇单元组成
package org.apache.lucene.demo;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Analyzer.ReuseStrategy;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Analyzer.TokenStreamComponents;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.compound.TestCompoundWordTokenFilter.MockRetainAttribute;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.util.CharTokenizer;
import org.apache.lucene.util.Version;
public class AnalysisTxt {
private static final String[] examples = {
"The quick brown fox jumped over the lazy dog",
"XY&Z Corporation - xyz@example.com"
};
private static final Analyzer[] analyzers = new Analyzer[] {
new WhitespaceAnalyzer(),
new SimpleAnalyzer(),
new StopAnalyzer(Version.LUCENE_4_10_3),
new StandardAnalyzer(Version.LUCENE_4_10_3)
};
public static void main(String[] args) {
String[] strings = examples;
if (args.length > 0) {
strings = args;
}
for (String text : strings) {
analyze(text);
}
}
private static void analyze(String text) {
System.out.println("Analyzing \"" + text + "\"");
for (Analyzer analyzer : analyzers)
{
String name = analyzer.getClass().getSimpleName();
System.out.println(" " + name + ":");
System.out.print(" ");
try {
AnalyzerUtils.displayTokens(analyzer, text);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("");
}
System.out.println("");
}
}
class AnalyzerUtils {
public static void displayTokens(Analyzer analyzer, String text) throws IOException {
String fieldName = "contens";
Reader reader = new StringReader(text);
TokenStream stream = analyzer.tokenStream(fieldName, reader);
stream.reset();
displayTokens(stream);
stream.close();
}
public static void displayTokens(TokenStream stream) throws IOException {
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
while (stream.incrementToken()) {
System.out.print("[" + term.toString()
+ ", " + posIncrAtt.getPositionIncrement()
+ ", " + offsetAtt.startOffset()
+ " ~ " + offsetAtt.endOffset()
+ "] ");
}
}
}
输出:
Analyzing "The quick brown fox jumped over the lazy dog"
WhitespaceAnalyzer:
[The, 1, 0 ~ 3] [quick, 1, 4 ~ 9] [brown, 1, 10 ~ 15] [fox, 1, 16 ~ 19] [jumped, 1, 20 ~ 26] [over, 1, 27 ~ 31] [the, 1, 32 ~ 35] [lazy, 1, 36 ~ 40] [dog, 1, 41 ~ 44]
SimpleAnalyzer:
[the, 1, 0 ~ 3] [quick, 1, 4 ~ 9] [brown, 1, 10 ~ 15] [fox, 1, 16 ~ 19] [jumped, 1, 20 ~ 26] [over, 1, 27 ~ 31] [the, 1, 32 ~ 35] [lazy, 1, 36 ~ 40] [dog, 1, 41 ~ 44]
StopAnalyzer:
[quick, 2, 4 ~ 9] [brown, 1, 10 ~ 15] [fox, 1, 16 ~ 19] [jumped, 1, 20 ~ 26] [over, 1, 27 ~ 31] [lazy, 2, 36 ~ 40] [dog, 1, 41 ~ 44]
StandardAnalyzer:
[quick, 2, 4 ~ 9] [brown, 1, 10 ~ 15] [fox, 1, 16 ~ 19] [jumped, 1, 20 ~ 26] [over, 1, 27 ~ 31] [lazy, 2, 36 ~ 40] [dog, 1, 41 ~ 44]
Analyzing "XY&Z Corporation - xyz@example.com"
WhitespaceAnalyzer:
[XY&Z, 1, 0 ~ 4] [Corporation, 1, 5 ~ 16] [-, 1, 17 ~ 18] [xyz@example.com, 1, 19 ~ 34]
SimpleAnalyzer:
[xy, 1, 0 ~ 2] [z, 1, 3 ~ 4] [corporation, 1, 5 ~ 16] [xyz, 1, 19 ~ 22] [example, 1, 23 ~ 30] [com, 1, 31 ~ 34]
StopAnalyzer:
[xy, 1, 0 ~ 2] [z, 1, 3 ~ 4] [corporation, 1, 5 ~ 16] [xyz, 1, 19 ~ 22] [example, 1, 23 ~ 30] [com, 1, 31 ~ 34]
StandardAnalyzer:
[xy, 1, 0 ~ 2] [z, 1, 3 ~ 4] [corporation, 1, 5 ~ 16] [xyz, 1, 19 ~ 22] [example.com, 1, 23 ~ 34]