www.pudn.com > lucene-2.0.0-src.zip > GreekAnalyzer.java


package org.apache.lucene.analysis.el; 
 
/** 
 * Copyright 2005 The Apache Software Foundation 
 * 
 * Licensed under the Apache License, Version 2.0 (the "License"); 
 * you may not use this file except in compliance with the License. 
 * You may obtain a copy of the License at 
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0 
 * 
 * Unless required by applicable law or agreed to in writing, software 
 * distributed under the License is distributed on an "AS IS" BASIS, 
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 * See the License for the specific language governing permissions and 
 * limitations under the License. 
 */ 
 
import org.apache.lucene.analysis.Analyzer; 
import org.apache.lucene.analysis.StopFilter; 
import org.apache.lucene.analysis.TokenStream; 
import org.apache.lucene.analysis.standard.StandardTokenizer; 
 
import java.io.Reader; 
import java.util.HashSet; 
import java.util.Hashtable; 
import java.util.Set; 
 
/** 
 * Analyzer for the Greek language. Supports an external list of stopwords (words 
 * that will not be indexed at all). 
 * A default set of stopwords is used unless an alternative list is specified. 
 * 
 * @author  Panagiotis Astithas, past@ebs.gr 
 */ 
public final class GreekAnalyzer extends Analyzer 
{ 
    // the letters are indexes to the charset array (see GreekCharsets.java) 
    private static char A = 6; 
    private static char B = 7; 
    private static char G = 8; 
    private static char D = 9; 
    private static char E = 10; 
    private static char Z = 11; 
    private static char H = 12; 
    private static char TH = 13; 
    private static char I = 14; 
    private static char K = 15; 
    private static char L = 16; 
    private static char M = 17; 
    private static char N = 18; 
    private static char KS = 19; 
    private static char O = 20; 
    private static char P = 21; 
    private static char R = 22; 
    private static char S = 24;	// skip final sigma 
    private static char T = 25; 
    private static char Y = 26; 
    private static char F = 27; 
    private static char X = 28; 
    private static char PS = 29; 
    private static char W = 30; 
 
    /** 
     * List of typical Greek stopwords. 
     */ 
    private static char[][] GREEK_STOP_WORDS = { 
        {O}, 
		{H}, 
		{T, O}, 
        {O, I}, 
		{T, A}, 
		{T, O, Y}, 
		{T, H, S}, 
		{T, W, N}, 
		{T, O, N}, 
		{T, H, N}, 
		{K, A, I}, 
		{K, I}, 
		{K}, 
		{E, I, M, A, I}, 
		{E, I, S, A, I}, 
		{E, I, N, A, I}, 
		{E, I, M, A, S, T, E}, 
		{E, I, S, T, E}, 
		{S, T, O}, 
		{S, T, O, N}, 
		{S, T, H}, 
		{S, T, H, N}, 
		{M, A}, 
		{A, L, L, A}, 
		{A, P, O}, 
		{G, I, A}, 
		{P, R, O, S}, 
		{M, E}, 
		{S, E}, 
		{W, S}, 
		{P, A, R, A}, 
		{A, N, T, I}, 
		{K, A, T, A}, 
		{M, E, T, A}, 
		{TH, A}, 
		{N, A}, 
		{D, E}, 
		{D, E, N}, 
		{M, H}, 
		{M, H, N}, 
		{E, P, I}, 
		{E, N, W}, 
		{E, A, N}, 
		{A, N}, 
		{T, O, T, E}, 
		{P, O, Y}, 
		{P, W, S}, 
		{P, O, I, O, S}, 
		{P, O, I, A}, 
		{P, O, I, O}, 
		{P, O, I, O, I}, 
		{P, O, I, E, S}, 
		{P, O, I, W, N}, 
		{P, O, I, O, Y, S}, 
		{A, Y, T, O, S}, 
		{A, Y, T, H}, 
		{A, Y, T, O}, 
		{A, Y, T, O, I}, 
		{A, Y, T, W, N}, 
		{A, Y, T, O, Y, S}, 
		{A, Y, T, E, S}, 
		{A, Y, T, A}, 
		{E, K, E, I, N, O, S}, 
		{E, K, E, I, N, H}, 
		{E, K, E, I, N, O}, 
		{E, K, E, I, N, O, I}, 
		{E, K, E, I, N, E, S}, 
		{E, K, E, I, N, A}, 
		{E, K, E, I, N, W, N}, 
		{E, K, E, I, N, O, Y, S}, 
		{O, P, W, S}, 
		{O, M, W, S}, 
		{I, S, W, S}, 
		{O, S, O}, 
		{O, T, I} 
    }; 
 
    /** 
     * Contains the stopwords used with the StopFilter. 
     */ 
    private Set stopSet = new HashSet(); 
 
    /** 
     * Charset for Greek letters. 
     * Represents encoding for 24 lowercase Greek letters. 
     * Predefined charsets can be taken from GreekCharSets class 
     */ 
    private char[] charset; 
 
    public GreekAnalyzer() { 
        charset = GreekCharsets.UnicodeGreek; 
        stopSet = StopFilter.makeStopSet( 
                    makeStopWords(GreekCharsets.UnicodeGreek)); 
    } 
 
    /** 
     * Builds an analyzer. 
     */ 
    public GreekAnalyzer(char[] charset) 
    { 
        this.charset = charset; 
        stopSet = StopFilter.makeStopSet(makeStopWords(charset)); 
    } 
 
    /** 
     * Builds an analyzer with the given stop words. 
     */ 
    public GreekAnalyzer(char[] charset, String[] stopwords) 
    { 
        this.charset = charset; 
        stopSet = StopFilter.makeStopSet(stopwords); 
    } 
 
    // Takes greek stop words and translates them to a String array, using 
    // the given charset 
    private static String[] makeStopWords(char[] charset) 
    { 
        String[] res = new String[GREEK_STOP_WORDS.length]; 
        for (int i = 0; i < res.length; i++) 
        { 
            char[] theStopWord = GREEK_STOP_WORDS[i]; 
            // translate the word,using the charset 
            StringBuffer theWord = new StringBuffer(); 
            for (int j = 0; j < theStopWord.length; j++) 
            { 
                theWord.append(charset[theStopWord[j]]); 
            } 
            res[i] = theWord.toString(); 
        } 
        return res; 
    } 
 
    /** 
     * Builds an analyzer with the given stop words. 
     */ 
    public GreekAnalyzer(char[] charset, Hashtable stopwords) 
    { 
        this.charset = charset; 
        stopSet = new HashSet(stopwords.keySet()); 
    } 
 
    /** 
     * Creates a TokenStream which tokenizes all the text in the provided Reader. 
     * 
     * @return  A TokenStream build from a StandardTokenizer filtered with 
     *                  GreekLowerCaseFilter and StopFilter 
     */ 
    public TokenStream tokenStream(String fieldName, Reader reader) 
    { 
    	TokenStream result = new StandardTokenizer(reader); 
        result = new GreekLowerCaseFilter(result, charset); 
        result = new StopFilter(result, stopSet); 
        return result; 
    } 
}