#include "CLucene/StdHeader.h" #ifndef _lucene_analysis_standard__ #define _lucene_analysis_standard__ using namespace lucene::util; using namespace lucene::analysis; namespace lucene{ namespace analysis { namespace standard { // Normalizes tokens extracted with {@link StandardTokenizer}. class StandardFilter: public TokenFilter{ public: // Construct filtering in. StandardFilter(TokenStream* in); ~StandardFilter(); // Returns the next token in the stream, or NULL at EOS. //
Removes 's from the end of words. //
Removes dots from acronyms. Token* next(); }; // A grammar-based tokenizer constructed with JavaCC. // //
This should be a good tokenizer for most European-language documents. // //
Many applications have specific tokenizer needs. If this tokenizer does // not suit your application, please consider copying this source code // directory to your project and maintaining your own grammar-based tokenizer. class StandardTokenizer: public Tokenizer { private: bool maybeAcronym; bool maybeHost; bool maybeNumber; bool prevHasDigit; public: FastCharStream& rd; int_t start; // Constructs a tokenizer for this Reader. StandardTokenizer(Reader& reader); ~StandardTokenizer(); void close(); // Returns the next token in the stream, or NULL at EOS. //
The returned token's type is set to an element of {@link
// StandardTokenizerConstants#tokenImage}.
Token* next();
Token* ReadAlphaNum(const uchar_t prev);
//Reads for apostrophe.
Token* ReadApostrophe(StringBuffer& str, const uchar_t ch);
//Reads for something@... it may be a COMPANY name or a EMAIL address
Token* ReadAt(const char_t* str, const uchar_t prev);
//Reads for COMPANYs in format some&home, at&t.
Token* ReadCompany(const char_t* str, const uchar_t prev);
//Reads for EMAILs somebody@somewhere.else.com.
Token* ReadEmail(StringBuffer& str, const uchar_t ch);
//Reads for some.
//It may be a NUMBER like 12.3, an ACRONYM like U.S.A., or a HOST www.som.com.
Token* ReadNumber(StringBuffer& str, const uchar_t prev);
};
enum TokenTypes
{
ALPHANUM,
APOSTROPHE,
ACRONYM,
COMPANY,
EMAIL,
HOST,
NUM,
_EOF
};
const static char_t *tokenImage[] = { _T(" "),
_T("