#include "CLucene/StdHeader.h" #ifndef _lucene_analysis_standard_StandardTokenizer #define _lucene_analysis_standard_StandardTokenizer #include "../AnalysisHeader.h" #include "../Analyzers.h" #include "StandardTokenizerConstants.h" #include "CLucene/util/StringBuffer.h" #include "CLucene/util/FastCharStream.h" #include "CLucene/util/Reader.h" using namespace lucene::analysis; using namespace lucene::util; namespace lucene{ namespace analysis { namespace standard { // A grammar-based tokenizer constructed with JavaCC. // //

This should be a good tokenizer for most European-language documents. // //

Many applications have specific tokenizer needs. If this tokenizer does // not suit your application, please consider copying this source code // directory to your project and maintaining your own grammar-based tokenizer. class StandardTokenizer: public Tokenizer { private: bool maybeAcronym; bool maybeHost; bool maybeNumber; bool prevHasDigit; public: FastCharStream& rd; int_t start; // Constructs a tokenizer for this Reader. StandardTokenizer(Reader& reader); ~StandardTokenizer(); void close(); // Returns the next token in the stream, or NULL at EOS. //

The returned token's type is set to an element of {@link // StandardTokenizerConstants#tokenImage}. Token* next(); Token* ReadAlphaNum(const uchar_t prev); //Reads for apostrophe. Token* ReadApostrophe(StringBuffer& str, const uchar_t ch); //Reads for something@... it may be a COMPANY name or a EMAIL address Token* ReadAt(const char_t* str, const uchar_t prev); //Reads for COMPANYs in format some&home, at&t. Token* ReadCompany(const char_t* str, const uchar_t prev); //Reads for EMAILs somebody@somewhere.else.com. Token* ReadEmail(StringBuffer& str, const uchar_t ch); //Reads for some. //It may be a NUMBER like 12.3, an ACRONYM like U.S.A., or a HOST www.som.com. Token* ReadNumber(StringBuffer& str, const uchar_t prev); }; }}} #endif