#include "CLucene/StdHeader.h" #ifndef _lucene_analysis_AnalysisHeader_ #define _lucene_analysis_AnalysisHeader_ #include "CLucene/util/Reader.h" namespace lucene{ namespace analysis{ class Token { private: const int_t startOffset; // start in source text const int_t endOffset; // end in source text const char_t* type; // lexical type public: #ifndef LUCENE_TOKEN_WORD_LENGTH char_t* termText; // the text of the term #else char_t termText[LUCENE_TOKEN_WORD_LENGTH]; // the text of the term #endif // Constructs a Token with the given term text, and start & end offsets. // The type defaults to "word." Token(const char_t* text, const int_t start, const int_t end): //termText( stringDuplicate(text) ), startOffset (start), endOffset (end), type ( _T("word") ) { #ifndef LUCENE_TOKEN_WORD_LENGTH termText = stringDuplicate(text); #else int len = stringLength(text); if ( len>LUCENE_TOKEN_WORD_LENGTH ){ stringNCopy(termText,text,LUCENE_TOKEN_WORD_LENGTH); termText[LUCENE_TOKEN_WORD_LENGTH] = 0; }else stringNCopy(termText,text,(len+1)); #endif } ~Token(){ #ifndef LUCENE_TOKEN_WORD_LENGTH delete[] termText; #endif } // Constructs a Token with the given text, start and end offsets, & type. Token(const char_t* text, const int_t start, const int_t end, const char_t* typ): //termText( stringDuplicate(text) ), startOffset (start), endOffset (end), type ( typ ) //type (stringDuplicate(typ) ) //shouldn't need to duplicate type, already in tokenImage { #ifndef LUCENE_TOKEN_WORD_LENGTH termText = stringDuplicate(text); #else int len = stringLength(text); if ( len>LUCENE_TOKEN_WORD_LENGTH ){ stringNCopy(termText,text,LUCENE_TOKEN_WORD_LENGTH); termText[LUCENE_TOKEN_WORD_LENGTH] = 0; }else stringNCopy(termText,text,(len+1)); #endif } // Returns the Token's term text. const char_t* TermText() { return termText; } // Returns this Token's starting offset, the position of the first character // corresponding to this token in the source text. // // Note that the difference between endOffset() and startOffset() may not be // equal to termText.length(), as the term text may have been altered by a // stemmer or some other filter. int_t StartOffset() const { return startOffset; } // Returns this Token's ending offset, one greater than the position of the // last character corresponding to this token in the source text. int_t EndOffset() const { return endOffset; } // Returns this Token's lexical type. Defaults to "word". const char_t* Type() const { return type; } }; class TokenStream { public: // Returns the next token in the stream, or null at EOS. virtual Token* next() = 0; // Releases resources associated with this stream. virtual void close() = 0; virtual ~TokenStream() { } }; class Analyzer { public: // Creates a TokenStream which tokenizes all the text in the provided // Reader. Default implementation forwards to tokenStream(Reader) for // compatibility with older version. Override to allow Analyzer to choose // strategy based on document and/or field. Must be able to handle null // field name for backward compatibility. virtual TokenStream& tokenStream(const char_t* fieldName, lucene::util::Reader* reader)=0; virtual ~Analyzer(){ } }; class Tokenizer:public TokenStream { protected: // The text source for this Tokenizer. lucene::util::Reader* input; public: // By default, closes the input Reader. virtual void close() { input->close(); } virtual ~Tokenizer(){ } }; class TokenFilter:public TokenStream { protected: // The source of tokens for this filter. TokenStream* input; bool deleteTokenStream; TokenFilter(TokenStream* in, bool deleteTS): input(in), deleteTokenStream(deleteTS) { } virtual ~TokenFilter(){ if ( deleteTokenStream ) delete input; } public: // Close the input TokenStream. void close() { input->close(); } }; }} #endif