#include "CLucene/StdHeader.h" #include "StandardTokenizer.h" #include "../AnalysisHeader.h" #include "../Analyzers.h" #include "StandardTokenizerConstants.h" #include "CLucene/util/StringBuffer.h" #include "CLucene/util/FastCharStream.h" #include "CLucene/util/Reader.h" using namespace lucene::analysis; using namespace lucene::util; namespace lucene{ namespace analysis { namespace standard { /** Constructs a tokenizer for this Reader. */ StandardTokenizer::StandardTokenizer(Reader& reader): rd(*new FastCharStream(reader)), start (1), maybeAcronym(false), maybeHost(false), maybeNumber(false), prevHasDigit(false) { } StandardTokenizer::~StandardTokenizer(){ delete &rd; } void StandardTokenizer::close(){ } /** Returns the next token in the stream, or NULL at EOS. *

The returned token's type is set to an element of {@link * StandardTokenizerConstants#tokenImage}. */ Token* StandardTokenizer::next() { while(!rd.Eos()) { uchar_t ch = rd.GetNext(); // For skipping whitespaces if( isSpace((uchar_t)ch)!=0 ) { continue; } // Read for Alpha-Nums if( isAlNum((uchar_t)ch)!=0 ) { start = rd.Column(); return ReadAlphaNum(ch); } continue; } return NULL; } Token* StandardTokenizer::ReadAlphaNum(const uchar_t prev) { maybeAcronym = true; maybeHost = true; maybeNumber = true; StringBuffer str; str.append(prev); uchar_t ch = prev; while(!rd.Eos() && isSpace((char_t)ch)==0 ) { ch = rd.GetNext(); if(isAlNum((uchar_t)ch)!=0) { if( isDigit(ch)!=0) prevHasDigit = true; str.append( ch ); } switch(ch) { case '\'': return ReadApostrophe(str, ch); case '.': return ReadNumber(str, ch); case '&': return ReadCompany(str.getBuffer(), ch); case '@': return ReadAt(str.getBuffer(), ch); case '-': case ',': case '_': case '/': return ReadNumber(str, ch); default: break; } } return new Token(str.getBuffer(), start, rd.Column(), tokenImage[lucene::analysis::standard::ALPHANUM] ); } //Reads for apostrophe. Token* StandardTokenizer::ReadApostrophe(StringBuffer& str, const uchar_t ch) { str.append( ch ); while( isSpace((char_t)rd.Peek())==0 && isAlNum((char_t)rd.Peek())!=0 || rd.Peek() == '\'') { str.append( rd.GetNext() ); } if(rd.Peek() == '\'') str.len--; return new Token(str.getBuffer(), start, rd.Column(), tokenImage[lucene::analysis::standard::APOSTROPHE]); } //Reads for something@... it may be a COMPANY name or a EMAIL address Token* StandardTokenizer::ReadAt(const char_t* str, const uchar_t prev) { StringBuffer val(_T("")); bool append = true; uchar_t ch = prev; while(!rd.Eos() && isSpace((char_t)ch)==0) { ch = rd.GetNext(); if( isAlNum((uchar_t)ch)!=0 && append) val.append( ch ); else if(ch == '.' && val.length() > 0){ val.prepend(_T("@")); val.prepend(str); return ReadEmail(val, ch); }else append = false; } if(val.length() > 0){ val.prepend(_T("@")); val.prepend(str); return new Token(val.getBuffer(), start, rd.Column(), tokenImage[lucene::analysis::standard::COMPANY]); }else{ return new Token(str, start, rd.Column(), tokenImage[lucene::analysis::standard::ALPHANUM]); } } //Reads for COMPANYs in format some&home, at&t. Token* StandardTokenizer::ReadCompany(const char_t* str, const uchar_t prev) { bool append = true; StringBuffer val(_T("")); uchar_t ch = prev; while(!rd.Eos() && isSpace((char_t)ch)==0) { ch = rd.GetNext(); if( isAlNum((uchar_t)ch)!=0 && append) val.append( ch); else append = false; } if(val.length() > 0){ val.prepend(_T("&")); val.prepend(str); return new Token(val.getBuffer(), start, rd.Column(), tokenImage[lucene::analysis::standard::COMPANY]); }else return new Token(str, start, rd.Column(), tokenImage[lucene::analysis::standard::ALPHANUM]); } //Reads for EMAILs somebody@somewhere.else.com. Token* StandardTokenizer::ReadEmail(StringBuffer& str, const uchar_t ch) { str.append(ch); while(isSpace((char_t)rd.Peek())==0 && isAlNum((char_t)rd.Peek())!=0 || rd.Peek() == '.') { str.append(rd.GetNext()); } if(rd.Peek() == '.') str.len--; return new Token(str.getBuffer(), start, rd.Column(), tokenImage[lucene::analysis::standard::EMAIL]); } //Reads for some. //It may be a NUMBER like 12.3, an ACRONYM like U.S.A., or a HOST www.som.com. Token* StandardTokenizer::ReadNumber(StringBuffer& str, const uchar_t prev) { uchar_t ch = prev; str.append( (char_t)ch ); StringBuffer val(_T("")); bool append = true; bool hasDigit = false; if(ch != '.') { maybeHost = false; maybeAcronym = false; } while(!rd.Eos() && isSpace((uchar_t)ch)==0) { ch = rd.GetNext(); if( Misc::isLetter(ch)!=0 && append) { val.append( ch ); } else if( isDigit(ch)!=0 && append) { // acronyms can't contain numbers maybeAcronym = false; // check for number hasDigit = true; val.append( ch ); } else if(ch == '_' || ch == '-' || ch == '/' || ch == ',' || ch == '.' && append) { if(ch != '.') { maybeHost = false; maybeAcronym = false; } if( val.length() == 0) { append = false; continue; } else { maybeNumber = (prevHasDigit != hasDigit); prevHasDigit = hasDigit; val.prepend(str.getBuffer()); return ReadNumber(val, ch); } } else append = false; } // end of while if(maybeAcronym && val.length() == 0){ val.prepend(str.getBuffer()); return new Token(val.getBuffer(), start, rd.Column(), tokenImage[lucene::analysis::standard::ACRONYM]); }else if(maybeHost){ val.prepend(str.getBuffer()); return new Token(val.getBuffer(), start, rd.Column(), tokenImage[lucene::analysis::standard::HOST]); }else if(maybeNumber){ val.prepend(str.getBuffer()); return new Token(val.getBuffer(), start, rd.Column(), tokenImage[lucene::analysis::standard::NUM]); }else { int_t idx = stringCSpn( str.getBuffer(), _T("_-./,") ); str.len = idx; return new Token(str.getBuffer(), start, rd.Column(), tokenImage[lucene::analysis::standard::ALPHANUM]); } } }}}