#include "CLucene/StdHeader.h" #include "FuzzyQuery.h" #ifndef NO_FUZZY_QUERY namespace lucene{ namespace search{ FuzzyTermEnum::FuzzyTermEnum(IndexReader& reader, Term* term): FilteredTermEnum(reader,term), searchTerm(term), field(term->Field()), /* DSR:PROPOSED: I don't see why text should be duplicated when field is not: */ /* text( stringDuplicate(term->Text()) ),*/ text(term->Text()), textlen(stringLength(text)), distance(0), fieldMatch(false), endEnum(false), eWidth(0), eHeight(0) { setFuzzyThreshold(0.5); Term* t = new Term(field, _T("")); TermEnum* terms = &reader.getTerms(t); setEnum( terms ); /* DSR:CL_BUG_LEAK: In CL 0.8.11, t was never finalized. */ t->finalize(); } FuzzyTermEnum::~FuzzyTermEnum(){ /* DSR: ** - searchTerm is not finalized because the FuzzyQuery that created this ** owns the reference (it's finalized in FuzzyQuery::~FuzzyQuery). ** - field is not deleted because the memory belongs to searchTerm. ** - text is not deleted because the memory belongs to searchTerm. */ /* DSR:CL_BUG_LEAK: In CL 0.8.11, e was not deleted. */ delete[] e; } bool FuzzyTermEnum::EndEnum() { return endEnum; } void FuzzyTermEnum::close(){ FilteredTermEnum::close(); } void FuzzyTermEnum::setFuzzyThreshold(float_t value){ FUZZY_THRESHOLD = value; SCALE_FACTOR = 1.0f / (1.0f - FUZZY_THRESHOLD); } bool FuzzyTermEnum::termCompare(Term* term) { if ( stringCompare(field, term->Field())==0 ) { const char_t* target = (const char_t*) stringDuplicate(term->Text()); /* DSR:PROPOSED: make const */ const int_t targetlen = stringLength(target); int_t dist = editDistance(text, target, textlen, targetlen); distance = 1 - ((double)dist / (double)min(textlen, targetlen)); delete[] target; return (distance > FUZZY_THRESHOLD); } endEnum = true; return false; } float_t FuzzyTermEnum::difference() { return (float_t)((distance - FUZZY_THRESHOLD) * SCALE_FACTOR); } //static int_t FuzzyTermEnum::Min(const int_t a, const int_t b, const int_t c){ int_t t = (a < b) ? a : b; return (t < c) ? t : c; } /* DSR:PROPOSED: Make args const: */ int_t FuzzyTermEnum::editDistance(const char_t* s, const char_t* t, const int_t n, const int_t m) { int_t i; // iterates through s int_t j; // iterates through t char_t s_i; // ith character of s if (n == 0) return m; if (m == 0) return n; if (eWidth <= n || eHeight <= m) { //resize e if ( eWidth > 0 || eHeight > 0 ) delete[] e; eWidth = max(eWidth, n+1); eHeight = max(eHeight, m+1); e = new int_t[eWidth*eHeight]; } int_t* d = e; // matrix // init matrix d for (i = 0; i <= n; i++) d[i + (0*eWidth)] = i; for (j = 0; j <= m; j++) d[0 + (j*eWidth)] = j; // start computing edit distance for (i = 1; i <= n; i++) { s_i = s[i - 1]; for (j = 1; j <= m; j++) { if (s_i != t[j-1]) d[i + (j*eWidth)] = Min(d[i-1 + (j*eWidth)], d[i + ((j-1)*eWidth)], d[i-1 + ((j-1)*eWidth)])+1; else d[i + ((j)*eWidth)] = Min(d[i-1 + ((j)*eWidth)]+1, d[i + ((j-1)*eWidth)]+1, d[i-1 + ((j-1)*eWidth)]); } } // we got the result! return d[n + ((m)*eWidth)]; } FuzzyQuery::FuzzyQuery(Term* term): MultiTermQuery(term), fuzzyTerm(term->pointer()) { MultiTermQuery::LUCENE_STYLE_TOSTRING = true; } FuzzyQuery::~FuzzyQuery(){ fuzzyTerm->finalize(); } void FuzzyQuery::prepare(IndexReader& reader) { try { setEnum(new FuzzyTermEnum(reader, fuzzyTerm)); } catch (...) { } } const char_t* FuzzyQuery::toString(const char_t* field){ StringBuffer buffer; const char_t* b = MultiTermQuery::toString(field); buffer.append ( b ); delete[] b; buffer.append( _T("~") ); return buffer.ToString(); } const char_t* FuzzyQuery::getQueryName() const{ return _T("FuzzyQuery"); } }} #endif