#include "CLucene/StdHeader.h" #ifndef CLUCENE_LITE #include "DocumentWriter.h" #include "CLucene/util/VoidMap.h" #include "CLucene/store/Directory.h" #include "CLucene/document/Document.h" #include "CLucene/document/Field.h" #include "FieldInfos.h" #include "Term.h" #include "TermInfo.h" #include "CLucene/analysis/AnalysisHeader.h" #include "CLucene/util/VoidMap.h" #include "CLucene/search/Similarity.h" #include "TermInfosWriter.h" #include "FieldsWriter.h" //#include //for DEBUG using namespace std; using namespace lucene::util; namespace lucene{ namespace index { /*Posting*/ int_t Posting::getPositionsLength(){ return positionsLength; } Posting::Posting(Term& t, const int_t position): term (*t.pointer()) { freq = 1; positions = new int_t[1]; positionsLength = 1; positions[0] = position; } Posting::~Posting(){ delete[] positions; term.finalize(); } /*DocumentWriter =======================*/ //static const char_t* DocumentWriter::segmentname(const char_t* segment, const char_t* ext ){ char_t* buf = new char_t[MAX_PATH]; stringPrintF(buf,_T("%s%s"), segment,ext ); return buf; } DocumentWriter::DocumentWriter(lucene::store::Directory& d, lucene::analysis::Analyzer& a, const int_t mfl): analyzer(a), directory(d), termBuffer(*new Term( _T(""), _T("") ,true)),// avoid consing maxFieldLength(mfl), fieldInfos(NULL), fieldLengths(NULL) { } DocumentWriter::~DocumentWriter(){ clearPostingTable(); if ( fieldInfos != NULL ) delete fieldInfos; if ( fieldLengths != NULL ) delete[] fieldLengths; termBuffer.finalize(); } void DocumentWriter::clearPostingTable(){ map::iterator itr = postingTable.begin(); while ( itr != postingTable.end() ){ delete itr->second; itr->first->finalize(); itr++; } postingTable.clear(); } void DocumentWriter::addDocument(const char_t* segment, lucene::document::Document& doc) { // write field names fieldInfos = new FieldInfos(); fieldInfos->add(doc); const char_t* buf = segmentname(segment, _T(".fnm")); fieldInfos->write(directory, buf); delete[] buf; // write field values FieldsWriter fieldsWriter(directory, segment, *fieldInfos); _TRY { fieldsWriter.addDocument(doc); } _FINALLY( fieldsWriter.close() ); // invert doc into postingTable clearPostingTable(); // clear postingTable fieldLengths = new int_t[fieldInfos->size()]; // init fieldLengths for ( int_t i=0;isize();i++ ) fieldLengths[i] = 0; invertDocument(doc); // sort postingTable into an array Posting** postings = NULL; int_t postingsLength = 0; sortPostingTable(postings,postingsLength); //DEBUG: /*for (int_t i = 0; i < postingsLength; i++) { Posting* posting = postings[i]; char_t* b = posting->term.toString(); _cout << b << " freq=" << posting->freq; delete b; _cout << " pos=" << posting->positions[0]; for (int_t j = 1; j < posting->freq; j++) _cout <<"," << posting->positions[j]; _cout << endl; }*/ // write postings writePostings(postings,postingsLength, segment); // write norms of indexed fields writeNorms(doc, segment); delete[] postings; } void DocumentWriter::sortPostingTable(Posting**& array, int_t& arraySize) { // copy postingTable into an array arraySize = postingTable.size(); array = new Posting*[arraySize]; map::iterator postings = postingTable.begin(); int_t i=0; while ( postings != postingTable.end() ){ array[i] = (Posting*)postings->second; postings++; i++; } // sort the array quickSort(array, 0, i - 1); } // Tokenizes the fields of a document into Postings. void DocumentWriter::invertDocument(lucene::document::Document& doc){ lucene::document::DocumentFieldEnumeration* fields = doc.fields(); while (fields->hasMoreElements()) { lucene::document::Field* field = (lucene::document::Field*)fields->nextElement(); const char_t* fieldName = field->Name(); int_t fieldNumber = fieldInfos->fieldNumber(fieldName); int_t position = fieldLengths[fieldNumber]; // position in field if (field->IsIndexed()) { if (!field->IsTokenized()) { // un-tokenized field //FEATURE: this is bug in java too: if using a Reader this fails if ( field->StringValue() == NULL ){ lucene::util::Reader* r = field->ReaderValue(); int_t rp = r->position(); r->seek(0); int_t rl = r->available(); char_t* rv = new char_t[rl+1]; r->read(rv,0,rl); rv[rl]=0; addPosition(fieldName, rv, position++); delete[] rv; r->seek(rp); //reset position }else addPosition(fieldName, field->StringValue(), position++); } else { lucene::util::Reader* reader; // find or make Reader bool delReader = false; if (field->ReaderValue() != NULL) reader = field->ReaderValue(); else if (field->StringValue() != NULL){ reader = new lucene::util::StringReader(field->StringValue()); delReader = true; }else _THROWC( "field must have either String or Reader value" ); // Tokenize field and add to postingTable lucene::analysis::TokenStream& stream = analyzer.tokenStream(fieldName, reader); _TRY { lucene::analysis::Token* t; for ( t = (lucene::analysis::Token*)stream.next(); t != NULL; t = (lucene::analysis::Token*)stream.next()) { addPosition(fieldName, t->TermText(), position++); if (position > maxFieldLength) break; //break before delete. delete t; } delete t; } _FINALLY ( stream.close(); delete &stream; if ( delReader ){ reader->close(); delete reader; } ); } fieldLengths[fieldNumber] = position; // save field length } } delete fields; } void DocumentWriter::addPosition(const char_t* field, const char_t* text, const int_t position) { termBuffer.set(stringDuplicate(field),stringDuplicate(text), false ); Posting* ti = postingTable.get(&termBuffer); if (ti != NULL) { // word seen before int_t freq = ti->freq; if (ti->getPositionsLength() == freq) { // positions array is full int_t *newPositions = new int_t[freq * 2]; // double size int_t *positions = ti->positions; for (int_t i = 0; i < freq; i++) // copy old positions to new newPositions[i] = positions[i]; delete[] ti->positions; ti->positions = newPositions; ti->positionsLength = freq*2; } ti->positions[freq] = position; // add new position ti->freq = freq + 1; // update frequency } else { // word not seen before Term* term = new Term( field, text); postingTable.put(term->pointer(), new Posting(*term, position)); term->finalize(); } } //static void DocumentWriter::quickSort(Posting**& postings, const int_t lo, const int_t hi) { if(lo >= hi) return; int_t mid = (lo + hi) / 2; if(postings[lo]->term.compareTo(postings[mid]->term) > 0) { Posting* tmp = postings[lo]; postings[lo] = postings[mid]; postings[mid] = tmp; } if(postings[mid]->term.compareTo(postings[hi]->term) > 0) { Posting* tmp = postings[mid]; postings[mid] = postings[hi]; postings[hi] = tmp; if(postings[lo]->term.compareTo(postings[mid]->term) > 0) { Posting* tmp2 = postings[lo]; postings[lo] = postings[mid]; postings[mid] = tmp2; } } int_t left = lo + 1; int_t right = hi - 1; if (left >= right) return; const Term& partition = postings[mid]->term; //not kept, so no need to finalize for( ;; ) { while(postings[right]->term.compareTo(partition) > 0) --right; while(left < right && postings[left]->term.compareTo(partition) <= 0) ++left; if(left < right) { Posting* tmp = postings[left]; postings[left] = postings[right]; postings[right] = tmp; --right; } else { break; } } quickSort(postings, lo, left); quickSort(postings, left + 1, hi); } void DocumentWriter::writePostings(Posting** postings, const int_t postingsLength, const char_t* segment){ OutputStream* freq = NULL; OutputStream* prox = NULL; TermInfosWriter* tis = NULL; try { const char_t* buf = segmentname( segment, _T(".frq")); freq = &directory.createFile( buf ); delete[] buf; buf = segmentname( segment, _T(".prx")); prox = &directory.createFile( buf ); delete[] buf; tis = new TermInfosWriter(directory, segment, *fieldInfos); TermInfo* ti = new TermInfo(); for (int_t i = 0; i < postingsLength; i++) { const Posting* posting = postings[i]; // add an entry to the dictionary with pointers to prox and freq files ti->set(1, freq->getFilePointer(), prox->getFilePointer()); tis->add(posting->term, *ti); // add an entry to the freq file int_t f = posting->freq; if (f == 1) // optimize freq=1 freq->writeVInt(1); // set low bit of doc num. else { freq->writeVInt(0); // the document number freq->writeVInt(f); // frequency in doc } int_t lastPosition = 0; // write positions int_t* positions = posting->positions; for (int_t j = 0; j < f; j++) { // use delta-encoding int_t position = positions[j]; prox->writeVInt(position - lastPosition); lastPosition = position; } } delete ti; }_FINALLY ( if (freq != NULL) { freq->close(); delete freq; } if (prox != NULL) { prox->close(); delete prox; } if (tis != NULL) { tis->close(); delete tis; } ); } void DocumentWriter::writeNorms(lucene::document::Document& doc, const char_t* segment) { lucene::document::DocumentFieldEnumeration* fields = doc.fields(); while (fields->hasMoreElements()) { lucene::document::Field* field = (lucene::document::Field*)fields->nextElement(); if (field->IsIndexed()) { int_t fieldNumber = fieldInfos->fieldNumber(field->Name()); char_t* fb = new char_t[MAX_PATH]; stringPrintF(fb,_T("%s.f%d"),segment,fieldNumber); OutputStream& norm = directory.createFile(fb); delete[] fb; _TRY { norm.writeByte(lucene::search::Similarity::normb(fieldLengths[fieldNumber])); }_FINALLY ( norm.close(); delete &norm; ); } } delete fields; } }} #endif