CS267
Chris Pollett
Apr 4, 2022
buildIndex_mergeBase(inputTokenizer, memoryLimit) { n := 0; position := 0; memoryConsumption := 0; while (inputTokenizer.hasNext()) { T := inputTokenizer.getNext(); obtain dictionary entry for T; create new entry if necessary; append new position to T's posting list position++; memoryConsumption++; if (memoryConsumption > memoryLimit) { createIndexPartition(); } } if (memoryConsumption > 0) { createIndexPartition(); } merge index partitions I[0],...,I[n-1] to make final index I_final; } createIndexPartition() { create empty on disk inverted file I[n]; sort in-memory dictionary entries in lex order; for each term T in dictionary { add T's posting list to I[n]; } delete all in memory posting lists; write the dictionary to disk reset the in-memory dictionary; memoryConsumption := 0; n++; } mergeIndexPartitions([I[0], ..., I[n-1]]) { create empty Inverted File I_final; for (k = 0; k < n; k++) { open partition I[k] for sequential processing; } currentIndex := I[0];// anything other than nil so go through loop once while (currentIndex != nil) { currentIndex = nil; for (k = 0; k < n; k++) { if (I[k] still has terms left) { if (currentIndex == nil || I[k].currentTerm < currentTerm) { currentIndex := I[k]; currentTerm := I[k].currentTerm; } } } if (currentIndex != nil) { I_final.addPostings(currentTerm, currentIndex.getPostings(currentTerm)); currentIndex.advanceToNextTerm(); } } delete I[0], ..., I[n-1]; }
Which of the following is true?
We can overcome the two limitations of our first algorithm for ranked retrieval by using two heaps: one to manage the query terms and, for each term t, keep track of the next document that contains t; the other one to maintain the set of the top `k` search results seen so far:
rankBM25_DocumentAtATime_WithHeaps((t[1], .. t[n]), k) { // create a min-heap for top k results for(i = 1 to k) { results[i].score := 0; } // create a min-heap for terms for (i = 1 to n) { terms[i].term := t[i]; terms[i].nextDoc = nextDoc(t[i], -infty); } sort terms in increasing order of nextDoc x while (terms[1].nextDoc < infty) { d := terms[1].nextDoc; score := 0; while(terms[1].nextDoc == d) { t := terms[1].term; score += log(N/N_t)*TM_(BM25)(t,d); terms[1].nextDoc := nextDoc(t,d); REHEAP(terms); // restore heap property for terms; } if(score > results[1].score) { results[1].docid := d; results[1].score := score; REHEAP(results); // restore the heap property for results } } remove from results all items with score = 0; sort results in decreasing order of score; return results; }
The complexity of this algorithm is `Theta(N_q cdot log(n) + N_q \cdot log(k))`.