CS267
Chris Pollett
Apr 4, 2022
buildIndex_mergeBase(inputTokenizer, memoryLimit)
{
n := 0;
position := 0;
memoryConsumption := 0;
while (inputTokenizer.hasNext()) {
T := inputTokenizer.getNext();
obtain dictionary entry for T;
create new entry if necessary;
append new position to T's posting list
position++;
memoryConsumption++;
if (memoryConsumption > memoryLimit) {
createIndexPartition();
}
}
if (memoryConsumption > 0) {
createIndexPartition();
}
merge index partitions I[0],...,I[n-1]
to make final index I_final;
}
createIndexPartition()
{
create empty on disk inverted file I[n];
sort in-memory dictionary entries in lex order;
for each term T in dictionary {
add T's posting list to I[n];
}
delete all in memory posting lists;
write the dictionary to disk
reset the in-memory dictionary;
memoryConsumption := 0;
n++;
}
mergeIndexPartitions([I[0], ..., I[n-1]])
{
create empty Inverted File I_final;
for (k = 0; k < n; k++) {
open partition I[k] for sequential processing;
}
currentIndex := I[0];// anything other than nil so go through loop once
while (currentIndex != nil) {
currentIndex = nil;
for (k = 0; k < n; k++) {
if (I[k] still has terms left) {
if (currentIndex == nil ||
I[k].currentTerm < currentTerm) {
currentIndex := I[k];
currentTerm := I[k].currentTerm;
}
}
}
if (currentIndex != nil) {
I_final.addPostings(currentTerm,
currentIndex.getPostings(currentTerm));
currentIndex.advanceToNextTerm();
}
}
delete I[0], ..., I[n-1];
}
Which of the following is true?
We can overcome the two limitations of our first algorithm for ranked retrieval by using two heaps: one to manage the query terms and, for each term t, keep track of the next document that contains t; the other one to maintain the set of the top `k` search results seen so far:
rankBM25_DocumentAtATime_WithHeaps((t[1], .. t[n]), k) {
// create a min-heap for top k results
for(i = 1 to k) {
results[i].score := 0;
}
// create a min-heap for terms
for (i = 1 to n) {
terms[i].term := t[i];
terms[i].nextDoc = nextDoc(t[i], -infty);
}
sort terms in increasing order of nextDoc x
while (terms[1].nextDoc < infty) {
d := terms[1].nextDoc;
score := 0;
while(terms[1].nextDoc == d) {
t := terms[1].term;
score += log(N/N_t)*TM_(BM25)(t,d);
terms[1].nextDoc := nextDoc(t,d);
REHEAP(terms); // restore heap property for terms;
}
if(score > results[1].score) {
results[1].docid := d;
results[1].score := score;
REHEAP(results); // restore the heap property for results
}
}
remove from results all items with score = 0;
sort results in decreasing order of score;
return results;
}
The complexity of this algorithm is `Theta(N_q cdot log(n) + N_q \cdot log(k))`.