CS267
Chris Pollett
Oct 25, 2023
We can overcome the two limitations of our first algorithm for ranked retrieval by using two heaps: one to manage the query terms and, for each term t, keep track of the next document that contains t; the other one to maintain the set of the top `k` search results seen so far:
rankBM25_DocumentAtATime_WithHeaps((t[1], .. t[n]), k) { // create a min-heap for top k results for(i = 1 to k) { results[i].score := 0; } // create a min-heap for terms for (i = 1 to n) { terms[i].term := t[i]; terms[i].nextDoc = nextDoc(t[i], -infty); } sort terms in increasing order of nextDoc x while (terms[1].nextDoc < infty) { d := terms[1].nextDoc; score := 0; while(terms[1].nextDoc == d) { t := terms[1].term; score += log(N/N_t)*TM_(BM25)(t,d); terms[1].nextDoc := nextDoc(t,d); REHEAP(terms); // restore heap property for terms; } if(score > results[1].score) { results[1].docid := d; results[1].score := score; REHEAP(results); // restore the heap property for results } } remove from results all items with score = 0; sort results in decreasing order of score; return results; }
The complexity of this algorithm is `Theta(N_q cdot log(n) + N_q \cdot log(k))`.
rankBM25_TermAtATime((t[1], t[2], ..., t[n]), k) { sort(t) in increasing order of N[t[i]]; acc := {}, acc' := {}; //initialize accumulators. //acc used for previous round, acc' for next acc[0].docid := infty // end-of-list marker for i := 1 to n do { inPos := 0; //current pos in acc outPos := 0; // current position in acc' foreach document d in t[i]'s posting list do { while acc[inPos].docid < d do { acc'[outPos++] := acc[inPos++]; //copy previous round to current for docs not containing t[i] } acc'[outPos].docId := d; acc'[outPos].score := log(N/N[t[i]]) * TFBM25(t[i], d); if(acc[inPos].docid == d) { acc'[outPos].score += acc[inPos++].score; } outPos++; } while acc[inPos] < infty do { // copy remaining acc to acc' acc'[outPos++] := acc[inPos++]; } acc'[outPos].docid :=infty; //end-of-list-marker swap acc and acc' } return the top k items of acc; //select using heap }
rankBM25_TermAtATimeWithPruning((t[1], t[2], ..., t[n]), k, amax, u) { // max_f is bounded above by a maximum number max_terms of terms allowed in a document. // (assume there is some doc length after which we truncate a document to that length) sort(t) in increasing order of N[[t[i]]; acc := {}, acc' := {}; //initialize accumulators. acc[0].docid := infty // end-of-list marker for i := 1 to n do { max_f = 0; quotaLeft := amax - length(acc) // the remaining accumulator quota if (N[t[i]] <= quotaLeft) { //plenty o' accumulators // do as we did in rankBM25_TermAtATime inPos := 0; //current pos in acc outPos := 0; // current position in acc' foreach document d in t[i]'s posting list do { while acc[inPos].docid < d do { acc'[outPos++] := acc[inPos++]; //copy previous round to current for docs not containing t[i] } acc'[outPos].docId := d; acc'[outPos].score := log(N/N[t[i]]) * TFBM25(t[i], d); if(acc[inPos].docid == d) { acc'[outPos].score += acc[inPos++].score; } outPos++; } } else if (quotaLeft == 0) { //no accumulators left for j:=1 to length(acc) do { acc[j].score := acc[j].score + log(N/N[t[i]]) * TFBM25(t[i], acc[j].docid); } } else { //still have some accumulators for j:= 1 to max_terms do { tfStats[j] := 0} //initialize TF stats T = 1; //init threshold for new accumulators postingsSeen := 0; inPos := 0; //current pos in acc outPos := 0; // current position in acc' foreach document d in t[i]'s posting list do { while acc[inPos].docid < d do { acc'[outPos++] := acc[inPos++]; //copy previous round to current for docs not containing t[i] } if(acc[inPos].docid == d) { acc'[outPos].docid = d acc'[outPos++].score = acc[inPos++].score + log(N/N[t[i]]) * TFBM25(t[i], d); } else if (quotaLeft > 0) { if (f[t[i],d] ≥ T) { // if happens, make new accumlator acc'[outPos].docid = d acc'[outPos++].score = log(N/N[t[i]]) * TFBM25(t[i], d); quotaLeft--; } tfStats[f[t[i],d]]++; if (f[t[i],d] > max_f) { max_f = f[t[i],d]; //update largest observed frequency } } postingsSeen++; if (postingsSeen % u == 0) { q := (N[t[i]] - postingsSeen)/postingsSeen; T := argmin_x{x in Nat| sum_(j=x)^{max_f}(tfStats[j] * q) < quotaLeft} } } } while acc[inPos] < infty do { // copy remaining acc to acc' acc'[outPos++] := acc[inPos++]; } acc'[outPos].docid :=infty; //end-of-list-marker swap acc and acc' } return the top k items of acc; //select using heap }
Given a list of ordered pairs, `S`, suggest pseudo-code to compute `G(S)`. What is the runtime of your code?
Post your solutions to the Oct 25 In-Class Exercise.