CS267
Chris Pollett
Apr 6, 2022
rankBM25_TermAtATime((t[1], t[2], ..., t[n]), k) { sort(t) in increasing order of N[t[i]]; acc := {}, acc' := {}; //initialize accumulators. //acc used for previous round, acc' for next acc[0].docid := infty // end-of-list marker for i := 1 to n do { inPos := 0; //current pos in acc outPos := 0; // current position in acc' foreach document d in t[i]'s posting list do { while acc[inPos].docid < d do { acc'[outPos++] := acc[inPos++]; //copy previous round to current for docs not containing t[i] } acc'[outPos].docId := d; acc'[outPos].score := log(N/N[t[i]]) * TFBM25(t[i], d); if(acc[inPos].docid == d) { acc'[outPos].score += acc[inPos++].score; } outPos++; } while acc[inPos] < infty do { // copy remaining acc to acc' acc'[outPos++] := acc[inPos++]; } acc'[outPos].docid :=infty; //end-of-list-marker swap acc and acc' } return the top k items of acc; //select using heap }
rankBM25_TermAtATimeWithPruning((t[1], t[2], ..., t[n]), k, amax, u) { // max_f is bounded above by a maximum number max_terms of terms allowed in a document. // (assume there is some doc length after which we truncate a document to that length) sort(t) in increasing order of N[[t[i]]; acc := {}, acc' := {}; //initialize accumulators. acc[0].docid := infty // end-of-list marker for i := 1 to n do { max_f = 0; quotaLeft := amax - length(acc) // the remaining accumulator quota if (N[t[i]] <= quotaLeft) { //plenty o' accumulators // do as we did in rankBM25_TermAtATime inPos := 0; //current pos in acc outPos := 0; // current position in acc' foreach document d in t[i]'s posting list do { while acc[inPos].docid < d do { acc'[outPos++] := acc[inPos++]; //copy previous round to current for docs not containing t[i] } acc'[outPos].docId := d; acc'[outPos].score := log(N/N[t[i]]) * TFBM25(t[i], d); if(acc[inPos].docid == d) { acc'[outPos].score += acc[inPos++].score; } outPos++; } } else if (quotaLeft == 0) { //no accumulators left for j:=1 to length(acc) do { acc[j].score := acc[j].score + log(N/N[t[i]]) * TFBM25(t[i], acc[j].docid); } } else { //still have some accumulators for j:= 1 to max_terms do { tfStats[j] := 0} //initialize TF stats T = 1; //init threshold for new accumulators postingsSeen := 0; inPos := 0; //current pos in acc outPos := 0; // current position in acc' foreach document d in t[i]'s posting list do { while acc[inPos].docid < d do { acc'[outPos++] := acc[inPos++]; //copy previous round to current for docs not containing t[i] } if(acc[inPos].docid == d) { acc'[outPos].docid = d acc'[outPos++].score = acc[inPos++].score + log(N/N[t[i]]) * TFBM25(t[i], d); } else if (quotaLeft > 0) { if (f[t[i],d] ≥ T) { // if happens, make new accumlator acc'[outPos].docid = d acc'[outPos++].score = log(N/N[t[i]]) * TFBM25(t[i], d); quotaLeft--; } tfStats[f[t[i],d]]++; if (f[t[i],d] > max_f) { max_f = f[t[i],d]; //update largest observed frequency } } postingsSeen++; if (postingsSeen % u == 0) { q := (N[t[i]] - postingsSeen)/postingsSeen; T := argmin_x{x in Nat| sum_(j=x)^{max_f}(tfStats[j] * q) < quotaLeft} } } } while acc[inPos] < infty do { // copy remaining acc to acc' acc'[outPos++] := acc[inPos++]; } acc'[outPos].docid :=infty; //end-of-list-marker swap acc and acc' } return the top k items of acc; //select using heap }
Given a list of ordered pairs, `S`, suggest pseudo-code to compute `G(S)`. What is the runtime of your code?
Post your solutions to the Apr 6 In-Class Exercise.