CS267
Chris Pollett
Oct. 16, 2019
rankBM25_TermAtATime((t[1], t[2], ..., t[n]), k) {
sort(t) in increasing order of N[t[i]];
acc := {}, acc' := {}; //initialize accumulators.
//acc used for previous round, acc' for next
acc[0].docid := infty // end-of-list marker
for i := 1 to n do {
inPos := 0; //current pos in acc
outPos := 0; // current position in acc'
foreach document d in t[i]'s posting list do {
while acc[inPos].docid < d do {
acc'[outPos++] := acc[inPos++];
//copy previous round to current for docs not containing t[i]
}
acc'[outPos].docId := d;
acc'[outPos].score := log(N/N[t[i]]) * TFBM25(t[i], d);
if(acc[inPos].docid == d) {
acc'[outPos].score += acc[inPos].score;
}
outPos++;
}
while acc[inPos] < infty do { // copy remaining acc to acc'
acc'[outPos++] := acc[inPos++];
}
acc'[outPos].docid :=infty; //end-of-list-marker
swap acc and acc'
}
return the top k items of acc; //select using heap
}
rankBM25_TermAtATimeWithPruning((t[1], t[2], ..., t[n]), k, amax, u) {
sort(t) in increasing order of N[[t[i]];
acc := {}, acc' := {}; //initialize accumulators.
acc[0].docid := infty // end-of-list marker
for i := 1 to n do {
quotaLeft := amax - length(acc) // the remaining accumulator quota
if (N[t[i]] <= quotaLeft) { //plenty o' accumulators
// do as we did in rankBM25_TermAtATime
inPos := 0; //current pos in acc
outPos := 0; // current position in acc'
foreach document d in t[i]'s posting list do {
while acc[inPos].docid < d do {
acc'[outPos++] := acc[inPos++];
//copy previous round to current for docs not containing t[i]
}
acc'[outPos].docId := d;
acc'[outPos].score := log(N/N[t[i]]) * TFBM25(t[i], d);
if(acc[inPos].docid == d) {
acc'[outPos].score += acc[inPos].score;
}
outPos++;
}
} else if (quotaLeft == 0) { //no accumulators left
for j:=1 to length(acc) do {
acc[j].score := acc[j].score +
log(N/N[t[i]]) * TFBM25(t[i], acc[j].docid);
}
} else { //still have some accumulators
for j:= 1 to k do { tfStats[j] := 0} //initialize TF stats
T = 1; //init threshold for new accumulators
postingsSeen := 0;
inPos := 0; //current pos in acc
outPos := 0; // current position in acc'
foreach document d in t[i]'s posting list do {
while acc[inPos].docid < d do {
acc'[outPos++] := acc[inPos++];
//copy previous round to current for docs not containing t[i]
}
if(acc[inPos].docid == d) {
acc'[outPos].docid = d
acc'[outPos++].score += acc[inPos++].score +
log(N/N[t[i]]) * TFBM25(t[i], d);
} else if (quotaLeft > 0) {
if (f[t[i],d] ≥ T) { // if happens, make new accumlator
acc'[outPos].docid = d
acc'[outPos++].score = log(N/N[t[i]]) * TFBM25(t[i], d);
quotaLeft--;
}
tfStats[f[t[i],d]]++;
}
postingsSeen++;
if (postingsSeen % u == 0) {
q := (N[t[i]] - postingsSeen)/postingsSeen;
T := argmin_x{x in Nat|
sum_(j=1)^x(tfStats[j] * q ≥ quotaLeft}
}
}
}
while acc[inPos] < infty do { // copy remaining acc to acc'
acc'[outPos++] := acc[inPos++];
}
acc'[outPos].docid :=infty; //end-of-list-marker
swap acc and acc'
}
return the top k items of acc; //select using heap
}
Given a list of ordered pairs, `S`, suggest pseudo-code to compute `G(S)`. What is the runtime of your code?
Post your solutions to the Oct 16 In-Class Exercise.
tau(t, k) :=
if (k == infty) {
u :=infty;
} else if (k == -infty) {
u := -infty;
} else {
u := next(t, k-1);
}
return [u, u]