CS267
Chris Pollett
Oct 2, 2019
buildIndex (indexTokenizer)
{
position := 0;
while (inputTokenizer.hasNext()) {
T := inputTokenizer.getNext();
obtain dictionary entry for T; create new entry, if necessary;
append new posting position to T's posting list;
position ++;
}
sort all dictionary entries in lex order
for each term T in the dictionary {
write T's postings list to disk
}
write the dictionary to disk
}
return
buildIndex_sortBased(inputTokenizer)
{
position := 0;
while (inputTokenizer.hasNext()) {
T := inputTokenizer.getNext();
obtain dictionary entry for T, create new entry if necessary;
termID := unique termID of T;
write record R[position] := (termID, position) to disk;
position++;
}
tokenCount := position;
sort R[0], .., R[tokenCount-1] by first component; break ties with second component;
perform a sequential scan of R[0], .., R[tokenCount-1] creating the final index;
return;
}
buildIndex_mergeBase(inputTokenizer, memoryLimit)
{
n := 0;
position := 0;
memoryConsumption := 0;
while (inputTokenizer.hasNext()) {
T := inputTokenizer.getNext();
obtain dictionary entry for T;
create new entry if necessary;
append new position to T's posting list
position++;
memoryConsumption++;
if (memoryConsumption > memoryLimit) {
createIndexPartition();
}
}
if (memoryConsumption > 0) {
createIndexPartition();
}
merge index partitions I[0],...,I[n-1]
to make final index I_final;
}
createIndexPartition()
{
create empty on disk inverted file I[n];
sort in-memory dictionary entries in lex order;
for each term T in dictionary {
add T's posting list to I[n];
}
delete all in memory posting lists;
reset the in-memory dictionary;
memoryConsumption := 0;
n++;
}
mergeIndexPartitions([I[0], ..., I[n-1]])
{
create empty Inverted File I_final;
for (k = 0; k < n; k++) {
open partition I[k] for sequential processing;
}
currentIndex := 0;
while (currentIndex != nil) {
currentIndex = nil;
for (k = 0; k < n; k++) {
if (I[k] still has terms left) {
if (currentIndex == nil ||
I[k].currentTerm < currentTerm) {
currentIndex := I[k];
currentTerm := I[k].currentTerm;
}
}
}
if (currentIndex != nil) {
I_final.addPostings(currentTerm,
currentIndex.getPostings(currentTerm));
currentIndex.advanceToNExtTerm();
}
}
delete I[0], ..., I[n-1];
}