Index Construction




CS267

Chris Pollett

Mar 17, 2021

Outline

Index Construction

In-memory Index Construction

buildIndex

buildIndex (indexTokenizer)
{
   position := 0;
   while (inputTokenizer.hasNext()) {
      T := inputTokenizer.getNext();
      obtain dictionary entry for T; create new entry, if necessary;
      append new posting position to T's posting list;
      position ++;
   }
   sort all dictionary entries in lex order
   for each term T in the dictionary {
      write T's postings list to disk
   }
   write the dictionary to disk
}
return

Index-Time Dictionary

Speeding up the Dictionary

In-Class Exercise

Extensible in-memory postings lists

Sort-based Index Construction

Sort-based Index Construction Code

buildIndex_sortBased(inputTokenizer)
{
    position := 0;
    while (inputTokenizer.hasNext()) {
        T := inputTokenizer.getNext();
        termID := unique termID of T;
        write record R[position] := (termID, position) to disk;
        position++;
    }
    tokenCount := position;
    sort R[0], .., R[tokenCount-1] by first component; break ties with second component;
    perform a sequential scan of R[0], .., R[tokenCount-1] creating the final index;
    return;
}

Disk-Based Sorting

Merge-based Index Construction

Merge-Based Index Pseudocode

buildIndex_mergeBase(inputTokenizer, memoryLimit) 
{
    n := 0;
    position := 0;
    memoryConsumption := 0;
    while (inputTokenizer.hasNext()) {
        T := inputTokenizer.getNext();
        obtain dictionary entry for T;
        create new entry if necessary;
        append new position to T's posting list
        position++;
        memoryConsumption++;
        if (memoryConsumption > memoryLimit) {
            createIndexPartition();
        }
    }
    if (memoryConsumption > 0) {
        createIndexPartition();
    }
    merge index partitions I[0],...,I[n-1]
        to make final index I_final;
}

createIndexPartition()
{
    create empty on disk inverted file I[n];
    sort in-memory dictionary entries in lex order;
    for each term T in dictionary {
        add T's posting list to I[n];
    }
    delete all in memory posting lists;
    write the dictionary to disk
    reset the in-memory dictionary;
    memoryConsumption := 0;
    n++;
}

mergeIndexPartitions([I[0], ..., I[n-1]])
{
    create empty Inverted File I_final;
    for (k = 0; k < n; k++) {
        open partition I[k] for sequential processing;
    }
    currentIndex := 0;
    while (currentIndex != nil) {
        currentIndex = nil;
        for (k = 0; k < n; k++) {
            if (I[k] still has terms left) {
                if (currentIndex == nil || 
                    I[k].currentTerm < currentTerm) {
                    currentIndex := I[k];
                    currentTerm := I[k].currentTerm;
                }     
            }
        }
        if (currentIndex != nil) {
            I_final.addPostings(currentTerm,
                currentIndex.getPostings(currentTerm));
            currentIndex.advanceToNExtTerm();
        }
    }
    delete I[0], ..., I[n-1];
}

Remarks on Merge Algorithm