CS267
Chris Pollett
Oct 2, 2019
buildIndex (indexTokenizer) { position := 0; while (inputTokenizer.hasNext()) { T := inputTokenizer.getNext(); obtain dictionary entry for T; create new entry, if necessary; append new posting position to T's posting list; position ++; } sort all dictionary entries in lex order for each term T in the dictionary { write T's postings list to disk } write the dictionary to disk } return
buildIndex_sortBased(inputTokenizer) { position := 0; while (inputTokenizer.hasNext()) { T := inputTokenizer.getNext(); obtain dictionary entry for T, create new entry if necessary; termID := unique termID of T; write record R[position] := (termID, position) to disk; position++; } tokenCount := position; sort R[0], .., R[tokenCount-1] by first component; break ties with second component; perform a sequential scan of R[0], .., R[tokenCount-1] creating the final index; return; }
buildIndex_mergeBase(inputTokenizer, memoryLimit) { n := 0; position := 0; memoryConsumption := 0; while (inputTokenizer.hasNext()) { T := inputTokenizer.getNext(); obtain dictionary entry for T; create new entry if necessary; append new position to T's posting list position++; memoryConsumption++; if (memoryConsumption > memoryLimit) { createIndexPartition(); } } if (memoryConsumption > 0) { createIndexPartition(); } merge index partitions I[0],...,I[n-1] to make final index I_final; } createIndexPartition() { create empty on disk inverted file I[n]; sort in-memory dictionary entries in lex order; for each term T in dictionary { add T's posting list to I[n]; } delete all in memory posting lists; reset the in-memory dictionary; memoryConsumption := 0; n++; } mergeIndexPartitions([I[0], ..., I[n-1]]) { create empty Inverted File I_final; for (k = 0; k < n; k++) { open partition I[k] for sequential processing; } currentIndex := 0; while (currentIndex != nil) { currentIndex = nil; for (k = 0; k < n; k++) { if (I[k] still has terms left) { if (currentIndex == nil || I[k].currentTerm < currentTerm) { currentIndex := I[k]; currentTerm := I[k].currentTerm; } } } if (currentIndex != nil) { I_final.addPostings(currentTerm, currentIndex.getPostings(currentTerm)); currentIndex.advanceToNExtTerm(); } } delete I[0], ..., I[n-1]; }