CS267
Chris Pollett
Oct 18, 2023
buildIndex (inputTokenizer) { position := 0; while (inputTokenizer.hasNext()) { T := inputTokenizer.getNext(); obtain dictionary entry for T; create new entry, if necessary; append new posting position to T's posting list; position ++; } sort all dictionary entries in lex order for each term T in the dictionary { write T's postings list to disk } write the dictionary to disk } return
buildIndex_sortBased(inputTokenizer) { position := 0; while (inputTokenizer.hasNext()) { T := inputTokenizer.getNext(); termID := unique termID of T; write record R[position] := (termID, position) to disk; position++; } tokenCount := position; sort R[0], .., R[tokenCount-1] by first component; break ties with second component; perform a sequential scan of R[0], .., R[tokenCount-1] creating the final index; return; }