Chris Pollett >
Students >
Shailesh [Bio] [CS297 Part of Speech Tagging Code] |
Cosine Ranking AlgorithmCosine similarity is a measure of similarity between two vectors of an inner product space that measures the cosine of the angle between them.Cosine similarity is particularly used in positive space, where the outcome is neatly bounded in [0,1]. <!DOCTYPE html> <html> <body> <?php function getCosineRanking($sentences) { $collection = $sentences; $dictionary = array(); $docCount = array(); $docID=0; foreach($collection as $docID => $doc) { $terms = explode(' ', $doc); $docCount[$docID] = count($terms); foreach($terms as $term) { if(!isset($dictionary[$term])) { $dictionary[$term] = array('df' => 0, 'postings' => array()); } if(!isset($dictionary[$term]['postings'][$docID])) { $dictionary[$term]['df']++; $dictionary[$term]['postings'][$docID] = array('tf' => 0); } $dictionary[$term]['postings'][$docID]['tf']++; } } $index = array('docCount' => $docCount, 'dictionary' => $dictionary); $docCount = count($index['docCount']); $entry = $index['dictionary'][$term]; foreach($entry['postings'] as $docID => $postings) { echo "Document $docID and term $term give TFIDF: " . ($postings['tf'] * log($docCount / $entry['df'], 2)); echo "\n"; } $query = array('live'); $index = getIndex(); $matchDocs = array(); $docCount = count($index['docCount']); foreach($query as $qterm) { $entry = $index['dictionary'][$qterm]; foreach($entry['postings'] as $docID => $posting) { if(!isset($matchDocs[$docID])) { $matchDocs[$docID] = $posting['tf'] * log($docCount + 1 / $entry['df'] + 1, 2); } else { $matchDocs[$docID] += $posting['tf'] * log($docCount + 1 / $entry['df'] + 1, 2); } } // length normalise foreach($matchDocs as $docID => $score) { $matchDocs[$docID] = $score/$index['docCount'][$docID]; } arsort($matchDocs); // high to low var_dump($matchDocs); ?> </body> </html> |