Chris Pollett > Students > Shailesh

    (Print View)

    [Bio]

    [Project Blog]

    [CS297 Proposal]

    [CS297 Presentation-PDF]

    [CS297 Part of Speech Tagging Code]

    [CS297 Cosine Ranking Code]

    [CS297 Wordnet Code]

    [CS297 Report-PDF]

    [CS298 Proposal]

    [CS298 Presentation-PDF]

    [CS298 Project Report-PDF]

                          

























Cosine Ranking Algorithm

Cosine similarity is a measure of similarity between two vectors of an inner product space that measures the cosine of the angle between them.Cosine similarity is particularly used in positive space, where the outcome is neatly bounded in [0,1].

<!DOCTYPE html>
<html>
<body>


<?php
function getCosineRanking($sentences) {
 
 $collection = $sentences;
        $dictionary = array();
        $docCount = array();
		$docID=0;
        foreach($collection as $docID => $doc) {
                $terms = explode(' ', $doc);
                $docCount[$docID] = count($terms);

                foreach($terms as $term) {
                        if(!isset($dictionary[$term])) {
                                $dictionary[$term] = array('df' => 0, 'postings' => array());
                        }
                        if(!isset($dictionary[$term]['postings'][$docID])) {
                                $dictionary[$term]['df']++;
                                $dictionary[$term]['postings'][$docID] = array('tf' => 0);
                        }

                        $dictionary[$term]['postings'][$docID]['tf']++;
                }
        }


        $index = array('docCount' => $docCount, 'dictionary' => $dictionary);
        $docCount = count($index['docCount']);
        $entry = $index['dictionary'][$term];
        foreach($entry['postings'] as  $docID => $postings) {
                echo "Document $docID and term $term give TFIDF: " .
                        ($postings['tf'] * log($docCount / $entry['df'], 2));
                echo "\n";
        }
$query = array('live');


$index = getIndex();
$matchDocs = array();
$docCount = count($index['docCount']);

foreach($query as $qterm) {
        $entry = $index['dictionary'][$qterm];
        foreach($entry['postings'] as $docID => $posting) {
					if(!isset($matchDocs[$docID])) {
						$matchDocs[$docID] = $posting['tf'] * log($docCount + 1 / $entry['df'] + 1, 2);
					} else {
						$matchDocs[$docID] += $posting['tf'] * log($docCount + 1 / $entry['df'] + 1, 2);
					}	
}

// length normalise
foreach($matchDocs as $docID => $score) {
        $matchDocs[$docID] = $score/$index['docCount'][$docID];
}

arsort($matchDocs); // high to low

var_dump($matchDocs);
?>

</body>
</html>