Cosine Ranking Algorithm
Cosine similarity is a measure of similarity between two vectors of an inner product space that measures the cosine of the angle between them.Cosine similarity is particularly used in positive space, where the outcome is neatly bounded in [0,1].
<!DOCTYPE html>
<html>
<body>
<?php
function getCosineRanking($sentences) {
$collection = $sentences;
$dictionary = array();
$docCount = array();
$docID=0;
foreach($collection as $docID => $doc) {
$terms = explode(' ', $doc);
$docCount[$docID] = count($terms);
foreach($terms as $term) {
if(!isset($dictionary[$term])) {
$dictionary[$term] = array('df' => 0, 'postings' => array());
}
if(!isset($dictionary[$term]['postings'][$docID])) {
$dictionary[$term]['df']++;
$dictionary[$term]['postings'][$docID] = array('tf' => 0);
}
$dictionary[$term]['postings'][$docID]['tf']++;
}
}
$index = array('docCount' => $docCount, 'dictionary' => $dictionary);
$docCount = count($index['docCount']);
$entry = $index['dictionary'][$term];
foreach($entry['postings'] as $docID => $postings) {
echo "Document $docID and term $term give TFIDF: " .
($postings['tf'] * log($docCount / $entry['df'], 2));
echo "\n";
}
$query = array('live');
$index = getIndex();
$matchDocs = array();
$docCount = count($index['docCount']);
foreach($query as $qterm) {
$entry = $index['dictionary'][$qterm];
foreach($entry['postings'] as $docID => $posting) {
if(!isset($matchDocs[$docID])) {
$matchDocs[$docID] = $posting['tf'] * log($docCount + 1 / $entry['df'] + 1, 2);
} else {
$matchDocs[$docID] += $posting['tf'] * log($docCount + 1 / $entry['df'] + 1, 2);
}
}
// length normalise
foreach($matchDocs as $docID => $score) {
$matchDocs[$docID] = $score/$index['docCount'][$docID];
}
arsort($matchDocs); // high to low
var_dump($matchDocs);
?>
</body>
</html>
|