-1

If you guys can please review if the following approach (pseudo-code) is good to go to calcualte cosine similarity between 2 vectors:

var vectorA = [2,5,7,8];
var referenceVector= [1,1,1,1];

//Apply weights to vectors (apply positive or negative weights to elements)
var weightageVector = [1,0.5,2,1.5];
var weighted vectA = GetWeightedVector(vectorA);

//normalize each element to a value beteen 0 and 1
//@see http://stn.spotfire.com/spotfire_client_help/norm/norm_scale_between_0_and_1.htm

as calcuated here:http://jsfiddle.net/snehilw/86jqo1sm/4/

var normalizedVectorA = GetNormalizedVector(vectorA); //using the formula above
var cosineSimilarityScore = GetCosineSimilarityScore(referenceVector, normalizedVectorA );

can someone please advise if this is correct approach as this is not giving me correct results.

As requested, here is the code snippet:

var defaultVectorWeights = [1,0.5,2,1.5];

var referenceVector = [1, 1, 1, 1] //Default values for the reference vector (Do not change these);

var supportedVectorLength = referenceVector.length;

function getNormalizedVector(multiDimArray, vector){
var normalizedVector = [];

if(vector.length == supportedVectorLength){
    var normalizedValue = 0;

    for(var j = 0; j < supportedVectorLength ; j++){

        var min = getMinMaxForMultidimensionalArrayColumn(multiDimArray,j)[0];
        var max = getMinMaxForMultidimensionalArrayColumn(multiDimArray,j)[1];

        normalizedValue = (max == min) ? 0.5 : (vector[j] - min) / (max - min);
        normalizedVector.push(normalizedValue); 
    }
}
//console.log('normalizedVector='+normalizedVector);
return normalizedVector;
}

function getCosineSimilarityScore(vectorA, vectorB) {

var similarityScore;

if((vectorA.length == supportedVectorLength) && (vectorB.length == supportedVectorLength)){
    var lenVectA = vectorA.length,
        product = 0,      
        normVectorA = 0,
        normVectorB = 0;
    for (var i = 0; i < lenVectA ; i++) {
        product += vectorA[i] * vectorB[i];
        normVectorA += vectorA[i] * vectorA[i];
        normVectorB += vectorB[i] * vectorB[i];
    }

    similarityScore =   product / (Math.sqrt(normVectorA) * Math.sqrt(normVectorB));
}
else {
    //TODO: Handle exception/ Fire an event to notify the server about this exception
    console.log("Cosine similarity workload vectors are of unequal lengths");
}

return similarityScore;

}

function getWeightedVector(vector) {

var vectorArray = []; //Initialize 

if(vector.length == supportedVectorLength){
    for(var j = 0; j < supportedVectorLength ; j++){
        vectorArray.push(defaultVectorWeights[j]*vector[j]);
    }
}
else{
    //TODO: Handle exception/ Fire an event to notify the server about this exception
    console.log("Cosine similarity workload vector is of unsupported length");
}

return vectorArray;
}

function getMinMaxForMultidimensionalArrayColumn(multiDimArray, column){
var _MIN_MAX = []; //[min,max]

var columnarArray = [];

if(column < supportedVectorLength){
    //Extract columnar array from the multi-dimensional array
    $.map(multiDimArray, function( arrayVect) {
        columnarArray.push(arrayVect[column]);
    });
    //Find the MIN and MAX 
    _MIN_MAX.push(Math.min.apply(Math,columnarArray));
    _MIN_MAX.push(Math.max.apply(Math,columnarArray));
}
else{
    //TODO: Handle exception/ Fire an event to notify the server about this exception
    console.log("Cosine similarity workload vectors are of unequal lengths");
}

return _MIN_MAX;
}

function getAssociateWorkloadScore(multiDimArray,queryVector){
var workloadScore;

var weightedQueryVector = [];
var weightedMultiDimArr = [];
var normalizedMultiDimArr = [];
var normalizedQueryVector = [];

//Apply feature scaling
weightedQueryVector = getWeightedVector(queryVector);
weightedMultiDimArr = getWeightedMultiDimArr(multiDimArray);
normalizedQueryVector = getNormalizedVector(weightedMultiDimArr, weightedQueryVector);

workloadScore = getCosineSimilarityScore(referenceVector, normalizedQueryVector);

console.log('weightedQueryVector='+weightedQueryVector);
console.log('weightedMultiDimArr='+JSON.stringify(weightedMultiDimArr));
console.log('normalizedMultiDimArr='+JSON.stringify(normalizedMultiDimArr));
console.log('normalizedQueryVector='+normalizedQueryVector);

console.log('workloadScore='+JSON.stringify(workloadScore));

return workloadScore;
}

function getTeamWorkloadScore(multiDimArray){
var workloadScores = [];

for(var j = 0; j < multiDimArray.length ; j++){
    workloadScores.push(getAssociateWorkloadScore(multiDimArray,multiDimArray[j]));
}
return workloadScores;
}
Rookie
  • 5,179
  • 13
  • 41
  • 65
  • 2
    Please don't rely on external sites (jsfiddle) to host the content of your question. – Has QUIT--Anony-Mousse Feb 27 '15 at 17:56
  • I plan to delete this fiddle once i get the answer as this is a personal project and i do not wish to keep the code out there for too long. That is the reason it is hosted on an external site. Also, I have seen numerous people use jsfiddle in the javascript group, due to which i used this. Many thanks for your comment though, but instead of downvoting, if you could have given your expert advice on the problem, that would have helped and i would have taken the jsfiddle link down immediately. I will take the link off. Thanks for your time. – Rookie Feb 28 '15 at 02:53
  • 1
    How could I help, if I cannot see your code? Also, questions here *must* be self-contained. They're meant to be useful for others, too, not only for you. If you delete the fiddle, the question must be deleted. – Has QUIT--Anony-Mousse Feb 28 '15 at 07:30
  • The question here is self contained, which is: "this a correct approach to apply weights and normalize the vectors based on the pseudo code?". The fiddle is posted just in case someone wants to look at it in depth. Thanks for your suggestion though, i have updated the original post with the code from the fiddle for your review and advise.Thanks! – Rookie Feb 28 '15 at 20:45
  • Usually one would not normalize with min/max for cosine distance, but do an L2 normalization so computing cosines afterwards is cheaper. – Has QUIT--Anony-Mousse Feb 28 '15 at 21:00
  • I have used MinMax to scale the vector values between 0 and 1 as per the following link: http://stn.spotfire.com/spotfire_client_help/norm/norm_scale_between_0_and_1.htm Do you think an L2 norm is to be used instead of this, not sure if this applies in this case..please advise, thanks! – Rookie Feb 28 '15 at 21:12
  • There are dozens of ways of normalization. None is more correct than another. – Has QUIT--Anony-Mousse Feb 28 '15 at 21:28
  • so sounds like an implementation while L2 norm is more efficient than MinMax, it is not necessarily the root cause of why i am getting incorrect results & MinMAx should still give me the correct results, which is the question. Any thoughts if you spot any mistakes in the approach? – Rookie Feb 28 '15 at 21:34
  • It's *different*, and there are reasons why it is commonly used with cosine, whereas min/max is commonly used with Euclidean. And no, min/max scaling *will* change the results. – Has QUIT--Anony-Mousse Feb 28 '15 at 21:43

2 Answers2

3

A cosine similarity is just a dot product divided by the product of norms. So why not make a dot product function and a norm function and divide the results? (dotproduct from http://c2.com/cgi/wiki?DotProductInManyProgrammingLanguages)

function dotproduct(a,b) {
    var n = 0, lim = Math.min(a.length,b.length);
    for (var i = 0; i < lim; i++) n += a[i] * b[i];
    return n;
 }

function norm2(a) {var sumsqr = 0; for (var i = 0; i < a.length; i++) sumsqr += a[i]*a[i]; return Math.sqrt(sumsqr);}

function similarity(a, b) {return dotproduct(a,b)/norm2(a)/norm2(b);}

Now similarity([1,0,0], [0,1,1]) == 0

Gavin Haynes
  • 1,721
  • 11
  • 21
  • I have already implemented this function(). My question sis how to normalize and apply weights to the input vectors. – Rookie Feb 28 '15 at 03:27
  • in your `norm2` for loop, why write such terse code? JS should only run the first instruction after the for and putting the return on the same line really obfuscates things. – dcsan Sep 02 '18 at 18:18
1

If you necessarily need scale-invariance (i.e., the original cosine similarity), then use Gavin's code augmented with checks for zero-vectors

function cosine_sim(x, y) {
    xnorm = norm2(x);
    if(!xnorm) return 0;
    ynorm = norm2(y);
    if(!ynorm) return 0;
    return dotproduct(x, y) / (xnorm * ynorm);
}

If you do not need scale-invariance, just use the dot product (i.e., cosine_sim(x, y) is dotproduct(x, y)).

victor
  • 141
  • 2