Introduction and source code
I am trying to compute the cosine similarity between two sparse vectors of dimension 169647.As input, the two vectors are represented as a string of the form <index, value>
. Only the non zero elements of the vector are given an index.
x = "1:0.1 43:0.4 100:0.43 10000:0.9"
y = "200:0.5 500:0.34 501:0.34"
First we convert each of x and y into two vectors<float>.
by using the function splitVector
. Then we compute the distance by using the function cosine_similarity
. Nevermind split
function. I am using it just in case you wish to run the code.
#include <iostream>
#include <string>
#include <vector>
#include <algorithm>
using namespace std;
void split(const string& s, char c,vector<string>& v) {
string::size_type i = 0;
string::size_type j = s.find(c);
while (j != string::npos) {
v.push_back(s.substr(i, j-i));
i = ++j;
j = s.find(c, j);
if (j == string::npos)
v.push_back(s.substr(i, s.length()));
}
}
float cosine_similarity(const std::vector<float> & A,const std::vector<float> & B)
{
float dot = 0.0, denom_a = 0.0, denom_b = 0.0 ;
for(unsigned int i = 0; i < A.size(); ++i)
{
dot += A[i] * B[i] ;
denom_a += A[i] * A[i] ;
denom_b += B[i] * B[i] ;
}
return dot / (sqrt(denom_a) * sqrt(denom_b)) ;
}
void splitVector(const vector<string> & v, vector<float> & values)
{
vector<string> tmpv;
string parsed;
for(unsigned int i = 0; i < v.size(); i++)
{
split(v[i], ':', tmpv);
int idx = atoi(tmpv[0].c_str());
float val = atof(tmpv[1].c_str());
tmpv.clear();
values[idx] = val;
}//end for;
}//end function
int main()
{
//INPUT VECTORS.
vector<string> x {"1:0.1","43:0.4","50:0.43","90:0.9"};
vector<string> y {"20:0.5","40:0.34","50:0.34"};
//STEP 1: Initialize vectors
int dimension = 169647;
vector<float> X;
X.resize(dimension, 0.0);
vector<float> Y;
Y.resize(dimension, 0.0);
//STEP 2: CREATE FLOAT VECTORS
splitVector(x, X);
splitVector(y, Y);
//STEP 3: COMPUTE COSINE SIMILARITY
cout << cosine_similarity(X,Y) << endl;
}
Problem and proposed solution
Initializing and filling the vector<float>
is a problem. It is really taking so much execution time. I was thinking of using the std::map<int,float>
structure in c++. where X and Y will be represented by :
std::map<int,float> x_m{ make_pair(1,0.1), make_pair(43,0.4), make_pair(50,0.43), make_pair(90,0.9)};
std::map<int,float> y_m{ make_pair(20,0.5), make_pair(40,0.34), make_pair(50,0.34)};
For this purpose I used the following function:
float cosine_similarity(const std::map<int,float> & A,const std::map<int,float> & B)
{
float dot = 0.0, denom_a = 0.0, denom_b = 0.0 ;
for(auto &a:A)
{
denom_a += a.second * a.second ;
}
for(auto &b:B)
{
denom_b += b.second * b.second ;
}
for(auto &a:A)
{
if(B.find(a.first) != B.end())
{
dot += a.second * B.find(a.first)->second ;
}
}
return dot / (sqrt(denom_a) * sqrt(denom_b)) ;
}
Question
- Can you help me with the math of the complexity?
- Will the second proposed function that uses maps reduce the complexity?
- What do you think of the solution?