Cosine similarity between two dictionary's values

270 Views Asked by At

I have this dict called queries:

{'q1': ['similar',
  'law',
  'must',
  'obey',
  'construct',
  'aeroelast',
  'model',
  'heat',
  'high',
  'speed',
  'aircraft'],
 'q2': ['structur',
  'aeroelast',
  'problem',
  'associ',
  'flight',
  'high',
  'speed',
  'aircraft'],
 'q3': ['problem', 'heat', 'conduct', 'composit', 'slab', 'solv', 'far']
...
}

And have used this code to transform it into a dict of vectorized arrays:

class RetrievalSystem:
    def __init__(self, docs, num_concepts, min_df=1, alpha=1.0, beta=0.75, gamma=0.15):
        # create a doc-term matrix out of our doc collection
        self.vec = TfidfVectorizer(tokenizer=str.split, min_df=min_df)
        doc_term_mat = self.vec.fit_transform([" ".join(docs[doc_id]) for doc_id in docs])
        self.q_vecs = dict() # query vectors
        
        self.svd = TruncatedSVD(n_components = num_concepts, random_state = 42)
        
        self.doc_vecs = self.svd.fit_transform(doc_term_mat)

    def retrieve_n_rank_docs(self, queries, max_docs=-1):
       
        for query in queries:
            s = self.vec.transform([" ".join(queries[query])])
            s = self.svd.transform(s)
            if query not in self.q_vecs.keys():
                self.q_vecs[query] = s

The max_docs arguments governs the maximum number of documents to be returned by each query. And now self.q_vecs looks like this:

{'q217': array([[ 0.16555858,  0.12041974,  0.10034606,  0.03249144,  0.00843294,
         0.16582048, -0.20520625, -0.05597786, -0.12666519, -0.10517737,
         0.14363559, -0.01525909, -0.16574115, -0.04112081, -0.1374631 ,
         0.05047798,  0.05825697, -0.01779095, -0.05663042, -0.14333234,
        -0.09671375, -0.02205753,  0.03309577, -0.04512224, -0.01605542,
         0.00762974,  0.02407301,  0.00426722,  0.00654344,  0.08085963,
         0.08657383, -0.09913353,  0.01492773, -0.06813004, -0.01151318,
        -0.08565942,  0.03826287, -0.00330817,  0.13141591,  0.04920131,
        -0.08375895,  0.09465868, -0.03466024,  0.01838176, -0.00336209,
         0.02372735, -0.03390722,  0.0440413 ,  0.00371048,  0.09835254,
        -0.01099799,  0.0014484 ,  0.06276236,  0.04311937, -0.0867389 ,
         0.00850617,  0.00496759, -0.17198825,  0.07988587,  0.05727097,
         0.13304752,  0.08784825, -0.06141824, -0.01383098, -0.02348199,
        -0.04522944,  0.05257815,  0.08263177, -0.01140021, -0.05829286,
        -0.04885191,  0.09377792,  0.0190092 ,  0.00947696,  0.05598195,
        -0.03815088, -0.02834209,  0.0281708 , -0.02843137, -0.03210851,
         0.04751607, -0.01162277,  0.02034976, -0.02088302,  0.07665635,
         0.0195319 , -0.0157795 ,  0.01210985, -0.03183579,  0.01161029,
         0.02409737, -0.01007874,  0.10754846,  0.01010833, -0.05662593,
        -0.01729383, -0.03097083,  0.03369774,  0.00572065,  0.02632313]]), 'q99': array([[ 0.10287323, -0.01085065, -0.00967409, -0.04218846,  0.09239141,
         0.07992809, -0.00359886, -0.03796564,  0.01250241,  0.01951022,
        -0.03673524, -0.02372439, -0.03240905, -0.03081271,  0.02817431,
         0.12468386, -0.02051108,  0.12191644,  0.00624408, -0.05094331,
         0.09598166, -0.02341246, -0.0020474 , -0.05629724,  0.03516377,
         0.09028871,  0.02806492, -0.02300581, -0.02998558, -0.00270938,
         0.01611941,  0.04106955,  0.05371339, -0.02561045, -0.01916819,
         0.08158927, -0.03353019, -0.01020131, -0.03670832,  0.02845091,
         0.07133292, -0.0944471 , -0.00662414,  0.0920997 , -0.00206586,
         0.07063442, -0.00814919, -0.00374118, -0.01353651,  0.07968094,
         0.00796783, -0.01397921, -0.07712498, -0.00308536,  0.07785687,
        -0.01220938, -0.06646712,  0.04048088,  0.01321445,  0.00041508,
        -0.04644943,  0.09307773,  0.0188646 , -0.03233048, -0.04803833,
        -0.06355723, -0.00560934, -0.05478746,  0.03196071,  0.08420215,
        -0.07706163, -0.12595219, -0.01330823, -0.00079499, -0.02515943,
         0.00087481, -0.00596035,  0.01680558,  0.0138655 , -0.01290259,
        -0.0497661 , -0.04627047, -0.00239779, -0.06377815, -0.01103349,
         0.00205314, -0.0774958 ,  0.00223332, -0.00976858,  0.02365778,
         0.02600081,  0.01212485,  0.03451618,  0.00642054, -0.00025119,
         0.00898667,  0.00749051,  0.02099796, -0.00906813, -0.06770008]])
...
}

I'd like to take the cosine similarity between the vector representations here and those of self.docs, and then sort the documents these queries came from in descending order of the cosine similarity. So the desired output would look something like this:

{
    'q217': ['d983', 'd554', ..., 'd623'],
    'q99' : ['d716', 'd67', ..., 'd164'],
    ...
}

I have written this code to try to output to cos similarity but it only returns 1 key-value pair:

class RetrievalSystem:
    def __init__(self, docs, num_concepts, min_df=1, alpha=1.0, beta=0.75, gamma=0.15):
        self.alpha, self.beta, self.gamma = alpha, beta, gamma
        
        # create a doc-term matrix out of our doc collection
        self.vec = TfidfVectorizer(tokenizer=str.split, min_df=min_df)
        doc_term_mat = self.vec.fit_transform([" ".join(docs[doc_id]) for doc_id in docs])
        self.q_vecs = dict() # query vectors
        
        self.svd = TruncatedSVD(n_components = num_concepts, random_state = 42)
        
        self.doc_vecs = self.svd.fit_transform(doc_term_mat)
        # YOUR CODE HERE
        #raise NotImplementedError()

    def retrieve_n_rank_docs(self, queries, max_docs=-1):
       
        for query in queries:
            s = self.vec.transform([" ".join(queries[query])])
            s = self.svd.transform(s)
            if query not in self.q_vecs.keys():
                self.q_vecs[query] = s
        
            all_keys = list(self.q_vecs.keys())
            new_d = {}
        
            for i in range(len(all_keys)):
                for j in range(i+1,len(all_keys)):
                    new_d[query] = {1 - spatial.distance.cosine(self.q_vecs[all_keys[i]], self.q_vecs[all_keys[j]])}
1

There are 1 best solutions below

1
Tom McLean On

because the code is not a minimal reproducable example, I cant fully help. But to create a dictionary of cosine similarities of every combination of keys you can do:

import itertools
import numpy as np


q_ves = {
    "q1": np.array([0, 1]),
    "q2": np.array([1, 0]),
    "q3": np.array([0, 2]),
    "q4": np.array([10, 10])
}
new_q = {}
for k1, k2 in list(map(dict, itertools.combinations(
    q_vecs.items(), 2))):
    new_d[(k1, k2)] = 1 - scipy.spatial.distance.cosine(q_vecs[k1], q_vecs[k2])

Which will give you:

{
    ('q1', 'q2'): -0.04468849422512422,
    ('q1', 'q3'): 1,
    ('q1', 'q4'): -0.04468849422512422,
    ('q2', 'q3'): -0.04468849422512422,
    ('q2', 'q4'): 1,
    ('q3', 'q4'): -0.04468849422512422
}

I hope this is what you are after, as I dont understand how you generate the strings for:

{
    'q217': ['d983', 'd554', ..., 'd623'],
    'q99' : ['d716', 'd67', ..., 'd164'],
    ...
}