0

I have this spark Udf for cosine similarity.

def cosineSimilarity(df): """ Cosine similarity of the each document with other """ from pyspark.sql.functions import udf from pyspark.sql.types import DoubleType from scipy.spatial import distance cosine = udf(lambda v1, v2: ( float(1-distance.cosine(v1, v2)) if v1 is not None and v2 is not None else None), DoubleType()) # Creating a cross product of the table to get the cosine similarity vectors crosstabDF=df.withColumnRenamed('id','id_1').withColumnRenamed('w2v_vector','w2v_vector_1')\ .join(df.withColumnRenamed('id','id_2').withColumnRenamed('w2v_vector','w2v_vector_2')) similardocsDF= crosstabDF.withColumn('cosinesim', cosine("w2v_vector_1","w2v_vector_1")) return similardocsDF similardocsDF=cosineSimilarity(w2vdf.select('id','w2v_vector')) similardocsDF.cache().take(4) 

My data is like below:

w2vdf.select('id','tokenised_text','w2v_vector').take(1) #tfidfDf.take(1) Out[18]: [Row(id=-33753621, tokenised_text=[u'if', u'you', u'hate', u'dealing', u'with', u'bank', u'tellers', u'or', u'customer', u'service', u'representatives,', u'then', u'the', u'royal', u'bank', u'of', u'scotland', u'might', u'have', u'a', u'solution', u'for', u'you.if', u'this', u'program', u'is', u'successful,', u'it', u'could', u'be', u'a', u'big', u'step', u'forward', u'on', u'the', u'road', u'to', u'automated', u'customer', u'service', u'through', u'the', u'use', u'of', u'ai,', u'notes', u'laurie', u'beaver,', u'research', u'associate', u'for', u'bi', u'intelligence,', u'business', u"insider's", u'premium', u'research', u"service.it's", u'noteworthy', u'that', u'luvo', u'does', u'not', u'operate', u'via', u'a', u'third-party', u'app', u'such', u'as', u'facebook', u'messenger,', u'wechat,', u'or', u'kik,', u'all', u'of', u'which', u'are', u'currently', u'trying', u'to', u'create', u'bots', u'that', u'would', u'assist', u'in', u'customer', u'service', u'within', u'their', u'respective', u'platforms.luvo', u'would', u'be', u'available', u'through', u'the', u'web', u'and', u'through', u'smartphones.', u'it', u'would', u'also', u'use', u'machine', u'learning', u'to', u'learn', u'from', u'its', u'mistakes,', u'which', u'should', u'ultimately', u'help', u'with', u'its', u'response', u'accuracy.down', u'the', u'road,', u'luvo', u'would', u'become', u'a', u'supplement', u'to', u'the', u'human', u'staff.', u'it', u'can', u'currently', u'answer', u'20', u'set', u'questions', u'but', u'as', u'that', u'number', u'grows,', u'it', u'would', u'allow', u'the', u'human', u'employees', u'to', u'more', u'complicated', u'issues.', u'if', u'a', u'problem', u'is', u'beyond', u"luvo's", u'comprehension,', u'then', u'it', u'would', u'refer', u'the', u'customer', u'to', u'a', u'bank', u'employee;', u'however,\xa0a', u'user', u'could', u'choose', u'to', u'speak', u'with', u'a', u'human', u'instead', u'of', u'luvo', u'anyway.ai', u'such', u'as', u'luvo,', u'if', u'successful,', u'could', u'help', u'businesses', u'become', u'more', u'efficient', u'and', u'increase', u'their', u'productivity,', u'while', u'simultaneously', u'improving', u'customer', u'service', u'capacity,', u'which', u'would', u'consequently\xa0save', u'money', u'that', u'would', u'otherwise', u'go', u'toward', u'manpower.and', u'this', u'trend', u'is', u'already', u'starting.', u'google,', u'microsoft,', u'and', u'ibm', u'are', u'investing', u'significantly', u'into', u'ai', u'research.', u'furthermore,', u'the', u'global', u'ai', u'market', u'is', u'estimated', u'to', u'grow', u'from', u'approximately', u'$420', u'million', u'in', u'2014', u'to', u'$5.05', u'billion', u'in', u'2020,', u'according', u'to', u'a', u'forecast', u'by', u'research', u'and', u'markets.\xa0the', u'move', u'toward', u'ai', u'would', u'be', u'just', u'one', u'more', u'way', u'in', u'which', u'the', u'digital', u'age', u'is', u'disrupting', u'retail', u'banking.', u'customers,', u'particularly', u'millennials,', u'are', u'increasingly', u'moving', u'toward', u'digital', u'banking,', u'and', u'as', u'a', u'result,', u"they're", u'walking', u'into', u'their', u"banks'", u'traditional', u'brick-and-mortar', u'branches', u'less', u'often', u'than', u'ever', u'before.'], w2v_vector=DenseVector([-0.0394, -0.0388, 0.0368, -0.0455, 0.0602, -0.0734, 0.0515, -0.0064, -0.068, -0.0438, 0.0671, 0.007, -0.0227, -0.0393, -0.0254, -0.024, 0.0115, 0.0415, -0.0116, -0.0169, 0.0545, -0.0439, 0.0414, 0.0312, -0.028, -0.0085, 0.0234, -0.1321, -0.0364, 0.0921, 0.0208, 0.0156, 0.0071, 0.0186, -0.0455, -0.0634, 0.0379, 0.0148, 0.0401, -0.0395, 0.0334, 0.0026, -0.0748, -0.0242, -0.0373, 0.0602, -0.0341, -0.0181, 0.0723, 0.0012, -0.1177, 0.0319, 0.0322, -0.1054, -0.0011, -0.0415, -0.0161, -0.0472, -0.0785, -0.0219, -0.0311, 0.0296, -0.0149, 0.04, 0.0001, 0.0337, 0.0841, -0.0344, -0.0171, 0.0425, -0.0122, 0.0838, 0.034, 0.0054, 0.0171, 0.0209, 0.0286, -0.0227, -0.0147, 0.0532, -0.027, -0.0645, -0.0858, -0.1444, 0.0824, 0.0128, -0.0485, -0.0378, -0.0229, 0.0331, -0.0248, 0.0427, -0.0624, -0.0324, -0.0271, 0.0135, 0.0504, 0.0028, -0.0772, 0.0121, -0.09, 0.031, -0.0771, -0.0703, 0.0947, 0.0997, -0.0084, 0.0774, 0.0281, 0.0405, -0.0475, 0.0217, 0.0591, 0.0241, -0.0287, 0.1064, 0.059, -0.06, 0.0422, 0.0908, 0.0341, 0.028, -0.0334, 0.0065, -0.0289, -0.0851, -0.0208, 0.0598, -0.0218, 0.001, 0.0049, 0.0257, 0.0076, -0.0599, 0.006, -0.0494, -0.0081, 0.0066, 0.0131, -0.0299, 0.0159, -0.0383, 0.0402, -0.0571, 0.0359, 0.0009, 0.0404, -0.0207, 0.0044, -0.0089, 0.0306, -0.0405, -0.0012, 0.0159, -0.005, -0.031, -0.0016, -0.0081, 0.0123, -0.0364, 0.0161, -0.0383, -0.0303, -0.0073, -0.0184, 0.0399, 0.0412, 0.0278, 0.0455, -0.0304, 0.0145, -0.0163, 0.0631, -0.0423, 0.0239, 0.0801, -0.0659, -0.0382, 0.0138, 0.051, 0.0056, -0.1605, 0.0018, 0.0077, -0.0076, 0.0119, 0.0397, -0.0823, -0.0462, 0.0465, 0.0735, 0.0283, -0.0205, -0.012, 0.0662, 0.0429, 0.0089, -0.0562, 0.1624, 0.0192, 0.0098, -0.0483, 0.0248, 0.0005, -0.0619, -0.0115, 0.0424, -0.0875, 0.0383, -0.0463, -0.0044, -0.0218, 0.014, -0.0404, -0.0198, -0.0162, -0.018, -0.0377, -0.0291, -0.0273, -0.0713, -0.0047, 0.0263, 0.0809, -0.0477, 0.0056, -0.0563, -0.061, -0.0185, 0.0223, -0.0718, 0.0163, 0.0061, -0.0716, -0.0081, 0.0079, 0.0156, -0.0124, -0.0223, -0.0092, -0.0621, 0.0033, 0.031, 0.0509, -0.0548, -0.0121, -0.0276, 0.0176, -0.04, 0.0382, -0.0737, 0.0202, -0.0314, -0.0702, 0.0685, -0.0928, 0.0698, -0.0484, 0.0541, -0.0539, 0.0895, 0.0076, -0.0134, -0.0116, 0.0227, -0.0361, -0.0729, -0.0068, -0.0501, 0.0137, -0.0134, 0.0039, -0.0463, 0.0289, -0.0336, -0.0731, -0.0362, -0.0195, 0.0466, -0.0132, 0.0336, 0.0108, 0.0219, -0.0702, -0.0117, -0.0285, 0.0644, -0.0806, 0.002, -0.0603, 0.0365, 0.0333, 0.0197, -0.037, 0.0983, 0.0011, 0.0436, 0.0506, -0.0089, -0.0134]))] 

But when I ran the above code it kept running for 2 hours and still didn't complete. I ran it on my local spark installed on mac. Total observations are 59K and the w2vec vector embeddings for a text doc is 300 dimension.

0

1 Answer 1

1

To sum up comments to the question: your method of calculating similarities is very inefficient (it's praciticaly brute-force).

You should consider using other algorithms, such as DIMSUM: https://databricks.com/blog/2014/10/20/efficient-similarity-algorithm-now-in-spark-twitter.html

It's available in pyspark since 2.0.0: http://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.RowMatrix.columnSimilarities

Sign up to request clarification or add additional context in comments.

Comments

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.