Calculate cosine similarity given 2 sentence strings in python

Calculate cosine similarity given 2 sentence strings in python

To calculate the cosine similarity between two sentence strings in Python, you can follow these steps:

  1. Tokenize the sentences: Split each sentence into individual words or tokens.
  2. Create a vector representation for each sentence.
  3. Calculate the cosine similarity between the two vectors.

Here's a Python code example using the nltk library for tokenization and numpy for vector operations:

import nltk import numpy as np from nltk.tokenize import word_tokenize from collections import Counter nltk.download('punkt') # Download the punkt tokenizer data if not already downloaded # Function to calculate cosine similarity between two vectors def cosine_similarity(vec1, vec2): dot_product = np.dot(vec1, vec2) norm_vec1 = np.linalg.norm(vec1) norm_vec2 = np.linalg.norm(vec2) return dot_product / (norm_vec1 * norm_vec2) # Function to convert a sentence into a vector representation def sentence_to_vector(sentence, all_words): words = word_tokenize(sentence.lower()) sentence_vector = Counter(words) vector = [sentence_vector[word] if word in words else 0 for word in all_words] return vector # Example sentences sentence1 = "This is the first sentence." sentence2 = "This is the second sentence." # Tokenize and build vocabulary all_words = set(word_tokenize(sentence1.lower()) + word_tokenize(sentence2.lower())) # Convert sentences to vectors vector1 = sentence_to_vector(sentence1, all_words) vector2 = sentence_to_vector(sentence2, all_words) # Calculate cosine similarity similarity = cosine_similarity(vector1, vector2) print(f"Cosine Similarity: {similarity}") 

In this code:

  • We tokenize the sentences using nltk.word_tokenize to split them into words.
  • We build a vocabulary containing all unique words from both sentences.
  • We convert each sentence into a vector representation using a Counter to count the occurrences of each word in the sentence.
  • We calculate the cosine similarity between the two vectors using the formula for cosine similarity.

Make sure to install the nltk library and download the necessary data by running nltk.download('punkt') if you haven't already.

This code will give you a cosine similarity score between 0 and 1, where 1 indicates that the sentences are identical in terms of word usage, and 0 indicates no similarity.

Examples

  1. Calculate cosine similarity using scikit-learn in Python: Description: Utilize scikit-learn library to calculate the cosine similarity between two sentence strings.

    from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity def cosine_similarity_sentence(s1, s2): vectorizer = CountVectorizer().fit_transform([s1, s2]) vectors = vectorizer.toarray() return cosine_similarity(vectors)[0][1] # Example usage sentence1 = "This is the first sentence." sentence2 = "This is the second sentence." similarity = cosine_similarity_sentence(sentence1, sentence2) print("Cosine similarity:", similarity) 
  2. Calculate cosine similarity using spaCy in Python: Description: Use spaCy library to compute the cosine similarity between two sentence strings.

    import spacy def cosine_similarity_sentence(s1, s2): nlp = spacy.load("en_core_web_md") doc1 = nlp(s1) doc2 = nlp(s2) return doc1.similarity(doc2) # Example usage sentence1 = "This is the first sentence." sentence2 = "This is the second sentence." similarity = cosine_similarity_sentence(sentence1, sentence2) print("Cosine similarity:", similarity) 
  3. Calculate cosine similarity using gensim in Python: Description: Use gensim library to compute the cosine similarity between two sentence strings.

    from gensim.models import KeyedVectors def cosine_similarity_sentence(s1, s2): word_vectors = KeyedVectors.load_word2vec_format('path_to_pretrained_word_vectors.bin', binary=True) return word_vectors.similarity(s1, s2) # Example usage sentence1 = "This is the first sentence." sentence2 = "This is the second sentence." similarity = cosine_similarity_sentence(sentence1, sentence2) print("Cosine similarity:", similarity) 
  4. Calculate cosine similarity using numpy in Python: Description: Implement cosine similarity calculation using NumPy for two sentence strings.

    import numpy as np def cosine_similarity_sentence(s1, s2): def vectorize(sentence): words = sentence.split() return np.array([words.count(word) for word in unique_words]) unique_words = list(set(s1.split() + s2.split())) v1, v2 = vectorize(s1), vectorize(s2) return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) # Example usage sentence1 = "This is the first sentence." sentence2 = "This is the second sentence." similarity = cosine_similarity_sentence(sentence1, sentence2) print("Cosine similarity:", similarity) 
  5. Calculate cosine similarity using NLTK in Python: Description: Use NLTK library to calculate the cosine similarity between two sentence strings.

    from nltk.corpus import stopwords from nltk.tokenize import word_tokenize def cosine_similarity_sentence(s1, s2): stop_words = set(stopwords.words('english')) words1 = [word.lower() for word in word_tokenize(s1) if word.isalnum() and word.lower() not in stop_words] words2 = [word.lower() for word in word_tokenize(s2) if word.isalnum() and word.lower() not in stop_words] combined_words = set(words1 + words2) vector1 = [1 if word in words1 else 0 for word in combined_words] vector2 = [1 if word in words2 else 0 for word in combined_words] dot_product = sum(a * b for a, b in zip(vector1, vector2)) magnitude1 = sum(a ** 2 for a in vector1) ** 0.5 magnitude2 = sum(b ** 2 for b in vector2) ** 0.5 return dot_product / (magnitude1 * magnitude2) # Example usage sentence1 = "This is the first sentence." sentence2 = "This is the second sentence." similarity = cosine_similarity_sentence(sentence1, sentence2) print("Cosine similarity:", similarity) 
  6. Calculate cosine similarity using TensorFlow in Python: Description: Utilize TensorFlow library to compute the cosine similarity between two sentence strings.

    import tensorflow as tf from tensorflow.keras.layers.experimental.preprocessing import TextVectorization def cosine_similarity_sentence(s1, s2): vectorizer = TextVectorization() vectorizer.adapt([s1, s2]) v1, v2 = vectorizer(np.array([s1])), vectorizer(np.array([s2])) return tf.keras.losses.cosine_similarity(v1, v2).numpy()[0][0] # Example usage sentence1 = "This is the first sentence." sentence2 = "This is the second sentence." similarity = cosine_similarity_sentence(sentence1, sentence2) print("Cosine similarity:", similarity) 
  7. Calculate cosine similarity using word embeddings in Python: Description: Compute the cosine similarity between two sentence strings using pre-trained word embeddings.

    import gensim.downloader as api def cosine_similarity_sentence(s1, s2): word_vectors = api.load("glove-wiki-gigaword-100") v1 = np.mean([word_vectors[word] for word in s1.split() if word in word_vectors], axis=0) v2 = np.mean([word_vectors[word] for word in s2.split() if word in word_vectors], axis=0) return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) # Example usage sentence1 = "This is the first sentence." sentence2 = "This is the second sentence." similarity = cosine_similarity_sentence(sentence1, sentence2) print("Cosine similarity:", similarity) 
  8. Calculate cosine similarity using Word2Vec in Python: Description: Use Word2Vec model to calculate the cosine similarity between two sentence strings.

    from gensim.models import Word2Vec def cosine_similarity_sentence(s1, s2): model = Word2Vec.load('path_to_pretrained_word2vec_model') v1 = model.infer_vector(s1.split()) v2 = model.infer_vector(s2.split()) return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) # Example usage sentence1 = "This is the first sentence." sentence2 = "This is the second sentence." similarity = cosine_similarity_sentence(sentence1, sentence2) print("Cosine similarity:", similarity) 
  9. Calculate cosine similarity using BERT embeddings in Python: Description: Use BERT embeddings to compute the cosine similarity between two sentence strings.

    from sentence_transformers import SentenceTransformer def cosine_similarity_sentence(s1, s2): model = SentenceTransformer('bert-base-nli-mean-tokens') v1, v2 = model.encode([s1, s2]) return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) # Example usage sentence1 = "This is the first sentence." sentence2 = "This is the second sentence." similarity = cosine_similarity_sentence(sentence1, sentence2) print("Cosine similarity:", similarity) 
  10. Calculate cosine similarity using universal sentence encoder in Python: Description: Use the Universal Sentence Encoder to calculate the cosine similarity between two sentence strings.

    import tensorflow_hub as hub def cosine_similarity_sentence(s1, s2): embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") v1, v2 = embed([s1]), embed([s2]) return np.dot(v1, v2.T).flatten()[0] # Example usage sentence1 = "This is the first sentence." sentence2 = "This is the second sentence." similarity = cosine_similarity_sentence(sentence1, sentence2) print("Cosine similarity:", similarity) 

More Tags

latex cqlsh next.js npapi gpio pgp vlookup drupal-contact-form radians executable

More Python Questions

More Chemical thermodynamics Calculators

More Biochemistry Calculators

More Cat Calculators

More Animal pregnancy Calculators