From 9fd246cb01096664ba2972fb0998bd433301ac00 Mon Sep 17 00:00:00 2001 From: WanderingPenwing Date: Thu, 26 Sep 2024 10:50:50 +0200 Subject: [PATCH] cosine similarity --- main.py | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/main.py b/main.py index 0d245b5..738636d 100644 --- a/main.py +++ b/main.py @@ -1,19 +1,38 @@ +import warnings from transformers import AutoTokenizer, AutoModel import torch +import torch.nn.functional as F + +# Suppress FutureWarnings and other warnings +warnings.simplefilter(action='ignore', category=FutureWarning) # Load the tokenizer and the model tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased') model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased') -# Prepare a test input sentence (e.g., "Hello, world!") -input_text = "Hello, world!" +# Function to compute sentence embeddings by pooling token embeddings (CLS token) +def get_sentence_embedding(text): + inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) + with torch.no_grad(): + outputs = model(**inputs) + + # Pooling strategy: Use the hidden state of the [CLS] token as the sentence embedding + cls_embedding = outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size) + return cls_embedding -# Tokenize the input text and convert it to input IDs -inputs = tokenizer(input_text, return_tensors="pt") # Return tensors in PyTorch format +# Example subject and abstract +subject = "Artificial Intelligence in Healthcare" +abstract = """ +Artificial intelligence (AI) is transforming healthcare with its ability to analyze complex medical data and assist in diagnosis. +AI models, especially in medical imaging, have shown promise in detecting diseases like cancer and predicting patient outcomes. +""" -# Forward pass through the model -with torch.no_grad(): # Disable gradient calculation since we are only doing inference - outputs = model(**inputs) +# Get embeddings +subject_embedding = get_sentence_embedding(subject) +abstract_embedding = get_sentence_embedding(abstract) -# Output model's hidden states (for the last layer) -print(outputs.last_hidden_state) +# 2. **Measure Semantic Similarity Using Cosine Similarity** + +# Compute cosine similarity between subject and abstract embeddings +similarity = F.cosine_similarity(subject_embedding, abstract_embedding) +print(f"Cosine Similarity: {similarity.item():.4f}")