hin/main.py
2024-09-27 14:24:18 +02:00

203 lines
5.5 KiB
Python

import warnings
from transformers import AutoTokenizer, AutoModel
from keybert import KeyBERT
import torch
import torch.nn.functional as F
import requests
import progressbar
from itertools import combinations
from datetime import datetime
#subject = input("Enter subject : ")
subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"
current_time = datetime.now().strftime("%m-%d_%H-%M")
file_path = f"logs/run_{current_time}.log"
content = f"# Hin run, {current_time}\n\nSubject : {subject}\n\n"
widgets = [' [',
progressbar.Timer(format= 'elapsed time: %(elapsed)s'),
'] ',
progressbar.Bar('*'),' (',
progressbar.ETA(), ') ',
]
# Suppress FutureWarnings and other warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
print("\n### Fetching Data ###\n")
# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
print("* Got tokenizer")
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
print("* Got model")
kw_model = KeyBERT()
print("* Got Keybert")
# Function to compute sentence embeddings by pooling token embeddings (CLS token)
def get_sentence_embedding(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
# Pooling strategy: Use the hidden state of the [CLS] token as the sentence embedding
cls_embedding = outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size)
return cls_embedding
# Function to compute cosine similarity
def compute_similarity(embedding1, embedding2):
similarity = F.cosine_similarity(embedding1, embedding2)
return similarity.item()
print("\n### Getting Keywords ###\n")
keywords = kw_model.extract_keywords(subject, keyphrase_ngram_range=(1, 2), stop_words='english', use_mmr=True, diversity=0.7)
print("* keywords extracted")
sorted_keywords = sorted(keywords, key=lambda x: -x[1])
text_keywords = [x[0] for x in sorted_keywords]
content += f"## Keywords\n\n{text_keywords}\n\n"
queries = []
for r in range(1, len(text_keywords) + 1):
comb = combinations(text_keywords, r)
queries.extend(comb)
final_queries = [subject] + ["\"" + "\" OR \"".join(query) + "\"" for query in queries]
#final_queries.ins(subject)
print("* query generated")
print("\n### Fetching Web data ###\n")
# Define the SearxNG instance URL and search query
searxng_url = "https://search.penwing.org/search"
results = []
web_bar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), progressbar.Bar()],
maxval=len(final_queries)).start()
progress = 0
content += f"## Queries\n\n"
for query in final_queries :
params = {
"q": query, # Your search query
"format": "json", # Requesting JSON format
"categories": "science", # You can specify categories (optional)
}
response = requests.get(searxng_url, params=params)
if response.status_code == 200:
data = response.json()
# List to store results with similarity scores
scored_results = []
results.extend(data.get("results", []))
content += f"{query};\n"
if query == subject:
test_content = ""
for result in data.get("results", []):
test_content+= result['title'] + "\n---\n"
with open("test.log", 'w') as file:
file.write(test_content)
else:
print(f"Error: {response.status_code}")
progress += 1
web_bar.update(progress)
print("\n\n### Starting result processing (",len(results),") ###\n")
subject_embedding = get_sentence_embedding(subject)
print("* Tokenized subject\n")
scored_results_urls = []
scored_results = []
bar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), progressbar.Bar()],
maxval=len(results)).start()
progress = 0
found_original = False
# Process each result
for result in results :
progress += 1
bar.update(progress)
title = result['title']
url = result['url']
snippet = result['content']
if title == subject :
found_original = True
if url in scored_results_urls :
continue
scored_results_urls.append(url)
# Get embedding for the snippet (abstract)
#result_embedding = get_sentence_embedding(snippet)
result_embedding = get_sentence_embedding(title)
# Compute similarity between subject and snippet
similarity = compute_similarity(subject_embedding, result_embedding)
# Store the result with its similarity score
scored_results.append({
'title': title,
'url': url,
'snippet': snippet,
'similarity': similarity
})
if found_original :
print("\n* Found Original Article")
# Sort the results by similarity (highest first)
top_results = sorted(scored_results, key=lambda x: x['similarity'], reverse=True)
print("\n\n### Done ###\n")
# Print the top 10 results
for idx, result in enumerate(top_results[:10], 1):
print(f"Rank {idx} ({result['similarity']:.4f}):")
print(f"Title: {result['title']}")
print(f"URL: {result['url']}")
print(f"Snippet: {result['snippet']}")
print("-" * 40)
# Define the file path with the current time in the filename
content += "\n## Results\n\n"
for result in top_results :
content += f"Title: {result['title']}\nURL: {result['url']}\n\n"
# Create and save the file
with open(file_path, 'w') as file:
file.write(content)