import warnings from transformers import AutoTokenizer, AutoModel from keybert import KeyBERT import torch import torch.nn.functional as F import requests import progressbar from itertools import combinations from datetime import datetime #subject = input("Enter subject : ") subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer" current_time = datetime.now().strftime("%m-%d_%H-%M") file_path = f"logs/run_{current_time}.log" content = f"# Hin run, {current_time}\n\nSubject : {subject}\n\n" widgets = [' [', progressbar.Timer(format= 'elapsed time: %(elapsed)s'), '] ', progressbar.Bar('*'),' (', progressbar.ETA(), ') ', ] # Suppress FutureWarnings and other warnings warnings.simplefilter(action='ignore', category=FutureWarning) print("\n### Fetching Data ###\n") # Load the tokenizer and the model tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased') print("* Got tokenizer") model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased') print("* Got model") kw_model = KeyBERT() print("* Got Keybert") # Function to compute sentence embeddings by pooling token embeddings (CLS token) def get_sentence_embedding(text): inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) with torch.no_grad(): outputs = model(**inputs) # Pooling strategy: Use the hidden state of the [CLS] token as the sentence embedding cls_embedding = outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size) return cls_embedding # Function to compute cosine similarity def compute_similarity(embedding1, embedding2): similarity = F.cosine_similarity(embedding1, embedding2) return similarity.item() print("\n### Getting Keywords ###\n") keywords = kw_model.extract_keywords(subject, keyphrase_ngram_range=(1, 2), stop_words='english', use_mmr=True, diversity=0.7) print("* keywords extracted") sorted_keywords = sorted(keywords, key=lambda x: -x[1]) text_keywords = [x[0] for x in sorted_keywords] content += f"## Keywords\n\n{text_keywords}\n\n" queries = [] for r in range(1, len(text_keywords) + 1): comb = combinations(text_keywords, r) queries.extend(comb) final_queries = [subject] + ["\"" + "\" OR \"".join(query) + "\"" for query in queries] #final_queries.ins(subject) print("* query generated") print("\n### Fetching Web data ###\n") # Define the SearxNG instance URL and search query searxng_url = "https://search.penwing.org/search" results = [] web_bar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), progressbar.Bar()], maxval=len(final_queries)).start() progress = 0 content += f"## Queries\n\n" for query in final_queries : params = { "q": query, # Your search query "format": "json", # Requesting JSON format "categories": "science", # You can specify categories (optional) } response = requests.get(searxng_url, params=params) if response.status_code == 200: data = response.json() # List to store results with similarity scores scored_results = [] results.extend(data.get("results", [])) content += f"{query};\n" if query == subject: test_content = "" for result in data.get("results", []): test_content+= result['title'] + "\n---\n" with open("test.log", 'w') as file: file.write(test_content) else: print(f"Error: {response.status_code}") progress += 1 web_bar.update(progress) print("\n\n### Starting result processing (",len(results),") ###\n") subject_embedding = get_sentence_embedding(subject) print("* Tokenized subject\n") scored_results_urls = [] scored_results = [] bar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), progressbar.Bar()], maxval=len(results)).start() progress = 0 found_original = False # Process each result for result in results : progress += 1 bar.update(progress) title = result['title'] url = result['url'] snippet = result['content'] if title == subject : found_original = True if url in scored_results_urls : continue scored_results_urls.append(url) # Get embedding for the snippet (abstract) #result_embedding = get_sentence_embedding(snippet) result_embedding = get_sentence_embedding(title) # Compute similarity between subject and snippet similarity = compute_similarity(subject_embedding, result_embedding) # Store the result with its similarity score scored_results.append({ 'title': title, 'url': url, 'snippet': snippet, 'similarity': similarity }) if found_original : print("\n* Found Original Article") # Sort the results by similarity (highest first) top_results = sorted(scored_results, key=lambda x: x['similarity'], reverse=True) print("\n\n### Done ###\n") # Print the top 10 results for idx, result in enumerate(top_results[:10], 1): print(f"Rank {idx} ({result['similarity']:.4f}):") print(f"Title: {result['title']}") print(f"URL: {result['url']}") print(f"Snippet: {result['snippet']}") print("-" * 40) # Define the file path with the current time in the filename content += "\n## Results\n\n" for result in top_results : content += f"Title: {result['title']}\nURL: {result['url']}\n\n" # Create and save the file with open(file_path, 'w') as file: file.write(content)