hin/main.py

import warnings
from transformers import AutoTokenizer, AutoModel
from keybert import KeyBERT
import torch
import torch.nn.functional as F
import requests
import progressbar
from itertools import combinations
from datetime import datetime

#subject = input("Enter subject : ")
subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"

current_time = datetime.now().strftime("%m-%d_%H-%M")

file_path = f"logs/run_{current_time}.log"

content = f"# Hin run, {current_time}\n\nSubject : {subject}\n\n"

widgets = [' [',
         progressbar.Timer(format= 'elapsed time: %(elapsed)s'),
         '] ',
           progressbar.Bar('*'),' (',
           progressbar.ETA(), ') ',
          ]

# Suppress FutureWarnings and other warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

print("\n### Fetching Data ###\n")

# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

print("* Got tokenizer")

model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')

print("* Got model")

kw_model = KeyBERT()

print("* Got Keybert")

# Function to compute sentence embeddings by pooling token embeddings (CLS token)
def get_sentence_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Pooling strategy: Use the hidden state of the [CLS] token as the sentence embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
    return cls_embedding

# Function to compute cosine similarity
def compute_similarity(embedding1, embedding2):
    similarity = F.cosine_similarity(embedding1, embedding2)
    return similarity.item()

print("\n### Getting Keywords ###\n")

keywords = kw_model.extract_keywords(subject, keyphrase_ngram_range=(1, 2), stop_words='english', use_mmr=True, diversity=0.7)

print("* keywords extracted")
sorted_keywords = sorted(keywords, key=lambda x: -x[1])
text_keywords = [x[0] for x in sorted_keywords]

content += f"## Keywords\n\n{text_keywords}\n\n"

queries = []

for r in range(1, len(text_keywords) + 1):
    comb = combinations(text_keywords, r)
    queries.extend(comb)

final_queries = [subject] + ["\"" + "\" OR \"".join(query) + "\"" for query in queries]

#final_queries.ins(subject)

print("* query generated")

print("\n### Fetching Web data ###\n")

# Define the SearxNG instance URL and search query
searxng_url = "https://search.penwing.org/search"

results = []

web_bar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), progressbar.Bar()],
        maxval=len(final_queries)).start()

progress = 0

content += f"## Queries\n\n"

for query in final_queries :
    params = {
        "q": query,  # Your search query
        "format": "json",         # Requesting JSON format
        "categories": "science",  # You can specify categories (optional)
    }

    response = requests.get(searxng_url, params=params)

    if response.status_code == 200:
        data = response.json()
    
        # List to store results with similarity scores
        scored_results = []
    
        results.extend(data.get("results", []))

        content += f"{query};\n"

        if query == subject:
            test_content = ""
            for result in data.get("results", []):
                test_content+= result['title'] + "\n---\n"
            with open("test.log", 'w') as file:
                file.write(test_content)
    else:
        print(f"Error: {response.status_code}")

    progress += 1
    web_bar.update(progress)
    
print("\n\n### Starting result processing (",len(results),") ###\n")

subject_embedding = get_sentence_embedding(subject)

print("* Tokenized subject\n")

scored_results_urls = []
scored_results = []

bar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), progressbar.Bar()],
    maxval=len(results)).start()

progress = 0

found_original = False

# Process each result
for result in results :
    progress += 1
    bar.update(progress)
        
    title = result['title']
    url = result['url']
    snippet = result['content']

    if title == subject :
        found_original = True

    if url in scored_results_urls :
        continue
        
    scored_results_urls.append(url)
    
    # Get embedding for the snippet (abstract)
    #result_embedding = get_sentence_embedding(snippet)
    result_embedding = get_sentence_embedding(title)
    
    # Compute similarity between subject and snippet
    similarity = compute_similarity(subject_embedding, result_embedding)
    
    # Store the result with its similarity score
    scored_results.append({
        'title': title,
        'url': url,
        'snippet': snippet,
        'similarity': similarity
    })

if found_original :
    print("\n* Found Original Article")
    

# Sort the results by similarity (highest first)
top_results = sorted(scored_results, key=lambda x: x['similarity'], reverse=True)

print("\n\n### Done ###\n")

# Print the top 10 results
for idx, result in enumerate(top_results[:10], 1):
    print(f"Rank {idx} ({result['similarity']:.4f}):")
    print(f"Title: {result['title']}")
    print(f"URL: {result['url']}")
    print(f"Snippet: {result['snippet']}")
    print("-" * 40)

# Define the file path with the current time in the filename


content += "\n## Results\n\n"

for result in top_results :
    content += f"Title: {result['title']}\nURL: {result['url']}\n\n"

# Create and save the file
with open(file_path, 'w') as file:
    file.write(content)
able to find original article 2024-09-27 14:24:18 +02:00			`import warnings`
			`from transformers import AutoTokenizer, AutoModel`
			`from keybert import KeyBERT`
			`import torch`
			`import torch.nn.functional as F`
			`import requests`
			`import progressbar`
			`from itertools import combinations`
			`from datetime import datetime`

			`#subject = input("Enter subject : ")`
			`subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"`

			`current_time = datetime.now().strftime("%m-%d_%H-%M")`

			`file_path = f"logs/run_{current_time}.log"`

			`content = f"# Hin run, {current_time}\n\nSubject : {subject}\n\n"`

			`widgets = [' [',`
			`progressbar.Timer(format= 'elapsed time: %(elapsed)s'),`
			`'] ',`
			`progressbar.Bar('*'),' (',`
			`progressbar.ETA(), ') ',`
			`]`

			`# Suppress FutureWarnings and other warnings`
			`warnings.simplefilter(action='ignore', category=FutureWarning)`

			`print("\n### Fetching Data ###\n")`

			`# Load the tokenizer and the model`
			`tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')`

			`print("* Got tokenizer")`

			`model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')`

			`print("* Got model")`

			`kw_model = KeyBERT()`

			`print("* Got Keybert")`

			`# Function to compute sentence embeddings by pooling token embeddings (CLS token)`
			`def get_sentence_embedding(text):`
			`inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)`
			`with torch.no_grad():`
			`outputs = model(**inputs)`

			`# Pooling strategy: Use the hidden state of the [CLS] token as the sentence embedding`
			`cls_embedding = outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size)`
			`return cls_embedding`

			`# Function to compute cosine similarity`
			`def compute_similarity(embedding1, embedding2):`
			`similarity = F.cosine_similarity(embedding1, embedding2)`
			`return similarity.item()`

			`print("\n### Getting Keywords ###\n")`

			`keywords = kw_model.extract_keywords(subject, keyphrase_ngram_range=(1, 2), stop_words='english', use_mmr=True, diversity=0.7)`

			`print("* keywords extracted")`
			`sorted_keywords = sorted(keywords, key=lambda x: -x[1])`
			`text_keywords = [x[0] for x in sorted_keywords]`

			`content += f"## Keywords\n\n{text_keywords}\n\n"`

			`queries = []`

			`for r in range(1, len(text_keywords) + 1):`
			`comb = combinations(text_keywords, r)`
			`queries.extend(comb)`

			`final_queries = [subject] + ["\"" + "\" OR \"".join(query) + "\"" for query in queries]`

			`#final_queries.ins(subject)`

			`print("* query generated")`

			`print("\n### Fetching Web data ###\n")`

			`# Define the SearxNG instance URL and search query`
			`searxng_url = "https://search.penwing.org/search"`

			`results = []`

			`web_bar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), progressbar.Bar()],`
			`maxval=len(final_queries)).start()`

			`progress = 0`

			`content += f"## Queries\n\n"`

			`for query in final_queries :`
			`params = {`
			`"q": query, # Your search query`
			`"format": "json", # Requesting JSON format`
			`"categories": "science", # You can specify categories (optional)`
			`}`

			`response = requests.get(searxng_url, params=params)`

			`if response.status_code == 200:`
			`data = response.json()`

			`# List to store results with similarity scores`
			`scored_results = []`

			`results.extend(data.get("results", []))`

			`content += f"{query};\n"`

			`if query == subject:`
			`test_content = ""`
			`for result in data.get("results", []):`
			`test_content+= result['title'] + "\n---\n"`
			`with open("test.log", 'w') as file:`
			`file.write(test_content)`
			`else:`
			`print(f"Error: {response.status_code}")`

			`progress += 1`
			`web_bar.update(progress)`

			`print("\n\n### Starting result processing (",len(results),") ###\n")`

			`subject_embedding = get_sentence_embedding(subject)`

			`print("* Tokenized subject\n")`

			`scored_results_urls = []`
			`scored_results = []`

			`bar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), progressbar.Bar()],`
			`maxval=len(results)).start()`

			`progress = 0`

			`found_original = False`

			`# Process each result`
			`for result in results :`
			`progress += 1`
			`bar.update(progress)`

			`title = result['title']`
			`url = result['url']`
			`snippet = result['content']`

			`if title == subject :`
			`found_original = True`

			`if url in scored_results_urls :`
			`continue`

			`scored_results_urls.append(url)`

			`# Get embedding for the snippet (abstract)`
			`#result_embedding = get_sentence_embedding(snippet)`
			`result_embedding = get_sentence_embedding(title)`

			`# Compute similarity between subject and snippet`
			`similarity = compute_similarity(subject_embedding, result_embedding)`

			`# Store the result with its similarity score`
			`scored_results.append({`
			`'title': title,`
			`'url': url,`
			`'snippet': snippet,`
			`'similarity': similarity`
			`})`

			`if found_original :`
			`print("\n* Found Original Article")`


			`# Sort the results by similarity (highest first)`
			`top_results = sorted(scored_results, key=lambda x: x['similarity'], reverse=True)`

			`print("\n\n### Done ###\n")`

			`# Print the top 10 results`
			`for idx, result in enumerate(top_results[:10], 1):`
			`print(f"Rank {idx} ({result['similarity']:.4f}):")`
			`print(f"Title: {result['title']}")`
			`print(f"URL: {result['url']}")`
			`print(f"Snippet: {result['snippet']}")`
			`print("-" * 40)`

			`# Define the file path with the current time in the filename`


			`content += "\n## Results\n\n"`

			`for result in top_results :`
			`content += f"Title: {result['title']}\nURL: {result['url']}\n\n"`

			`# Create and save the file`
			`with open(file_path, 'w') as file:`
			`file.write(content)`