Compare commits

..

No commits in common. "a266739b51de8339ce3b9a3fd46ab655afb1e3c9" and "5a815643a7854ad4ef52e9ab989e1d63ede1cf9c" have entirely different histories.

14 changed files with 218 additions and 370 deletions

3
.gitignore vendored
View file

@ -1,3 +1,2 @@
/target
/models /models
/logs/*
/web_data/*

View file

@ -9,11 +9,6 @@ torch = "*"
requests = "*" requests = "*"
keybert = "*" keybert = "*"
progressbar = "*" progressbar = "*"
rake-nltk = "*"
nltk = "*"
spacy = "*"
numpy = "*"
gensim = "*"
[dev-packages] [dev-packages]

View file

@ -1,3 +0,0 @@
# Hin
A searxng/BERT mix to find science papers more efficiently, given a subject

36
evaluate.py Normal file
View file

@ -0,0 +1,36 @@
import warnings
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
# Suppress FutureWarnings and other warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
# Function to compute sentence embeddings by pooling token embeddings (CLS token)
def get_sentence_embedding(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
# Pooling strategy: Use the hidden state of the [CLS] token as the sentence embedding
cls_embedding = outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size)
return cls_embedding
# Example subject and abstract
subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"
abstract = """
The research work presented in this paper aims to optimise the dynamic response of a carbon-epoxy plate by including into the laminate one frequency-dependent interleaved viscoelastic layer. To keep an acceptable bending stiffness, some holes are created in the viscoelastic layer, thus facilitating the resin through layer pene- tration during the co-curing manufacturing process. Plates including (or not) one perforated (or non-perforated) viscoelastic layer are manufactured and investigated experimentally and numerically. First, static and dynamic tests are performed on sandwich coupons to characterise the stiffness and damping properties of the plates in a given frequency range. Resulting mechanical properties are then used to set-up a finite element model and simulate the plate dynamic response. In parallel, fre- quency response measurements are carried out on the manufactured plates, then successfully confronted to the numerical results. Finally, a design of experiments is built based on a limited number on numerical simulations to find the configuration of bridges that maximises the damping while keeping a stiffness higher than half the stiffness of the equivalent undamped plate."""
# Get embeddings
subject_embedding = get_sentence_embedding(subject)
abstract_embedding = get_sentence_embedding(abstract)
# 2. **Measure Semantic Similarity Using Cosine Similarity**
# Compute cosine similarity between subject and abstract embeddings
similarity = F.cosine_similarity(subject_embedding, abstract_embedding)
print(f"Cosine Similarity: {similarity.item():.4f}")

25
key.py Normal file
View file

@ -0,0 +1,25 @@
from keybert import KeyBERT
from transformers import AutoTokenizer, AutoModel
# Load the SciBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
print("* Tokenizer")
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
print("* Scibert model")
# Define a KeyBERT model using SciBERT embeddings
kw_model = KeyBERT(model=model)
print("* Keybert model")
# Define the subject from which to extract keywords
subject = "tig welding of inconel 625 and influences on micro structures"
# Extract keywords from the subject
keywords = kw_model.extract_keywords(subject, keyphrase_ngram_range=(1, 2), stop_words='english', use_maxsum=True)
# Print extracted keywords
for keyword, score in keywords:
print(f"Keyword: {keyword}, Score: {score:.4f}")

67
main.py
View file

@ -1,67 +0,0 @@
import warnings
from datetime import datetime
import json
import os
from src.scrub import scrub_web
from src.key import create_queries
from src.evaluate import sort_results, CLS_POOLING, MEAN_POOLING, MAX_POOLING
# Suppress FutureWarnings and other warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def hin_fetch(subject, weights, pooling):
current_time = datetime.now().strftime("%m-%d_%H-%M")
data_path = f"web_data/{hash(subject)}.json"
file_path = f"logs/run_{current_time}_{weights}{pooling}.md"
log_content = f"# Hin run, {current_time}\n\nSubject : {subject}\n\n"
results = []
if os.path.exists(data_path) :
log_content += f"## Query results from {data_path}*\n\n"
print(f"* Subject known from {data_path}")
with open(data_path, 'r', encoding='utf-8') as f:
results = json.load(f)
else :
queries, keyword_log = create_queries(subject)
log_content += keyword_log
results, scrub_log = scrub_web(queries)
log_content += scrub_log
with open(data_path, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=4)
log_content += f"*Stored results in {data_path}*\n\n"
print(f"\n* Stored results in {data_path}")
sorted_results, results_log = sort_results(subject, results, weights, pooling)
log_content += results_log
print("### Done ###\n")
report = "## Results\n"
# Print the top 10 results
for idx, result in enumerate(sorted_results[:10], 1):
report += f"\nRank {idx} ({result['score']:.4f}):\nTitle: {result['title']}\nURL: {result['url']}\nSnippet: {result['snippet']}\n" + "-" * 40
print(report + "\n")
# Create and save the file
with open(file_path, 'w') as file:
file.write(log_content + report)
#subject = input("Enter subject : ")
subject = "State of the art on the identification of wood structure natural frequencies. Influence of the mechanical properties and interest in sensitivity analysis as prospects for reverse identification method of wood elastic properties."
#subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"
# hin_fetch(subject, [title_weight, snippet_weight], [title_pooling, snippet_pooling])
hin_fetch(subject, [1,0], [CLS_POOLING, MAX_POOLING])
hin_fetch(subject, [1,0], [MEAN_POOLING,MAX_POOLING])
hin_fetch(subject, [1,0], [MAX_POOLING, MAX_POOLING])
hin_fetch(subject, [0,1], [CLS_POOLING, CLS_POOLING])
hin_fetch(subject, [0,1], [CLS_POOLING, MEAN_POOLING])
hin_fetch(subject, [0,1], [CLS_POOLING, MAX_POOLING])

131
scrub-evaluate.py Normal file
View file

@ -0,0 +1,131 @@
import warnings
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import requests
import progressbar
# Me
#subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"
#query = "composite viscoelastic damping"
# Anne
#subject = "State of the art on the identification of wood structure natural frequencies. Influence of the mechanical properties and interest in sensitivity analysis as prospects for reverse identification method of wood elastic properties."
#query = "wood frequency analysis mechanical properties"
# Axel
#subject = "Characterization of SiC MOSFET using double pulse test method."
#query = "SiC MOSFET double pulse test"
# Paul
#subject = "Thermo-Mechanical Impact of temperature oscillations on bonding and metallization for SiC MOSFETs soldered on ceramic substrate"
#query = "thermo mechanical model discrete bonding SiC MOSFET"
# Jam
subject = "tig welding of inconel 625 and influences on micro structures"
query = "tig welding inconel 625"
widgets = [' [',
progressbar.Timer(format= 'elapsed time: %(elapsed)s'),
'] ',
progressbar.Bar('*'),' (',
progressbar.ETA(), ') ',
]
# Suppress FutureWarnings and other warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
print("\n### Fetching Data ###\n")
# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
print("* Got tokenizer")
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
print("* Got model")
# Function to compute sentence embeddings by pooling token embeddings (CLS token)
def get_sentence_embedding(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
# Pooling strategy: Use the hidden state of the [CLS] token as the sentence embedding
cls_embedding = outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size)
return cls_embedding
# Function to compute cosine similarity
def compute_similarity(embedding1, embedding2):
similarity = F.cosine_similarity(embedding1, embedding2)
return similarity.item()
# Define the SearxNG instance URL and search query
searxng_url = "https://search.penwing.org/search" # Replace with your instance URL
params = {
"q": query, # Your search query
"format": "json", # Requesting JSON format
"categories": "science", # You can specify categories (optional)
}
# Send the request to SearxNG API
response = requests.get(searxng_url, params=params)
# Check if the request was successful
if response.status_code == 200:
print("* Got search results")
# Parse the JSON response
data = response.json()
subject_embedding = get_sentence_embedding(subject)
print("* Tokenized subject")
print("\n### Starting result processing ###\n")
# List to store results with similarity scores
scored_results = []
results = data.get("results", [])
progress = 0
bar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), progressbar.Bar()],
maxval=len(results)).start()
# Process each result
for result in results :
title = result['title']
url = result['url']
snippet = result['content']
# Get embedding for the snippet (abstract)
snippet_embedding = get_sentence_embedding(snippet)
# Compute similarity between subject and snippet
similarity = compute_similarity(subject_embedding, snippet_embedding)
# Store the result with its similarity score
scored_results.append({
'title': title,
'url': url,
'snippet': snippet,
'similarity': similarity
})
progress += 1
bar.update(progress)
# Sort the results by similarity (highest first)
top_results = sorted(scored_results, key=lambda x: x['similarity'], reverse=True)[:10]
print("\n### Done ###\n")
# Print the top 10 results
for idx, result in enumerate(top_results, 1):
print(f"Rank {idx} ({result['similarity']:.4f}):")
print(f"Title: {result['title']}")
print(f"URL: {result['url']}")
print(f"Snippet: {result['snippet']}")
print("-" * 40)
else:
print(f"Error: {response.status_code}")

25
scrub.py Normal file
View file

@ -0,0 +1,25 @@
import requests
# Define the SearxNG instance URL and search query
searxng_url = "https://search.penwing.org/search" # Replace with your instance URL
params = {
"q": "zig zag theories", # Your search query
"format": "json", # Requesting JSON format
"categories": "science", # You can specify categories (optional)
}
# Send the request to SearxNG API
response = requests.get(searxng_url, params=params)
# Check if the request was successful
if response.status_code == 200:
# Parse the JSON response
data = response.json()
# Print or process the results
for result in data.get("results", []):
print(f"Title: {result['title']}")
print(f"URL: {result['url']}")
print(f"Snippet: {result['content']}")
print("-" * 40)
else:
print(f"Error: {response.status_code}")

View file

@ -6,12 +6,9 @@ mkShell {
pipenv pipenv
python3 python3
stdenv.cc.cc.lib stdenv.cc.cc.lib
zlib
]; ];
shellHook = '' shellHook = ''
export LD_LIBRARY_PATH="${pkgs.stdenv.cc.cc.lib}/lib"; export LD_LIBRARY_PATH="${pkgs.stdenv.cc.cc.lib}/lib";
export LD_LIBRARY_PATH="${pkgs.zlib}/lib:$LD_LIBRARY_PATH";
alias run="pipenv run python main.py; notify-send -u normal -a 'Hin' 'finished'"
''; '';
} }

View file

@ -1,169 +0,0 @@
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import progressbar
import math
CLS_POOLING = 1
MEAN_POOLING = 2
MAX_POOLING = 3
print("\n### Fetching SciBert ###\n")
# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
print("* Got tokenizer")
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
print("* Got model")
def get_subject_output(subject):
subject_inputs = tokenizer(subject, return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
subject_outputs = model(**subject_inputs)
return subject_outputs
# Function to compute the embedding with a selected pooling method
def compute_similarity(subject_outputs, compare_text, pooling_method):
# Tokenize the input texts
compare_inputs = tokenizer(compare_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
# Compute embeddings for both the subject and the comparison text
with torch.no_grad():
compare_outputs = model(**compare_inputs)
# Pooling strategies
def cls_pooling(output):
return output.last_hidden_state[:, 0, :] # CLS token is at index 0
def mean_pooling(output):
return output.last_hidden_state.mean(dim=1) # Mean of all token embeddings
def max_pooling(output):
return output.last_hidden_state.max(dim=1).values # Max of all token embeddings
# Choose pooling strategy based on the input integer
if pooling_method == CLS_POOLING:
subject_embedding = cls_pooling(subject_outputs)
compare_embedding = cls_pooling(compare_outputs)
elif pooling_method == MEAN_POOLING:
subject_embedding = mean_pooling(subject_outputs)
compare_embedding = mean_pooling(compare_outputs)
elif pooling_method == MAX_POOLING:
subject_embedding = max_pooling(subject_outputs)
compare_embedding = max_pooling(compare_outputs)
else:
raise ValueError("Pooling method must be 1 (CLS), 2 (Mean), or 3 (Max).")
return F.cosine_similarity(subject_embedding, compare_embedding).item()
def score_results(subject, results, weights, pooling):
subject_model_output = get_subject_output(subject)
print("* Tokenized subject\n")
scored_results_urls = []
scored_results = []
print("* Started scoring results\n")
bar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), progressbar.Bar()],
maxval=len(results)).start()
progress = 0
title_score_bounds = [1, 0]
snippet_score_bounds = [1, 0]
title_pooling = pooling[0]
snippet_pooling = pooling[1]
log = f"Weights : {weights};\n\nPooling : {pooling}\n\n"
# Process each result
for result in results :
progress += 1
bar.update(progress)
title = result['title']
url = result['url']
snippet = result['content']
if title == subject :
found_original = True
if url in scored_results_urls :
continue
scored_results_urls.append(url)
# Compute similarity between subject and result
title_score, snippet_score = 1, 1
if weights[0] != 0 :
title_score = compute_similarity(subject_model_output, title, title_pooling)
if weights[1] != 0 :
snippet_score = compute_similarity(subject_model_output, snippet, snippet_pooling)
if title_score < title_score_bounds[0] :
title_score_bounds[0] = title_score
if title_score > title_score_bounds[1] :
title_score_bounds[1] = title_score
if snippet_score < snippet_score_bounds[0] :
snippet_score_bounds[0] = snippet_score
if snippet_score > snippet_score_bounds[1] :
snippet_score_bounds[1] = snippet_score
# Store the result with its similarity score
scored_results.append({
'title': title,
'url': url,
'snippet': snippet,
'title-score': title_score,
'snippet-score': snippet_score
})
log += f"Score bounds : T{title_score_bounds} # S{snippet_score_bounds}\n\n"
print("\n\n* Scored results\n")
normalized_results = []
for result in scored_results:
title_score, snippet_score = 1, 1
if weights[0] != 0 :
title_score = (result['title-score'] - title_score_bounds[0]) / (title_score_bounds[1] - title_score_bounds[0])
if weights[1] != 0 :
snippet_score = (result['snippet-score'] - snippet_score_bounds[0]) / (snippet_score_bounds[1] - snippet_score_bounds[0])
score = math.pow(math.pow(title_score, weights[0]) * math.pow(snippet_score, weights[1]), 1 / (weights[0] + weights[1]))
normalized_results.append({
'title': result['title'],
'url': result['url'],
'snippet': result['snippet'],
'score': score,
})
return normalized_results, log
def sort_results(subject, results, weights, pooling):
print("\n### Starting result processing (",len(results),") ###\n")
log = "\n---\n\n## Scoring\n\n"
scored_results, score_log = score_results(subject, results, weights, pooling)
log += score_log
# Sort the results by similarity (highest first)
sorted_results = sorted(scored_results, key=lambda x: x['score'], reverse=True)
return sorted_results, log

View file

@ -1,33 +0,0 @@
from keybert import KeyBERT
from itertools import combinations
def create_queries(subject) :
print("\n### Getting Keywords ###\n")
kw_model = KeyBERT()
print("* Got Keybert")
keywords = kw_model.extract_keywords(subject, keyphrase_ngram_range=(1, 2), stop_words='english', use_mmr=True, diversity=0.7)
print("* keywords extracted")
sorted_keywords = sorted(keywords, key=lambda x: -x[1])
text_keywords = [x[0] for x in sorted_keywords]
log = f"## Keywords\n\n{text_keywords}\n\n"
queries = []
for r in range(1, len(text_keywords) + 1):
comb = combinations(text_keywords, r)
queries.extend(comb)
final_queries = [subject] + ["\"" + "\" OR \"".join(query) + "\"" for query in queries]
#final_queries.ins(subject)
print("* query generated")
return final_queries, log

View file

@ -1,44 +0,0 @@
import requests
import progressbar
searxng_url = "https://search.penwing.org/search"
def scrub_web(queries) :
print("\n### Fetching Web data ###\n")
web_bar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), progressbar.Bar()],
maxval=len(queries)).start()
progress = 0
results = []
log = "## Queries\n\n"
for query in queries :
params = {
"q": query, # Your search query
"format": "json", # Requesting JSON format
"categories": "science", # You can specify categories (optional)
}
response = requests.get(searxng_url, params=params)
if response.status_code == 200:
data = response.json()
# List to store results with similarity scores
scored_results = []
results.extend(data.get("results", []))
log += f"{query};\n"
else:
print(f"Error: {response.status_code}")
progress += 1
web_bar.update(progress)
print("")
return results, log

View file

@ -1,25 +0,0 @@
import requests
subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"
searxng_url = "https://search.penwing.org/search"
params = {
"q": subject, # Your search query
"format": "json", # Requesting JSON format
"categories": "science", # You can specify categories (optional)
}
response = requests.get(searxng_url, params=params)
if response.status_code == 200:
data = response.json()
# List to store results with similarity scores
scored_results = []
for result in data.get("results", []):
print(result['title'])
print("---")
else:
print(f"Error: {response.status_code}")

View file

@ -1,19 +0,0 @@
# Me
subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"
query = "composite viscoelastic damping"
# Anne
subject = "State of the art on the identification of wood structure natural frequencies. Influence of the mechanical properties and interest in sensitivity analysis as prospects for reverse identification method of wood elastic properties."
query = "wood frequency analysis mechanical properties"
# Axel
subject = "Characterization of SiC MOSFET using double pulse test method."
query = "SiC MOSFET double pulse test"
# Paul
subject = "Thermo-Mechanical Impact of temperature oscillations on bonding and metallization for SiC MOSFETs soldered on ceramic substrate"
query = "thermo mechanical model discrete bonding SiC MOSFET"
# Jam
subject = "tig welding of inconel 625 and influences on micro structures"
query = "tig welding inconel 625"