Compare commits

..

10 commits

Author SHA1 Message Date
WanderingPenwing a266739b51 added readme 2024-09-27 21:44:41 +02:00
WanderingPenwing cb36ef8bd2 modified gitignore 2024-09-27 21:42:23 +02:00
WanderingPenwing 1bb5922b98 empty web_data folder 2024-09-27 21:41:37 +02:00
WanderingPenwing ebfc48fdcb empty logs folder 2024-09-27 21:40:05 +02:00
WanderingPenwing 3489ade151 better log name 2024-09-27 21:38:44 +02:00
WanderingPenwing b3f0bea0e5 different pooling techniques 2024-09-27 18:18:21 +02:00
WanderingPenwing ef5c154a1e better logs, weighted scores 2024-09-27 17:11:21 +02:00
WanderingPenwing 5a51a383ed split files 2024-09-27 14:53:16 +02:00
WanderingPenwing 7f4bc61fa8 able to find original article 2024-09-27 14:24:18 +02:00
WanderingPenwing 2539b919c2 keyword extraction 2024-09-27 12:11:59 +02:00
14 changed files with 370 additions and 218 deletions

3
.gitignore vendored
View file

@ -1,2 +1,3 @@
/target
/models /models
/logs/*
/web_data/*

View file

@ -9,6 +9,11 @@ torch = "*"
requests = "*" requests = "*"
keybert = "*" keybert = "*"
progressbar = "*" progressbar = "*"
rake-nltk = "*"
nltk = "*"
spacy = "*"
numpy = "*"
gensim = "*"
[dev-packages] [dev-packages]

3
README.md Normal file
View file

@ -0,0 +1,3 @@
# Hin
A searxng/BERT mix to find science papers more efficiently, given a subject

View file

@ -1,36 +0,0 @@
import warnings
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
# Suppress FutureWarnings and other warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
# Function to compute sentence embeddings by pooling token embeddings (CLS token)
def get_sentence_embedding(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
# Pooling strategy: Use the hidden state of the [CLS] token as the sentence embedding
cls_embedding = outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size)
return cls_embedding
# Example subject and abstract
subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"
abstract = """
The research work presented in this paper aims to optimise the dynamic response of a carbon-epoxy plate by including into the laminate one frequency-dependent interleaved viscoelastic layer. To keep an acceptable bending stiffness, some holes are created in the viscoelastic layer, thus facilitating the resin through layer pene- tration during the co-curing manufacturing process. Plates including (or not) one perforated (or non-perforated) viscoelastic layer are manufactured and investigated experimentally and numerically. First, static and dynamic tests are performed on sandwich coupons to characterise the stiffness and damping properties of the plates in a given frequency range. Resulting mechanical properties are then used to set-up a finite element model and simulate the plate dynamic response. In parallel, fre- quency response measurements are carried out on the manufactured plates, then successfully confronted to the numerical results. Finally, a design of experiments is built based on a limited number on numerical simulations to find the configuration of bridges that maximises the damping while keeping a stiffness higher than half the stiffness of the equivalent undamped plate."""
# Get embeddings
subject_embedding = get_sentence_embedding(subject)
abstract_embedding = get_sentence_embedding(abstract)
# 2. **Measure Semantic Similarity Using Cosine Similarity**
# Compute cosine similarity between subject and abstract embeddings
similarity = F.cosine_similarity(subject_embedding, abstract_embedding)
print(f"Cosine Similarity: {similarity.item():.4f}")

25
key.py
View file

@ -1,25 +0,0 @@
from keybert import KeyBERT
from transformers import AutoTokenizer, AutoModel
# Load the SciBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
print("* Tokenizer")
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
print("* Scibert model")
# Define a KeyBERT model using SciBERT embeddings
kw_model = KeyBERT(model=model)
print("* Keybert model")
# Define the subject from which to extract keywords
subject = "tig welding of inconel 625 and influences on micro structures"
# Extract keywords from the subject
keywords = kw_model.extract_keywords(subject, keyphrase_ngram_range=(1, 2), stop_words='english', use_maxsum=True)
# Print extracted keywords
for keyword, score in keywords:
print(f"Keyword: {keyword}, Score: {score:.4f}")

67
main.py Normal file
View file

@ -0,0 +1,67 @@
import warnings
from datetime import datetime
import json
import os
from src.scrub import scrub_web
from src.key import create_queries
from src.evaluate import sort_results, CLS_POOLING, MEAN_POOLING, MAX_POOLING
# Suppress FutureWarnings and other warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def hin_fetch(subject, weights, pooling):
current_time = datetime.now().strftime("%m-%d_%H-%M")
data_path = f"web_data/{hash(subject)}.json"
file_path = f"logs/run_{current_time}_{weights}{pooling}.md"
log_content = f"# Hin run, {current_time}\n\nSubject : {subject}\n\n"
results = []
if os.path.exists(data_path) :
log_content += f"## Query results from {data_path}*\n\n"
print(f"* Subject known from {data_path}")
with open(data_path, 'r', encoding='utf-8') as f:
results = json.load(f)
else :
queries, keyword_log = create_queries(subject)
log_content += keyword_log
results, scrub_log = scrub_web(queries)
log_content += scrub_log
with open(data_path, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=4)
log_content += f"*Stored results in {data_path}*\n\n"
print(f"\n* Stored results in {data_path}")
sorted_results, results_log = sort_results(subject, results, weights, pooling)
log_content += results_log
print("### Done ###\n")
report = "## Results\n"
# Print the top 10 results
for idx, result in enumerate(sorted_results[:10], 1):
report += f"\nRank {idx} ({result['score']:.4f}):\nTitle: {result['title']}\nURL: {result['url']}\nSnippet: {result['snippet']}\n" + "-" * 40
print(report + "\n")
# Create and save the file
with open(file_path, 'w') as file:
file.write(log_content + report)
#subject = input("Enter subject : ")
subject = "State of the art on the identification of wood structure natural frequencies. Influence of the mechanical properties and interest in sensitivity analysis as prospects for reverse identification method of wood elastic properties."
#subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"
# hin_fetch(subject, [title_weight, snippet_weight], [title_pooling, snippet_pooling])
hin_fetch(subject, [1,0], [CLS_POOLING, MAX_POOLING])
hin_fetch(subject, [1,0], [MEAN_POOLING,MAX_POOLING])
hin_fetch(subject, [1,0], [MAX_POOLING, MAX_POOLING])
hin_fetch(subject, [0,1], [CLS_POOLING, CLS_POOLING])
hin_fetch(subject, [0,1], [CLS_POOLING, MEAN_POOLING])
hin_fetch(subject, [0,1], [CLS_POOLING, MAX_POOLING])

View file

@ -1,131 +0,0 @@
import warnings
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import requests
import progressbar
# Me
#subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"
#query = "composite viscoelastic damping"
# Anne
#subject = "State of the art on the identification of wood structure natural frequencies. Influence of the mechanical properties and interest in sensitivity analysis as prospects for reverse identification method of wood elastic properties."
#query = "wood frequency analysis mechanical properties"
# Axel
#subject = "Characterization of SiC MOSFET using double pulse test method."
#query = "SiC MOSFET double pulse test"
# Paul
#subject = "Thermo-Mechanical Impact of temperature oscillations on bonding and metallization for SiC MOSFETs soldered on ceramic substrate"
#query = "thermo mechanical model discrete bonding SiC MOSFET"
# Jam
subject = "tig welding of inconel 625 and influences on micro structures"
query = "tig welding inconel 625"
widgets = [' [',
progressbar.Timer(format= 'elapsed time: %(elapsed)s'),
'] ',
progressbar.Bar('*'),' (',
progressbar.ETA(), ') ',
]
# Suppress FutureWarnings and other warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
print("\n### Fetching Data ###\n")
# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
print("* Got tokenizer")
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
print("* Got model")
# Function to compute sentence embeddings by pooling token embeddings (CLS token)
def get_sentence_embedding(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
# Pooling strategy: Use the hidden state of the [CLS] token as the sentence embedding
cls_embedding = outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size)
return cls_embedding
# Function to compute cosine similarity
def compute_similarity(embedding1, embedding2):
similarity = F.cosine_similarity(embedding1, embedding2)
return similarity.item()
# Define the SearxNG instance URL and search query
searxng_url = "https://search.penwing.org/search" # Replace with your instance URL
params = {
"q": query, # Your search query
"format": "json", # Requesting JSON format
"categories": "science", # You can specify categories (optional)
}
# Send the request to SearxNG API
response = requests.get(searxng_url, params=params)
# Check if the request was successful
if response.status_code == 200:
print("* Got search results")
# Parse the JSON response
data = response.json()
subject_embedding = get_sentence_embedding(subject)
print("* Tokenized subject")
print("\n### Starting result processing ###\n")
# List to store results with similarity scores
scored_results = []
results = data.get("results", [])
progress = 0
bar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), progressbar.Bar()],
maxval=len(results)).start()
# Process each result
for result in results :
title = result['title']
url = result['url']
snippet = result['content']
# Get embedding for the snippet (abstract)
snippet_embedding = get_sentence_embedding(snippet)
# Compute similarity between subject and snippet
similarity = compute_similarity(subject_embedding, snippet_embedding)
# Store the result with its similarity score
scored_results.append({
'title': title,
'url': url,
'snippet': snippet,
'similarity': similarity
})
progress += 1
bar.update(progress)
# Sort the results by similarity (highest first)
top_results = sorted(scored_results, key=lambda x: x['similarity'], reverse=True)[:10]
print("\n### Done ###\n")
# Print the top 10 results
for idx, result in enumerate(top_results, 1):
print(f"Rank {idx} ({result['similarity']:.4f}):")
print(f"Title: {result['title']}")
print(f"URL: {result['url']}")
print(f"Snippet: {result['snippet']}")
print("-" * 40)
else:
print(f"Error: {response.status_code}")

View file

@ -1,25 +0,0 @@
import requests
# Define the SearxNG instance URL and search query
searxng_url = "https://search.penwing.org/search" # Replace with your instance URL
params = {
"q": "zig zag theories", # Your search query
"format": "json", # Requesting JSON format
"categories": "science", # You can specify categories (optional)
}
# Send the request to SearxNG API
response = requests.get(searxng_url, params=params)
# Check if the request was successful
if response.status_code == 200:
# Parse the JSON response
data = response.json()
# Print or process the results
for result in data.get("results", []):
print(f"Title: {result['title']}")
print(f"URL: {result['url']}")
print(f"Snippet: {result['content']}")
print("-" * 40)
else:
print(f"Error: {response.status_code}")

View file

@ -6,9 +6,12 @@ mkShell {
pipenv pipenv
python3 python3
stdenv.cc.cc.lib stdenv.cc.cc.lib
zlib
]; ];
shellHook = '' shellHook = ''
export LD_LIBRARY_PATH="${pkgs.stdenv.cc.cc.lib}/lib"; export LD_LIBRARY_PATH="${pkgs.stdenv.cc.cc.lib}/lib";
export LD_LIBRARY_PATH="${pkgs.zlib}/lib:$LD_LIBRARY_PATH";
alias run="pipenv run python main.py; notify-send -u normal -a 'Hin' 'finished'"
''; '';
} }

169
src/evaluate.py Normal file
View file

@ -0,0 +1,169 @@
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import progressbar
import math
CLS_POOLING = 1
MEAN_POOLING = 2
MAX_POOLING = 3
print("\n### Fetching SciBert ###\n")
# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
print("* Got tokenizer")
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
print("* Got model")
def get_subject_output(subject):
subject_inputs = tokenizer(subject, return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
subject_outputs = model(**subject_inputs)
return subject_outputs
# Function to compute the embedding with a selected pooling method
def compute_similarity(subject_outputs, compare_text, pooling_method):
# Tokenize the input texts
compare_inputs = tokenizer(compare_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
# Compute embeddings for both the subject and the comparison text
with torch.no_grad():
compare_outputs = model(**compare_inputs)
# Pooling strategies
def cls_pooling(output):
return output.last_hidden_state[:, 0, :] # CLS token is at index 0
def mean_pooling(output):
return output.last_hidden_state.mean(dim=1) # Mean of all token embeddings
def max_pooling(output):
return output.last_hidden_state.max(dim=1).values # Max of all token embeddings
# Choose pooling strategy based on the input integer
if pooling_method == CLS_POOLING:
subject_embedding = cls_pooling(subject_outputs)
compare_embedding = cls_pooling(compare_outputs)
elif pooling_method == MEAN_POOLING:
subject_embedding = mean_pooling(subject_outputs)
compare_embedding = mean_pooling(compare_outputs)
elif pooling_method == MAX_POOLING:
subject_embedding = max_pooling(subject_outputs)
compare_embedding = max_pooling(compare_outputs)
else:
raise ValueError("Pooling method must be 1 (CLS), 2 (Mean), or 3 (Max).")
return F.cosine_similarity(subject_embedding, compare_embedding).item()
def score_results(subject, results, weights, pooling):
subject_model_output = get_subject_output(subject)
print("* Tokenized subject\n")
scored_results_urls = []
scored_results = []
print("* Started scoring results\n")
bar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), progressbar.Bar()],
maxval=len(results)).start()
progress = 0
title_score_bounds = [1, 0]
snippet_score_bounds = [1, 0]
title_pooling = pooling[0]
snippet_pooling = pooling[1]
log = f"Weights : {weights};\n\nPooling : {pooling}\n\n"
# Process each result
for result in results :
progress += 1
bar.update(progress)
title = result['title']
url = result['url']
snippet = result['content']
if title == subject :
found_original = True
if url in scored_results_urls :
continue
scored_results_urls.append(url)
# Compute similarity between subject and result
title_score, snippet_score = 1, 1
if weights[0] != 0 :
title_score = compute_similarity(subject_model_output, title, title_pooling)
if weights[1] != 0 :
snippet_score = compute_similarity(subject_model_output, snippet, snippet_pooling)
if title_score < title_score_bounds[0] :
title_score_bounds[0] = title_score
if title_score > title_score_bounds[1] :
title_score_bounds[1] = title_score
if snippet_score < snippet_score_bounds[0] :
snippet_score_bounds[0] = snippet_score
if snippet_score > snippet_score_bounds[1] :
snippet_score_bounds[1] = snippet_score
# Store the result with its similarity score
scored_results.append({
'title': title,
'url': url,
'snippet': snippet,
'title-score': title_score,
'snippet-score': snippet_score
})
log += f"Score bounds : T{title_score_bounds} # S{snippet_score_bounds}\n\n"
print("\n\n* Scored results\n")
normalized_results = []
for result in scored_results:
title_score, snippet_score = 1, 1
if weights[0] != 0 :
title_score = (result['title-score'] - title_score_bounds[0]) / (title_score_bounds[1] - title_score_bounds[0])
if weights[1] != 0 :
snippet_score = (result['snippet-score'] - snippet_score_bounds[0]) / (snippet_score_bounds[1] - snippet_score_bounds[0])
score = math.pow(math.pow(title_score, weights[0]) * math.pow(snippet_score, weights[1]), 1 / (weights[0] + weights[1]))
normalized_results.append({
'title': result['title'],
'url': result['url'],
'snippet': result['snippet'],
'score': score,
})
return normalized_results, log
def sort_results(subject, results, weights, pooling):
print("\n### Starting result processing (",len(results),") ###\n")
log = "\n---\n\n## Scoring\n\n"
scored_results, score_log = score_results(subject, results, weights, pooling)
log += score_log
# Sort the results by similarity (highest first)
sorted_results = sorted(scored_results, key=lambda x: x['score'], reverse=True)
return sorted_results, log

33
src/key.py Normal file
View file

@ -0,0 +1,33 @@
from keybert import KeyBERT
from itertools import combinations
def create_queries(subject) :
print("\n### Getting Keywords ###\n")
kw_model = KeyBERT()
print("* Got Keybert")
keywords = kw_model.extract_keywords(subject, keyphrase_ngram_range=(1, 2), stop_words='english', use_mmr=True, diversity=0.7)
print("* keywords extracted")
sorted_keywords = sorted(keywords, key=lambda x: -x[1])
text_keywords = [x[0] for x in sorted_keywords]
log = f"## Keywords\n\n{text_keywords}\n\n"
queries = []
for r in range(1, len(text_keywords) + 1):
comb = combinations(text_keywords, r)
queries.extend(comb)
final_queries = [subject] + ["\"" + "\" OR \"".join(query) + "\"" for query in queries]
#final_queries.ins(subject)
print("* query generated")
return final_queries, log

44
src/scrub.py Normal file
View file

@ -0,0 +1,44 @@
import requests
import progressbar
searxng_url = "https://search.penwing.org/search"
def scrub_web(queries) :
print("\n### Fetching Web data ###\n")
web_bar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), progressbar.Bar()],
maxval=len(queries)).start()
progress = 0
results = []
log = "## Queries\n\n"
for query in queries :
params = {
"q": query, # Your search query
"format": "json", # Requesting JSON format
"categories": "science", # You can specify categories (optional)
}
response = requests.get(searxng_url, params=params)
if response.status_code == 200:
data = response.json()
# List to store results with similarity scores
scored_results = []
results.extend(data.get("results", []))
log += f"{query};\n"
else:
print(f"Error: {response.status_code}")
progress += 1
web_bar.update(progress)
print("")
return results, log

25
src/test.py Normal file
View file

@ -0,0 +1,25 @@
import requests
subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"
searxng_url = "https://search.penwing.org/search"
params = {
"q": subject, # Your search query
"format": "json", # Requesting JSON format
"categories": "science", # You can specify categories (optional)
}
response = requests.get(searxng_url, params=params)
if response.status_code == 200:
data = response.json()
# List to store results with similarity scores
scored_results = []
for result in data.get("results", []):
print(result['title'])
print("---")
else:
print(f"Error: {response.status_code}")

19
subjects Normal file
View file

@ -0,0 +1,19 @@
# Me
subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"
query = "composite viscoelastic damping"
# Anne
subject = "State of the art on the identification of wood structure natural frequencies. Influence of the mechanical properties and interest in sensitivity analysis as prospects for reverse identification method of wood elastic properties."
query = "wood frequency analysis mechanical properties"
# Axel
subject = "Characterization of SiC MOSFET using double pulse test method."
query = "SiC MOSFET double pulse test"
# Paul
subject = "Thermo-Mechanical Impact of temperature oscillations on bonding and metallization for SiC MOSFETs soldered on ceramic substrate"
query = "thermo mechanical model discrete bonding SiC MOSFET"
# Jam
subject = "tig welding of inconel 625 and influences on micro structures"
query = "tig welding inconel 625"