scrub+evaluate

This commit is contained in:
WanderingPenwing 2024-09-27 10:49:44 +02:00
parent 9fd246cb01
commit 5a815643a7
6 changed files with 220 additions and 38 deletions

View file

@ -6,6 +6,9 @@ name = "pypi"
[packages]
transformers = "*"
torch = "*"
requests = "*"
keybert = "*"
progressbar = "*"
[dev-packages]

36
evaluate.py Normal file
View file

@ -0,0 +1,36 @@
import warnings
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
# Suppress FutureWarnings and other warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
# Function to compute sentence embeddings by pooling token embeddings (CLS token)
def get_sentence_embedding(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
# Pooling strategy: Use the hidden state of the [CLS] token as the sentence embedding
cls_embedding = outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size)
return cls_embedding
# Example subject and abstract
subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"
abstract = """
The research work presented in this paper aims to optimise the dynamic response of a carbon-epoxy plate by including into the laminate one frequency-dependent interleaved viscoelastic layer. To keep an acceptable bending stiffness, some holes are created in the viscoelastic layer, thus facilitating the resin through layer pene- tration during the co-curing manufacturing process. Plates including (or not) one perforated (or non-perforated) viscoelastic layer are manufactured and investigated experimentally and numerically. First, static and dynamic tests are performed on sandwich coupons to characterise the stiffness and damping properties of the plates in a given frequency range. Resulting mechanical properties are then used to set-up a finite element model and simulate the plate dynamic response. In parallel, fre- quency response measurements are carried out on the manufactured plates, then successfully confronted to the numerical results. Finally, a design of experiments is built based on a limited number on numerical simulations to find the configuration of bridges that maximises the damping while keeping a stiffness higher than half the stiffness of the equivalent undamped plate."""
# Get embeddings
subject_embedding = get_sentence_embedding(subject)
abstract_embedding = get_sentence_embedding(abstract)
# 2. **Measure Semantic Similarity Using Cosine Similarity**
# Compute cosine similarity between subject and abstract embeddings
similarity = F.cosine_similarity(subject_embedding, abstract_embedding)
print(f"Cosine Similarity: {similarity.item():.4f}")

25
key.py Normal file
View file

@ -0,0 +1,25 @@
from keybert import KeyBERT
from transformers import AutoTokenizer, AutoModel
# Load the SciBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
print("* Tokenizer")
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
print("* Scibert model")
# Define a KeyBERT model using SciBERT embeddings
kw_model = KeyBERT(model=model)
print("* Keybert model")
# Define the subject from which to extract keywords
subject = "tig welding of inconel 625 and influences on micro structures"
# Extract keywords from the subject
keywords = kw_model.extract_keywords(subject, keyphrase_ngram_range=(1, 2), stop_words='english', use_maxsum=True)
# Print extracted keywords
for keyword, score in keywords:
print(f"Keyword: {keyword}, Score: {score:.4f}")

38
main.py
View file

@ -1,38 +0,0 @@
import warnings
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
# Suppress FutureWarnings and other warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
# Function to compute sentence embeddings by pooling token embeddings (CLS token)
def get_sentence_embedding(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
# Pooling strategy: Use the hidden state of the [CLS] token as the sentence embedding
cls_embedding = outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size)
return cls_embedding
# Example subject and abstract
subject = "Artificial Intelligence in Healthcare"
abstract = """
Artificial intelligence (AI) is transforming healthcare with its ability to analyze complex medical data and assist in diagnosis.
AI models, especially in medical imaging, have shown promise in detecting diseases like cancer and predicting patient outcomes.
"""
# Get embeddings
subject_embedding = get_sentence_embedding(subject)
abstract_embedding = get_sentence_embedding(abstract)
# 2. **Measure Semantic Similarity Using Cosine Similarity**
# Compute cosine similarity between subject and abstract embeddings
similarity = F.cosine_similarity(subject_embedding, abstract_embedding)
print(f"Cosine Similarity: {similarity.item():.4f}")

131
scrub-evaluate.py Normal file
View file

@ -0,0 +1,131 @@
import warnings
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import requests
import progressbar
# Me
#subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"
#query = "composite viscoelastic damping"
# Anne
#subject = "State of the art on the identification of wood structure natural frequencies. Influence of the mechanical properties and interest in sensitivity analysis as prospects for reverse identification method of wood elastic properties."
#query = "wood frequency analysis mechanical properties"
# Axel
#subject = "Characterization of SiC MOSFET using double pulse test method."
#query = "SiC MOSFET double pulse test"
# Paul
#subject = "Thermo-Mechanical Impact of temperature oscillations on bonding and metallization for SiC MOSFETs soldered on ceramic substrate"
#query = "thermo mechanical model discrete bonding SiC MOSFET"
# Jam
subject = "tig welding of inconel 625 and influences on micro structures"
query = "tig welding inconel 625"
widgets = [' [',
progressbar.Timer(format= 'elapsed time: %(elapsed)s'),
'] ',
progressbar.Bar('*'),' (',
progressbar.ETA(), ') ',
]
# Suppress FutureWarnings and other warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
print("\n### Fetching Data ###\n")
# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
print("* Got tokenizer")
model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
print("* Got model")
# Function to compute sentence embeddings by pooling token embeddings (CLS token)
def get_sentence_embedding(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
# Pooling strategy: Use the hidden state of the [CLS] token as the sentence embedding
cls_embedding = outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size)
return cls_embedding
# Function to compute cosine similarity
def compute_similarity(embedding1, embedding2):
similarity = F.cosine_similarity(embedding1, embedding2)
return similarity.item()
# Define the SearxNG instance URL and search query
searxng_url = "https://search.penwing.org/search" # Replace with your instance URL
params = {
"q": query, # Your search query
"format": "json", # Requesting JSON format
"categories": "science", # You can specify categories (optional)
}
# Send the request to SearxNG API
response = requests.get(searxng_url, params=params)
# Check if the request was successful
if response.status_code == 200:
print("* Got search results")
# Parse the JSON response
data = response.json()
subject_embedding = get_sentence_embedding(subject)
print("* Tokenized subject")
print("\n### Starting result processing ###\n")
# List to store results with similarity scores
scored_results = []
results = data.get("results", [])
progress = 0
bar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), progressbar.Bar()],
maxval=len(results)).start()
# Process each result
for result in results :
title = result['title']
url = result['url']
snippet = result['content']
# Get embedding for the snippet (abstract)
snippet_embedding = get_sentence_embedding(snippet)
# Compute similarity between subject and snippet
similarity = compute_similarity(subject_embedding, snippet_embedding)
# Store the result with its similarity score
scored_results.append({
'title': title,
'url': url,
'snippet': snippet,
'similarity': similarity
})
progress += 1
bar.update(progress)
# Sort the results by similarity (highest first)
top_results = sorted(scored_results, key=lambda x: x['similarity'], reverse=True)[:10]
print("\n### Done ###\n")
# Print the top 10 results
for idx, result in enumerate(top_results, 1):
print(f"Rank {idx} ({result['similarity']:.4f}):")
print(f"Title: {result['title']}")
print(f"URL: {result['url']}")
print(f"Snippet: {result['snippet']}")
print("-" * 40)
else:
print(f"Error: {response.status_code}")

25
scrub.py Normal file
View file

@ -0,0 +1,25 @@
import requests
# Define the SearxNG instance URL and search query
searxng_url = "https://search.penwing.org/search" # Replace with your instance URL
params = {
"q": "zig zag theories", # Your search query
"format": "json", # Requesting JSON format
"categories": "science", # You can specify categories (optional)
}
# Send the request to SearxNG API
response = requests.get(searxng_url, params=params)
# Check if the request was successful
if response.status_code == 200:
# Parse the JSON response
data = response.json()
# Print or process the results
for result in data.get("results", []):
print(f"Title: {result['title']}")
print(f"URL: {result['url']}")
print(f"Snippet: {result['content']}")
print("-" * 40)
else:
print(f"Error: {response.status_code}")