From 5a815643a7854ad4ef52e9ab989e1d63ede1cf9c Mon Sep 17 00:00:00 2001 From: WanderingPenwing Date: Fri, 27 Sep 2024 10:49:44 +0200 Subject: [PATCH] scrub+evaluate --- Pipfile | 3 ++ evaluate.py | 36 +++++++++++++ key.py | 25 +++++++++ main.py | 38 -------------- scrub-evaluate.py | 131 ++++++++++++++++++++++++++++++++++++++++++++++ scrub.py | 25 +++++++++ 6 files changed, 220 insertions(+), 38 deletions(-) create mode 100644 evaluate.py create mode 100644 key.py delete mode 100644 main.py create mode 100644 scrub-evaluate.py create mode 100644 scrub.py diff --git a/Pipfile b/Pipfile index 87d345c..edb203f 100644 --- a/Pipfile +++ b/Pipfile @@ -6,6 +6,9 @@ name = "pypi" [packages] transformers = "*" torch = "*" +requests = "*" +keybert = "*" +progressbar = "*" [dev-packages] diff --git a/evaluate.py b/evaluate.py new file mode 100644 index 0000000..d6c87bb --- /dev/null +++ b/evaluate.py @@ -0,0 +1,36 @@ +import warnings +from transformers import AutoTokenizer, AutoModel +import torch +import torch.nn.functional as F + +# Suppress FutureWarnings and other warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +# Load the tokenizer and the model +tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased') +model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased') + +# Function to compute sentence embeddings by pooling token embeddings (CLS token) +def get_sentence_embedding(text): + inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) + with torch.no_grad(): + outputs = model(**inputs) + + # Pooling strategy: Use the hidden state of the [CLS] token as the sentence embedding + cls_embedding = outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size) + return cls_embedding + +# Example subject and abstract +subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer" +abstract = """ +The research work presented in this paper aims to optimise the dynamic response of a carbon-epoxy plate by including into the laminate one frequency-dependent interleaved viscoelastic layer. To keep an acceptable bending stiffness, some holes are created in the viscoelastic layer, thus facilitating the resin through layer pene- tration during the co-curing manufacturing process. Plates including (or not) one perforated (or non-perforated) viscoelastic layer are manufactured and investigated experimentally and numerically. First, static and dynamic tests are performed on sandwich coupons to characterise the stiffness and damping properties of the plates in a given frequency range. Resulting mechanical properties are then used to set-up a finite element model and simulate the plate dynamic response. In parallel, fre- quency response measurements are carried out on the manufactured plates, then successfully confronted to the numerical results. Finally, a design of experiments is built based on a limited number on numerical simulations to find the configuration of bridges that maximises the damping while keeping a stiffness higher than half the stiffness of the equivalent undamped plate.""" + +# Get embeddings +subject_embedding = get_sentence_embedding(subject) +abstract_embedding = get_sentence_embedding(abstract) + +# 2. **Measure Semantic Similarity Using Cosine Similarity** + +# Compute cosine similarity between subject and abstract embeddings +similarity = F.cosine_similarity(subject_embedding, abstract_embedding) +print(f"Cosine Similarity: {similarity.item():.4f}") diff --git a/key.py b/key.py new file mode 100644 index 0000000..3cb12a5 --- /dev/null +++ b/key.py @@ -0,0 +1,25 @@ +from keybert import KeyBERT +from transformers import AutoTokenizer, AutoModel + +# Load the SciBERT model and tokenizer +tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased') + +print("* Tokenizer") + +model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased') + +print("* Scibert model") + +# Define a KeyBERT model using SciBERT embeddings +kw_model = KeyBERT(model=model) + +print("* Keybert model") +# Define the subject from which to extract keywords +subject = "tig welding of inconel 625 and influences on micro structures" + +# Extract keywords from the subject +keywords = kw_model.extract_keywords(subject, keyphrase_ngram_range=(1, 2), stop_words='english', use_maxsum=True) + +# Print extracted keywords +for keyword, score in keywords: + print(f"Keyword: {keyword}, Score: {score:.4f}") diff --git a/main.py b/main.py deleted file mode 100644 index 738636d..0000000 --- a/main.py +++ /dev/null @@ -1,38 +0,0 @@ -import warnings -from transformers import AutoTokenizer, AutoModel -import torch -import torch.nn.functional as F - -# Suppress FutureWarnings and other warnings -warnings.simplefilter(action='ignore', category=FutureWarning) - -# Load the tokenizer and the model -tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased') -model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased') - -# Function to compute sentence embeddings by pooling token embeddings (CLS token) -def get_sentence_embedding(text): - inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) - with torch.no_grad(): - outputs = model(**inputs) - - # Pooling strategy: Use the hidden state of the [CLS] token as the sentence embedding - cls_embedding = outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size) - return cls_embedding - -# Example subject and abstract -subject = "Artificial Intelligence in Healthcare" -abstract = """ -Artificial intelligence (AI) is transforming healthcare with its ability to analyze complex medical data and assist in diagnosis. -AI models, especially in medical imaging, have shown promise in detecting diseases like cancer and predicting patient outcomes. -""" - -# Get embeddings -subject_embedding = get_sentence_embedding(subject) -abstract_embedding = get_sentence_embedding(abstract) - -# 2. **Measure Semantic Similarity Using Cosine Similarity** - -# Compute cosine similarity between subject and abstract embeddings -similarity = F.cosine_similarity(subject_embedding, abstract_embedding) -print(f"Cosine Similarity: {similarity.item():.4f}") diff --git a/scrub-evaluate.py b/scrub-evaluate.py new file mode 100644 index 0000000..d21884c --- /dev/null +++ b/scrub-evaluate.py @@ -0,0 +1,131 @@ +import warnings +from transformers import AutoTokenizer, AutoModel +import torch +import torch.nn.functional as F +import requests +import progressbar + + +# Me +#subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer" +#query = "composite viscoelastic damping" + +# Anne +#subject = "State of the art on the identification of wood structure natural frequencies. Influence of the mechanical properties and interest in sensitivity analysis as prospects for reverse identification method of wood elastic properties." +#query = "wood frequency analysis mechanical properties" + +# Axel +#subject = "Characterization of SiC MOSFET using double pulse test method." +#query = "SiC MOSFET double pulse test" + +# Paul +#subject = "Thermo-Mechanical Impact of temperature oscillations on bonding and metallization for SiC MOSFETs soldered on ceramic substrate" +#query = "thermo mechanical model discrete bonding SiC MOSFET" + +# Jam +subject = "tig welding of inconel 625 and influences on micro structures" +query = "tig welding inconel 625" + +widgets = [' [', + progressbar.Timer(format= 'elapsed time: %(elapsed)s'), + '] ', + progressbar.Bar('*'),' (', + progressbar.ETA(), ') ', + ] + +# Suppress FutureWarnings and other warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +print("\n### Fetching Data ###\n") + +# Load the tokenizer and the model +tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased') + +print("* Got tokenizer") + +model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased') + +print("* Got model") + +# Function to compute sentence embeddings by pooling token embeddings (CLS token) +def get_sentence_embedding(text): + inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) + with torch.no_grad(): + outputs = model(**inputs) + + # Pooling strategy: Use the hidden state of the [CLS] token as the sentence embedding + cls_embedding = outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size) + return cls_embedding + +# Function to compute cosine similarity +def compute_similarity(embedding1, embedding2): + similarity = F.cosine_similarity(embedding1, embedding2) + return similarity.item() + +# Define the SearxNG instance URL and search query +searxng_url = "https://search.penwing.org/search" # Replace with your instance URL +params = { + "q": query, # Your search query + "format": "json", # Requesting JSON format + "categories": "science", # You can specify categories (optional) +} + +# Send the request to SearxNG API +response = requests.get(searxng_url, params=params) + +# Check if the request was successful +if response.status_code == 200: + print("* Got search results") + # Parse the JSON response + data = response.json() + + subject_embedding = get_sentence_embedding(subject) + + print("* Tokenized subject") + + print("\n### Starting result processing ###\n") + # List to store results with similarity scores + scored_results = [] + + results = data.get("results", []) + progress = 0 + + bar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), progressbar.Bar()], + maxval=len(results)).start() + + # Process each result + for result in results : + title = result['title'] + url = result['url'] + snippet = result['content'] + + # Get embedding for the snippet (abstract) + snippet_embedding = get_sentence_embedding(snippet) + + # Compute similarity between subject and snippet + similarity = compute_similarity(subject_embedding, snippet_embedding) + + # Store the result with its similarity score + scored_results.append({ + 'title': title, + 'url': url, + 'snippet': snippet, + 'similarity': similarity + }) + + progress += 1 + bar.update(progress) + + # Sort the results by similarity (highest first) + top_results = sorted(scored_results, key=lambda x: x['similarity'], reverse=True)[:10] + + print("\n### Done ###\n") + # Print the top 10 results + for idx, result in enumerate(top_results, 1): + print(f"Rank {idx} ({result['similarity']:.4f}):") + print(f"Title: {result['title']}") + print(f"URL: {result['url']}") + print(f"Snippet: {result['snippet']}") + print("-" * 40) +else: + print(f"Error: {response.status_code}") diff --git a/scrub.py b/scrub.py new file mode 100644 index 0000000..95c96b4 --- /dev/null +++ b/scrub.py @@ -0,0 +1,25 @@ +import requests + +# Define the SearxNG instance URL and search query +searxng_url = "https://search.penwing.org/search" # Replace with your instance URL +params = { + "q": "zig zag theories", # Your search query + "format": "json", # Requesting JSON format + "categories": "science", # You can specify categories (optional) +} + +# Send the request to SearxNG API +response = requests.get(searxng_url, params=params) + +# Check if the request was successful +if response.status_code == 200: + # Parse the JSON response + data = response.json() + # Print or process the results + for result in data.get("results", []): + print(f"Title: {result['title']}") + print(f"URL: {result['url']}") + print(f"Snippet: {result['content']}") + print("-" * 40) +else: + print(f"Error: {response.status_code}")