scrub+evaluate

2024-09-27 10:49:44 +02:00 · 2024-09-27 10:49:44 +02:00 · 5a815643a7
parent 9fd246cb01
commit 5a815643a7
6 changed files with 220 additions and 38 deletions
--- a/3
+++ b/3
@ -6,6 +6,9 @@ name = "pypi"
 [packages]
 transformers = "*"
 torch = "*"
 requests = "*"
 keybert = "*"
 progressbar = "*"
 [dev-packages]
--- a/evaluate.py
+++ b/evaluate.py
@ -0,0 +1,36 @@
 import warnings
 from transformers import AutoTokenizer, AutoModel
 import torch
 import torch.nn.functional as F
 # Suppress FutureWarnings and other warnings
 warnings.simplefilter(action='ignore', category=FutureWarning)
 # Load the tokenizer and the model
 tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
 model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
 # Function to compute sentence embeddings by pooling token embeddings (CLS token)
 def get_sentence_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Pooling strategy: Use the hidden state of the [CLS] token as the sentence embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
    return cls_embedding
 # Example subject and abstract
 subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"
 abstract = """
 The research work presented in this paper aims to optimise the dynamic response of a carbon-epoxy plate by including into the laminate one frequency-dependent interleaved viscoelastic layer. To keep an acceptable bending stiffness, some holes are created in the viscoelastic layer, thus facilitating the resin through layer pene- tration during the co-curing manufacturing process. Plates including (or not) one perforated (or non-perforated) viscoelastic layer are manufactured and investigated experimentally and numerically. First, static and dynamic tests are performed on sandwich coupons to characterise the stiffness and damping properties of the plates in a given frequency range. Resulting mechanical properties are then used to set-up a finite element model and simulate the plate dynamic response. In parallel, fre- quency response measurements are carried out on the manufactured plates, then successfully confronted to the numerical results. Finally, a design of experiments is built based on a limited number on numerical simulations to find the configuration of bridges that maximises the damping while keeping a stiffness higher than half the stiffness of the equivalent undamped plate."""
 # Get embeddings
 subject_embedding = get_sentence_embedding(subject)
 abstract_embedding = get_sentence_embedding(abstract)
 # 2. **Measure Semantic Similarity Using Cosine Similarity**
 # Compute cosine similarity between subject and abstract embeddings
 similarity = F.cosine_similarity(subject_embedding, abstract_embedding)
 print(f"Cosine Similarity: {similarity.item():.4f}")
--- a/key.py
+++ b/key.py
@ -0,0 +1,25 @@
 from keybert import KeyBERT
 from transformers import AutoTokenizer, AutoModel
 # Load the SciBERT model and tokenizer
 tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
 print("* Tokenizer")
 model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
 print("* Scibert model")
 # Define a KeyBERT model using SciBERT embeddings
 kw_model = KeyBERT(model=model)
 print("* Keybert model")
 # Define the subject from which to extract keywords
 subject = "tig welding of inconel 625 and influences on micro structures"
 # Extract keywords from the subject
 keywords = kw_model.extract_keywords(subject, keyphrase_ngram_range=(1, 2), stop_words='english', use_maxsum=True)
 # Print extracted keywords
 for keyword, score in keywords:
    print(f"Keyword: {keyword}, Score: {score:.4f}")
--- a/main.py
+++ b/main.py
@ -1,38 +0,0 @@
 import warnings
 from transformers import AutoTokenizer, AutoModel
 import torch
 import torch.nn.functional as F
 # Suppress FutureWarnings and other warnings
 warnings.simplefilter(action='ignore', category=FutureWarning)
 # Load the tokenizer and the model
 tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
 model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
 # Function to compute sentence embeddings by pooling token embeddings (CLS token)
 def get_sentence_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Pooling strategy: Use the hidden state of the [CLS] token as the sentence embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
    return cls_embedding
 # Example subject and abstract
 subject = "Artificial Intelligence in Healthcare"
 abstract = """
 Artificial intelligence (AI) is transforming healthcare with its ability to analyze complex medical data and assist in diagnosis. 
 AI models, especially in medical imaging, have shown promise in detecting diseases like cancer and predicting patient outcomes.
 """
 # Get embeddings
 subject_embedding = get_sentence_embedding(subject)
 abstract_embedding = get_sentence_embedding(abstract)
 # 2. **Measure Semantic Similarity Using Cosine Similarity**
 # Compute cosine similarity between subject and abstract embeddings
 similarity = F.cosine_similarity(subject_embedding, abstract_embedding)
 print(f"Cosine Similarity: {similarity.item():.4f}")
--- a/scrub-evaluate.py
+++ b/scrub-evaluate.py
@ -0,0 +1,131 @@
 import warnings
 from transformers import AutoTokenizer, AutoModel
 import torch
 import torch.nn.functional as F
 import requests
 import progressbar
 # Me
 #subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"
 #query = "composite viscoelastic damping"
 # Anne
 #subject = "State of the art on the identification of wood structure natural frequencies. Influence of the mechanical properties and interest in sensitivity analysis as prospects for reverse identification method of wood elastic properties."
 #query = "wood frequency analysis mechanical properties"
 # Axel
 #subject = "Characterization of SiC MOSFET using double pulse test method."
 #query = "SiC MOSFET double pulse test"
 # Paul
 #subject = "Thermo-Mechanical Impact of temperature oscillations on bonding and metallization for SiC MOSFETs soldered on ceramic substrate"
 #query = "thermo mechanical model discrete bonding SiC MOSFET"
 # Jam
 subject = "tig welding of inconel 625 and influences on micro structures"
 query = "tig welding inconel 625"
 widgets = [' [',
         progressbar.Timer(format= 'elapsed time: %(elapsed)s'),
         '] ',
           progressbar.Bar('*'),' (',
           progressbar.ETA(), ') ',
          ]
 # Suppress FutureWarnings and other warnings
 warnings.simplefilter(action='ignore', category=FutureWarning)
 print("\n### Fetching Data ###\n")
 # Load the tokenizer and the model
 tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
 print("* Got tokenizer")
 model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
 print("* Got model")
 # Function to compute sentence embeddings by pooling token embeddings (CLS token)
 def get_sentence_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Pooling strategy: Use the hidden state of the [CLS] token as the sentence embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
    return cls_embedding
 # Function to compute cosine similarity
 def compute_similarity(embedding1, embedding2):
    similarity = F.cosine_similarity(embedding1, embedding2)
    return similarity.item()
 # Define the SearxNG instance URL and search query
 searxng_url = "https://search.penwing.org/search"  # Replace with your instance URL
 params = {
    "q": query,  # Your search query
    "format": "json",         # Requesting JSON format
    "categories": "science",  # You can specify categories (optional)
 }
 # Send the request to SearxNG API
 response = requests.get(searxng_url, params=params)
 # Check if the request was successful
 if response.status_code == 200:
    print("* Got search results")
    # Parse the JSON response
    data = response.json()
    subject_embedding = get_sentence_embedding(subject)
    print("* Tokenized subject")
    print("\n### Starting result processing ###\n")
    # List to store results with similarity scores
    scored_results = []
    results = data.get("results", [])
    progress = 0
    bar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), progressbar.Bar()],
        maxval=len(results)).start()
    # Process each result
    for result in results :
        title = result['title']
        url = result['url']
        snippet = result['content']
        # Get embedding for the snippet (abstract)
        snippet_embedding = get_sentence_embedding(snippet)
        # Compute similarity between subject and snippet
        similarity = compute_similarity(subject_embedding, snippet_embedding)
        # Store the result with its similarity score
        scored_results.append({
            'title': title,
            'url': url,
            'snippet': snippet,
            'similarity': similarity
        })
        progress += 1
        bar.update(progress)
    # Sort the results by similarity (highest first)
    top_results = sorted(scored_results, key=lambda x: x['similarity'], reverse=True)[:10]
    print("\n### Done ###\n")
    # Print the top 10 results
    for idx, result in enumerate(top_results, 1):
        print(f"Rank {idx} ({result['similarity']:.4f}):")
        print(f"Title: {result['title']}")
        print(f"URL: {result['url']}")
        print(f"Snippet: {result['snippet']}")
        print("-" * 40)
 else:
    print(f"Error: {response.status_code}")
--- a/scrub.py
+++ b/scrub.py
@ -0,0 +1,25 @@
 import requests
 # Define the SearxNG instance URL and search query
 searxng_url = "https://search.penwing.org/search"  # Replace with your instance URL
 params = {
    "q": "zig zag theories",  # Your search query
    "format": "json",         # Requesting JSON format
    "categories": "science",  # You can specify categories (optional)
 }
 # Send the request to SearxNG API
 response = requests.get(searxng_url, params=params)
 # Check if the request was successful
 if response.status_code == 200:
    # Parse the JSON response
    data = response.json()
    # Print or process the results
    for result in data.get("results", []):
        print(f"Title: {result['title']}")
        print(f"URL: {result['url']}")
        print(f"Snippet: {result['content']}")
        print("-" * 40)
 else:
    print(f"Error: {response.status_code}")