added readme

modified gitignore
empty web_data folder
2024-09-27 21:44:41 +02:00 · 2024-09-27 21:42:23 +02:00 · 2024-09-27 21:41:37 +02:00 · 2024-09-27 21:40:05 +02:00 · 2024-09-27 21:38:44 +02:00 · 2024-09-27 18:18:21 +02:00
14 changed files with 370 additions and 218 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1,3 @@
 /target
 /models
 /logs/*
 /web_data/*
--- a/5
+++ b/5
@ -9,6 +9,11 @@ torch = "*"
 requests = "*"
 keybert = "*"
 progressbar = "*"
 rake-nltk = "*"
 nltk = "*"
 spacy = "*"
 numpy = "*"
 gensim = "*"
 [dev-packages]
--- a/README.md
+++ b/README.md
@ -0,0 +1,3 @@
 # Hin
 A searxng/BERT mix to find science papers more efficiently, given a subject
--- a/evaluate.py
+++ b/evaluate.py
@ -1,36 +0,0 @@
 import warnings
 from transformers import AutoTokenizer, AutoModel
 import torch
 import torch.nn.functional as F
 # Suppress FutureWarnings and other warnings
 warnings.simplefilter(action='ignore', category=FutureWarning)
 # Load the tokenizer and the model
 tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
 model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
 # Function to compute sentence embeddings by pooling token embeddings (CLS token)
 def get_sentence_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Pooling strategy: Use the hidden state of the [CLS] token as the sentence embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
    return cls_embedding
 # Example subject and abstract
 subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"
 abstract = """
 The research work presented in this paper aims to optimise the dynamic response of a carbon-epoxy plate by including into the laminate one frequency-dependent interleaved viscoelastic layer. To keep an acceptable bending stiffness, some holes are created in the viscoelastic layer, thus facilitating the resin through layer pene- tration during the co-curing manufacturing process. Plates including (or not) one perforated (or non-perforated) viscoelastic layer are manufactured and investigated experimentally and numerically. First, static and dynamic tests are performed on sandwich coupons to characterise the stiffness and damping properties of the plates in a given frequency range. Resulting mechanical properties are then used to set-up a finite element model and simulate the plate dynamic response. In parallel, fre- quency response measurements are carried out on the manufactured plates, then successfully confronted to the numerical results. Finally, a design of experiments is built based on a limited number on numerical simulations to find the configuration of bridges that maximises the damping while keeping a stiffness higher than half the stiffness of the equivalent undamped plate."""
 # Get embeddings
 subject_embedding = get_sentence_embedding(subject)
 abstract_embedding = get_sentence_embedding(abstract)
 # 2. **Measure Semantic Similarity Using Cosine Similarity**
 # Compute cosine similarity between subject and abstract embeddings
 similarity = F.cosine_similarity(subject_embedding, abstract_embedding)
 print(f"Cosine Similarity: {similarity.item():.4f}")
--- a/key.py
+++ b/key.py
@ -1,25 +0,0 @@
 from keybert import KeyBERT
 from transformers import AutoTokenizer, AutoModel
 # Load the SciBERT model and tokenizer
 tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
 print("* Tokenizer")
 model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
 print("* Scibert model")
 # Define a KeyBERT model using SciBERT embeddings
 kw_model = KeyBERT(model=model)
 print("* Keybert model")
 # Define the subject from which to extract keywords
 subject = "tig welding of inconel 625 and influences on micro structures"
 # Extract keywords from the subject
 keywords = kw_model.extract_keywords(subject, keyphrase_ngram_range=(1, 2), stop_words='english', use_maxsum=True)
 # Print extracted keywords
 for keyword, score in keywords:
    print(f"Keyword: {keyword}, Score: {score:.4f}")
--- a/main.py
+++ b/main.py
@ -0,0 +1,67 @@
 import warnings
 from datetime import datetime
 import json
 import os
 from src.scrub import scrub_web
 from src.key import create_queries
 from src.evaluate import sort_results, CLS_POOLING, MEAN_POOLING, MAX_POOLING
 # Suppress FutureWarnings and other warnings
 warnings.simplefilter(action='ignore', category=FutureWarning)
 def hin_fetch(subject, weights, pooling):
    current_time = datetime.now().strftime("%m-%d_%H-%M")
    data_path = f"web_data/{hash(subject)}.json"
    file_path = f"logs/run_{current_time}_{weights}{pooling}.md"
    log_content = f"# Hin run, {current_time}\n\nSubject : {subject}\n\n"
    results = []
    if os.path.exists(data_path) :
        log_content += f"## Query results from {data_path}*\n\n"
        print(f"* Subject known from {data_path}")
        with open(data_path, 'r', encoding='utf-8') as f:
            results = json.load(f)
    else :
        queries, keyword_log = create_queries(subject)
        log_content += keyword_log
        results, scrub_log = scrub_web(queries)
        log_content += scrub_log
        with open(data_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=4)
        log_content += f"*Stored results in {data_path}*\n\n"
        print(f"\n* Stored results in {data_path}")
    sorted_results, results_log = sort_results(subject, results, weights, pooling)
    log_content += results_log
    print("### Done ###\n")
    report = "## Results\n"
    # Print the top 10 results
    for idx, result in enumerate(sorted_results[:10], 1):
        report += f"\nRank {idx} ({result['score']:.4f}):\nTitle: {result['title']}\nURL: {result['url']}\nSnippet: {result['snippet']}\n" + "-" * 40
    print(report + "\n")
    # Create and save the file
    with open(file_path, 'w') as file:
        file.write(log_content + report)
 #subject = input("Enter subject : ")
 subject = "State of the art on the identification of wood structure natural frequencies. Influence of the mechanical properties and interest in sensitivity analysis as prospects for reverse identification method of wood elastic properties."
 #subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"
 # hin_fetch(subject, [title_weight, snippet_weight], [title_pooling, snippet_pooling])
 hin_fetch(subject, [1,0], [CLS_POOLING, MAX_POOLING])
 hin_fetch(subject, [1,0], [MEAN_POOLING,MAX_POOLING])
 hin_fetch(subject, [1,0], [MAX_POOLING, MAX_POOLING])
 hin_fetch(subject, [0,1], [CLS_POOLING, CLS_POOLING])
 hin_fetch(subject, [0,1], [CLS_POOLING, MEAN_POOLING])
 hin_fetch(subject, [0,1], [CLS_POOLING, MAX_POOLING])
--- a/scrub-evaluate.py
+++ b/scrub-evaluate.py
@ -1,131 +0,0 @@
 import warnings
 from transformers import AutoTokenizer, AutoModel
 import torch
 import torch.nn.functional as F
 import requests
 import progressbar
 # Me
 #subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"
 #query = "composite viscoelastic damping"
 # Anne
 #subject = "State of the art on the identification of wood structure natural frequencies. Influence of the mechanical properties and interest in sensitivity analysis as prospects for reverse identification method of wood elastic properties."
 #query = "wood frequency analysis mechanical properties"
 # Axel
 #subject = "Characterization of SiC MOSFET using double pulse test method."
 #query = "SiC MOSFET double pulse test"
 # Paul
 #subject = "Thermo-Mechanical Impact of temperature oscillations on bonding and metallization for SiC MOSFETs soldered on ceramic substrate"
 #query = "thermo mechanical model discrete bonding SiC MOSFET"
 # Jam
 subject = "tig welding of inconel 625 and influences on micro structures"
 query = "tig welding inconel 625"
 widgets = [' [',
         progressbar.Timer(format= 'elapsed time: %(elapsed)s'),
         '] ',
           progressbar.Bar('*'),' (',
           progressbar.ETA(), ') ',
          ]
 # Suppress FutureWarnings and other warnings
 warnings.simplefilter(action='ignore', category=FutureWarning)
 print("\n### Fetching Data ###\n")
 # Load the tokenizer and the model
 tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
 print("* Got tokenizer")
 model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
 print("* Got model")
 # Function to compute sentence embeddings by pooling token embeddings (CLS token)
 def get_sentence_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Pooling strategy: Use the hidden state of the [CLS] token as the sentence embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
    return cls_embedding
 # Function to compute cosine similarity
 def compute_similarity(embedding1, embedding2):
    similarity = F.cosine_similarity(embedding1, embedding2)
    return similarity.item()
 # Define the SearxNG instance URL and search query
 searxng_url = "https://search.penwing.org/search"  # Replace with your instance URL
 params = {
    "q": query,  # Your search query
    "format": "json",         # Requesting JSON format
    "categories": "science",  # You can specify categories (optional)
 }
 # Send the request to SearxNG API
 response = requests.get(searxng_url, params=params)
 # Check if the request was successful
 if response.status_code == 200:
    print("* Got search results")
    # Parse the JSON response
    data = response.json()
    subject_embedding = get_sentence_embedding(subject)
    print("* Tokenized subject")
    print("\n### Starting result processing ###\n")
    # List to store results with similarity scores
    scored_results = []
    results = data.get("results", [])
    progress = 0
    bar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), progressbar.Bar()],
        maxval=len(results)).start()
    # Process each result
    for result in results :
        title = result['title']
        url = result['url']
        snippet = result['content']
        # Get embedding for the snippet (abstract)
        snippet_embedding = get_sentence_embedding(snippet)
        # Compute similarity between subject and snippet
        similarity = compute_similarity(subject_embedding, snippet_embedding)
        # Store the result with its similarity score
        scored_results.append({
            'title': title,
            'url': url,
            'snippet': snippet,
            'similarity': similarity
        })
        progress += 1
        bar.update(progress)
    # Sort the results by similarity (highest first)
    top_results = sorted(scored_results, key=lambda x: x['similarity'], reverse=True)[:10]
    print("\n### Done ###\n")
    # Print the top 10 results
    for idx, result in enumerate(top_results, 1):
        print(f"Rank {idx} ({result['similarity']:.4f}):")
        print(f"Title: {result['title']}")
        print(f"URL: {result['url']}")
        print(f"Snippet: {result['snippet']}")
        print("-" * 40)
 else:
    print(f"Error: {response.status_code}")
--- a/scrub.py
+++ b/scrub.py
@ -1,25 +0,0 @@
 import requests
 # Define the SearxNG instance URL and search query
 searxng_url = "https://search.penwing.org/search"  # Replace with your instance URL
 params = {
    "q": "zig zag theories",  # Your search query
    "format": "json",         # Requesting JSON format
    "categories": "science",  # You can specify categories (optional)
 }
 # Send the request to SearxNG API
 response = requests.get(searxng_url, params=params)
 # Check if the request was successful
 if response.status_code == 200:
    # Parse the JSON response
    data = response.json()
    # Print or process the results
    for result in data.get("results", []):
        print(f"Title: {result['title']}")
        print(f"URL: {result['url']}")
        print(f"Snippet: {result['content']}")
        print("-" * 40)
 else:
    print(f"Error: {response.status_code}")
--- a/shell.nix
+++ b/shell.nix
@ -6,9 +6,12 @@ mkShell {
    pipenv
    python3
    stdenv.cc.cc.lib
    zlib
  ];
  shellHook = ''
      export LD_LIBRARY_PATH="${pkgs.stdenv.cc.cc.lib}/lib";
      export LD_LIBRARY_PATH="${pkgs.zlib}/lib:$LD_LIBRARY_PATH";
      alias run="pipenv run python main.py; notify-send -u normal -a 'Hin' 'finished'"
  '';
 }
--- a/src/evaluate.py
+++ b/src/evaluate.py
@ -0,0 +1,169 @@
 from transformers import AutoTokenizer, AutoModel
 import torch
 import torch.nn.functional as F
 import progressbar
 import math
 CLS_POOLING = 1
 MEAN_POOLING = 2
 MAX_POOLING = 3
 print("\n### Fetching SciBert ###\n")
 # Load the tokenizer and the model
 tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
 print("* Got tokenizer")
 model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
 print("* Got model")
 def get_subject_output(subject):
    subject_inputs = tokenizer(subject, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        subject_outputs = model(**subject_inputs)
    return subject_outputs
 # Function to compute the embedding with a selected pooling method
 def compute_similarity(subject_outputs, compare_text, pooling_method):
    # Tokenize the input texts
    compare_inputs = tokenizer(compare_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    # Compute embeddings for both the subject and the comparison text
    with torch.no_grad():
        compare_outputs = model(**compare_inputs)
    # Pooling strategies
    def cls_pooling(output):
        return output.last_hidden_state[:, 0, :]  # CLS token is at index 0
    def mean_pooling(output):
        return output.last_hidden_state.mean(dim=1)  # Mean of all token embeddings
    def max_pooling(output):
        return output.last_hidden_state.max(dim=1).values  # Max of all token embeddings
    # Choose pooling strategy based on the input integer
    if pooling_method == CLS_POOLING:
        subject_embedding = cls_pooling(subject_outputs)
        compare_embedding = cls_pooling(compare_outputs)
    elif pooling_method == MEAN_POOLING:
        subject_embedding = mean_pooling(subject_outputs)
        compare_embedding = mean_pooling(compare_outputs)
    elif pooling_method == MAX_POOLING:
        subject_embedding = max_pooling(subject_outputs)
        compare_embedding = max_pooling(compare_outputs)
    else:
        raise ValueError("Pooling method must be 1 (CLS), 2 (Mean), or 3 (Max).")
    return F.cosine_similarity(subject_embedding, compare_embedding).item()
 def score_results(subject, results, weights, pooling):
    subject_model_output = get_subject_output(subject)
    print("* Tokenized subject\n")
    scored_results_urls = []
    scored_results = []
    print("* Started scoring results\n")
    bar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), progressbar.Bar()],
        maxval=len(results)).start()
    progress = 0
    title_score_bounds = [1, 0]
    snippet_score_bounds = [1, 0]
    title_pooling = pooling[0]
    snippet_pooling = pooling[1]
    log = f"Weights : {weights};\n\nPooling : {pooling}\n\n"
    # Process each result
    for result in results :
        progress += 1
        bar.update(progress)
        title = result['title']
        url = result['url']
        snippet = result['content']
        if title == subject :
            found_original = True
        if url in scored_results_urls :
            continue
        scored_results_urls.append(url)
        # Compute similarity between subject and result
        title_score, snippet_score = 1, 1
        if weights[0] != 0 :
            title_score = compute_similarity(subject_model_output, title, title_pooling)
        if weights[1] != 0 :
            snippet_score = compute_similarity(subject_model_output, snippet, snippet_pooling)
        if title_score < title_score_bounds[0] :
            title_score_bounds[0] = title_score
        if title_score > title_score_bounds[1] :
            title_score_bounds[1] = title_score
        if snippet_score < snippet_score_bounds[0] :
            snippet_score_bounds[0] = snippet_score
        if snippet_score > snippet_score_bounds[1] :
            snippet_score_bounds[1] = snippet_score
        # Store the result with its similarity score
        scored_results.append({
            'title': title,
            'url': url,
            'snippet': snippet,
            'title-score': title_score,
            'snippet-score': snippet_score
        })
    log += f"Score bounds : T{title_score_bounds} # S{snippet_score_bounds}\n\n"
    print("\n\n* Scored results\n")
    normalized_results = []
    for result in scored_results:
        title_score, snippet_score = 1, 1
        if weights[0] != 0 :
            title_score = (result['title-score'] - title_score_bounds[0]) / (title_score_bounds[1] - title_score_bounds[0])
        if weights[1] != 0 :
            snippet_score = (result['snippet-score'] - snippet_score_bounds[0]) / (snippet_score_bounds[1] - snippet_score_bounds[0])
        score = math.pow(math.pow(title_score, weights[0]) * math.pow(snippet_score, weights[1]), 1 / (weights[0] + weights[1]))
        normalized_results.append({
            'title': result['title'],
            'url': result['url'],
            'snippet': result['snippet'],
            'score': score,
        })
    return normalized_results, log
 def sort_results(subject, results, weights, pooling):
    print("\n### Starting result processing (",len(results),") ###\n")
    log = "\n---\n\n## Scoring\n\n"
    scored_results, score_log = score_results(subject, results, weights, pooling)
    log += score_log
    # Sort the results by similarity (highest first)
    sorted_results = sorted(scored_results, key=lambda x: x['score'], reverse=True)
    return sorted_results, log
--- a/src/key.py
+++ b/src/key.py
@ -0,0 +1,33 @@
 from keybert import KeyBERT
 from itertools import combinations
 def create_queries(subject) :
    print("\n### Getting Keywords ###\n")
    kw_model = KeyBERT()
    print("* Got Keybert")
    keywords = kw_model.extract_keywords(subject, keyphrase_ngram_range=(1, 2), stop_words='english', use_mmr=True, diversity=0.7)
    print("* keywords extracted")
    sorted_keywords = sorted(keywords, key=lambda x: -x[1])
    text_keywords = [x[0] for x in sorted_keywords]
    log = f"## Keywords\n\n{text_keywords}\n\n"
    queries = []
    for r in range(1, len(text_keywords) + 1):
        comb = combinations(text_keywords, r)
        queries.extend(comb)
    final_queries = [subject] + ["\"" + "\" OR \"".join(query) + "\"" for query in queries]
    #final_queries.ins(subject)
    print("* query generated")
    return final_queries, log
--- a/src/scrub.py
+++ b/src/scrub.py
@ -0,0 +1,44 @@
 import requests
 import progressbar
 searxng_url = "https://search.penwing.org/search"
 def scrub_web(queries) :
    print("\n### Fetching Web data ###\n")
    web_bar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), progressbar.Bar()],
            maxval=len(queries)).start()
    progress = 0
    results = []
    log = "## Queries\n\n"
    for query in queries :
        params = {
            "q": query,  # Your search query
            "format": "json",         # Requesting JSON format
            "categories": "science",  # You can specify categories (optional)
        }
        response = requests.get(searxng_url, params=params)
        if response.status_code == 200:
            data = response.json()
            # List to store results with similarity scores
            scored_results = []
            results.extend(data.get("results", []))
            log += f"{query};\n"
        else:
            print(f"Error: {response.status_code}")
        progress += 1
        web_bar.update(progress)
    print("")
    return results, log
--- a/src/test.py
+++ b/src/test.py
@ -0,0 +1,25 @@
 import requests
 subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"
 searxng_url = "https://search.penwing.org/search"
 params = {
    "q": subject,  # Your search query
    "format": "json",         # Requesting JSON format
    "categories": "science",  # You can specify categories (optional)
 }
 response = requests.get(searxng_url, params=params)
 if response.status_code == 200:
    data = response.json()
    # List to store results with similarity scores
    scored_results = []
    for result in data.get("results", []):
        print(result['title'])
        print("---")
 else:
    print(f"Error: {response.status_code}")
--- a/19
+++ b/19
@ -0,0 +1,19 @@
 # Me
 subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"
 query = "composite viscoelastic damping"
 # Anne
 subject = "State of the art on the identification of wood structure natural frequencies. Influence of the mechanical properties and interest in sensitivity analysis as prospects for reverse identification method of wood elastic properties."
 query = "wood frequency analysis mechanical properties"
 # Axel
 subject = "Characterization of SiC MOSFET using double pulse test method."
 query = "SiC MOSFET double pulse test"
 # Paul
 subject = "Thermo-Mechanical Impact of temperature oscillations on bonding and metallization for SiC MOSFETs soldered on ceramic substrate"
 query = "thermo mechanical model discrete bonding SiC MOSFET"
 # Jam
 subject = "tig welding of inconel 625 and influences on micro structures"
 query = "tig welding inconel 625"
Author	SHA1	Message	Date
WanderingPenwing	a266739b51	added readme	2024-09-27 21:44:41 +02:00
WanderingPenwing	cb36ef8bd2	modified gitignore	2024-09-27 21:42:23 +02:00
WanderingPenwing	1bb5922b98	empty web_data folder	2024-09-27 21:41:37 +02:00
WanderingPenwing	ebfc48fdcb	empty logs folder	2024-09-27 21:40:05 +02:00
WanderingPenwing	3489ade151	better log name	2024-09-27 21:38:44 +02:00
WanderingPenwing	b3f0bea0e5	different pooling techniques	2024-09-27 18:18:21 +02:00
WanderingPenwing	ef5c154a1e	better logs, weighted scores	2024-09-27 17:11:21 +02:00
WanderingPenwing	5a51a383ed	split files	2024-09-27 14:53:16 +02:00
WanderingPenwing	7f4bc61fa8	able to find original article	2024-09-27 14:24:18 +02:00
WanderingPenwing	2539b919c2	keyword extraction	2024-09-27 12:11:59 +02:00
		`@ -0,0 +1,3 @@`
							`# Hin`

							`A searxng/BERT mix to find science papers more efficiently, given a subject`