From 2539b919c21e779311701c0b15c1b9e0061de586 Mon Sep 17 00:00:00 2001 From: WanderingPenwing Date: Fri, 27 Sep 2024 12:11:59 +0200 Subject: [PATCH] keyword extraction --- Pipfile | 5 ++ key-scrub-evaluate.py | 163 ++++++++++++++++++++++++++++++++++++++++++ key.py | 36 +++++++--- key2.py | 8 +++ key3.py | 5 ++ key4.py | 5 ++ shell.nix | 2 + 7 files changed, 215 insertions(+), 9 deletions(-) create mode 100644 key-scrub-evaluate.py create mode 100644 key2.py create mode 100644 key3.py create mode 100644 key4.py diff --git a/Pipfile b/Pipfile index edb203f..ac7641a 100644 --- a/Pipfile +++ b/Pipfile @@ -9,6 +9,11 @@ torch = "*" requests = "*" keybert = "*" progressbar = "*" +rake-nltk = "*" +nltk = "*" +spacy = "*" +numpy = "*" +gensim = "*" [dev-packages] diff --git a/key-scrub-evaluate.py b/key-scrub-evaluate.py new file mode 100644 index 0000000..1fba0dc --- /dev/null +++ b/key-scrub-evaluate.py @@ -0,0 +1,163 @@ +import warnings +from transformers import AutoTokenizer, AutoModel +from keybert import KeyBERT +import torch +import torch.nn.functional as F +import requests +import progressbar +from itertools import combinations + + +# Me +#subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer" +#query = "composite viscoelastic damping" + +# Anne +#subject = "State of the art on the identification of wood structure natural frequencies. Influence of the mechanical properties and interest in sensitivity analysis as prospects for reverse identification method of wood elastic properties." +#query = "wood frequency analysis mechanical properties" + +# Axel +#subject = "Characterization of SiC MOSFET using double pulse test method." +#query = "SiC MOSFET double pulse test" + +# Paul +#subject = "Thermo-Mechanical Impact of temperature oscillations on bonding and metallization for SiC MOSFETs soldered on ceramic substrate" +#query = "thermo mechanical model discrete bonding SiC MOSFET" + +# Jam +#subject = "tig welding of inconel 625 and influences on micro structures" +#query = "tig welding inconel 625" + +subject = "artificial inetlligence for satellite detection" + +widgets = [' [', + progressbar.Timer(format= 'elapsed time: %(elapsed)s'), + '] ', + progressbar.Bar('*'),' (', + progressbar.ETA(), ') ', + ] + +# Suppress FutureWarnings and other warnings +warnings.simplefilter(action='ignore', category=FutureWarning) + +print("\n### Fetching Data ###\n") + +# Load the tokenizer and the model +tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased') + +print("* Got tokenizer") + +model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased') + +print("* Got model") + +kw_model = KeyBERT() + +print("* Got Keybert") + +# Function to compute sentence embeddings by pooling token embeddings (CLS token) +def get_sentence_embedding(text): + inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) + with torch.no_grad(): + outputs = model(**inputs) + + # Pooling strategy: Use the hidden state of the [CLS] token as the sentence embedding + cls_embedding = outputs.last_hidden_state[:, 0, :] # Shape: (batch_size, hidden_size) + return cls_embedding + +# Function to compute cosine similarity +def compute_similarity(embedding1, embedding2): + similarity = F.cosine_similarity(embedding1, embedding2) + return similarity.item() + +print("\n### Getting Keywords ###\n") + +keywords = kw_model.extract_keywords(subject, keyphrase_ngram_range=(1, 2), stop_words='english', use_mmr=True, diversity=0.7) + +print("* keywords extracted") +sorted_keywords = sorted(keywords, key=lambda x: -x[1]) +text_keywords = [x[0] for x in sorted_keywords] + +queries = [] + +for r in range(1, len(text_keywords) + 1): + comb = combinations(text_keywords, r) + queries.extend(comb) + +final_query = [" OR ".join(query) for query in queries] + +final_query.append(subject) + +print("* query generated") + +print("\n### Fetching Web data ###\n") + +# Define the SearxNG instance URL and search query +searxng_url = "https://search.penwing.org/search" # Replace with your instance URL +params = { + "q": final_query, # Your search query + "format": "json", # Requesting JSON format + "categories": "science", # You can specify categories (optional) +} + +# Send the request to SearxNG API +response = requests.get(searxng_url, params=params) + +# Check if the request was successful +if response.status_code == 200: + print("* Got response") + # Parse the JSON response + data = response.json() + + subject_embedding = get_sentence_embedding(subject) + + print("* Tokenized subject") + + # List to store results with similarity scores + scored_results = [] + + results = data.get("results", []) + progress = 0 + + + print("\n### Starting result processing (",len(results),") ###\n") + + bar = progressbar.ProgressBar(widgets=[progressbar.Percentage(), progressbar.Bar()], + maxval=len(results)).start() + + # Process each result + for result in results : + title = result['title'] + url = result['url'] + snippet = result['content'] + + # Get embedding for the snippet (abstract) + snippet_embedding = get_sentence_embedding(snippet) + + # Compute similarity between subject and snippet + similarity = compute_similarity(subject_embedding, snippet_embedding) + + # Store the result with its similarity score + scored_results.append({ + 'title': title, + 'url': url, + 'snippet': snippet, + 'similarity': similarity + }) + + progress += 1 + bar.update(progress) + + # Sort the results by similarity (highest first) + top_results = sorted(scored_results, key=lambda x: x['similarity'], reverse=True)[:10] + + print("\n\n### Done ###\n") + # Print the top 10 results + for idx, result in enumerate(top_results, 1): + print(f"Rank {idx} ({result['similarity']:.4f}):") + print(f"Title: {result['title']}") + print(f"URL: {result['url']}") + print(f"Snippet: {result['snippet']}") + print("-" * 40) +else: + print(f"Error: {response.status_code}") diff --git a/key.py b/key.py index 3cb12a5..5b0f4cb 100644 --- a/key.py +++ b/key.py @@ -1,25 +1,43 @@ from keybert import KeyBERT from transformers import AutoTokenizer, AutoModel +from itertools import combinations # Load the SciBERT model and tokenizer -tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased') +#tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased') +#print("* Tokenizer") -print("* Tokenizer") - -model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased') - -print("* Scibert model") +#model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased') +#print("* Scibert model") # Define a KeyBERT model using SciBERT embeddings -kw_model = KeyBERT(model=model) +#kw_model = KeyBERT(model=model) +kw_model = KeyBERT() print("* Keybert model") # Define the subject from which to extract keywords -subject = "tig welding of inconel 625 and influences on micro structures" +subject = "Thermo-Mechanical Impact of temperature oscillations on bonding and metallization for SiC MOSFETs soldered on ceramic substrate" # Extract keywords from the subject -keywords = kw_model.extract_keywords(subject, keyphrase_ngram_range=(1, 2), stop_words='english', use_maxsum=True) +keywords = kw_model.extract_keywords(subject, keyphrase_ngram_range=(1, 2), stop_words='english', use_mmr=True, diversity=0.7) # Print extracted keywords for keyword, score in keywords: print(f"Keyword: {keyword}, Score: {score:.4f}") + +print("-"*40) +sorted_keywords = sorted(keywords, key=lambda x: -x[1]) +text_keywords = [x[0] for x in sorted_keywords] + +queries = [] + +for r in range(1, len(text_keywords) + 1): # r is the length of combinations + comb = combinations(text_keywords, r) + queries.extend(comb) + +#print([" OR ".join(query) for query in queries]) + +text_queries = [" OR ".join(query) for query in queries] + +text_queries.append(subject) + +print(text_queries) diff --git a/key2.py b/key2.py new file mode 100644 index 0000000..19fb8d1 --- /dev/null +++ b/key2.py @@ -0,0 +1,8 @@ +from rake_nltk import Rake +import nltk + +rake_nltk_var = Rake() +text = "Characterization of SiC MOSFET using double pulse test method." +rake_nltk_var.extract_keywords_from_text(text) +keyword_extracted = rake_nltk_var.get_ranked_phrases() +print(keyword_extracted) diff --git a/key3.py b/key3.py new file mode 100644 index 0000000..a8959d5 --- /dev/null +++ b/key3.py @@ -0,0 +1,5 @@ +import spacy +nlp = spacy.load("en_core_sci_lg") +text = "Characterization of SiC MOSFET using double pulse test method" +doc = nlp(text) +print(doc.ents) diff --git a/key4.py b/key4.py new file mode 100644 index 0000000..1b32ce3 --- /dev/null +++ b/key4.py @@ -0,0 +1,5 @@ +from gensim.summarization import keywords + +text_en = ('Characterization of SiC MOSFET using double pulse test method.') + +print(keywords(text_en,words = 5,scores = True, lemmatize = True)) diff --git a/shell.nix b/shell.nix index e6d76fe..ca6cd38 100644 --- a/shell.nix +++ b/shell.nix @@ -6,9 +6,11 @@ mkShell { pipenv python3 stdenv.cc.cc.lib + zlib ]; shellHook = '' export LD_LIBRARY_PATH="${pkgs.stdenv.cc.cc.lib}/lib"; + export LD_LIBRARY_PATH="${pkgs.zlib}/lib:$LD_LIBRARY_PATH"; ''; }