hin/key.py

44 lines
1.4 KiB
Python
Raw Normal View History

2024-09-27 10:49:44 +02:00
from keybert import KeyBERT
from transformers import AutoTokenizer, AutoModel
2024-09-27 12:11:59 +02:00
from itertools import combinations
2024-09-27 10:49:44 +02:00
# Load the SciBERT model and tokenizer
2024-09-27 12:11:59 +02:00
#tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
#print("* Tokenizer")
2024-09-27 10:49:44 +02:00
2024-09-27 12:11:59 +02:00
#model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
#print("* Scibert model")
2024-09-27 10:49:44 +02:00
# Define a KeyBERT model using SciBERT embeddings
2024-09-27 12:11:59 +02:00
#kw_model = KeyBERT(model=model)
kw_model = KeyBERT()
2024-09-27 10:49:44 +02:00
print("* Keybert model")
# Define the subject from which to extract keywords
2024-09-27 12:11:59 +02:00
subject = "Thermo-Mechanical Impact of temperature oscillations on bonding and metallization for SiC MOSFETs soldered on ceramic substrate"
2024-09-27 10:49:44 +02:00
# Extract keywords from the subject
2024-09-27 12:11:59 +02:00
keywords = kw_model.extract_keywords(subject, keyphrase_ngram_range=(1, 2), stop_words='english', use_mmr=True, diversity=0.7)
2024-09-27 10:49:44 +02:00
# Print extracted keywords
for keyword, score in keywords:
print(f"Keyword: {keyword}, Score: {score:.4f}")
2024-09-27 12:11:59 +02:00
print("-"*40)
sorted_keywords = sorted(keywords, key=lambda x: -x[1])
text_keywords = [x[0] for x in sorted_keywords]
queries = []
for r in range(1, len(text_keywords) + 1): # r is the length of combinations
comb = combinations(text_keywords, r)
queries.extend(comb)
#print([" OR ".join(query) for query in queries])
text_queries = [" OR ".join(query) for query in queries]
text_queries.append(subject)
print(text_queries)