diff --git a/.gitignore b/.gitignore index 483489a..8b4c829 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /models /logs/* /web_data/* +api.py diff --git a/Pipfile b/Pipfile index ac7641a..2704303 100644 --- a/Pipfile +++ b/Pipfile @@ -14,6 +14,9 @@ nltk = "*" spacy = "*" numpy = "*" gensim = "*" +scikit-learn = "*" +pandas = "*" +groq = "*" [dev-packages] diff --git a/main.py b/main.py index 9dd2e6c..deb1fe3 100644 --- a/main.py +++ b/main.py @@ -1,16 +1,11 @@ -import warnings from datetime import datetime import json import os from src.scrub import scrub_web -from src.key import create_queries +from src.key3 import create_queries from src.evaluate import sort_results, CLS_POOLING, MEAN_POOLING, MAX_POOLING -# Suppress FutureWarnings and other warnings -warnings.simplefilter(action='ignore', category=FutureWarning) - - def hin_fetch(subject, weights, pooling): current_time = datetime.now().strftime("%m-%d_%H-%M") data_path = f"web_data/{hash(subject)}.json" @@ -55,13 +50,16 @@ def hin_fetch(subject, weights, pooling): file.write(log_content + report) #subject = input("Enter subject : ") -subject = "State of the art on the identification of wood structure natural frequencies. Influence of the mechanical properties and interest in sensitivity analysis as prospects for reverse identification method of wood elastic properties." +#subject = "State of the art on the identification of wood structure natural frequencies. Influence of the mechanical properties and interest in sensitivity analysis as prospects for reverse identification method of wood elastic properties." #subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer" +#subject = "Dynamic response of carbon-epoxy laminates including a perforated viscoelastic film" +subject = "tig welding of inconel 625 and influences on micro structures" +#subject = "Thermo-Mechanical Impact of temperature oscillations on bonding and metallization for SiC MOSFETs soldered on ceramic substrate" # hin_fetch(subject, [title_weight, snippet_weight], [title_pooling, snippet_pooling]) -hin_fetch(subject, [1,0], [CLS_POOLING, MAX_POOLING]) -hin_fetch(subject, [1,0], [MEAN_POOLING,MAX_POOLING]) -hin_fetch(subject, [1,0], [MAX_POOLING, MAX_POOLING]) -hin_fetch(subject, [0,1], [CLS_POOLING, CLS_POOLING]) -hin_fetch(subject, [0,1], [CLS_POOLING, MEAN_POOLING]) -hin_fetch(subject, [0,1], [CLS_POOLING, MAX_POOLING]) +hin_fetch(subject, [2,1], [CLS_POOLING, MAX_POOLING]) +#hin_fetch(subject, [1,0], [MEAN_POOLING,MAX_POOLING]) +#hin_fetch(subject, [1,0], [MAX_POOLING, MAX_POOLING]) +#hin_fetch(subject, [0,1], [CLS_POOLING, CLS_POOLING]) +#hin_fetch(subject, [0,1], [CLS_POOLING, MEAN_POOLING]) +#hin_fetch(subject, [0,1], [CLS_POOLING, MAX_POOLING]) diff --git a/shell.nix b/shell.nix index 8634f33..9918004 100644 --- a/shell.nix +++ b/shell.nix @@ -12,6 +12,6 @@ mkShell { shellHook = '' export LD_LIBRARY_PATH="${pkgs.stdenv.cc.cc.lib}/lib"; export LD_LIBRARY_PATH="${pkgs.zlib}/lib:$LD_LIBRARY_PATH"; - alias run="pipenv run python main.py; notify-send -u normal -a 'Hin' 'finished'" + alias hin="pipenv run python main.py; notify-send -u normal -a 'Hin' 'finished'" ''; } diff --git a/src/evaluate.py b/src/evaluate.py index d9b277d..c2c4185 100644 --- a/src/evaluate.py +++ b/src/evaluate.py @@ -3,6 +3,10 @@ import torch import torch.nn.functional as F import progressbar import math +import warnings + +# Suppress FutureWarnings and other warnings +warnings.simplefilter(action='ignore', category=FutureWarning) CLS_POOLING = 1 MEAN_POOLING = 2 @@ -66,7 +70,7 @@ def score_results(subject, results, weights, pooling): subject_model_output = get_subject_output(subject) print("* Tokenized subject\n") - scored_results_urls = [] + scored_results_titles = [] scored_results = [] print("* Started scoring results\n") @@ -89,18 +93,18 @@ def score_results(subject, results, weights, pooling): for result in results : progress += 1 bar.update(progress) + + if not ("content" in result) : + continue title = result['title'] url = result['url'] snippet = result['content'] - if title == subject : - found_original = True - - if url in scored_results_urls : + if title in scored_results_titles : continue - scored_results_urls.append(url) + scored_results_titles.append(title) # Compute similarity between subject and result diff --git a/src/key2.py b/src/key2.py new file mode 100644 index 0000000..77afcf9 --- /dev/null +++ b/src/key2.py @@ -0,0 +1,38 @@ +from sklearn.feature_extraction.text import TfidfVectorizer +from itertools import combinations + +vectorizer = TfidfVectorizer(stop_words='english') + +def create_queries(subject) : + + print("\n### Getting Keywords ###\n") + + tfidf_matrix = vectorizer.fit_transform([subject]) + + feature_names = vectorizer.get_feature_names_out() + + print("* Preparation done") + + sorted_indices = tfidf_matrix[0].toarray()[0].argsort()[::-1] + + text_keywords = [] + + for i in range(5): # Change 3 to however many keywords you want + if i < len(sorted_indices): + text_keywords.append(feature_names[sorted_indices[i]]) + + log = f"## Keywords\n\n{text_keywords}\n\n" + + queries = [] + + for r in range(1, len(text_keywords) + 1): + comb = combinations(text_keywords, r) + queries.extend(comb) + + final_queries = [subject] + ["\"" + "\" OR \"".join(query) + "\"" for query in queries] + + #final_queries.ins(subject) + + print("* query generated") + + return final_queries, log diff --git a/src/key3.py b/src/key3.py new file mode 100644 index 0000000..f3e7bd0 --- /dev/null +++ b/src/key3.py @@ -0,0 +1,24 @@ +from groq import Groq +from src.api import KEY + +client = Groq( + api_key=KEY, +) + +def create_queries(subject) : + + chat_completion = client.chat.completions.create( + messages=[ + { + "role": "user", + "content": f"Generate 15 google scholar queries from this subject : \"{subject}\" Your response should only contain the queries, no title, no quotation marks, no numbers, one per line.", + } + ], + model="llama3-8b-8192", + ) + + log = "" + + queries = chat_completion.choices[0].message.content.split("\n") + + return queries, log diff --git a/src/test.py b/src/test.py index a47d613..caebbbf 100644 --- a/src/test.py +++ b/src/test.py @@ -1,25 +1,24 @@ -import requests +from groq import Groq -subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer" +client = Groq( + api_key=KEY, +) -searxng_url = "https://search.penwing.org/search" +def create_queries(subject) : -params = { - "q": subject, # Your search query - "format": "json", # Requesting JSON format - "categories": "science", # You can specify categories (optional) -} + chat_completion = client.chat.completions.create( + messages=[ + { + "role": "user", + "content": f"Generate 15 google scholar queries from this subject : \"{subject}\" Your response should only contain the queries, no title, no quotation marks, no numbers, one per line.", + } + ], + model="llama3-8b-8192", + ) -response = requests.get(searxng_url, params=params) + log = "" -if response.status_code == 200: - data = response.json() + queries = chat_completion.choices[0].message.content.split("\n") + print(queries) - # List to store results with similarity scores - scored_results = [] - - for result in data.get("results", []): - print(result['title']) - print("---") -else: - print(f"Error: {response.status_code}") + return queries, log