This commit is contained in:
WanderingPenwing 2024-09-30 12:24:26 +02:00
parent a266739b51
commit 5ddeb26891
8 changed files with 106 additions and 39 deletions

1
.gitignore vendored
View file

@ -1,3 +1,4 @@
/models
/logs/*
/web_data/*
api.py

View file

@ -14,6 +14,9 @@ nltk = "*"
spacy = "*"
numpy = "*"
gensim = "*"
scikit-learn = "*"
pandas = "*"
groq = "*"
[dev-packages]

24
main.py
View file

@ -1,16 +1,11 @@
import warnings
from datetime import datetime
import json
import os
from src.scrub import scrub_web
from src.key import create_queries
from src.key3 import create_queries
from src.evaluate import sort_results, CLS_POOLING, MEAN_POOLING, MAX_POOLING
# Suppress FutureWarnings and other warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def hin_fetch(subject, weights, pooling):
current_time = datetime.now().strftime("%m-%d_%H-%M")
data_path = f"web_data/{hash(subject)}.json"
@ -55,13 +50,16 @@ def hin_fetch(subject, weights, pooling):
file.write(log_content + report)
#subject = input("Enter subject : ")
subject = "State of the art on the identification of wood structure natural frequencies. Influence of the mechanical properties and interest in sensitivity analysis as prospects for reverse identification method of wood elastic properties."
#subject = "State of the art on the identification of wood structure natural frequencies. Influence of the mechanical properties and interest in sensitivity analysis as prospects for reverse identification method of wood elastic properties."
#subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"
#subject = "Dynamic response of carbon-epoxy laminates including a perforated viscoelastic film"
subject = "tig welding of inconel 625 and influences on micro structures"
#subject = "Thermo-Mechanical Impact of temperature oscillations on bonding and metallization for SiC MOSFETs soldered on ceramic substrate"
# hin_fetch(subject, [title_weight, snippet_weight], [title_pooling, snippet_pooling])
hin_fetch(subject, [1,0], [CLS_POOLING, MAX_POOLING])
hin_fetch(subject, [1,0], [MEAN_POOLING,MAX_POOLING])
hin_fetch(subject, [1,0], [MAX_POOLING, MAX_POOLING])
hin_fetch(subject, [0,1], [CLS_POOLING, CLS_POOLING])
hin_fetch(subject, [0,1], [CLS_POOLING, MEAN_POOLING])
hin_fetch(subject, [0,1], [CLS_POOLING, MAX_POOLING])
hin_fetch(subject, [2,1], [CLS_POOLING, MAX_POOLING])
#hin_fetch(subject, [1,0], [MEAN_POOLING,MAX_POOLING])
#hin_fetch(subject, [1,0], [MAX_POOLING, MAX_POOLING])
#hin_fetch(subject, [0,1], [CLS_POOLING, CLS_POOLING])
#hin_fetch(subject, [0,1], [CLS_POOLING, MEAN_POOLING])
#hin_fetch(subject, [0,1], [CLS_POOLING, MAX_POOLING])

View file

@ -12,6 +12,6 @@ mkShell {
shellHook = ''
export LD_LIBRARY_PATH="${pkgs.stdenv.cc.cc.lib}/lib";
export LD_LIBRARY_PATH="${pkgs.zlib}/lib:$LD_LIBRARY_PATH";
alias run="pipenv run python main.py; notify-send -u normal -a 'Hin' 'finished'"
alias hin="pipenv run python main.py; notify-send -u normal -a 'Hin' 'finished'"
'';
}

View file

@ -3,6 +3,10 @@ import torch
import torch.nn.functional as F
import progressbar
import math
import warnings
# Suppress FutureWarnings and other warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
CLS_POOLING = 1
MEAN_POOLING = 2
@ -66,7 +70,7 @@ def score_results(subject, results, weights, pooling):
subject_model_output = get_subject_output(subject)
print("* Tokenized subject\n")
scored_results_urls = []
scored_results_titles = []
scored_results = []
print("* Started scoring results\n")
@ -89,18 +93,18 @@ def score_results(subject, results, weights, pooling):
for result in results :
progress += 1
bar.update(progress)
if not ("content" in result) :
continue
title = result['title']
url = result['url']
snippet = result['content']
if title == subject :
found_original = True
if url in scored_results_urls :
if title in scored_results_titles :
continue
scored_results_urls.append(url)
scored_results_titles.append(title)
# Compute similarity between subject and result

38
src/key2.py Normal file
View file

@ -0,0 +1,38 @@
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import combinations
vectorizer = TfidfVectorizer(stop_words='english')
def create_queries(subject) :
print("\n### Getting Keywords ###\n")
tfidf_matrix = vectorizer.fit_transform([subject])
feature_names = vectorizer.get_feature_names_out()
print("* Preparation done")
sorted_indices = tfidf_matrix[0].toarray()[0].argsort()[::-1]
text_keywords = []
for i in range(5): # Change 3 to however many keywords you want
if i < len(sorted_indices):
text_keywords.append(feature_names[sorted_indices[i]])
log = f"## Keywords\n\n{text_keywords}\n\n"
queries = []
for r in range(1, len(text_keywords) + 1):
comb = combinations(text_keywords, r)
queries.extend(comb)
final_queries = [subject] + ["\"" + "\" OR \"".join(query) + "\"" for query in queries]
#final_queries.ins(subject)
print("* query generated")
return final_queries, log

24
src/key3.py Normal file
View file

@ -0,0 +1,24 @@
from groq import Groq
from src.api import KEY
client = Groq(
api_key=KEY,
)
def create_queries(subject) :
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": f"Generate 15 google scholar queries from this subject : \"{subject}\" Your response should only contain the queries, no title, no quotation marks, no numbers, one per line.",
}
],
model="llama3-8b-8192",
)
log = ""
queries = chat_completion.choices[0].message.content.split("\n")
return queries, log

View file

@ -1,25 +1,24 @@
import requests
from groq import Groq
subject = "Experiments, numerical models and optimization of carbon-epoxy plates damped by a frequency-dependent interleaved viscoelastic layer"
client = Groq(
api_key=KEY,
)
searxng_url = "https://search.penwing.org/search"
def create_queries(subject) :
params = {
"q": subject, # Your search query
"format": "json", # Requesting JSON format
"categories": "science", # You can specify categories (optional)
}
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": f"Generate 15 google scholar queries from this subject : \"{subject}\" Your response should only contain the queries, no title, no quotation marks, no numbers, one per line.",
}
],
model="llama3-8b-8192",
)
response = requests.get(searxng_url, params=params)
log = ""
if response.status_code == 200:
data = response.json()
queries = chat_completion.choices[0].message.content.split("\n")
print(queries)
# List to store results with similarity scores
scored_results = []
for result in data.get("results", []):
print(result['title'])
print("---")
else:
print(f"Error: {response.status_code}")
return queries, log