Spaces:

OrganizedProgrammers
/

Search-Technologies-API

Sleeping

Search-Technologies-API / src /processor.py

ALLOUNE

add faiss search

7d1249d 7 months ago

4.16 kB

	from fuzzywuzzy import fuzz
	from google.genai import Client, types
	from datasets import load_dataset
	import json
	import os


	def search_and_retrieve(user_input, config):
	dataset = config["dataset"]
	model = config["model"]

	user_embedding = model.encode(user_input)
	results = dataset.get_nearest_examples('embeddings', user_embedding, k=5)

	s=results.scores
	t=results.examples
	n = len(t['name'])

	result = []

	for i in range(n):
	item = {}

	for key, value in t.items():
	if key!="embeddings":
	item[key] = value[i]

	result.append(item)

	for i,r in enumerate(result):
	r["score"]=float(s[i])

	final_output = {"title": result[0]["name"], "purpose": result[0]["purpose"], "score": result[0]["score"]}
	final_output["top5"] = result
	print(final_output)

	return final_output


	def generate_tech(user_input, user_instructions):
	prompt = f"""
	# ROLE

	You are a meticulous senior technical analyst and technology scout. Your task is to generate a technology into a structured JSON object.

	# OBJECTIVE

	Analyze the provided `<USER_INPUT>`. Identify what is technology discussed, focus on the highest level of the technology.
	Create a complete JSON object according to the schema below.
	Your final output must be a single, valid JSON document containing a technology you created.
	The technology should be described with sentences.

	# INSTRUCTIONS & RULES

	1. JSON List Output: Your entire response MUST be a single JSON code block starting with a hyphen (`-`) to denote a list.
	Do not include any explanatory text before or after the JSON.
	2. Discover and Iterate: Your primary task is to understand the technology and create a JSON entry for it.
	3. Descriptive Sentences: You MUST write clear, full sentences that describe the technology's abilities and the issues it resolves.
	Do not use single keywords.
	4. Infer Where Necessary: The source material may not contain all details. Infer plausible information based on the context.

	# YAML SCHEMA & EXAMPLE

	Your output must be a list of YAML objects matching this structure. Note how `functional_capabilities` and `problem_types_solved` contain full sentences.

	{{"name": "Generative Watermarking"
	"purpose": "Add an invisible, machine-readable tags to content generated by AI models and enables the tracing and authentication of digital media to its source."
	"problem_types_solved": "Helps to combat digital misinformation by providing a method to verify content authenticity and addresses the erosion of trust in digital media caused by the proliferation of deepfakes."
	"advantages": "Way faster to generate by an AI"
	"limitations": "Takes a lot of computational time to generate"
	"domain_tags": "Present in the domains of : AI ethics, cybersecurity, digital media, content moderation"
	}}

	Take into account those additionnal informations if there is any:
	{user_instructions}
	---
	*NOW, BEGIN THE TASK.*

	<USER_INPUT>
	{user_input}
	</USER_INPUT>
	"""

	client = Client(api_key=os.getenv("GEMINI_API_KEY"))

	client = Client(api_key=os.getenv("GEMINI_API_KEY"))

	# Define the grounding tool
	grounding_tool = types.Tool(
	google_search=types.GoogleSearch()
	)

	# Configure generation settings
	config = types.GenerateContentConfig(
	tools=[grounding_tool]
	)

	response = client.models.generate_content(
	model="gemini-2.5-flash",
	contents=prompt,
	config=config,
	)

	data = response.text
	data = data[data.find("{"):data.find("}")+1].replace('\n','')
	json_data = json.loads(data[data.find("{"):data.find("}")+1].replace('\n',''))

	return json_data


	def send_to_dataset(data, model):
	data_embedding = model.encode(str(data))
	data["embeddings"] = data_embedding

	dataset = load_dataset("OrganizedProgrammers/Technologies", split="train")
	updated_dataset = dataset.add_item(data)
	updated_dataset.push_to_hub("OrganizedProgrammers/Technologies")