| | from fuzzywuzzy import fuzz |
| | from google.genai import Client, types |
| | from datasets import load_dataset |
| | import json |
| | import os |
| |
|
| |
|
| | def search_and_retrieve(user_input, config): |
| | dataset = config["dataset"] |
| | model = config["model"] |
| |
|
| | user_embedding = model.encode(user_input) |
| | results = dataset.get_nearest_examples('embeddings', user_embedding, k=5) |
| |
|
| | s=results.scores |
| | t=results.examples |
| | n = len(t['name']) |
| |
|
| | result = [] |
| |
|
| | for i in range(n): |
| | item = {} |
| |
|
| | for key, value in t.items(): |
| | if key!="embeddings": |
| | item[key] = value[i] |
| |
|
| | result.append(item) |
| |
|
| | for i,r in enumerate(result): |
| | r["score"]=float(s[i]) |
| | |
| | final_output = {"title": result[0]["name"], "purpose": result[0]["purpose"], "score": result[0]["score"]} |
| | final_output["top5"] = result |
| | print(final_output) |
| |
|
| | return final_output |
| |
|
| |
|
| | def generate_tech(user_input, user_instructions): |
| | prompt = f""" |
| | # ROLE |
| | |
| | You are a meticulous senior technical analyst and technology scout. Your task is to generate a technology into a structured JSON object. |
| | |
| | # OBJECTIVE |
| | |
| | Analyze the provided `<USER_INPUT>`. Identify what is technology discussed, focus on the highest level of the technology. |
| | Create a complete JSON object according to the schema below. |
| | Your final output must be a single, valid JSON document containing a technology you created. |
| | The technology should be described with sentences. |
| | |
| | # INSTRUCTIONS & RULES |
| | |
| | 1. **JSON List Output**: Your entire response MUST be a single JSON code block starting with a hyphen (`-`) to denote a list. |
| | Do not include any explanatory text before or after the JSON. |
| | 2. **Discover and Iterate**: Your primary task is to understand the technology and create a JSON entry for it. |
| | 3. **Descriptive Sentences**: You MUST write clear, full sentences that describe the technology's abilities and the issues it resolves. |
| | Do not use single keywords. |
| | 4. **Infer Where Necessary**: The source material may not contain all details. Infer plausible information based on the context. |
| | |
| | # YAML SCHEMA & EXAMPLE |
| | |
| | Your output must be a list of YAML objects matching this structure. Note how `functional_capabilities` and `problem_types_solved` contain full sentences. |
| | |
| | {{"name": "Generative Watermarking" |
| | "purpose": "Add an invisible, machine-readable tags to content generated by AI models and enables the tracing and authentication of digital media to its source." |
| | "problem_types_solved": "Helps to combat digital misinformation by providing a method to verify content authenticity and addresses the erosion of trust in digital media caused by the proliferation of deepfakes." |
| | "advantages": "Way faster to generate by an AI" |
| | "limitations": "Takes a lot of computational time to generate" |
| | "domain_tags": "Present in the domains of : AI ethics, cybersecurity, digital media, content moderation" |
| | }} |
| | |
| | Take into account those additionnal informations if there is any: |
| | {user_instructions} |
| | --- |
| | ***NOW, BEGIN THE TASK.*** |
| | |
| | <USER_INPUT> |
| | {user_input} |
| | </USER_INPUT> |
| | """ |
| |
|
| | client = Client(api_key=os.getenv("GEMINI_API_KEY")) |
| | |
| | client = Client(api_key=os.getenv("GEMINI_API_KEY")) |
| |
|
| | |
| | grounding_tool = types.Tool( |
| | google_search=types.GoogleSearch() |
| | ) |
| |
|
| | |
| | config = types.GenerateContentConfig( |
| | tools=[grounding_tool] |
| | ) |
| |
|
| | response = client.models.generate_content( |
| | model="gemini-2.5-flash", |
| | contents=prompt, |
| | config=config, |
| | ) |
| |
|
| | data = response.text |
| | data = data[data.find("{"):data.find("}")+1].replace('\n','') |
| | json_data = json.loads(data[data.find("{"):data.find("}")+1].replace('\n','')) |
| |
|
| | return json_data |
| |
|
| |
|
| | def send_to_dataset(data, model): |
| | data_embedding = model.encode(str(data)) |
| | data["embeddings"] = data_embedding |
| |
|
| | dataset = load_dataset("OrganizedProgrammers/Technologies", split="train") |
| | updated_dataset = dataset.add_item(data) |
| | updated_dataset.push_to_hub("OrganizedProgrammers/Technologies") |