Spaces:
Sleeping
Sleeping
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| from typing import Sequence, List, Tuple | |
| from app.vectorizer import Vectorizer | |
| from app.scorer import cosine_similarity | |
| class PromptSearchEngine: | |
| def __init__(self, prompts: Sequence[str]) -> None: | |
| """ | |
| Initialize search engine by vectorizing prompt corpus. | |
| Vectorized prompt corpus should be used to find the top n most | |
| similar prompts w.r.t. user’s input prompt. | |
| Args: | |
| prompts: The sequence of raw prompts from the dataset. | |
| """ | |
| self.prompts = prompts | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| self.vectorizer = Vectorizer(model) | |
| self.corpus_vectors = self.vectorizer.transform(prompts) | |
| def most_similar(self, query: str, n: int = 5) -> List[Tuple[float, str]]: | |
| """ | |
| Return top n most similar prompts from corpus. | |
| Input query prompt should be vectorized with chosen Vectorizer. | |
| After that, use the cosine_similarity function to get the top n most similar prompts from the corpus. | |
| Args: | |
| query: The raw query prompt input from the user. | |
| n: The number of similar prompts returned from the corpus. | |
| Returns: | |
| The list of top n most similar prompts from the corpus along | |
| with similarity scores. Note that returned prompts are verbatim. | |
| """ | |
| query_vector = self.vectorizer.transform([query]) | |
| similarities = cosine_similarity(query_vector, self.corpus_vectors) | |
| top_n_vectors_with_scores = np.argsort(similarities)[-n:][::-1] | |
| # Convert similarities to Python float and return the top-n prompts | |
| return [(float(similarities[i]), self.prompts[i]) for i in top_n_vectors_with_scores] | |