Spaces:
Sleeping
Sleeping
| import numpy as np | |
| # TODO: Move exception classes into separate file | |
| class DimensionalityMismatchError(ValueError): | |
| """Raised when the dimensions of query and corpus vectors don't match.""" | |
| pass | |
| class ZeroVectorError(ValueError): | |
| """Raised when a zero vector is encountered.""" | |
| pass | |
| class EmptyInputError(ValueError): | |
| """Raised when the input arrays are empty.""" | |
| pass | |
| def cosine_similarity( | |
| query_vector: np.ndarray, | |
| corpus_vectors: np.ndarray | |
| ) -> np.ndarray: | |
| """ | |
| Calculate cosine similarity between prompt vectors. | |
| Args: | |
| query_vector: Vectorized prompt query of shape (1, D). | |
| corpus_vectors: Vectorized prompt corpus of shape (N, D). | |
| Returns: | |
| The vector of shape (N,) with values in range [-1, 1] where 1 is max similarity i.e., two vectors are the same. | |
| Raises: | |
| DimensionalityMismatchError: If dimensions of query_vector and corpus_vectors do not match. | |
| ZeroVectorError: If query_vector is a zero vector or any corpus vector is a zero vector. | |
| EmptyInputError: If query_vector or corpus_vectors are empty. | |
| Note: | |
| - This implementation assumes the use of SentenceTransformer with the "all-MiniLM-L6-v2" model. | |
| - SentenceTransformer embeddings are unlikely to produce zero vectors, even for empty or irrelevant inputs. | |
| - However, checks for zero vectors are included to handle potential edge cases and ensure robustness | |
| for future modifications or alternative embedding models. | |
| """ | |
| # Validate input shapes and properties | |
| if query_vector.shape[0] != 1: | |
| raise DimensionalityMismatchError(f"query_vector must have shape (1, D), but got shape {query_vector.shape}.") | |
| if query_vector.shape[1] != corpus_vectors.shape[1]: | |
| raise DimensionalityMismatchError( | |
| f"query_vector shape {query_vector.shape} does not match corpus_vectors shape {corpus_vectors.shape}." | |
| ) | |
| if query_vector.size == 0 or corpus_vectors.size == 0: | |
| raise EmptyInputError("query_vector and corpus_vectors must not be empty.") | |
| # Compute query norm and check for zero vector | |
| query_norm = np.linalg.norm(query_vector, axis=1)[0] | |
| if query_norm == 0: | |
| raise ZeroVectorError("query_vector must not be a zero vector.") | |
| # Check if any corpus vector is zero (alternative option for raising an error if any corpus vector is zero, | |
| # can be filtering out zero vectors and raising the error only if all corpus vectors are zero vectors) | |
| corpus_norms = np.linalg.norm(corpus_vectors, axis=1) | |
| if np.any(corpus_norms == 0): | |
| raise ZeroVectorError("corpus_vectors must not contain zero vectors.") | |
| # Compute cosine similarity | |
| dot_products = np.dot(corpus_vectors, query_vector.T).flatten() | |
| similarities = dot_products / (query_norm * corpus_norms) | |
| return similarities | |