| """ |
| Document Classification Schemas |
| |
| Pydantic models for document type classification and categorization. |
| """ |
|
|
| from enum import Enum |
| from typing import List, Dict, Any, Optional |
| from pydantic import BaseModel, Field |
|
|
| from .core import EvidenceRef |
|
|
|
|
| class DocumentType(str, Enum): |
| """ |
| Common document types for classification. |
| Extensible for domain-specific types. |
| """ |
| |
| CONTRACT = "contract" |
| INVOICE = "invoice" |
| RECEIPT = "receipt" |
| PURCHASE_ORDER = "purchase_order" |
| AGREEMENT = "agreement" |
| NDA = "nda" |
| TERMS_OF_SERVICE = "terms_of_service" |
|
|
| |
| PATENT = "patent" |
| RESEARCH_PAPER = "research_paper" |
| TECHNICAL_REPORT = "technical_report" |
| SPECIFICATION = "specification" |
| DATASHEET = "datasheet" |
| USER_MANUAL = "user_manual" |
|
|
| |
| FINANCIAL_REPORT = "financial_report" |
| BANK_STATEMENT = "bank_statement" |
| TAX_FORM = "tax_form" |
| BALANCE_SHEET = "balance_sheet" |
| INCOME_STATEMENT = "income_statement" |
|
|
| |
| ID_DOCUMENT = "id_document" |
| PASSPORT = "passport" |
| DRIVERS_LICENSE = "drivers_license" |
| CERTIFICATE = "certificate" |
| FORM = "form" |
| APPLICATION = "application" |
|
|
| |
| MEDICAL_RECORD = "medical_record" |
| PRESCRIPTION = "prescription" |
| LAB_REPORT = "lab_report" |
| INSURANCE_CLAIM = "insurance_claim" |
|
|
| |
| LETTER = "letter" |
| EMAIL = "email" |
| MEMO = "memo" |
| PRESENTATION = "presentation" |
| SPREADSHEET = "spreadsheet" |
| REPORT = "report" |
| ARTICLE = "article" |
| BOOK = "book" |
|
|
| |
| OTHER = "other" |
| UNKNOWN = "unknown" |
|
|
|
|
| class ClassificationScore(BaseModel): |
| """Score for a single document type classification.""" |
| document_type: DocumentType = Field(..., description="Document type") |
| confidence: float = Field(..., ge=0.0, le=1.0, description="Classification confidence") |
| reasoning: Optional[str] = Field(default=None, description="Reasoning for classification") |
|
|
|
|
| class DocumentClassification(BaseModel): |
| """ |
| Document classification result with confidence scores. |
| """ |
| document_id: str = Field(..., description="Document identifier") |
|
|
| |
| primary_type: DocumentType = Field(..., description="Most likely document type") |
| primary_confidence: float = Field( |
| ..., |
| ge=0.0, |
| le=1.0, |
| description="Confidence in primary classification" |
| ) |
|
|
| |
| scores: List[ClassificationScore] = Field( |
| default_factory=list, |
| description="Scores for all considered types" |
| ) |
|
|
| |
| evidence: List[EvidenceRef] = Field( |
| default_factory=list, |
| description="Evidence supporting classification" |
| ) |
|
|
| |
| method: str = Field( |
| default="llm", |
| description="Classification method used (llm/rule-based/hybrid)" |
| ) |
| model_used: Optional[str] = Field(default=None, description="Model used for classification") |
|
|
| |
| is_confident: bool = Field( |
| default=True, |
| description="Whether classification meets confidence threshold" |
| ) |
| warnings: List[str] = Field(default_factory=list, description="Classification warnings") |
| needs_human_review: bool = Field( |
| default=False, |
| description="Whether human review is recommended" |
| ) |
|
|
| |
| attributes: Dict[str, Any] = Field( |
| default_factory=dict, |
| description="Additional detected attributes (language, domain, etc.)" |
| ) |
|
|
| def get_top_k(self, k: int = 3) -> List[ClassificationScore]: |
| """Get top k classifications by confidence.""" |
| sorted_scores = sorted(self.scores, key=lambda x: x.confidence, reverse=True) |
| return sorted_scores[:k] |
|
|
| def is_type(self, doc_type: DocumentType, min_confidence: float = 0.5) -> bool: |
| """Check if document is classified as a specific type with minimum confidence.""" |
| for score in self.scores: |
| if score.document_type == doc_type and score.confidence >= min_confidence: |
| return True |
| return False |
|
|
|
|
| class DocumentCategoryRule(BaseModel): |
| """ |
| Rule for rule-based document classification. |
| """ |
| name: str = Field(..., description="Rule name") |
| document_type: DocumentType = Field(..., description="Target document type") |
|
|
| |
| title_keywords: List[str] = Field( |
| default_factory=list, |
| description="Keywords to match in title" |
| ) |
| content_keywords: List[str] = Field( |
| default_factory=list, |
| description="Keywords to match in content" |
| ) |
| required_sections: List[str] = Field( |
| default_factory=list, |
| description="Required section headings" |
| ) |
| file_patterns: List[str] = Field( |
| default_factory=list, |
| description="Filename patterns (regex)" |
| ) |
|
|
| |
| base_confidence: float = Field( |
| default=0.8, |
| ge=0.0, |
| le=1.0, |
| description="Base confidence when rule matches" |
| ) |
| keyword_boost: float = Field( |
| default=0.05, |
| ge=0.0, |
| le=0.2, |
| description="Confidence boost per matched keyword" |
| ) |
|
|
| |
| priority: int = Field( |
| default=0, |
| description="Rule priority (higher = checked first)" |
| ) |
|
|
|
|
| class ClassificationConfig(BaseModel): |
| """ |
| Configuration for document classification. |
| """ |
| |
| min_confidence: float = Field( |
| default=0.6, |
| ge=0.0, |
| le=1.0, |
| description="Minimum confidence for classification" |
| ) |
| human_review_threshold: float = Field( |
| default=0.7, |
| ge=0.0, |
| le=1.0, |
| description="Below this, flag for human review" |
| ) |
|
|
| |
| use_llm: bool = Field(default=True, description="Use LLM for classification") |
| use_rules: bool = Field(default=True, description="Use rule-based classification") |
| hybrid_mode: str = Field( |
| default="llm_primary", |
| description="Hybrid mode: llm_primary, rules_primary, or ensemble" |
| ) |
|
|
| |
| custom_rules: List[DocumentCategoryRule] = Field( |
| default_factory=list, |
| description="Custom classification rules" |
| ) |
|
|
| |
| enabled_types: List[DocumentType] = Field( |
| default_factory=lambda: list(DocumentType), |
| description="Document types to consider" |
| ) |
|
|
| |
| require_evidence: bool = Field( |
| default=True, |
| description="Require evidence for classification" |
| ) |
| max_evidence_snippets: int = Field( |
| default=3, |
| description="Maximum evidence snippets to include" |
| ) |
|
|