Instructions to use anthonym21/json-tokenizer-structured with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use anthonym21/json-tokenizer-structured with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="anthonym21/json-tokenizer-structured")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("anthonym21/json-tokenizer-structured", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use anthonym21/json-tokenizer-structured with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "anthonym21/json-tokenizer-structured" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "anthonym21/json-tokenizer-structured", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/anthonym21/json-tokenizer-structured
- SGLang
How to use anthonym21/json-tokenizer-structured with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "anthonym21/json-tokenizer-structured" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "anthonym21/json-tokenizer-structured", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "anthonym21/json-tokenizer-structured" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "anthonym21/json-tokenizer-structured", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use anthonym21/json-tokenizer-structured with Docker Model Runner:
docker model run hf.co/anthonym21/json-tokenizer-structured
| """HuggingFace Transformers-compatible wrapper for JSONTokenizer. | |
| Provides JSONPreTrainedTokenizer, a PreTrainedTokenizer subclass that | |
| wraps JSONTokenizer for use with the HuggingFace ecosystem: | |
| - save_pretrained / from_pretrained | |
| - AutoTokenizer.from_pretrained (with trust_remote_code=True) | |
| - tokenizer(json_string) -> BatchEncoding | |
| - Padding, truncation, batch processing, return_tensors | |
| Requires: pip install json-tokenizer[huggingface] | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| from typing import Any, Dict, List, Optional, Tuple, Union | |
| try: | |
| from transformers import PreTrainedTokenizer | |
| except ImportError: | |
| raise ImportError( | |
| "The HuggingFace transformers library is required for this module. " | |
| "Install it with: pip install json-tokenizer[huggingface]" | |
| ) | |
| from json_tokenizer.tokenizer import JSONTokenizer, StructuralTokens | |
| from json_tokenizer.bpe import BPETrainer | |
| VOCAB_FILES_NAMES = {"vocab_file": "json_tokenizer_vocab.json"} | |
| # Structural token ID -> HF-compatible string name. | |
| # Uses <name> format which cannot collide with BPE tokens because | |
| # the BPE pre-tokenizer splits <, >, : into separate tokens. | |
| _STRUCTURAL_TOKEN_NAMES = { | |
| StructuralTokens.PAD: "<pad>", | |
| StructuralTokens.START: "<s>", | |
| StructuralTokens.END: "</s>", | |
| StructuralTokens.OBJ_START: "<obj_start>", | |
| StructuralTokens.OBJ_END: "<obj_end>", | |
| StructuralTokens.ARR_START: "<arr_start>", | |
| StructuralTokens.ARR_END: "<arr_end>", | |
| StructuralTokens.COLON: "<colon>", | |
| StructuralTokens.COMMA: "<comma>", | |
| StructuralTokens.NULL: "<null>", | |
| StructuralTokens.TRUE: "<true>", | |
| StructuralTokens.FALSE: "<false>", | |
| StructuralTokens.STR_DELIM: "<str_delim>", | |
| StructuralTokens.NUM_PREFIX: "<num_prefix>", | |
| StructuralTokens.KEY_PREFIX: "<key_prefix>", | |
| StructuralTokens.UNK: "<unk>", | |
| } | |
| _STRUCTURAL_NAME_TO_ID = {v: k for k, v in _STRUCTURAL_TOKEN_NAMES.items()} | |
| class JSONPreTrainedTokenizer(PreTrainedTokenizer): | |
| """HuggingFace-compatible wrapper around JSONTokenizer. | |
| Usage: | |
| # From a trained JSONTokenizer: | |
| tok = JSONTokenizer(bpe_vocab_size=4096) | |
| tok.train(data) | |
| hf_tok = JSONPreTrainedTokenizer.from_json_tokenizer(tok) | |
| # Encode/decode via HF API: | |
| output = hf_tok('{"name": "Alice", "age": 30}') | |
| print(output["input_ids"]) | |
| print(hf_tok.decode(output["input_ids"])) | |
| # Save and reload: | |
| hf_tok.save_pretrained("./my_tokenizer") | |
| loaded = JSONPreTrainedTokenizer.from_pretrained("./my_tokenizer") | |
| """ | |
| vocab_files_names = VOCAB_FILES_NAMES | |
| model_input_names = ["input_ids", "attention_mask"] | |
| def __init__( | |
| self, | |
| vocab_file: Optional[str] = None, | |
| unk_token: str = "<unk>", | |
| bos_token: str = "<s>", | |
| eos_token: str = "</s>", | |
| pad_token: str = "<pad>", | |
| **kwargs, | |
| ): | |
| # Internal state β populated from vocab_file or from_json_tokenizer | |
| if not hasattr(self, "_json_tokenizer"): | |
| self._json_tokenizer: Optional[JSONTokenizer] = None | |
| if not hasattr(self, "_hf_vocab"): | |
| self._hf_vocab: Dict[str, int] = {} | |
| if not hasattr(self, "_hf_id_to_token"): | |
| self._hf_id_to_token: Dict[int, str] = {} | |
| if vocab_file is not None and os.path.isfile(vocab_file): | |
| self._load_vocab_file(vocab_file) | |
| super().__init__( | |
| unk_token=unk_token, | |
| bos_token=bos_token, | |
| eos_token=eos_token, | |
| pad_token=pad_token, | |
| **kwargs, | |
| ) | |
| # ββ Factory ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def from_json_tokenizer( | |
| cls, tokenizer: JSONTokenizer, **kwargs | |
| ) -> "JSONPreTrainedTokenizer": | |
| """Create from a trained JSONTokenizer instance. | |
| Args: | |
| tokenizer: A trained JSONTokenizer. | |
| **kwargs: Additional arguments passed to __init__. | |
| Returns: | |
| A new JSONPreTrainedTokenizer wrapping the provided tokenizer. | |
| """ | |
| if not tokenizer._trained: | |
| raise ValueError("JSONTokenizer must be trained before wrapping.") | |
| instance = cls.__new__(cls) | |
| instance._json_tokenizer = tokenizer | |
| instance._hf_vocab = {} | |
| instance._hf_id_to_token = {} | |
| instance._build_hf_vocab() | |
| instance.__init__(vocab_file=None, **kwargs) | |
| return instance | |
| # ββ Vocab building βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _load_vocab_file(self, vocab_file: str) -> None: | |
| """Reconstruct a JSONTokenizer from our saved vocab file.""" | |
| with open(vocab_file, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| config = data["config"] | |
| tok = JSONTokenizer( | |
| bpe_vocab_size=config["bpe_vocab_size"], | |
| max_key_vocab=config["max_key_vocab"], | |
| min_key_freq=config["min_key_freq"], | |
| bpe_min_freq=config["bpe_min_freq"], | |
| ) | |
| tok._key_to_id = {k: int(v) for k, v in data["key_vocab"].items()} | |
| tok._id_to_key = {int(v): k for k, v in data["key_vocab"].items()} | |
| tok._key_offset = config["key_offset"] | |
| tok._bpe_offset = config["bpe_offset"] | |
| bpe_data = data["bpe_model"] | |
| bpe = BPETrainer( | |
| vocab_size=bpe_data["vocab_size"], | |
| min_frequency=bpe_data["min_frequency"], | |
| ) | |
| bpe.merges = [tuple(m) for m in bpe_data["merges"]] | |
| bpe.vocab = bpe_data["vocab"] | |
| bpe._id_to_tok = None | |
| tok._bpe = bpe | |
| tok._build_vocab_lookup() | |
| tok._trained = True | |
| self._json_tokenizer = tok | |
| self._build_hf_vocab() | |
| def _build_hf_vocab(self) -> None: | |
| """Build the unified {token_string: id} mapping across all tiers.""" | |
| tok = self._json_tokenizer | |
| self._hf_vocab = {} | |
| self._hf_id_to_token = {} | |
| # Structural tokens (0-15) | |
| for tid, name in _STRUCTURAL_TOKEN_NAMES.items(): | |
| self._hf_vocab[name] = tid | |
| self._hf_id_to_token[tid] = name | |
| # Reserved tokens (16-31) | |
| for tid in range(16, StructuralTokens.RESERVED_END): | |
| name = f"<reserved_{tid}>" | |
| self._hf_vocab[name] = tid | |
| self._hf_id_to_token[tid] = name | |
| # Key vocabulary tokens | |
| for key_str, tid in tok._key_to_id.items(): | |
| name = f"<key:{key_str}>" | |
| self._hf_vocab[name] = tid | |
| self._hf_id_to_token[tid] = name | |
| # BPE tokens | |
| for bpe_token, bpe_local_id in tok._bpe.vocab.items(): | |
| full_id = tok._bpe_offset + bpe_local_id | |
| # Collision guard (only <UNK> from BPE could theoretically collide) | |
| if bpe_token in self._hf_vocab: | |
| bpe_token_name = f"bpe:{bpe_token}" | |
| else: | |
| bpe_token_name = bpe_token | |
| self._hf_vocab[bpe_token_name] = full_id | |
| self._hf_id_to_token[full_id] = bpe_token_name | |
| # ββ Required PreTrainedTokenizer overrides βββββββββββββββββββββββββ | |
| def vocab_size(self) -> int: | |
| if self._json_tokenizer is None: | |
| return len(_STRUCTURAL_TOKEN_NAMES) | |
| return self._json_tokenizer.vocab_size | |
| def get_vocab(self) -> Dict[str, int]: | |
| vocab = dict(self._hf_vocab) | |
| vocab.update(self.added_tokens_encoder) | |
| return vocab | |
| def _tokenize(self, text: str, **kwargs) -> List[str]: | |
| """Tokenize a JSON string into HF token strings. | |
| The HF pipeline calls: tokenize(text) -> _tokenize -> list[str] | |
| then convert_tokens_to_ids maps those to IDs. | |
| We parse the JSON, encode via JSONTokenizer (skipping START/END | |
| since HF adds special tokens via build_inputs_with_special_tokens), | |
| then convert IDs to our HF token string names. | |
| """ | |
| if self._json_tokenizer is None: | |
| return [self.unk_token] | |
| try: | |
| ids = self._json_tokenizer.encode(text) | |
| except (ValueError, json.JSONDecodeError): | |
| # Not valid JSON β encode as raw string via BPE | |
| ids = [StructuralTokens.START] | |
| self._json_tokenizer._encode_string(text, ids) | |
| ids.append(StructuralTokens.END) | |
| # Strip START/END β HF adds them via build_inputs_with_special_tokens | |
| if ids and ids[0] == StructuralTokens.START: | |
| ids = ids[1:] | |
| if ids and ids[-1] == StructuralTokens.END: | |
| ids = ids[:-1] | |
| return [self._hf_id_to_token.get(tid, self.unk_token) for tid in ids] | |
| def _convert_token_to_id(self, token: str) -> int: | |
| return self._hf_vocab.get( | |
| token, self._hf_vocab.get(self.unk_token, StructuralTokens.UNK) | |
| ) | |
| def _convert_id_to_token(self, index: int) -> str: | |
| return self._hf_id_to_token.get(index, self.unk_token) | |
| def convert_tokens_to_string(self, tokens: List[str]) -> str: | |
| """Reconstruct a JSON string from token strings. | |
| Converts token strings -> IDs, wraps with START/END, | |
| and delegates to JSONTokenizer.decode(). | |
| """ | |
| if self._json_tokenizer is None: | |
| return "" | |
| ids = [StructuralTokens.START] | |
| for token in tokens: | |
| tid = self._convert_token_to_id(token) | |
| ids.append(tid) | |
| ids.append(StructuralTokens.END) | |
| try: | |
| return self._json_tokenizer.decode(ids) | |
| except Exception: | |
| return " ".join(tokens) | |
| # ββ Special tokens βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def build_inputs_with_special_tokens( | |
| self, | |
| token_ids_0: List[int], | |
| token_ids_1: Optional[List[int]] = None, | |
| ) -> List[int]: | |
| """Wrap with START (bos) and END (eos) tokens.""" | |
| bos = [self.bos_token_id] | |
| eos = [self.eos_token_id] | |
| if token_ids_1 is None: | |
| return bos + token_ids_0 + eos | |
| return bos + token_ids_0 + eos + bos + token_ids_1 + eos | |
| def get_special_tokens_mask( | |
| self, | |
| token_ids_0: List[int], | |
| token_ids_1: Optional[List[int]] = None, | |
| already_has_special_tokens: bool = False, | |
| ) -> List[int]: | |
| """1 for special tokens (START/END), 0 for content tokens.""" | |
| if already_has_special_tokens: | |
| return super().get_special_tokens_mask( | |
| token_ids_0=token_ids_0, | |
| token_ids_1=token_ids_1, | |
| already_has_special_tokens=True, | |
| ) | |
| if token_ids_1 is None: | |
| return [1] + [0] * len(token_ids_0) + [1] | |
| return ( | |
| [1] + [0] * len(token_ids_0) + [1] | |
| + [1] + [0] * len(token_ids_1) + [1] | |
| ) | |
| def create_token_type_ids_from_sequences( | |
| self, | |
| token_ids_0: List[int], | |
| token_ids_1: Optional[List[int]] = None, | |
| ) -> List[int]: | |
| """Segment IDs: 0 for first sequence, 1 for second.""" | |
| bos_eos = 2 # one bos + one eos | |
| if token_ids_1 is None: | |
| return [0] * (len(token_ids_0) + bos_eos) | |
| return [0] * (len(token_ids_0) + bos_eos) + [1] * (len(token_ids_1) + bos_eos) | |
| # ββ Persistence ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def save_vocabulary( | |
| self, | |
| save_directory: str, | |
| filename_prefix: Optional[str] = None, | |
| ) -> Tuple[str]: | |
| """Save the vocabulary to a single JSON file. | |
| This file contains everything needed to reconstruct the | |
| JSONTokenizer: config, key vocab, and BPE model. | |
| """ | |
| if not os.path.isdir(save_directory): | |
| raise ValueError(f"Not a directory: {save_directory}") | |
| vocab_file = os.path.join( | |
| save_directory, | |
| (filename_prefix + "-" if filename_prefix else "") | |
| + VOCAB_FILES_NAMES["vocab_file"], | |
| ) | |
| tok = self._json_tokenizer | |
| data = { | |
| "version": "json-tokenizer-hf-v1", | |
| "config": { | |
| "bpe_vocab_size": tok.bpe_vocab_size, | |
| "max_key_vocab": tok.max_key_vocab, | |
| "min_key_freq": tok.min_key_freq, | |
| "bpe_min_freq": tok.bpe_min_freq, | |
| "key_offset": tok._key_offset, | |
| "bpe_offset": tok._bpe_offset, | |
| }, | |
| "key_vocab": tok._key_to_id, | |
| "bpe_model": { | |
| "vocab_size": tok._bpe.vocab_size, | |
| "min_frequency": tok._bpe.min_frequency, | |
| "merges": [list(m) for m in tok._bpe.merges], | |
| "vocab": tok._bpe.vocab, | |
| }, | |
| } | |
| with open(vocab_file, "w", encoding="utf-8") as f: | |
| json.dump(data, f, indent=2, ensure_ascii=False) | |
| return (vocab_file,) | |