OpCustom
/

OpAI1.1

Model card Files Files and versions

OpAI1.1 / tokenizer.py

Osher

Upload 14 files

70a6fd7 verified 12 months ago

history blame contribute delete

895 Bytes

	import torch

	class SimpleTokenizer:
	def __init__(self, vocab_path):
	self.char_to_idx = torch.load(vocab_pth)

	# Add <unk> if not in vocab
	if '<unk>' not in self.char_to_idx:
	self.char_to_idx['<unk>'] = max(self.char_to_idx.values()) + 1

	self.idx_to_char = {i: c for c, i in self.char_to_idx.items()}


	def encode(self, text):
	return [self.char_to_idx.get(c, self.char_to_idx.get('<unk>', 0)) for c in text]

	def decode(self, indices):
	return ''.join([self.idx_to_char.get(i, '') for i in indices])

	# Example usage
	vocab_path = 'vocab.pth' # Replace with the actual path to your vocab file
	tokenizer = SimpleTokenizer(vocab_path)

	text = "Hello, world!"
	tokens = tokenizer.encode(text) # Use the encode method here
	print(tokens)

	decoded_text = tokenizer.decode(tokens)
	print(decoded_text)