| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | from pathlib import Path |
| | from types import SimpleNamespace |
| |
|
| | import torchvision.transforms as transforms |
| | from PIL import Image |
| |
|
| | from m4.models.vopt.modeling_vopt import VOPTConfig, VOPTForCausalLM |
| | from m4.training.packing import image_attention_mask_for_packed_input_ids, incremental_to_binary_attention_mask |
| | from m4.training.utils import get_tokenizer |
| |
|
| |
|
| | mname_tiny = "tiny-random-vopt-clip" |
| |
|
| | path = Path(mname_tiny) |
| | path.mkdir(parents=True, exist_ok=True) |
| |
|
| | |
| | additional_vocab_size = 2 |
| |
|
| | config = VOPTConfig() |
| | config.update( |
| | dict( |
| | ffn_dim=64, |
| | hidden_size=16, |
| | max_position_embeddings=128, |
| | num_attention_heads=4, |
| | num_hidden_layers=2, |
| | word_embed_proj_dim=16, |
| | max_new_tokens=100, |
| | use_resampler=True, |
| | resampler_depth=2, |
| | resampler_head_dim=8, |
| | resampler_n_heads=2, |
| | resampler_n_latents=16, |
| | vision_embed_dim=32, |
| | vision_image_size=30, |
| | vision_model_name="hf-internal-testing/tiny-random-clip", |
| | vision_model_params="{}", |
| | vocab_size=50265, |
| | additional_vocab_size=additional_vocab_size, |
| | ) |
| | ) |
| |
|
| | |
| | |
| |
|
| | model = VOPTForCausalLM.from_config(config) |
| | |
| | |
| |
|
| | tokenizer_config = dict( |
| | tokenizer_add_special_tokens="{}", |
| | tokenizer_add_tokens=( |
| | '[AddedToken("<fake_token_around_image>", rstrip=False, lstrip=False), AddedToken("<image>", rstrip=False,' |
| | " lstrip=False)]" |
| | ), |
| | tokenizer_name="facebook/opt-13b", |
| | tokenizer_params='{"use_fast":True}', |
| | ) |
| | tokenizer_config = SimpleNamespace(**tokenizer_config) |
| | |
| |
|
| | tokenizer = get_tokenizer( |
| | tokenizer_name=tokenizer_config.tokenizer_name, |
| | tokenizer_add_tokens=tokenizer_config.tokenizer_add_tokens, |
| | tokenizer_add_special_tokens=tokenizer_config.tokenizer_add_special_tokens, |
| | tokenizer_params=tokenizer_config.tokenizer_params, |
| | additional_vocab_size=model.config.additional_vocab_size, |
| | model_vocab_size=model.config.vocab_size, |
| | ) |
| | assert "<image>" in tokenizer.get_vocab() |
| |
|
| | |
| | query = "<fake_token_around_image><image><fake_token_around_image>This is a picture of a cat." |
| | query_tokens = tokenizer(query, return_tensors="pt") |
| |
|
| | num_images_per_ex = 1 |
| | pixel_values = transforms.ToTensor()(Image.new("RGB", (30, 30))).repeat(1, 1, 1, 1).unsqueeze(0) |
| | image_attention_mask, _ = image_attention_mask_for_packed_input_ids(query_tokens["input_ids"], tokenizer) |
| | image_attention_mask = incremental_to_binary_attention_mask(image_attention_mask, num_classes=num_images_per_ex) |
| |
|
| | input = { |
| | "input_ids": query_tokens["input_ids"], |
| | "attention_mask": query_tokens["attention_mask"], |
| | "pixel_values": pixel_values, |
| | "pixel_values": pixel_values, |
| | "image_attention_mask": image_attention_mask, |
| | } |
| | |
| | |
| | |
| | |
| | |
| |
|
| | out_gen = model.generate(**input) |
| | text = tokenizer.batch_decode(out_gen) |
| | |
| |
|
| | |
| | model.half() |
| | model.save_pretrained(path) |
| | tokenizer.save_pretrained(path) |
| |
|
| | |
| | model = VOPTForCausalLM.from_pretrained(path) |
| |
|
| | print(f"Generated {mname_tiny} - Upload the generated folder to the hub") |
| |
|