Commit
·
b3d45f6
1
Parent(s):
8a9e9ed
feat: merged checkpoint, modified qwen, readme
Browse filesSigned-off-by: jupyterjazz <saba.sturua@jina.ai>
- README.md +5 -9
- adapters/{retrieval/adapter_config.json → adapter_config.json} +0 -0
- adapters/{text-matching/adapter_model.safetensors → adapter_model.safetensors} +2 -2
- adapters/code/adapter_config.json +0 -26
- adapters/code/adapter_model.safetensors +0 -3
- adapters/retrieval/adapter_model.safetensors +0 -3
- adapters/text-matching/adapter_config.json +0 -26
- modeling_jina_embeddings_v4.py +5 -2
- qwen2_5_vl.py +10 -9
README.md
CHANGED
|
@@ -22,11 +22,9 @@ image_paths = ['/<path_to_image>']
|
|
| 22 |
images = [Image.open(path) for path in image_paths]
|
| 23 |
|
| 24 |
# Example 1: Text matching task with single vector embeddings
|
| 25 |
-
model.set_task(task='text-matching')
|
| 26 |
-
|
| 27 |
# Generate embeddings with dimension truncation (256), decrease max_pixels
|
| 28 |
-
img_embeddings = model.encode_images(images=images, truncate_dim=256, max_pixels=602112)
|
| 29 |
-
text_embeddings = model.encode_texts(texts=texts, truncate_dim=256, max_length=512)
|
| 30 |
|
| 31 |
# Example 2: Retrieval task with multi-vector embeddings
|
| 32 |
model.set_task(task='retrieval')
|
|
@@ -36,10 +34,8 @@ img_embeddings = model.encode_images(images=images, vector_type='multi_vector')
|
|
| 36 |
text_embeddings = model.encode_texts(texts=texts, vector_type='multi_vector', prompt_name='passage')
|
| 37 |
|
| 38 |
# Example 3: Code task with single vector embeddings
|
| 39 |
-
model.set_task(task='code')
|
| 40 |
-
|
| 41 |
code = ["def hello_world():\n print('Hello, World!')"]
|
| 42 |
-
code_embeddings = model.encode_texts(texts=code)
|
| 43 |
|
| 44 |
```
|
| 45 |
|
|
@@ -75,8 +71,8 @@ with torch.no_grad():
|
|
| 75 |
|
| 76 |
with torch.autocast(device_type='cuda' if torch.cuda.is_available() else 'cpu'):
|
| 77 |
# Get embeddings
|
| 78 |
-
text_embeddings = model.model(**text_batch).single_vec_emb
|
| 79 |
-
img_embeddings = model.model(**image_batch).single_vec_emb
|
| 80 |
|
| 81 |
|
| 82 |
```
|
|
|
|
| 22 |
images = [Image.open(path) for path in image_paths]
|
| 23 |
|
| 24 |
# Example 1: Text matching task with single vector embeddings
|
|
|
|
|
|
|
| 25 |
# Generate embeddings with dimension truncation (256), decrease max_pixels
|
| 26 |
+
img_embeddings = model.encode_images(images=images, truncate_dim=256, max_pixels=602112, task='text-matching')
|
| 27 |
+
text_embeddings = model.encode_texts(texts=texts, truncate_dim=256, max_length=512, task='text-matching')
|
| 28 |
|
| 29 |
# Example 2: Retrieval task with multi-vector embeddings
|
| 30 |
model.set_task(task='retrieval')
|
|
|
|
| 34 |
text_embeddings = model.encode_texts(texts=texts, vector_type='multi_vector', prompt_name='passage')
|
| 35 |
|
| 36 |
# Example 3: Code task with single vector embeddings
|
|
|
|
|
|
|
| 37 |
code = ["def hello_world():\n print('Hello, World!')"]
|
| 38 |
+
code_embeddings = model.encode_texts(texts=code, task='code')
|
| 39 |
|
| 40 |
```
|
| 41 |
|
|
|
|
| 71 |
|
| 72 |
with torch.autocast(device_type='cuda' if torch.cuda.is_available() else 'cpu'):
|
| 73 |
# Get embeddings
|
| 74 |
+
text_embeddings = model.model(**text_batch, task_label='retrieval').single_vec_emb
|
| 75 |
+
img_embeddings = model.model(**image_batch, task_label='retrieval').single_vec_emb
|
| 76 |
|
| 77 |
|
| 78 |
```
|
adapters/{retrieval/adapter_config.json → adapter_config.json}
RENAMED
|
File without changes
|
adapters/{text-matching/adapter_model.safetensors → adapter_model.safetensors}
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7a5cb8cc0f4e10f184ccc10f8864999098b887dbc4107221ec0e400d927f4555
|
| 3 |
+
size 360095344
|
adapters/code/adapter_config.json
DELETED
|
@@ -1,26 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"alpha_pattern": {},
|
| 3 |
-
"auto_mapping": null,
|
| 4 |
-
"base_model_name_or_path": "jinaai/colqwen25-duo-base",
|
| 5 |
-
"bias": "none",
|
| 6 |
-
"fan_in_fan_out": false,
|
| 7 |
-
"inference_mode": false,
|
| 8 |
-
"init_lora_weights": "gaussian",
|
| 9 |
-
"layer_replication": null,
|
| 10 |
-
"layers_pattern": null,
|
| 11 |
-
"layers_to_transform": null,
|
| 12 |
-
"loftq_config": {},
|
| 13 |
-
"lora_alpha": 32,
|
| 14 |
-
"lora_dropout": 0.1,
|
| 15 |
-
"megatron_config": null,
|
| 16 |
-
"megatron_core": "megatron.core",
|
| 17 |
-
"modules_to_save": null,
|
| 18 |
-
"peft_type": "LORA",
|
| 19 |
-
"r": 32,
|
| 20 |
-
"rank_pattern": {},
|
| 21 |
-
"revision": null,
|
| 22 |
-
"target_modules": "(.*(model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(single_vector_projector|multi_vector_projector).*$)",
|
| 23 |
-
"task_type": "FEATURE_EXTRACTION",
|
| 24 |
-
"use_dora": false,
|
| 25 |
-
"use_rslora": false
|
| 26 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
adapters/code/adapter_model.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:510d017efc64c97e2db985ed1a96b17477ac97e1a5470996209041ad35beeee7
|
| 3 |
-
size 119802032
|
|
|
|
|
|
|
|
|
|
|
|
adapters/retrieval/adapter_model.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:0c2b1d85506d01bd29a942975cb0abbd8c4af3487fb80b5ad408ae0e55f8bb3a
|
| 3 |
-
size 120138416
|
|
|
|
|
|
|
|
|
|
|
|
adapters/text-matching/adapter_config.json
DELETED
|
@@ -1,26 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"alpha_pattern": {},
|
| 3 |
-
"auto_mapping": null,
|
| 4 |
-
"base_model_name_or_path": "jinaai/colqwen25-duo-base",
|
| 5 |
-
"bias": "none",
|
| 6 |
-
"fan_in_fan_out": false,
|
| 7 |
-
"inference_mode": true,
|
| 8 |
-
"init_lora_weights": "gaussian",
|
| 9 |
-
"layer_replication": null,
|
| 10 |
-
"layers_pattern": null,
|
| 11 |
-
"layers_to_transform": null,
|
| 12 |
-
"loftq_config": {},
|
| 13 |
-
"lora_alpha": 32,
|
| 14 |
-
"lora_dropout": 0.1,
|
| 15 |
-
"megatron_config": null,
|
| 16 |
-
"megatron_core": "megatron.core",
|
| 17 |
-
"modules_to_save": null,
|
| 18 |
-
"peft_type": "LORA",
|
| 19 |
-
"r": 32,
|
| 20 |
-
"rank_pattern": {},
|
| 21 |
-
"revision": null,
|
| 22 |
-
"target_modules": "(.*(model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(single_vector_projector|multi_vector_projector).*$)",
|
| 23 |
-
"task_type": "FEATURE_EXTRACTION",
|
| 24 |
-
"use_dora": false,
|
| 25 |
-
"use_rslora": false
|
| 26 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
modeling_jina_embeddings_v4.py
CHANGED
|
@@ -522,6 +522,9 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
|
|
| 522 |
"""
|
| 523 |
if "torch_dtype" not in kwargs:
|
| 524 |
kwargs["torch_dtype"] = "auto"
|
|
|
|
|
|
|
|
|
|
| 525 |
|
| 526 |
base_model = super().from_pretrained(
|
| 527 |
pretrained_model_name_or_path, *args, **kwargs
|
|
@@ -536,7 +539,7 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
|
|
| 536 |
)
|
| 537 |
adapter_dir = os.path.join(adapter_cache_path, "adapters")
|
| 538 |
|
| 539 |
-
lora_config = LoraConfig.from_pretrained(
|
| 540 |
lora_config._custom_modules = {
|
| 541 |
torch.nn.modules.linear.Linear: partial(
|
| 542 |
MultiAdapterLinear,
|
|
@@ -545,7 +548,7 @@ class JinaEmbeddingsV4Model(Qwen2_5_VLForConditionalGeneration):
|
|
| 545 |
}
|
| 546 |
peft_model = PeftModel.from_pretrained(
|
| 547 |
model=base_model,
|
| 548 |
-
model_id=
|
| 549 |
config=lora_config,
|
| 550 |
)
|
| 551 |
|
|
|
|
| 522 |
"""
|
| 523 |
if "torch_dtype" not in kwargs:
|
| 524 |
kwargs["torch_dtype"] = "auto"
|
| 525 |
+
|
| 526 |
+
if torch.cuda.is_available() and "attn_implementation" not in kwargs:
|
| 527 |
+
kwargs["attn_implementation"] = "flash_attention_2"
|
| 528 |
|
| 529 |
base_model = super().from_pretrained(
|
| 530 |
pretrained_model_name_or_path, *args, **kwargs
|
|
|
|
| 539 |
)
|
| 540 |
adapter_dir = os.path.join(adapter_cache_path, "adapters")
|
| 541 |
|
| 542 |
+
lora_config = LoraConfig.from_pretrained(adapter_dir)
|
| 543 |
lora_config._custom_modules = {
|
| 544 |
torch.nn.modules.linear.Linear: partial(
|
| 545 |
MultiAdapterLinear,
|
|
|
|
| 548 |
}
|
| 549 |
peft_model = PeftModel.from_pretrained(
|
| 550 |
model=base_model,
|
| 551 |
+
model_id=adapter_dir,
|
| 552 |
config=lora_config,
|
| 553 |
)
|
| 554 |
|
qwen2_5_vl.py
CHANGED
|
@@ -945,6 +945,7 @@ class Qwen2_5_VLAttention(nn.Module):
|
|
| 945 |
|
| 946 |
def forward(
|
| 947 |
self,
|
|
|
|
| 948 |
hidden_states: torch.Tensor,
|
| 949 |
attention_mask: Optional[torch.Tensor] = None,
|
| 950 |
position_ids: Optional[torch.LongTensor] = None,
|
|
@@ -956,9 +957,9 @@ class Qwen2_5_VLAttention(nn.Module):
|
|
| 956 |
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
| 957 |
bsz, q_len, _ = hidden_states.size()
|
| 958 |
|
| 959 |
-
query_states = self.q_proj(hidden_states)
|
| 960 |
-
key_states = self.k_proj(hidden_states)
|
| 961 |
-
value_states = self.v_proj(hidden_states)
|
| 962 |
|
| 963 |
query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
|
| 964 |
key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
|
|
@@ -1002,7 +1003,7 @@ class Qwen2_5_VLAttention(nn.Module):
|
|
| 1002 |
attn_output = attn_output.transpose(1, 2).contiguous()
|
| 1003 |
attn_output = attn_output.reshape(bsz, q_len, -1)
|
| 1004 |
|
| 1005 |
-
attn_output = self.o_proj(attn_output)
|
| 1006 |
|
| 1007 |
if not output_attentions:
|
| 1008 |
attn_weights = None
|
|
@@ -1021,7 +1022,6 @@ class Qwen2_5_VLFlashAttention2(Qwen2_5_VLAttention):
|
|
| 1021 |
|
| 1022 |
def __init__(self, *args, **kwargs):
|
| 1023 |
super().__init__(*args, **kwargs)
|
| 1024 |
-
|
| 1025 |
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
|
| 1026 |
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
|
| 1027 |
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
|
|
@@ -1029,6 +1029,7 @@ class Qwen2_5_VLFlashAttention2(Qwen2_5_VLAttention):
|
|
| 1029 |
|
| 1030 |
def forward(
|
| 1031 |
self,
|
|
|
|
| 1032 |
hidden_states: torch.Tensor,
|
| 1033 |
attention_mask: Optional[torch.Tensor] = None,
|
| 1034 |
position_ids: Optional[torch.LongTensor] = None,
|
|
@@ -1040,9 +1041,9 @@ class Qwen2_5_VLFlashAttention2(Qwen2_5_VLAttention):
|
|
| 1040 |
):
|
| 1041 |
bsz, q_len, _ = hidden_states.size()
|
| 1042 |
|
| 1043 |
-
query_states = self.q_proj(hidden_states)
|
| 1044 |
-
key_states = self.k_proj(hidden_states)
|
| 1045 |
-
value_states = self.v_proj(hidden_states)
|
| 1046 |
|
| 1047 |
query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
|
| 1048 |
key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
|
|
@@ -1113,7 +1114,7 @@ class Qwen2_5_VLFlashAttention2(Qwen2_5_VLAttention):
|
|
| 1113 |
)
|
| 1114 |
|
| 1115 |
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
|
| 1116 |
-
attn_output = self.o_proj(attn_output)
|
| 1117 |
|
| 1118 |
if not output_attentions:
|
| 1119 |
attn_weights = None
|
|
|
|
| 945 |
|
| 946 |
def forward(
|
| 947 |
self,
|
| 948 |
+
task_label: Union[str, List[str]],
|
| 949 |
hidden_states: torch.Tensor,
|
| 950 |
attention_mask: Optional[torch.Tensor] = None,
|
| 951 |
position_ids: Optional[torch.LongTensor] = None,
|
|
|
|
| 957 |
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
| 958 |
bsz, q_len, _ = hidden_states.size()
|
| 959 |
|
| 960 |
+
query_states = self.q_proj(hidden_states, task_label=task_label)
|
| 961 |
+
key_states = self.k_proj(hidden_states, task_label=task_label)
|
| 962 |
+
value_states = self.v_proj(hidden_states, task_label=task_label)
|
| 963 |
|
| 964 |
query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
|
| 965 |
key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
|
|
|
|
| 1003 |
attn_output = attn_output.transpose(1, 2).contiguous()
|
| 1004 |
attn_output = attn_output.reshape(bsz, q_len, -1)
|
| 1005 |
|
| 1006 |
+
attn_output = self.o_proj(attn_output, task_label=task_label)
|
| 1007 |
|
| 1008 |
if not output_attentions:
|
| 1009 |
attn_weights = None
|
|
|
|
| 1022 |
|
| 1023 |
def __init__(self, *args, **kwargs):
|
| 1024 |
super().__init__(*args, **kwargs)
|
|
|
|
| 1025 |
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
|
| 1026 |
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
|
| 1027 |
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
|
|
|
|
| 1029 |
|
| 1030 |
def forward(
|
| 1031 |
self,
|
| 1032 |
+
task_label: Union[str, List[str]],
|
| 1033 |
hidden_states: torch.Tensor,
|
| 1034 |
attention_mask: Optional[torch.Tensor] = None,
|
| 1035 |
position_ids: Optional[torch.LongTensor] = None,
|
|
|
|
| 1041 |
):
|
| 1042 |
bsz, q_len, _ = hidden_states.size()
|
| 1043 |
|
| 1044 |
+
query_states = self.q_proj(hidden_states, task_label=task_label)
|
| 1045 |
+
key_states = self.k_proj(hidden_states, task_label=task_label)
|
| 1046 |
+
value_states = self.v_proj(hidden_states, task_label=task_label)
|
| 1047 |
|
| 1048 |
query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
|
| 1049 |
key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
|
|
|
|
| 1114 |
)
|
| 1115 |
|
| 1116 |
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
|
| 1117 |
+
attn_output = self.o_proj(attn_output, task_label=task_label)
|
| 1118 |
|
| 1119 |
if not output_attentions:
|
| 1120 |
attn_weights = None
|