Overview
OneProt is a multimodal model that integrates protein sequence, protein structure (both in form of an augmented sequence and in a form of a graph), protein binding sites and protein text annotations. Contrastive learning is used to align each of the modality to the central one, which is protein sequence. In the pre-training phase InfoNCE loss is computed between pairs (protein sequence, other modality).
Model architecture
Protein sequence encoder: esm2_t33_650M_UR50D
Protein structure encoder: esm2_t12_35M_UR50D
Protein structure encoder GNN: ProNet
Pocket (binding sites encoder) GNN: ProNet
Text encoder: BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext
Below is an example code on how to obtain the embeddings (requires cloning our repo first). Note that example data for transformer models are read-off from .txt files and in principle can be passed as strings, whlist the data for GNN models are contained in the example .h5 file and need to subsequently be converted to graphs.
import torch
import hydra
from omegaconf import OmegaConf
from huggingface_hub import HfApi, hf_hub_download
import sys
import os
import h5py
from torch_geometric.data import Batch
from transformers import AutoTokenizer
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) # assuming that you are running this script from the oneprot repo, can be any other path
from src.models.oneprot_module import OneProtLitModule
from src.data.utils.struct_graph_utils import protein_to_graph
###if you are not running on the supercomputer, you may need to uncomment the two following lines
#os.environ['RANK']='0'
#os.environ['WORLD_SIZE']='1'
#Load the config file and read it off
config_path = hf_hub_download(
repo_id="HelmholtzAI-FZJ/oneprot",
filename="config.yaml",
)
with open(config_path, 'r') as f:
cfg = OmegaConf.load(f)
# Prepare components dictionary from config
components = {
'sequence': hydra.utils.instantiate(cfg.model.components.sequence),
'struct_token': hydra.utils.instantiate(cfg.model.components.struct_token),
'struct_graph': hydra.utils.instantiate(cfg.model.components.struct_graph),
'pocket': hydra.utils.instantiate(cfg.model.components.pocket),
'text': hydra.utils.instantiate(cfg.model.components.text)
}
# Load the model checkpoint
checkpoint_path = hf_hub_download(
repo_id="HelmholtzAI-FZJ/oneprot",
filename="pytorch_model.bin",
repo_type="model"
)
# Create model instance and load the checkpoint
model = OneProtLitModule(
components=components,
optimizer=None,
loss_fn=cfg.model.loss_fn,
local_loss=cfg.model.local_loss,
gather_with_grad=cfg.model.gather_with_grad,
use_l1_regularization=cfg.model.use_l1_regularization,
train_on_all_modalities_after_step=cfg.model.train_on_all_modalities_after_step,
use_seqsim=cfg.model.use_seqsim
)
state_dict = torch.load(checkpoint_path)
model_state_dict = model.state_dict()
model.load_state_dict(state_dict, strict=True)
# Define the tokenisers
tokenizers = {
'sequence': "facebook/esm2_t33_650M_UR50D",
'struct_token': "facebook/esm2_t33_650M_UR50D",
'text': "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"
}
loaded_tokenizers = {}
for modality, tokenizer_name in tokenizers.items():
tokenizer = AutoTokenizer.from_pretrained(tokenizers[modality])
if modality=='struct_token':
new_tokens = ['p', 'y', 'n', 'w', 'r', 'q', 'h', 'g', 'd', 'l', 'v', 't', 'm', 'f', 's', 'a', 'e', 'i', 'k', 'c','#']
tokenizer.add_tokens(new_tokens)
loaded_tokenizers[modality] = tokenizer
# Get example embeddings for each modality
##########################sequence##############################
modality = "sequence"
file_path = hf_hub_download(
repo_id="HelmholtzAI-FZJ/oneprot",
filename="data_examples/sequence_example.txt",
repo_type="model" # or "dataset"
)
with open(file_path, 'r') as file:
input_sequence = file.read().strip()
input_tensor = loaded_tokenizers[modality](input_sequence, return_tensors="pt")["input_ids"]
output = model.network[modality](input_tensor)
print(f"Output for modality '{modality}': {output}")
###########################text#################################
modality = "text"
file_path = hf_hub_download(
repo_id="HelmholtzAI-FZJ/oneprot",
filename="data_examples/text_example.txt",
repo_type="model" # or "dataset"
)
with open(file_path, 'r') as file:
input_text = file.read().strip()
input_tensor = loaded_tokenizers[modality](input_text, return_tensors="pt")["input_ids"]
output = model.network[modality](input_tensor)
print(f"Output for modality '{modality}': {output}")
#####################tokenized structure########################
modality = "struct_token"
file_path = hf_hub_download(
repo_id="HelmholtzAI-FZJ/oneprot",
filename="data_examples/struct_token_example.txt",
repo_type="model" # or "dataset"
)
with open(file_path, 'r') as file:
input_struct_token = file.read().strip()
input_struct_token = "".join([s.replace("#", "") for s in input_struct_token])
input_tensor = loaded_tokenizers[modality](input_struct_token, return_tensors="pt")["input_ids"]
output = model.network[modality](input_tensor)
print(f"Output for modality '{modality}': {output}")
#####################graph structure############################
modality = "struct_graph"
file_path = hf_hub_download(
repo_id="HelmholtzAI-FZJ/oneprot",
filename="data_examples/seqstruc_example.h5",
repo_type="model" # or "dataset"
)
with h5py.File(file_path, 'r') as file:
input_struct_graph=[protein_to_graph('E6Y2X0', file_path, 'non_pdb', 'A', pockets=False)]
input_struct_graph = Batch.from_data_list(input_struct_graph)
output=model.network[modality](input_struct_graph)
print(f"Output for modality '{modality}': {output}")
##########################pocket################################
modality = "pocket" # Replace with the desired modality
file_path = hf_hub_download(
repo_id="HelmholtzAI-FZJ/oneprot",
filename="data_examples/pocket_example.h5",
repo_type="model" # or "dataset"
)
with h5py.File(file_path, 'r') as file:
input_pocket=[protein_to_graph('E6Y2X0', file_path, 'non_pdb', 'A', pockets=True)]
input_pocket = Batch.from_data_list(input_pocket)
output=model.network[modality](input_pocket)
print(f"Output for modality '{modality}': {output}")
Citation
@article{10.1371/journal.pcbi.1013679,
doi = {10.1371/journal.pcbi.1013679},
author = {Flöge, Klemens AND Udayakumar, Srisruthi AND Sommer, Johanna AND Piraud, Marie AND Kesselheim, Stefan AND Fortuin, Vincent AND Günnemann, Stephan AND van der Weg, Karel J. AND Gohlke, Holger AND Merdivan, Erinc AND Bazarova, Alina},
journal = {PLOS Computational Biology},
publisher = {Public Library of Science},
title = {OneProt: Towards multi-modal protein foundation models via latent space alignment of sequence, structure, binding sites and text encoders},
year = {2025},
month = {11},
volume = {21},
url = {https://doi.org/10.1371/journal.pcbi.1013679},
pages = {1-27},
abstract = {Recent advances in Artificial Intelligence have enabled multi-modal systems to model and translate diverse information spaces. Extending beyond text and vision, we introduce OneProt, a multi-modal Deep Learning model for proteins that integrates structural, sequence, text, and binding site data. Using the ImageBind framework, OneProt aligns the latent spaces of protein modality encoders in a lightweight fine-tuning scheme that focuses on pairwise alignment with sequence data, rather than requiring full matches. This novel approach comprises a mix of Graph Neural Networks and transformer architectures. It demonstrates good performance in retrieval tasks and showcases the efficacy of multi-modal systems in Protein Machine Learning through a broad spectrum of downstream baselines, including enzyme function prediction and binding site analysis. Furthermore, OneProt enables the transfer of representational information from specialized encoders to the sequence encoder, enhancing capabilities for distinguishing evolutionarily related and unrelated sequences and exhibiting representational properties where evolutionarily related proteins align in similar directions within the latent space. In addition, we extensively investigate modality ablations to identify the encoders that contribute the most to predictive performance, highlighting the significance of the binding site encoder, which has not been used in similar models previously. This work expands the horizons of multi-modal protein models, paving the way for transformative applications in drug discovery, biocatalytic reaction planning, and protein engineering.},
number = {11},
}
- Downloads last month
- 5