| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer |
| import torch |
| import threading |
|
|
| model_name = "microsoft/phi-2" |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| model = AutoModelForCausalLM.from_pretrained( |
| model_name, |
| torch_dtype=torch.float16 if device == "cuda" else torch.float32, |
| low_cpu_mem_usage=True |
| ).to(device) |
|
|
| system_prompt = ( |
| "You are ProTalk, a professional and intelligent AI. " |
| "You answer clearly, politely, and with insight. " |
| "Be professional, witty, and helpful in all responses." |
| ) |
|
|
| def chat_loop(): |
| history = [] |
| print("ProTalk Online — type 'exit' to quit.\n") |
| while True: |
| user_input = input("User: ") |
| if user_input.lower() == "exit": |
| break |
| prompt = system_prompt + "\n" + "\n".join(history) + f"\nUser: {user_input}\nProTalk:" |
| inputs = tokenizer(prompt, return_tensors="pt").to(device) |
| streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True) |
| thread = threading.Thread(target=model.generate, kwargs={ |
| "input_ids": inputs["input_ids"], |
| "max_new_tokens": 200, |
| "do_sample": True, |
| "temperature": 0.7, |
| "top_p": 0.9, |
| "streamer": streamer |
| }) |
| thread.start() |
| output_text = "" |
| for token in streamer: |
| print(token, end="", flush=True) |
| output_text += token |
| thread.join() |
| print() |
| history.append(f"User: {user_input}") |
| history.append(f"ProTalk: {output_text}") |
|
|
| if __name__ == "__main__": |
| chat_loop() |
|
|