| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | """ |
| | Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa). |
| | GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned |
| | using a masked language modeling (MLM) loss. |
| | """ |
| |
|
| | from __future__ import absolute_import |
| | import os |
| | import sys |
| | from bleu import _bleu |
| | import pickle |
| | import torch |
| | import json |
| | import random |
| | import logging |
| | import argparse |
| | import numpy as np |
| | from io import open |
| | from itertools import cycle |
| | import torch.nn as nn |
| | from model import Seq2Seq |
| | from tqdm import tqdm, trange |
| | from fuzzywuzzy import fuzz |
| | from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset |
| | from torch.utils.data.distributed import DistributedSampler |
| |
|
| | from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, |
| | RobertaConfig, RobertaModel, RobertaTokenizer) |
| | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', |
| | datefmt = '%m/%d/%Y %H:%M:%S', |
| | level = logging.INFO) |
| | logger = logging.getLogger(__name__) |
| | divide_number = 3 |
| |
|
| |
|
| | class Example(object): |
| | """A single training/test example.""" |
| | def __init__(self, |
| | idx, |
| | source, |
| | ts_v, |
| | target, |
| | ): |
| | self.idx = idx |
| | self.source = source |
| | self.ts_v = ts_v |
| | self.target = target |
| |
|
| | def read_examples(filename): |
| | """Read examples from filename.""" |
| | examples=[] |
| | with open(filename,encoding="utf-8") as f: |
| | for idx, line in enumerate(f): |
| | |
| | line=line.strip() |
| | js=json.loads(line) |
| |
|
| | examples.append( |
| | Example( |
| | idx = idx, |
| | source=" ".join(js['natrual_language']), |
| | ts_v = ",".join(js['TS_V_token']), |
| | target = " ".join(js["ground_truth"][1:-1]), |
| | ) |
| | ) |
| | |
| | return examples |
| |
|
| |
|
| | class InputFeatures(object): |
| | """A single training/test features for a example.""" |
| | def __init__(self, |
| | example_id, |
| | source_ids, |
| | target_ids, |
| | ): |
| | self.example_id = example_id |
| | self.source_ids = source_ids |
| | self.target_ids = target_ids |
| | |
| | def convert_examples_to_features(examples, tokenizer, args,stage=None): |
| | features = [] |
| | for example_index, example in enumerate(examples): |
| | |
| | source_tokens = tokenizer.tokenize(example.source) |
| | ts_v_tokens = tokenizer.tokenize(example.ts_v) |
| | source_tokens =[tokenizer.cls_token]+source_tokens+[tokenizer.sep_token]+ts_v_tokens+[tokenizer.sep_token] |
| | |
| | source_ids = tokenizer.convert_tokens_to_ids(source_tokens[:args.max_source_length-5]) |
| | padding_length = args.max_source_length - len(source_ids) |
| | source_ids+=[tokenizer.pad_token_id]*padding_length |
| | |
| | |
| | if stage=="test": |
| | target_tokens = tokenizer.tokenize("None") |
| | else: |
| | target_tokens = tokenizer.tokenize(example.target)[:args.max_target_length-2] |
| | target_tokens = [tokenizer.cls_token]+target_tokens+[tokenizer.sep_token] |
| | target_ids = tokenizer.convert_tokens_to_ids(target_tokens) |
| | padding_length = args.max_target_length - len(target_ids) |
| | target_ids+=[tokenizer.pad_token_id]*padding_length |
| | |
| |
|
| | |
| | features.append( |
| | InputFeatures( |
| | example_index, |
| | source_ids, |
| | target_ids, |
| | ) |
| | ) |
| | return features |
| |
|
| |
|
| |
|
| | def set_seed(seed=20240124): |
| | random.seed(seed) |
| | os.environ['PYHTONHASHSEED'] = str(seed) |
| | np.random.seed(seed) |
| | torch.manual_seed(seed) |
| | torch.cuda.manual_seed(seed) |
| | torch.backends.cudnn.deterministic = True |
| | |
| | def main(): |
| | parser = argparse.ArgumentParser() |
| |
|
| | |
| | parser.add_argument("--model_name_or_path", default=None, type=str, required=True, |
| | help="Path to pre-trained model: e.g. roberta-base" ) |
| | parser.add_argument("--load_model_path", default=None, type=str, |
| | help="Path to trained model" ) |
| | parser.add_argument("--output_dir", default=None, type=str, required=True, |
| | help="The output directory where the model predictions and checkpoints will be written.") |
| | |
| | |
| | parser.add_argument("--train_filename", default=None, type=str, |
| | help="The train filename. Should contain the .jsonl files for this task.") |
| | parser.add_argument("--dev_filename", default=None, type=str, |
| | help="The dev filename. Should contain the .jsonl files for this task.") |
| | parser.add_argument("--test_filename", default=None, type=str, |
| | help="The test filename. Should contain the .jsonl files for this task.") |
| | parser.add_argument("--max_source_length", default=256, type=int, |
| | help="The maximum total source sequence length after tokenization. Sequences longer " |
| | "than this will be truncated, sequences shorter will be padded.") |
| | parser.add_argument("--max_target_length", default=256, type=int, |
| | help="The maximum total target sequence length after tokenization. Sequences longer " |
| | "than this will be truncated, sequences shorter will be padded.") |
| | parser.add_argument("--do_train", action='store_true', |
| | help="Whether to run training.") |
| | parser.add_argument("--do_eval", action='store_true', |
| | help="Whether to run eval on the dev set.") |
| | parser.add_argument("--do_test", action='store_true', |
| | help="Whether to run eval on the dev set.") |
| | parser.add_argument("--no_cuda", action='store_true', |
| | help="Avoid using CUDA when available") |
| | |
| | parser.add_argument("--train_batch_size", default=8, type=int, |
| | help="Batch size per GPU/CPU for training.") |
| | parser.add_argument("--eval_batch_size", default=8, type=int, |
| | help="Batch size per GPU/CPU for evaluation.") |
| | parser.add_argument('--gradient_accumulation_steps', type=int, default=1, |
| | help="Number of updates steps to accumulate before performing a backward/update pass.") |
| | parser.add_argument("--learning_rate", default=5e-5, type=float, |
| | help="The initial learning rate for Adam.") |
| | parser.add_argument("--beam_size", default=10, type=int, |
| | help="beam size for beam search") |
| | parser.add_argument("--weight_decay", default=0.0, type=float, |
| | help="Weight deay if we apply some.") |
| | parser.add_argument("--adam_epsilon", default=1e-8, type=float, |
| | help="Epsilon for Adam optimizer.") |
| | parser.add_argument("--max_grad_norm", default=1.0, type=float, |
| | help="Max gradient norm.") |
| | parser.add_argument("--num_train_epochs", default=3, type=int, |
| | help="Total number of training epochs to perform.") |
| | parser.add_argument('--seed', type=int, default=20240124, |
| | help="random seed for initialization") |
| | |
| | |
| | args = parser.parse_args() |
| | |
| | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', |
| | datefmt='%m/%d/%Y %H:%M:%S',level=logging.INFO ) |
| | |
| | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| | args.n_gpu = torch.cuda.device_count() |
| | args.device = device |
| | logger.info("device: %s, n_gpu: %s",device, args.n_gpu) |
| | |
| | |
| | set_seed(args.seed) |
| | |
| | if os.path.exists(args.output_dir) is False: |
| | os.makedirs(args.output_dir) |
| |
|
| | |
| | tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) |
| | config = RobertaConfig.from_pretrained(args.model_name_or_path) |
| | |
| | config.is_decoder = True |
| | encoder = RobertaModel.from_pretrained(args.model_name_or_path,config=config) |
| |
|
| | model = Seq2Seq(encoder=encoder,decoder=encoder,config=config, |
| | beam_size=args.beam_size,max_length=args.max_target_length, |
| | sos_id=tokenizer.convert_tokens_to_ids(["<mask0>"])[0],eos_id=tokenizer.sep_token_id) |
| | |
| | logger.info("Training/evaluation parameters %s", args) |
| | if args.load_model_path is not None: |
| | logger.info("reload model from {}".format(args.load_model_path + "/pytorch_model.bin")) |
| | model.load_state_dict(torch.load(args.load_model_path + "/pytorch_model.bin")) |
| | model.to(args.device) |
| | |
| | if args.n_gpu > 1: |
| | |
| | model = torch.nn.DataParallel(model) |
| |
|
| | if args.do_train: |
| | |
| | train_examples = read_examples(args.train_filename) |
| | train_features = convert_examples_to_features(train_examples, tokenizer,args,stage='train') |
| | all_source_ids = torch.tensor([f.source_ids for f in train_features], dtype=torch.long) |
| | all_target_ids = torch.tensor([f.target_ids for f in train_features], dtype=torch.long) |
| | train_data = TensorDataset(all_source_ids,all_target_ids) |
| | train_sampler = RandomSampler(train_data) |
| | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size // args.gradient_accumulation_steps) |
| |
|
| |
|
| | |
| | no_decay = ['bias', 'LayerNorm.weight'] |
| | optimizer_grouped_parameters = [ |
| | {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], |
| | 'weight_decay': args.weight_decay}, |
| | {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} |
| | ] |
| | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) |
| | scheduler = get_linear_schedule_with_warmup(optimizer, |
| | num_warmup_steps=int(len(train_dataloader)*args.num_train_epochs*0.1), |
| | num_training_steps=len(train_dataloader)*args.num_train_epochs) |
| | |
| | |
| | logger.info("***** Running training *****") |
| | logger.info(" Num examples = %d", len(train_examples)) |
| | logger.info(" Batch size = %d", args.train_batch_size * args.gradient_accumulation_steps) |
| | logger.info(" Num epoch = %d", args.num_train_epochs) |
| | |
| |
|
| | model.train() |
| | patience, best_score, losses, dev_dataset = 0, 0, [], {} |
| | for epoch in range(args.num_train_epochs): |
| | for idx,batch in enumerate(train_dataloader): |
| | batch = tuple(t.to(device) for t in batch) |
| | source_ids,target_ids = batch |
| | loss,_,_ = model(source_ids=source_ids,target_ids=target_ids) |
| |
|
| | if args.n_gpu > 1: |
| | loss = loss.mean() |
| | if args.gradient_accumulation_steps > 1: |
| | loss = loss / args.gradient_accumulation_steps |
| | |
| | losses.append(loss.item()) |
| | loss.backward() |
| | if len(losses) % args.gradient_accumulation_steps == 0: |
| | |
| | optimizer.step() |
| | optimizer.zero_grad() |
| | scheduler.step() |
| | if len(losses) // args.gradient_accumulation_steps % 100 == 0: |
| | logger.info("epoch {} step {} loss {}".format(epoch, |
| | len(losses)//args.gradient_accumulation_steps, |
| | round(np.mean(losses[-100*args.gradient_accumulation_steps:]),4))) |
| | if args.do_eval: |
| | |
| | if 'dev_loss' in dev_dataset: |
| | eval_examples,eval_data = dev_dataset['dev_loss'] |
| | else: |
| | eval_examples = read_examples(args.dev_filename) |
| | eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='dev') |
| | all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long) |
| | all_target_ids = torch.tensor([f.target_ids for f in eval_features], dtype=torch.long) |
| | eval_data = TensorDataset(all_source_ids,all_target_ids) |
| | dev_dataset['dev_loss' ]= eval_examples,eval_data |
| | eval_sampler = SequentialSampler(eval_data) |
| | eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) |
| |
|
| | logger.info("\n***** Running evaluation *****") |
| | logger.info(" Num examples = %d", len(eval_examples)) |
| | logger.info(" Batch size = %d", args.eval_batch_size) |
| |
|
| | |
| | model.eval() |
| | eval_loss,tokens_num = 0,0 |
| | for batch in eval_dataloader: |
| | batch = tuple(t.to(device) for t in batch) |
| | source_ids,target_ids = batch |
| |
|
| | with torch.no_grad(): |
| | _,loss,num = model(source_ids=source_ids,target_ids=target_ids) |
| | eval_loss += loss.sum().item() |
| | tokens_num += num.sum().item() |
| | |
| | model.train() |
| | eval_loss = eval_loss / tokens_num |
| | result = {'eval_ppl': round(np.exp(eval_loss),5)} |
| | for key in sorted(result.keys()): |
| | logger.info(" %s = %s", key, str(result[key])) |
| | logger.info(" "+"*"*20) |
| |
|
| | |
| | if 'dev_bleu' in dev_dataset: |
| | eval_examples,eval_data=dev_dataset['dev_bleu'] |
| | else: |
| | eval_examples = read_examples(args.dev_filename) |
| | |
| | eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test') |
| | all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long) |
| | eval_data = TensorDataset(all_source_ids) |
| | dev_dataset['dev_bleu'] = eval_examples,eval_data |
| |
|
| | eval_sampler = SequentialSampler(eval_data) |
| | eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) |
| |
|
| | model.eval() |
| | p=[] |
| | for batch in eval_dataloader: |
| | batch = tuple(t.to(device) for t in batch) |
| | source_ids = batch[0] |
| | with torch.no_grad(): |
| | preds = model(source_ids=source_ids) |
| | |
| | for pred in preds: |
| | t = pred[0].cpu().numpy() |
| | t = list(t) |
| | if 0 in t: |
| | t = t[:t.index(0)] |
| | text = tokenizer.decode(t,clean_up_tokenization_spaces=False) |
| | |
| | p.append(text) |
| |
|
| | model.train() |
| | predictions = [] |
| | edit_dis = 0 |
| | cnt_all = 0 |
| | res_list = [] |
| | EM = [] |
| | is_gened = False |
| | with open(args.output_dir+"/dev.output",'w') as f, open(args.output_dir+"/dev.gold",'w') as f1: |
| | for ref,gold in zip(p,eval_examples): |
| | predictions.append(ref) |
| | if len(ref) > 0: |
| | is_gened = True |
| | f.write(ref+'\n') |
| | f1.write(gold.target+'\n') |
| | EM.append(ref.split()==gold.target.split()) |
| | edit_dis += fuzz.ratio(ref, gold.target) |
| | res_list.append([ref,gold.target]) |
| | cnt_all += 1 |
| |
|
| | if is_gened: |
| | dev_bleu = _bleu(args.output_dir+"/dev.gold", args.output_dir+"/dev.output") |
| | else: |
| | dev_bleu = 0 |
| | avg_edit_dis = float(edit_dis)/cnt_all |
| | logger.info(" %s = %s "%("Epoch",str(epoch))) |
| | logger.info(" %s = %s "%("bleu-4",str(dev_bleu))) |
| | logger.info(" %s = %s "%("Edit Distance",str(round(float(edit_dis)/cnt_all,2)))) |
| | logger.info(" %s = %s "%("EM",str(round(np.mean(EM)*100,2)))) |
| | logger.info(" "+"*"*20) |
| | dev_score = (dev_bleu+avg_edit_dis) / 2.0 |
| | if dev_score>best_score: |
| | best_score=dev_score |
| | |
| | output_dir = args.output_dir |
| | if not os.path.exists(output_dir): |
| | os.makedirs(output_dir) |
| | model_to_save = model.module if hasattr(model, 'module') else model |
| | output_model_file = os.path.join(output_dir, "pytorch_model.bin") |
| | torch.save(model_to_save.state_dict(), output_model_file) |
| | patience = 0 |
| | else: |
| | patience += 1 |
| | if patience == 3: |
| | break |
| | output_dir = args.output_dir |
| | with open(output_dir + "/last_training_result.jsonl", 'w') as wf: |
| | for line in res_list: |
| | dic = {} |
| | dic["Pred"] = line[0] |
| | dic["GT"] = line[1] |
| | wf.write(json.dumps(dic)) |
| | wf.write("\n") |
| | logger.info(" Best score:%s",best_score) |
| | logger.info(" "+"*"*20) |
| | if args.do_test: |
| | res_list = [] |
| | if args.load_model_path is not None: |
| | checkpoint_prefix = 'pytorch_model.bin' |
| | output_dir = os.path.join(args.output_dir, checkpoint_prefix) |
| | model_to_load = model.module if hasattr(model, 'module') else model |
| | model_to_load.load_state_dict(torch.load(output_dir)) |
| | |
| | |
| |
|
| | eval_examples = read_examples(args.test_filename) |
| | eval_features = convert_examples_to_features(eval_examples, tokenizer, args,stage='test') |
| | all_source_ids = torch.tensor([f.source_ids for f in eval_features], dtype=torch.long) |
| | eval_data = TensorDataset(all_source_ids) |
| |
|
| | |
| | eval_sampler = SequentialSampler(eval_data) |
| | eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) |
| |
|
| | model.eval() |
| | p=[] |
| | for batch in tqdm(eval_dataloader,total=len(eval_dataloader)): |
| | batch = tuple(t.to(device) for t in batch) |
| | source_ids = batch[0] |
| | with torch.no_grad(): |
| | preds = model(source_ids) |
| | |
| | for pred in preds: |
| | t = pred[0].cpu().numpy() |
| | t = list(t) |
| | if 0 in t: |
| | t = t[:t.index(0)] |
| | text = tokenizer.decode(t,clean_up_tokenization_spaces=False) |
| | p.append(text) |
| |
|
| | predictions=[] |
| | EM = [] |
| | edit_dis = 0 |
| | cnt = 0 |
| | with open(args.output_dir+"/test.output",'w') as f, open(args.output_dir+"/test.gold",'w') as f1: |
| | for ref,gold in zip(p,eval_examples): |
| | res_list.append([ref,gold.target]) |
| | predictions.append(ref) |
| | f.write(ref+'\n') |
| | f1.write(gold.target+'\n') |
| | EM.append(ref.split()==gold.target.split()) |
| | edit_dis += fuzz.ratio(ref, gold.target) |
| | cnt += 1 |
| |
|
| | dev_bleu = _bleu(args.output_dir+"/test.gold", args.output_dir+"/test.output") |
| | logger.info(" %s = %s "%("bleu-4",str(dev_bleu))) |
| | logger.info(" %s = %s "%("EM",str(round(np.mean(EM)*100,2)))) |
| | logger.info(" %s = %s "%("Edit Distance",str(round(float(edit_dis)/cnt,2)))) |
| | logger.info(" "+"*"*20) |
| |
|
| |
|
| | with open(args.output_dir + "/last_training_result.jsonl", 'w') as wf: |
| | for line in res_list: |
| | dic = {} |
| | dic["Pred"] = line[0] |
| | dic["GT"] = line[1] |
| | wf.write(json.dumps(dic)) |
| | wf.write("\n") |
| | |
| | if __name__ == "__main__": |
| | main() |
| |
|
| |
|
| |
|