diff --git a/convlab2/base_models/gpt/create_data.py b/convlab2/base_models/gpt/create_data.py index 3186fd3374b64aae7461c7883607f773c8d5924c..94e88f3e61f36d4cf8ff531ee7ba3917d6dba9f5 100644 --- a/convlab2/base_models/gpt/create_data.py +++ b/convlab2/base_models/gpt/create_data.py @@ -14,7 +14,7 @@ def create_lm_data(dataset, data_dir, args): data = [] for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False): if args.model_type == 'dialogpt': - dialogue = ' <|endoftext|> '.join([turn['utterance'] for turn in sample['turns']]) + dialogue = ' <|endoftext|> '.join([turn['utterance'] for turn in sample['turns']]) + ' <|endoftext|>' else: dialogue = ' '.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['turns']]) data.append(json.dumps({'dialogue': dialogue}, ensure_ascii=False)+'\n')