Skip to content
Snippets Groups Projects
Commit 42f8dec8 authored by zqwerty's avatar zqwerty
Browse files

fix data generation for dialogpt

parent 3a38a9b5
Branches
No related tags found
No related merge requests found
...@@ -14,7 +14,7 @@ def create_lm_data(dataset, data_dir, args): ...@@ -14,7 +14,7 @@ def create_lm_data(dataset, data_dir, args):
data = [] data = []
for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False): for sample in tqdm(data_by_split[data_split], desc=f'{data_split} sample', leave=False):
if args.model_type == 'dialogpt': if args.model_type == 'dialogpt':
dialogue = ' <|endoftext|> '.join([turn['utterance'] for turn in sample['turns']]) dialogue = ' <|endoftext|> '.join([turn['utterance'] for turn in sample['turns']]) + ' <|endoftext|>'
else: else:
dialogue = ' '.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['turns']]) dialogue = ' '.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['turns']])
data.append(json.dumps({'dialogue': dialogue}, ensure_ascii=False)+'\n') data.append(json.dumps({'dialogue': dialogue}, ensure_ascii=False)+'\n')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment