Skip to content
Snippets Groups Projects
Commit 653b6238 authored by zqwerty's avatar zqwerty
Browse files

update base model scripts

parent efeb62b9
No related branches found
No related tags found
No related merge requests found
...@@ -14,6 +14,8 @@ __pycache__ ...@@ -14,6 +14,8 @@ __pycache__
.vscode .vscode
# data # data
data/unified_datasets/multiwoz21/MultiWOZ_2.1.zip
data/unified_datasets/tm1/master.zip
data/**/train.json data/**/train.json
data/**/val.json data/**/val.json
data/**/test.json data/**/test.json
...@@ -25,6 +27,9 @@ data/schema/dstc8-schema-guided-dialogue-master ...@@ -25,6 +27,9 @@ data/schema/dstc8-schema-guided-dialogue-master
data/mdbt/data data/mdbt/data
data/mdbt/models data/mdbt/models
data/mdbt/word-vectors data/mdbt/word-vectors
convlab2/**/data
convlab2/**/output
convlab2/**/cache
convlab2/nlg/sclstm/**/resource/* convlab2/nlg/sclstm/**/resource/*
convlab2/nlg/sclstm/**/resource_usr/* convlab2/nlg/sclstm/**/resource_usr/*
convlab2/nlg/sclstm/**/sclstm.pt convlab2/nlg/sclstm/**/sclstm.pt
......
...@@ -4,8 +4,10 @@ from tqdm import tqdm ...@@ -4,8 +4,10 @@ from tqdm import tqdm
from convlab2.util import load_dataset, load_nlu_data, load_dst_data, load_policy_data, load_nlg_data, load_e2e_data, load_rg_data from convlab2.util import load_dataset, load_nlu_data, load_dst_data, load_policy_data, load_nlg_data, load_e2e_data, load_rg_data
from nltk.tokenize import TreebankWordTokenizer, PunktSentenceTokenizer from nltk.tokenize import TreebankWordTokenizer, PunktSentenceTokenizer
from collections import Counter from collections import Counter
import json_lines
from convlab2.util.unified_datasets_util import create_delex_data
def create_bio_data(dataset, data_dir): def create_bio_data(dataset, data_dir, args):
data_by_split = load_nlu_data(dataset, speaker='all') data_by_split = load_nlu_data(dataset, speaker='all')
os.makedirs(data_dir, exist_ok=True) os.makedirs(data_dir, exist_ok=True)
...@@ -47,7 +49,7 @@ def create_bio_data(dataset, data_dir): ...@@ -47,7 +49,7 @@ def create_bio_data(dataset, data_dir):
f.writelines(data) f.writelines(data)
print('num of spans in utterances', cnt) print('num of spans in utterances', cnt)
def create_dialogBIO_data(dataset, data_dir): def create_dialogBIO_data(dataset, data_dir, args):
data_by_split = load_nlu_data(dataset, split_to_turn=False) data_by_split = load_nlu_data(dataset, split_to_turn=False)
os.makedirs(data_dir, exist_ok=True) os.makedirs(data_dir, exist_ok=True)
...@@ -94,16 +96,83 @@ def create_dialogBIO_data(dataset, data_dir): ...@@ -94,16 +96,83 @@ def create_dialogBIO_data(dataset, data_dir):
f.writelines(data) f.writelines(data)
print('num of spans in utterances', cnt) print('num of spans in utterances', cnt)
def create_revert_dialogBIO_data(dataset, data_dir, args):
def tag2da(tokens, tags):
assert len(tokens)==len(tags)
triples = []
i = 0
utt = ''
while i < len(tags):
tag = tags[i]
if tag == 'B':
value = tokens[i]
j = i + 1
while j < len(tags):
next_tag = tags[j]
if next_tag == 'I':
value += ' ' + tokens[j]
i += 1
j += 1
else:
break
triples.append({'intent':'', 'domain':'', 'slot':'', 'value': value, 'start': len(utt), 'end': len(utt)+len(value)})
utt += value + ' '
assert utt[triples[-1]['start']:triples[-1]['end']] == value, print(utt[triples[-1]['start']:triples[-1]['end']],triples[-1])
else:
utt += tokens[i] + ' '
i += 1
utt = utt[:-1]
assert utt == ' '.join(tokens), print(utt, '\n', ' '.join(tokens))
return triples
def dialog2turn(tokens, labels):
turns = []
turn = {'tokens': [], 'tags': []}
i = 0
while i < len(tokens):
token = tokens[i]
if i < len(tokens) - 1 and token in ['user', 'system'] and tokens[i+1] == ':':
turns.append(turn)
turn = {'tokens': [], 'tags': []}
i += 2
continue
turn['tokens'].append(token)
turn['tags'].append(labels[i])
i += 1
turns.pop(0)
for turn in turns:
da = {'binary': [], 'categorical': [], 'non-categorical': []}
da['non-categorical'] = tag2da(turn['tokens'], turn['tags'])
turn['utterance'] = ' '.join(turn['tokens'])
turn['dialogue_acts'] = da
return turns
for data_split in dataset:
infer_output_data_path = os.path.join(args.infer_data_dir, f'{data_split}.json')
for original_dial, bio_dial in zip(dataset[data_split], json_lines.reader(open(infer_output_data_path))):
bio_turns = dialog2turn(bio_dial['tokens'], bio_dial['labels'])
original_dial['turns'] = original_dial['turns'][:len(bio_turns)]
assert len(bio_turns) == len(original_dial['turns']), print(len(bio_turns), len(original_dial['turns']))
for ori_turn, new_turn in zip(original_dial['turns'], bio_turns):
ori_turn['original_utterance'] = ori_turn['utterance']
ori_turn['utterance'] = new_turn['utterance']
ori_turn['original_dialogue_acts'] = ori_turn['dialogue_acts']
ori_turn['dialogue_acts'] = new_turn['dialogue_acts']
dataset, _ = create_delex_data(dataset, delex_func=lambda d,s,v: f'<v>{v}</v>')
os.makedirs(data_dir, exist_ok=True)
json.dump(dataset, open(os.path.join(data_dir, 'data.json'), 'w', encoding='utf-8'), ensure_ascii=False, indent=2)
if __name__ == '__main__': if __name__ == '__main__':
from argparse import ArgumentParser from argparse import ArgumentParser
parser = ArgumentParser(description="create data for seq2seq training") parser = ArgumentParser(description="create data for seq2seq training")
parser.add_argument('--tasks', metavar='task_name', nargs='*', choices=['bio', 'dialogBIO'], help='names of tasks') parser.add_argument('--tasks', metavar='task_name', nargs='*', choices=['bio', 'dialogBIO', 'revert_dialogBIO'], help='names of tasks')
parser.add_argument('--datasets', metavar='dataset_name', nargs='*', help='names of unified datasets') parser.add_argument('--datasets', metavar='dataset_name', nargs='*', help='names of unified datasets')
parser.add_argument('--save_dir', metavar='save_directory', type=str, default='data', help='directory to save the data, default: data/$task_name/$dataset_name') parser.add_argument('--save_dir', metavar='save_directory', type=str, default='data', help='directory to save the data, default: data/$task_name/$dataset_name')
parser.add_argument('--infer_data_dir', metavar='infer_data_dir', type=str, default=None, help='directory of inference output data, default: None')
args = parser.parse_args() args = parser.parse_args()
print(args) print(args)
for dataset_name in tqdm(args.datasets, desc='datasets'): for dataset_name in tqdm(args.datasets, desc='datasets'):
dataset = load_dataset(dataset_name) dataset = load_dataset(dataset_name)
for task_name in tqdm(args.tasks, desc='tasks', leave=False): for task_name in tqdm(args.tasks, desc='tasks', leave=False):
data_dir = os.path.join(args.save_dir, task_name, dataset_name) data_dir = os.path.join(args.save_dir, task_name, dataset_name)
eval(f"create_{task_name}_data")(dataset, data_dir) eval(f"create_{task_name}_data")(dataset, data_dir, args)
set -e set -e
n_gpus=3 n_gpus=2
task_name="dialogBIO" task_name="dialogBIO"
dataset_name="multiwoz21" dataset_name="multiwoz21"
data_dir="data/${task_name}/${dataset_name}" data_dir="data/${task_name}/${dataset_name}"
...@@ -8,14 +8,14 @@ cache_dir="cache" ...@@ -8,14 +8,14 @@ cache_dir="cache"
logging_dir="${output_dir}/runs" logging_dir="${output_dir}/runs"
source_column="tokens" source_column="tokens"
target_column="labels" target_column="labels"
model_name_or_path="output/dialogBIO/sgd" model_name_or_path="output/dialogBIO/sgd+tm1+tm2+tm3"
per_device_eval_batch_size=16 per_device_eval_batch_size=32
python create_data.py --tasks ${task_name} --datasets ${dataset_name} --save_dir "data" python create_data.py --tasks ${task_name} --datasets ${dataset_name} --save_dir "data"
for split in test validation train for split in test validation train
do do
python -m torch.distributed.launch \ CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} run_token_classification.py \ --nproc_per_node ${n_gpus} run_token_classification.py \
--task_name ${task_name} \ --task_name ${task_name} \
--train_file ${data_dir}/${split}.json \ --train_file ${data_dir}/${split}.json \
......
...@@ -135,7 +135,7 @@ class DataTrainingArguments: ...@@ -135,7 +135,7 @@ class DataTrainingArguments:
metadata={"help": "The number of processes to use for the preprocessing."}, metadata={"help": "The number of processes to use for the preprocessing."},
) )
max_seq_length: int = field( max_seq_length: int = field(
default=None, default=512,
metadata={ metadata={
"help": "The maximum total input sequence length after tokenization. If set, sequences longer " "help": "The maximum total input sequence length after tokenization. If set, sequences longer "
"than this will be truncated, sequences shorter will be padded." "than this will be truncated, sequences shorter will be padded."
......
n_gpus=3 set -e
n_gpus=2
task_name="dialogBIO" task_name="dialogBIO"
dataset_name="sgd" dataset_name="sgd+tm1+tm2+tm3"
data_dir="data/${task_name}/${dataset_name}" data_dir="data/${task_name}/${dataset_name}"
output_dir="output/${task_name}/${dataset_name}" output_dir="output/${task_name}/${dataset_name}"
cache_dir="cache" cache_dir="cache"
...@@ -18,9 +19,21 @@ lr=2e-5 ...@@ -18,9 +19,21 @@ lr=2e-5
num_train_epochs=1 num_train_epochs=1
metric_for_best_model="f1" metric_for_best_model="f1"
python create_data.py --tasks ${task_name} --datasets ${dataset_name} --save_dir "data" names=$(echo ${dataset_name} | tr "+" "\n")
mkdir -p ${data_dir}
for name in ${names};
do
echo "preprocessing ${name}"
python create_data.py --tasks ${task_name} --datasets ${name} --save_dir "data"
if [ "${name}" != "${dataset_name}" ]; then
cat "data/${task_name}/${name}/train.json" >> ${train_file}
cat "data/${task_name}/${name}/validation.json" >> ${validation_file}
cat "data/${task_name}/${name}/test.json" >> ${test_file}
fi
done
python -m torch.distributed.launch \
CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} run_token_classification.py \ --nproc_per_node ${n_gpus} run_token_classification.py \
--task_name ${task_name} \ --task_name ${task_name} \
--train_file ${train_file} \ --train_file ${train_file} \
...@@ -39,7 +52,6 @@ python -m torch.distributed.launch \ ...@@ -39,7 +52,6 @@ python -m torch.distributed.launch \
--cache_dir ${cache_dir} \ --cache_dir ${cache_dir} \
--output_dir ${output_dir} \ --output_dir ${output_dir} \
--logging_dir ${logging_dir} \ --logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \ --preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \ --per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \ --per_device_eval_batch_size ${per_device_eval_batch_size} \
......
import json import json
import os import os
import random
from tqdm import tqdm
def main(args): def main(args):
random.seed(45)
os.makedirs(args.output_dir, exist_ok=True) os.makedirs(args.output_dir, exist_ok=True)
filenames = [f for (_, _, fs) in os.walk(args.input_dir) for f in fs if 'keywords' in f] filenames = [f for (_, _, fs) in os.walk(args.input_dir) for f in fs if 'keywords' in f]
for filename in filenames: for filename in filenames:
data = json.load(open(os.path.join(args.input_dir, filename))) data = json.load(open(os.path.join(args.input_dir, filename)))
fout = open(os.path.join(args.output_dir, f"{filename.split('/')[-1].split('_')[1]}.json"), 'w', encoding='utf-8') fout = open(os.path.join(args.output_dir, f"{filename.split('/')[-1].split('_')[1]}.json"), 'w', encoding='utf-8')
for dial in data: turn_keywords = [turn['keywords'] for dial in data for turn in dial]
random.shuffle(turn_keywords)
cnt = 0
# keywords_set = {keyword for keywords in turn_keywords_set for keyword in keywords}
for dial in tqdm(data):
context = [] context = []
for i, turn in enumerate(dial): for i, turn in enumerate(dial):
speaker = 'user' if i%2 == 0 else 'system' speaker = 'user' if i%2 == 0 else 'system'
keywords = ', '.join(turn['keywords']) random.shuffle(turn['keywords'])
keywords = ' | '.join(turn['keywords'])
utt = turn['utterance'] utt = turn['utterance']
input_seq = '\n'.join([f"{turn['speaker']}: {turn['utt']}" for turn in context]+[f'{speaker}: ']) context_seq = '\n'.join([f"{turn['speaker']}: {turn['utt']}" for turn in context]+[f'{speaker}: '])
input_seq = f'{keywords}\n{input_seq}' input_seq = f'keywords: {keywords}\n\ncontext: {context_seq}'
context.append({'speaker': speaker, 'utt':utt}) context.append({'speaker': speaker, 'utt':utt})
fout.write(json.dumps({'keywords+context': input_seq, 'response': utt}, ensure_ascii=False)+'\n') fout.write(json.dumps({'keywords+context': input_seq, 'response': utt}, ensure_ascii=False)+'\n')
# min_neg = len(turn['keywords'])
# max_neg = 4 * min_neg
# negative_keywords = random.sample(keywords_set, random.randint(min_neg, max_neg))
# negative_keywords = random.sample(turn_keywords_set, 1)[0]
negative_keywords = turn_keywords[cnt]
cnt += 1
possible_keywords = turn['keywords'] + list(negative_keywords)
random.shuffle(possible_keywords)
possible_keywords = ' | '.join(possible_keywords)
input_seq = f'possible keywords: {possible_keywords}\n\ncontext: {context_seq}'
fout.write(json.dumps({'keywords+context': input_seq, 'response': utt}, ensure_ascii=False)+'\n')
if __name__ == '__main__': if __name__ == '__main__':
from argparse import ArgumentParser from argparse import ArgumentParser
......
dataset_name="sgd+metalwoz+tm1+tm2+tm3" dataset_name="metalwoz+sgd+tm1+tm2+tm3"
names=$(echo ${dataset_name} | tr "+" "\n") names=$(echo ${dataset_name} | tr "+" "\n")
model_type="gpt" model_type="gpt"
data_dir=data/key2gen/${model_type}/${name}/${dataset_name} data_dir=data/key2gen_shuffle_noisy/${model_type}/${name}/${dataset_name}
mkdir -p ${data_dir} mkdir -p ${data_dir}
train_file="${data_dir}/train.json" train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json" validation_file="${data_dir}/validation.json"
...@@ -9,10 +9,11 @@ test_file="${data_dir}/test.json" ...@@ -9,10 +9,11 @@ test_file="${data_dir}/test.json"
for name in ${names} for name in ${names}
do do
echo "preprocessing ${name}" echo "preprocessing ${name}"
python gen_pretraining_data.py -i data/lm/${name}/${model_type} -o data/keygen/${model_type}/${name} python gen_pretraining_data.py -i data/lm/${name}/${model_type} -o data/key2gen_shuffle_noisy/${model_type}/${name}
if [ "${name}" != "${dataset_name}" ]; then if [ "${name}" != "${dataset_name}" ]; then
cat "data/keygen/gpt/${name}/train.json" >> ${train_file} cat "data/key2gen_shuffle_noisy/gpt/${name}/train.json" >> ${train_file}
cat "data/keygen/gpt/${name}/validation.json" >> ${validation_file} cat "data/key2gen_shuffle_noisy/gpt/${name}/validation.json" >> ${validation_file}
cat "data/keygen/gpt/${name}/test.json" >> ${test_file} cat "data/key2gen_shuffle_noisy/gpt/${name}/test.json" >> ${test_file}
fi fi
done done
python gen_pretraining_data.py -i data/lm/multiwoz21/${model_type} -o data/key2gen_shuffle_noisy/${model_type}/multiwoz21
\ No newline at end of file
set -e set -e
n_gpus=2 n_gpus=1
task_name="key2gen" task_name="key2gen_shuffle_noisy"
dataset_name="metalwoz+sgd+tm1+tm2+tm3" dataset_name="metalwoz+sgd+tm1+tm2+tm3"
speaker="all" speaker="all"
model_type="gpt" model_type="gpt"
...@@ -19,7 +19,7 @@ max_target_length=128 ...@@ -19,7 +19,7 @@ max_target_length=128
model_name_or_path="t5-small" model_name_or_path="t5-small"
per_device_train_batch_size=128 per_device_train_batch_size=128
per_device_eval_batch_size=128 per_device_eval_batch_size=128
gradient_accumulation_steps=4 gradient_accumulation_steps=8
lr=1e-3 lr=1e-3
num_train_epochs=1 num_train_epochs=1
......
...@@ -20,7 +20,7 @@ per_device_train_batch_size=128 ...@@ -20,7 +20,7 @@ per_device_train_batch_size=128
per_device_eval_batch_size=128 per_device_eval_batch_size=128
gradient_accumulation_steps=4 gradient_accumulation_steps=4
lr=1e-3 lr=1e-3
num_train_epochs=5 num_train_epochs=1
names=$(echo ${dataset_name} | tr "+" "\n") names=$(echo ${dataset_name} | tr "+" "\n")
mkdir -p ${data_dir} mkdir -p ${data_dir}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment