Skip to content
Snippets Groups Projects
Commit 7bcb6096 authored by zqwerty's avatar zqwerty
Browse files

add dailydialog

parents 052b8c5f 69ba479e
No related branches found
No related tags found
No related merge requests found
Showing
with 639 additions and 108 deletions
......@@ -28,6 +28,9 @@ data/schema/dstc8-schema-guided-dialogue-master
data/mdbt/data
data/mdbt/models
data/mdbt/word-vectors
convlab2/**/data
convlab2/**/output
convlab2/**/cache
convlab2/nlg/sclstm/**/resource/*
convlab2/nlg/sclstm/**/resource_usr/*
convlab2/nlg/sclstm/**/sclstm.pt
......
......@@ -4,8 +4,10 @@ from tqdm import tqdm
from convlab2.util import load_dataset, load_nlu_data, load_dst_data, load_policy_data, load_nlg_data, load_e2e_data, load_rg_data
from nltk.tokenize import TreebankWordTokenizer, PunktSentenceTokenizer
from collections import Counter
import json_lines
from convlab2.util.unified_datasets_util import create_delex_data
def create_bio_data(dataset, data_dir):
def create_bio_data(dataset, data_dir, args):
data_by_split = load_nlu_data(dataset, speaker='all')
os.makedirs(data_dir, exist_ok=True)
......@@ -47,7 +49,7 @@ def create_bio_data(dataset, data_dir):
f.writelines(data)
print('num of spans in utterances', cnt)
def create_dialogBIO_data(dataset, data_dir):
def create_dialogBIO_data(dataset, data_dir, args):
data_by_split = load_nlu_data(dataset, split_to_turn=False)
os.makedirs(data_dir, exist_ok=True)
......@@ -94,16 +96,83 @@ def create_dialogBIO_data(dataset, data_dir):
f.writelines(data)
print('num of spans in utterances', cnt)
def create_revert_dialogBIO_data(dataset, data_dir, args):
def tag2da(tokens, tags):
assert len(tokens)==len(tags)
triples = []
i = 0
utt = ''
while i < len(tags):
tag = tags[i]
if tag == 'B':
value = tokens[i]
j = i + 1
while j < len(tags):
next_tag = tags[j]
if next_tag == 'I':
value += ' ' + tokens[j]
i += 1
j += 1
else:
break
triples.append({'intent':'', 'domain':'', 'slot':'', 'value': value, 'start': len(utt), 'end': len(utt)+len(value)})
utt += value + ' '
assert utt[triples[-1]['start']:triples[-1]['end']] == value, print(utt[triples[-1]['start']:triples[-1]['end']],triples[-1])
else:
utt += tokens[i] + ' '
i += 1
utt = utt[:-1]
assert utt == ' '.join(tokens), print(utt, '\n', ' '.join(tokens))
return triples
def dialog2turn(tokens, labels):
turns = []
turn = {'tokens': [], 'tags': []}
i = 0
while i < len(tokens):
token = tokens[i]
if i < len(tokens) - 1 and token in ['user', 'system'] and tokens[i+1] == ':':
turns.append(turn)
turn = {'tokens': [], 'tags': []}
i += 2
continue
turn['tokens'].append(token)
turn['tags'].append(labels[i])
i += 1
turns.pop(0)
for turn in turns:
da = {'binary': [], 'categorical': [], 'non-categorical': []}
da['non-categorical'] = tag2da(turn['tokens'], turn['tags'])
turn['utterance'] = ' '.join(turn['tokens'])
turn['dialogue_acts'] = da
return turns
for data_split in dataset:
infer_output_data_path = os.path.join(args.infer_data_dir, f'{data_split}.json')
for original_dial, bio_dial in zip(dataset[data_split], json_lines.reader(open(infer_output_data_path))):
bio_turns = dialog2turn(bio_dial['tokens'], bio_dial['labels'])
original_dial['turns'] = original_dial['turns'][:len(bio_turns)]
assert len(bio_turns) == len(original_dial['turns']), print(len(bio_turns), len(original_dial['turns']))
for ori_turn, new_turn in zip(original_dial['turns'], bio_turns):
ori_turn['original_utterance'] = ori_turn['utterance']
ori_turn['utterance'] = new_turn['utterance']
ori_turn['original_dialogue_acts'] = ori_turn['dialogue_acts']
ori_turn['dialogue_acts'] = new_turn['dialogue_acts']
dataset, _ = create_delex_data(dataset, delex_func=lambda d,s,v: f'<v>{v}</v>')
os.makedirs(data_dir, exist_ok=True)
json.dump(dataset, open(os.path.join(data_dir, 'data.json'), 'w', encoding='utf-8'), ensure_ascii=False, indent=2)
if __name__ == '__main__':
from argparse import ArgumentParser
parser = ArgumentParser(description="create data for seq2seq training")
parser.add_argument('--tasks', metavar='task_name', nargs='*', choices=['bio', 'dialogBIO'], help='names of tasks')
parser.add_argument('--tasks', metavar='task_name', nargs='*', choices=['bio', 'dialogBIO', 'revert_dialogBIO'], help='names of tasks')
parser.add_argument('--datasets', metavar='dataset_name', nargs='*', help='names of unified datasets')
parser.add_argument('--save_dir', metavar='save_directory', type=str, default='data', help='directory to save the data, default: data/$task_name/$dataset_name')
parser.add_argument('--infer_data_dir', metavar='infer_data_dir', type=str, default=None, help='directory of inference output data, default: None')
args = parser.parse_args()
print(args)
for dataset_name in tqdm(args.datasets, desc='datasets'):
dataset = load_dataset(dataset_name)
for task_name in tqdm(args.tasks, desc='tasks', leave=False):
data_dir = os.path.join(args.save_dir, task_name, dataset_name)
eval(f"create_{task_name}_data")(dataset, data_dir)
eval(f"create_{task_name}_data")(dataset, data_dir, args)
set -e
n_gpus=3
n_gpus=2
task_name="dialogBIO"
dataset_name="multiwoz21"
data_dir="data/${task_name}/${dataset_name}"
......@@ -8,14 +8,14 @@ cache_dir="cache"
logging_dir="${output_dir}/runs"
source_column="tokens"
target_column="labels"
model_name_or_path="output/dialogBIO/sgd"
per_device_eval_batch_size=16
model_name_or_path="output/dialogBIO/sgd+tm1+tm2+tm3"
per_device_eval_batch_size=32
python create_data.py --tasks ${task_name} --datasets ${dataset_name} --save_dir "data"
for split in test validation train
do
python -m torch.distributed.launch \
CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} run_token_classification.py \
--task_name ${task_name} \
--train_file ${data_dir}/${split}.json \
......
......@@ -135,7 +135,7 @@ class DataTrainingArguments:
metadata={"help": "The number of processes to use for the preprocessing."},
)
max_seq_length: int = field(
default=None,
default=512,
metadata={
"help": "The maximum total input sequence length after tokenization. If set, sequences longer "
"than this will be truncated, sequences shorter will be padded."
......
n_gpus=3
set -e
n_gpus=2
task_name="dialogBIO"
dataset_name="sgd"
dataset_name="sgd+tm1+tm2+tm3"
data_dir="data/${task_name}/${dataset_name}"
output_dir="output/${task_name}/${dataset_name}"
cache_dir="cache"
......@@ -18,9 +19,21 @@ lr=2e-5
num_train_epochs=1
metric_for_best_model="f1"
python create_data.py --tasks ${task_name} --datasets ${dataset_name} --save_dir "data"
names=$(echo ${dataset_name} | tr "+" "\n")
mkdir -p ${data_dir}
for name in ${names};
do
echo "preprocessing ${name}"
python create_data.py --tasks ${task_name} --datasets ${name} --save_dir "data"
if [ "${name}" != "${dataset_name}" ]; then
cat "data/${task_name}/${name}/train.json" >> ${train_file}
cat "data/${task_name}/${name}/validation.json" >> ${validation_file}
cat "data/${task_name}/${name}/test.json" >> ${test_file}
fi
done
python -m torch.distributed.launch \
CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} run_token_classification.py \
--task_name ${task_name} \
--train_file ${train_file} \
......@@ -39,7 +52,6 @@ python -m torch.distributed.launch \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
......
import json
import os
import random
from tqdm import tqdm
def main(args):
random.seed(45)
os.makedirs(args.output_dir, exist_ok=True)
filenames = [f for (_, _, fs) in os.walk(args.input_dir) for f in fs if 'keywords' in f]
for filename in filenames:
data = json.load(open(os.path.join(args.input_dir, filename)))
fout = open(os.path.join(args.output_dir, f"{filename.split('/')[-1].split('_')[1]}.json"), 'w', encoding='utf-8')
turn_keywords = [turn['keywords'] for dial in data for turn in dial]
random.shuffle(turn_keywords)
cnt = 0
# keywords_set = {keyword for keywords in turn_keywords_set for keyword in keywords}
for dial in tqdm(data):
context = []
for i, turn in enumerate(dial):
speaker = 'user' if i%2 == 0 else 'system'
random.shuffle(turn['keywords'])
keywords = ' | '.join(turn['keywords'])
utt = turn['utterance']
context_seq = '\n'.join([f"{turn['speaker']}: {turn['utt']}" for turn in context]+[f'{speaker}: '])
input_seq = f'keywords: {keywords}\n\ncontext: {context_seq}'
context.append({'speaker': speaker, 'utt':utt})
fout.write(json.dumps({'keywords+context': input_seq, 'response': utt}, ensure_ascii=False)+'\n')
# min_neg = len(turn['keywords'])
# max_neg = 4 * min_neg
# negative_keywords = random.sample(keywords_set, random.randint(min_neg, max_neg))
# negative_keywords = random.sample(turn_keywords_set, 1)[0]
negative_keywords = turn_keywords[cnt]
cnt += 1
possible_keywords = turn['keywords'] + list(negative_keywords)
random.shuffle(possible_keywords)
possible_keywords = ' | '.join(possible_keywords)
input_seq = f'possible keywords: {possible_keywords}\n\ncontext: {context_seq}'
fout.write(json.dumps({'keywords+context': input_seq, 'response': utt}, ensure_ascii=False)+'\n')
if __name__ == '__main__':
from argparse import ArgumentParser
parser = ArgumentParser(description="calculate NLU metrics for unified datasets")
parser.add_argument('--input_dir', '-i', type=str, help='path to the input files')
parser.add_argument('--output_dir', '-o', type=str, help='path to the output files')
args = parser.parse_args()
print(args)
main(args)
dataset_name="metalwoz+sgd+tm1+tm2+tm3"
names=$(echo ${dataset_name} | tr "+" "\n")
model_type="gpt"
data_dir=data/key2gen_shuffle_noisy/${model_type}/${name}/${dataset_name}
mkdir -p ${data_dir}
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
for name in ${names}
do
echo "preprocessing ${name}"
python gen_pretraining_data.py -i data/lm/${name}/${model_type} -o data/key2gen_shuffle_noisy/${model_type}/${name}
if [ "${name}" != "${dataset_name}" ]; then
cat "data/key2gen_shuffle_noisy/gpt/${name}/train.json" >> ${train_file}
cat "data/key2gen_shuffle_noisy/gpt/${name}/validation.json" >> ${validation_file}
cat "data/key2gen_shuffle_noisy/gpt/${name}/test.json" >> ${test_file}
fi
done
python gen_pretraining_data.py -i data/lm/multiwoz21/${model_type} -o data/key2gen_shuffle_noisy/${model_type}/multiwoz21
\ No newline at end of file
model_type=dialogpt
dataset_name=multiwoz21
model_name=dialogpt-large
data_dir="data/lm/${dataset_name}/${model_type}"
word_loss_file="${data_dir}/${model_name}_${dataset_name}_word_loss.json"
keywords_num=5
keywords_ratio=1
keywords_th=0
task_name="lm"
dataset_name=$1
model_type="gpt"
data_dir="data/${task_name}/${dataset_name}/${model_type}"
model_name_or_path="gpt2-large"
keywords_num=100
keywords_ratio=0.3
keywords_th_ratio=0
stopwords=True
output_file="${data_dir}/${dataset_name}_keywords_${model_name}_topk_${keywords_num}_ratio_${keywords_ratio}_th_${keywords_th}_stopwords_${stopwords}.json"
for data_split in validation test train
do
word_loss_file="${data_dir}/${model_name_or_path}_${dataset_name}_${data_split}_word_loss.json"
output_file="${data_dir}/${dataset_name}_${data_split}_keywords_${model_name_or_path}_topk_${keywords_num}_ratio_${keywords_ratio}_th_${keywords_th_ratio}_stopwords_${stopwords}.json"
python lmloss2keywords.py \
--model_type ${model_type} \
--word_loss_file ${word_loss_file} \
--keywords_num ${keywords_num} \
--keywords_ratio ${keywords_ratio} \
--keywords_th ${keywords_th} \
--keywords_th_ratio ${keywords_th_ratio} \
--stopwords ${stopwords} \
--output_file ${output_file}
\ No newline at end of file
done
\ No newline at end of file
set -e
n_gpus=1
task_name="lm"
dataset_name="multiwoz21"
model_type="dialogpt"
dataset_name=$1
model_type="gpt"
data_dir="data/${task_name}/${dataset_name}/${model_type}"
output_dir="output/${task_name}/${dataset_name}/${model_type}"
cache_dir="../cache"
validation_file="${data_dir}/validation.json"
source_column="dialogue"
max_length=512
model_name_or_path="microsoft/DialoGPT-large"
per_device_eval_batch_size=4
dump_eval_loss_to="${data_dir}/dialogpt-large_${dataset_name}_token_loss.json"
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type dialogpt
python ../run_clm.py \
--dump_eval_loss_to ${dump_eval_loss_to}\
--model_name_or_path ${model_name_or_path} \
--output_dir ${data_dir} \
--validation_file ${validation_file} \
--source_column ${source_column} \
--max_length ${max_length} \
--do_eval \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--preprocessing_num_workers 4 \
--per_device_eval_batch_size ${per_device_eval_batch_size}
python lmloss2keywords.py --token_loss_file ${dump_eval_loss_to} --model_type ${model_type}
dump_eval_loss_to="${data_dir}/dialogpt-large-mwoz_${dataset_name}_token_loss.json"
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type dialogpt
python ../run_clm.py \
--dump_eval_loss_to ${dump_eval_loss_to}\
--model_name_or_path ${output_dir} \
--output_dir ${data_dir} \
--validation_file ${validation_file} \
--source_column ${source_column} \
--max_length ${max_length} \
--do_eval \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--preprocessing_num_workers 4 \
--per_device_eval_batch_size ${per_device_eval_batch_size}
python lmloss2keywords.py --token_loss_file ${dump_eval_loss_to} --model_type ${model_type}
model_type="gpt"
data_dir="data/${task_name}/${dataset_name}/${model_type}"
validation_file="${data_dir}/validation.json"
model_name_or_path="gpt2-large"
dump_eval_loss_to="${data_dir}/gpt2-large_${dataset_name}_token_loss.json"
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type gpt
per_device_eval_batch_size=16
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type ${model_type}
for data_split in validation test train
do
validation_file="${data_dir}/${data_split}.json"
dump_eval_loss_to="${data_dir}/${model_name_or_path}_${dataset_name}_${data_split}_token_loss.json"
python ../run_clm.py \
--dump_eval_loss_to ${dump_eval_loss_to}\
--model_name_or_path ${model_name_or_path} \
......@@ -63,3 +30,4 @@ python ../run_clm.py \
--preprocessing_num_workers 4 \
--per_device_eval_batch_size ${per_device_eval_batch_size}
python lmloss2keywords.py --token_loss_file ${dump_eval_loss_to} --model_type ${model_type}
done
......@@ -80,8 +80,36 @@ def main(args):
stop_words = set(stopwords.words('english'))
if args.keywords_th_ratio > 0:
losses = [loss for x in word_loss_list for word, loss in zip(x['words'], x['losses']) if not any([w.lower() in stop_words for w in word_tokenize(word)])]
loss_th = sorted(losses, reverse=True)[round(args.keywords_th_ratio*len(losses))]
print(f'loss th for top {args.keywords_th_ratio*100}%: {loss_th}')
else:
loss_th = 0
def keywords_filter(word_loss_pairs):
candidate_indexes = []
for i, word_loss_pair in enumerate(word_loss_pairs):
if args.stopwords and any([w.lower() in stop_words for w in word_tokenize(word_loss_pair[0])]):
continue
if word_loss_pair[1] <= loss_th:
continue
candidate_indexes.append(i)
topk = min(round(args.keywords_ratio*len(word_loss_pairs)), args.keywords_num)
topk_indexes = sorted(candidate_indexes, key=lambda x: word_loss_pairs[x][1], reverse=True)[:topk]
topk_indexes = sorted(topk_indexes)
keywords = []
for i, index in enumerate(topk_indexes):
if i > 0 and index == topk_indexes[i-1] + 1:
keywords[-1]+= ' '+word_loss_pairs[index][0]
else:
keywords.append(word_loss_pairs[index][0])
return keywords
dialogs = []
for item in word_loss_list:
for item in tqdm(word_loss_list):
words = item['words']
losses = item['losses']
turns = []
......@@ -90,11 +118,9 @@ def main(args):
if word == '<|endoftext|>':
# switch turn
turn['utterance'] = ' '.join(turn['words'])
turn['keywords'] = list(zip(turn['words'], turn['losses']))
if args.stopwords:
turn['keywords'] = [x for x in turn['keywords'] if not any([w.lower() in stop_words for w in word_tokenize(x[0])])]
turn['keywords'] = sorted(turn['keywords'], key=lambda x: x[1], reverse=True)
turn['keywords'] = [x for x in turn['keywords'] if x[1] > args.keywords_th][:min(round(args.keywords_ratio*len(turn['keywords'])), args.keywords_num)]
keywords = keywords_filter(list(zip(turn['words'], turn['losses'])))
turn['keywords'] = keywords
# turn['keywords'] = ' | '.join([x[0] for x in keywords])
turn.pop('words')
turn.pop('losses')
turns.append(turn)
......@@ -116,7 +142,7 @@ if __name__ == '__main__':
parser.add_argument('--output_file', '-o', type=str, help='path to the output file')
parser.add_argument('--keywords_num', '-n', type=int, default=100, help='how many words in an utterance serve as keywords')
parser.add_argument('--keywords_ratio', '-r', type=float, default=1.0, help='how many words (in ratio) in an utterance serve as keywords')
parser.add_argument('--keywords_th', '-th', type=float, default=0., help='loss threshold for the keywords')
parser.add_argument('--keywords_th_ratio', '-th', type=float, default=0., help='loss threshold for the keywords, ratio of all word losses')
parser.add_argument('--stopwords', '-s', type=lambda x: bool(eval(x)), default=True, help='filter out stopwords')
args = parser.parse_args()
print(args)
......
set -e
n_gpus=1
task_name="key2gen"
dataset_name="multiwoz21"
speaker="all"
model_type="gpt"
data_dir="data/${task_name}/${model_type}/${dataset_name}"
output_dir="output/${task_name}/${model_type}/${dataset_name}"
cache_dir="../../t5/cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
source_column="keywords+context"
target_column="response"
truncation_side="left"
max_source_length=512
max_target_length=128
model_name_or_path="output/key2gen/gpt/metalwoz+sgd+tm1+tm2+tm3"
per_device_train_batch_size=128
per_device_eval_batch_size=128
gradient_accumulation_steps=4
lr=1e-3
num_train_epochs=1
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../../t5/run_seq2seq.py \
--task_name ${task_name} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${model_name_or_path} \
--do_predict \
--predict_with_generate \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_eval_batch_size ${per_device_eval_batch_size}
......@@ -19,7 +19,7 @@ gradient_accumulation_steps=4
lr=5e-5
num_train_epochs=3
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type dialogpt
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type ${model_type}
python ../run_clm.py \
--model_name_or_path ${model_name_or_path} \
......
set -e
n_gpus=1
task_name="lm"
dataset_name="multiwoz21"
model_type="gpt"
data_dir="data/${task_name}/${dataset_name}/${model_type}"
output_dir="output/${task_name}/${dataset_name}/${model_type}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
source_column="dialogue"
max_length=512
model_name_or_path="gpt2-large"
per_device_train_batch_size=16
per_device_eval_batch_size=16
gradient_accumulation_steps=4
lr=5e-5
num_train_epochs=3
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type ${model_type}
python ../run_clm.py \
--model_name_or_path ${model_name_or_path} \
--train_file ${train_file} \
--validation_file ${validation_file} \
--source_column ${source_column} \
--max_length ${max_length} \
--do_train \
--do_eval \
--save_strategy epoch \
--evaluation_strategy epoch \
--load_best_model_at_end \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--debug underflow_overflow \
--gradient_checkpointing
set -e
n_gpus=1
task_name="key2gen_shuffle_noisy"
dataset_name="metalwoz+sgd+tm1+tm2+tm3"
speaker="all"
model_type="gpt"
data_dir="data/${task_name}/${model_type}/${dataset_name}"
output_dir="output/${task_name}/${model_type}/${dataset_name}"
cache_dir="../../t5/cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
source_column="keywords+context"
target_column="response"
truncation_side="left"
max_source_length=512
max_target_length=128
model_name_or_path="t5-small"
per_device_train_batch_size=128
per_device_eval_batch_size=128
gradient_accumulation_steps=8
lr=1e-3
num_train_epochs=1
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../../t5/run_seq2seq.py \
--task_name ${task_name} \
--train_file ${train_file} \
--validation_file ${validation_file} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--do_eval \
--do_predict \
--save_strategy epoch \
--evaluation_strategy epoch \
--load_best_model_at_end \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--debug underflow_overflow \
--adafactor \
--gradient_checkpointing
......@@ -87,9 +87,9 @@ def create_nlg_data(dataset, data_dir, args):
dialogue_acts_seq = serialize_dialogue_acts(sample['dialogue_acts'])
if args.context_window_size>0:
context = '\n'.join([f"{turn['speaker']}: {turn['utterance']}" for turn in sample['context']]+[f'{sample["speaker"]}: '])
context = f'{dialogue_acts_seq}\n{context}'
context = f'{dialogue_acts_seq}\n\n{context}'
else:
context = f'{dialogue_acts_seq}\n{sample["speaker"]}: '
context = f'{dialogue_acts_seq}\n\n{sample["speaker"]}: '
assert equal_da_seq(sample['dialogue_acts'], dialogue_acts_seq), print(sample['dialogue_acts'], dialogue_acts_seq, deserialize_dialogue_acts(dialogue_acts_seq))
data.append(json.dumps({'context+da': context, 'response': sample['utterance']}, ensure_ascii=False)+'\n')
......@@ -138,14 +138,19 @@ if __name__ == '__main__':
parser.add_argument('--speaker', '-s', type=str, choices=['user', 'system', 'all'], help='speaker(s)')
parser.add_argument('--context_window_size', '-c', type=int, default=0, help='how many contextual utterances are considered')
parser.add_argument('--len_tokenizer', '-l', type=str, default=None, help='name or path of tokenizer that used to get seq len')
parser.add_argument('--ratio', '-r', type=float, default=None, help='how many data is used for training and evaluation')
parser.add_argument('--dial_ids_order', '-o', type=int, default=None, help='which data order is used for experiments')
args = parser.parse_args()
print(args)
if args.len_tokenizer:
tokenizer = AutoTokenizer.from_pretrained(args.len_tokenizer)
for dataset_name in tqdm(args.datasets, desc='datasets'):
dataset = load_dataset(dataset_name)
dataset = load_dataset(dataset_name, args.dial_ids_order)
if args.ratio:
dataset['train'] = dataset['train'][:round(len(dataset['train'])*args.ratio)]
dataset['validation'] = dataset['validation'][:round(len(dataset['validation'])*args.ratio)]
for task_name in tqdm(args.tasks, desc='tasks', leave=False):
data_dir = os.path.join('data', task_name, dataset_name)
data_dir = os.path.join('data', task_name, (dataset_name if not args.ratio else f'{dataset_name}_{args.ratio}_order{args.dial_ids_order}'))
data_by_split = eval(f"create_{task_name}_data")(dataset, data_dir, args)
if args.len_tokenizer:
get_max_len(data_by_split, tokenizer)
import logging
import os
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig
from convlab2.dst.dst import DST
from convlab2.base_models.t5.dst.serialization import deserialize_dialogue_state
from convlab2.util.custom_util import model_downloader
class T5DST(DST):
def __init__(self, speaker, context_window_size, model_name_or_path, model_file=None, device='cuda'):
assert speaker in ['user', 'system']
assert context_window_size > 0
self.speaker = speaker
self.opponent = 'system' if speaker == 'user' else 'user'
self.context_window_size = context_window_size
model_dir = os.path.dirname(os.path.abspath(__file__))
if not os.path.exists(model_name_or_path):
model_downloader(model_dir, model_file)
self.config = AutoConfig.from_pretrained(model_name_or_path)
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, config=self.config)
self.model.eval()
self.device = device if torch.cuda.is_available() else "cpu"
self.model.to(self.device)
logging.info("T5DST loaded")
def update(self, context):
if len(context) > 0 and type(context[0]) is list and len(context[0]) > 1:
context = [item[1] for item in context]
context = context[-self.context_window_size:]
input_seq = '\n'.join([f"{self.opponent if (i % 2) == (len(context) % 2) else self.speaker}: {utt}" for i, utt in enumerate(context)])
# print(input_seq)
input_seq = self.tokenizer(input_seq, return_tensors="pt").to(self.device)
# print(input_seq)
output_seq = self.model.generate(**input_seq, max_length=256)
# print(output_seq)
output_seq = self.tokenizer.decode(output_seq[0], skip_special_tokens=True)
# print(output_seq)
state = deserialize_dialogue_state(output_seq.strip())
return state
if __name__ == '__main__':
contexts = [
["I would like a taxi from Saint John's college to Pizza Hut Fen Ditton."],
["I would like a taxi from Saint John's college to Pizza Hut Fen Ditton.",
"What time do you want to leave and what time do you want to arrive by?",
"I want to leave after 17:15."],
["I would like a taxi from Saint John's college to Pizza Hut Fen Ditton.",
"What time do you want to leave and what time do you want to arrive by?",
"I want to leave after 17:15.",
"Booking completed! your taxi will be blue honda Contact number is 07218068540",
"Thank you for all the help! I appreciate it."],
["I would like a taxi from Saint John's college to Pizza Hut Fen Ditton.",
"What time do you want to leave and what time do you want to arrive by?",
"I want to leave after 17:15.",
"Booking completed! your taxi will be blue honda Contact number is 07218068540",
"Thank you for all the help! I appreciate it.",
"You are welcome. Is there anything else I can help you with today?",
"No, I am all set. Have a nice day. Bye."],
]
dst = T5DST(speaker='user', context_window_size=100, model_name_or_path='output/dst/multiwoz21/user/context_100')
for context in contexts:
print(dst.update(context))
print()
......@@ -6,7 +6,7 @@ from convlab2.base_models.t5.dst.serialization import deserialize_dialogue_state
def merge(dataset_name, speaker, save_dir, context_window_size, predict_result):
assert os.path.exists(predict_result)
dataset = load_dataset(dataset_name)
dataset = load_dataset(dataset_name, args.dial_ids_order)
data = load_dst_data(dataset, data_split='test', speaker=speaker, use_context=context_window_size>0, context_window_size=context_window_size)['test']
if save_dir is None:
......@@ -29,6 +29,7 @@ if __name__ == '__main__':
parser.add_argument('--save_dir', type=str, help='merged data will be saved as $save_dir/predictions.json. default: on the same directory as predict_result')
parser.add_argument('--context_window_size', '-c', type=int, default=0, help='how many contextual utterances are considered')
parser.add_argument('--predict_result', '-p', type=str, required=True, help='path to the output file generated_predictions.json')
parser.add_argument('--dial_ids_order', '-o', type=int, default=None, help='which data order is used for experiments')
args = parser.parse_args()
print(args)
merge(args.dataset, args.speaker, args.save_dir, args.context_window_size, args.predict_result)
......@@ -24,14 +24,12 @@ gradient_accumulation_steps=2
lr=1e-3
num_train_epochs=10
python ../create_data.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} -l t5-small
python ../create_data.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size}
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
python ../run_seq2seq.py \
--task_name ${task_name} \
--train_file ${train_file} \
--validation_file ${validation_file} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
......@@ -40,9 +38,9 @@ python -m torch.distributed.launch \
--model_name_or_path ${model_name_or_path} \
--do_train \
--do_eval \
--do_predict \
--save_strategy epoch \
--evaluation_strategy epoch \
--save_total_limit 3 \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
......@@ -58,8 +56,7 @@ python -m torch.distributed.launch \
--adafactor \
--gradient_checkpointing
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../run_seq2seq.py \
python ../run_seq2seq.py \
--task_name ${task_name} \
--test_file ${test_file} \
--source_column ${source_column} \
......@@ -76,7 +73,14 @@ python -m torch.distributed.launch \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_eval_batch_size ${per_device_eval_batch_size}
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--debug underflow_overflow \
--adafactor \
--gradient_checkpointing
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json
......
n_gpus=1
task_name="dst"
dataset_name=$1
speaker="user"
context_window_size=100
data_dir="data/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}/${speaker}/context_${context_window_size}"
output_dir="output/${task_name}/${dataset_name}_${ratio}_order${dial_ids_order}/${speaker}/context_${context_window_size}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
metric_name_or_path="dst_metric.py"
metric_for_best_model="accuracy"
source_column="context"
target_column="state_seq"
truncation_side="left"
max_source_length=1024
max_target_length=512
model_name_or_path="t5-small"
per_device_train_batch_size=64
per_device_eval_batch_size=64
gradient_accumulation_steps=2
lr=1e-3
num_train_epochs=100
python ../create_data.py -t ${task_name} -d ${dataset_name} -s ${speaker} -c ${context_window_size} -r ${ratio} -o ${dial_ids_order}
python ../run_seq2seq.py \
--task_name ${task_name} \
--train_file ${train_file} \
--validation_file ${validation_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--do_eval \
--save_strategy epoch \
--evaluation_strategy epoch \
--save_total_limit 3 \
--early_stopping_patience 10 \
--prediction_loss_only \
--load_best_model_at_end \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--debug underflow_overflow \
--adafactor \
--gradient_checkpointing
python ../run_seq2seq.py \
--task_name ${task_name} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${output_dir} \
--do_predict \
--predict_with_generate \
--metric_name_or_path ${metric_name_or_path} \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--debug underflow_overflow \
--adafactor \
--gradient_checkpointing
python merge_predict_res.py -d ${dataset_name} -s ${speaker} -c ${context_window_size} -p ${output_dir}/generated_predictions.json -o ${dial_ids_order}
python ../../../dst/evaluate_unified_datasets.py -p ${output_dir}/predictions.json
n_gpus=1
task_name="dst"
dataset_name="sgd+tm1+tm2+tm3"
speaker="user"
context_window_size=100
data_dir="data/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
output_dir="output/${task_name}/${dataset_name}/${speaker}/context_${context_window_size}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
metric_name_or_path="dst_metric.py"
metric_for_best_model="accuracy"
source_column="context"
target_column="state_seq"
truncation_side="left"
max_source_length=1024
max_target_length=512
model_name_or_path="t5-small"
per_device_train_batch_size=64
per_device_eval_batch_size=64
gradient_accumulation_steps=2
lr=1e-3
num_train_epochs=1
names=$(echo ${dataset_name} | tr "+" "\n")
mkdir -p ${data_dir}
for name in ${names};
do
echo "preprocessing ${name}"
python ../create_data.py -t ${task_name} -d ${name} -s ${speaker} -c ${context_window_size}
if [ "${name}" != "${dataset_name}" ]; then
cat "data/${task_name}/${name}/${speaker}/context_${context_window_size}/train.json" >> ${train_file}
cat "data/${task_name}/${name}/${speaker}/context_${context_window_size}/validation.json" >> ${validation_file}
cat "data/${task_name}/${name}/${speaker}/context_${context_window_size}/test.json" >> ${test_file}
fi
done
python ../run_seq2seq.py \
--task_name ${task_name} \
--train_file ${train_file} \
--validation_file ${validation_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--do_eval \
--save_strategy epoch \
--evaluation_strategy epoch \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--debug underflow_overflow \
--adafactor \
--gradient_checkpointing
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment