Skip to content
Snippets Groups Projects
Commit 653b6238 authored by zqwerty's avatar zqwerty
Browse files

update base model scripts

parent efeb62b9
No related branches found
No related tags found
No related merge requests found
......@@ -14,6 +14,8 @@ __pycache__
.vscode
# data
data/unified_datasets/multiwoz21/MultiWOZ_2.1.zip
data/unified_datasets/tm1/master.zip
data/**/train.json
data/**/val.json
data/**/test.json
......@@ -25,6 +27,9 @@ data/schema/dstc8-schema-guided-dialogue-master
data/mdbt/data
data/mdbt/models
data/mdbt/word-vectors
convlab2/**/data
convlab2/**/output
convlab2/**/cache
convlab2/nlg/sclstm/**/resource/*
convlab2/nlg/sclstm/**/resource_usr/*
convlab2/nlg/sclstm/**/sclstm.pt
......
......@@ -4,8 +4,10 @@ from tqdm import tqdm
from convlab2.util import load_dataset, load_nlu_data, load_dst_data, load_policy_data, load_nlg_data, load_e2e_data, load_rg_data
from nltk.tokenize import TreebankWordTokenizer, PunktSentenceTokenizer
from collections import Counter
import json_lines
from convlab2.util.unified_datasets_util import create_delex_data
def create_bio_data(dataset, data_dir):
def create_bio_data(dataset, data_dir, args):
data_by_split = load_nlu_data(dataset, speaker='all')
os.makedirs(data_dir, exist_ok=True)
......@@ -47,7 +49,7 @@ def create_bio_data(dataset, data_dir):
f.writelines(data)
print('num of spans in utterances', cnt)
def create_dialogBIO_data(dataset, data_dir):
def create_dialogBIO_data(dataset, data_dir, args):
data_by_split = load_nlu_data(dataset, split_to_turn=False)
os.makedirs(data_dir, exist_ok=True)
......@@ -94,16 +96,83 @@ def create_dialogBIO_data(dataset, data_dir):
f.writelines(data)
print('num of spans in utterances', cnt)
def create_revert_dialogBIO_data(dataset, data_dir, args):
def tag2da(tokens, tags):
assert len(tokens)==len(tags)
triples = []
i = 0
utt = ''
while i < len(tags):
tag = tags[i]
if tag == 'B':
value = tokens[i]
j = i + 1
while j < len(tags):
next_tag = tags[j]
if next_tag == 'I':
value += ' ' + tokens[j]
i += 1
j += 1
else:
break
triples.append({'intent':'', 'domain':'', 'slot':'', 'value': value, 'start': len(utt), 'end': len(utt)+len(value)})
utt += value + ' '
assert utt[triples[-1]['start']:triples[-1]['end']] == value, print(utt[triples[-1]['start']:triples[-1]['end']],triples[-1])
else:
utt += tokens[i] + ' '
i += 1
utt = utt[:-1]
assert utt == ' '.join(tokens), print(utt, '\n', ' '.join(tokens))
return triples
def dialog2turn(tokens, labels):
turns = []
turn = {'tokens': [], 'tags': []}
i = 0
while i < len(tokens):
token = tokens[i]
if i < len(tokens) - 1 and token in ['user', 'system'] and tokens[i+1] == ':':
turns.append(turn)
turn = {'tokens': [], 'tags': []}
i += 2
continue
turn['tokens'].append(token)
turn['tags'].append(labels[i])
i += 1
turns.pop(0)
for turn in turns:
da = {'binary': [], 'categorical': [], 'non-categorical': []}
da['non-categorical'] = tag2da(turn['tokens'], turn['tags'])
turn['utterance'] = ' '.join(turn['tokens'])
turn['dialogue_acts'] = da
return turns
for data_split in dataset:
infer_output_data_path = os.path.join(args.infer_data_dir, f'{data_split}.json')
for original_dial, bio_dial in zip(dataset[data_split], json_lines.reader(open(infer_output_data_path))):
bio_turns = dialog2turn(bio_dial['tokens'], bio_dial['labels'])
original_dial['turns'] = original_dial['turns'][:len(bio_turns)]
assert len(bio_turns) == len(original_dial['turns']), print(len(bio_turns), len(original_dial['turns']))
for ori_turn, new_turn in zip(original_dial['turns'], bio_turns):
ori_turn['original_utterance'] = ori_turn['utterance']
ori_turn['utterance'] = new_turn['utterance']
ori_turn['original_dialogue_acts'] = ori_turn['dialogue_acts']
ori_turn['dialogue_acts'] = new_turn['dialogue_acts']
dataset, _ = create_delex_data(dataset, delex_func=lambda d,s,v: f'<v>{v}</v>')
os.makedirs(data_dir, exist_ok=True)
json.dump(dataset, open(os.path.join(data_dir, 'data.json'), 'w', encoding='utf-8'), ensure_ascii=False, indent=2)
if __name__ == '__main__':
from argparse import ArgumentParser
parser = ArgumentParser(description="create data for seq2seq training")
parser.add_argument('--tasks', metavar='task_name', nargs='*', choices=['bio', 'dialogBIO'], help='names of tasks')
parser.add_argument('--tasks', metavar='task_name', nargs='*', choices=['bio', 'dialogBIO', 'revert_dialogBIO'], help='names of tasks')
parser.add_argument('--datasets', metavar='dataset_name', nargs='*', help='names of unified datasets')
parser.add_argument('--save_dir', metavar='save_directory', type=str, default='data', help='directory to save the data, default: data/$task_name/$dataset_name')
parser.add_argument('--infer_data_dir', metavar='infer_data_dir', type=str, default=None, help='directory of inference output data, default: None')
args = parser.parse_args()
print(args)
for dataset_name in tqdm(args.datasets, desc='datasets'):
dataset = load_dataset(dataset_name)
for task_name in tqdm(args.tasks, desc='tasks', leave=False):
data_dir = os.path.join(args.save_dir, task_name, dataset_name)
eval(f"create_{task_name}_data")(dataset, data_dir)
eval(f"create_{task_name}_data")(dataset, data_dir, args)
set -e
n_gpus=3
n_gpus=2
task_name="dialogBIO"
dataset_name="multiwoz21"
data_dir="data/${task_name}/${dataset_name}"
......@@ -8,14 +8,14 @@ cache_dir="cache"
logging_dir="${output_dir}/runs"
source_column="tokens"
target_column="labels"
model_name_or_path="output/dialogBIO/sgd"
per_device_eval_batch_size=16
model_name_or_path="output/dialogBIO/sgd+tm1+tm2+tm3"
per_device_eval_batch_size=32
python create_data.py --tasks ${task_name} --datasets ${dataset_name} --save_dir "data"
for split in test validation train
do
python -m torch.distributed.launch \
CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} run_token_classification.py \
--task_name ${task_name} \
--train_file ${data_dir}/${split}.json \
......
......@@ -135,7 +135,7 @@ class DataTrainingArguments:
metadata={"help": "The number of processes to use for the preprocessing."},
)
max_seq_length: int = field(
default=None,
default=512,
metadata={
"help": "The maximum total input sequence length after tokenization. If set, sequences longer "
"than this will be truncated, sequences shorter will be padded."
......
n_gpus=3
set -e
n_gpus=2
task_name="dialogBIO"
dataset_name="sgd"
dataset_name="sgd+tm1+tm2+tm3"
data_dir="data/${task_name}/${dataset_name}"
output_dir="output/${task_name}/${dataset_name}"
cache_dir="cache"
......@@ -18,9 +19,21 @@ lr=2e-5
num_train_epochs=1
metric_for_best_model="f1"
python create_data.py --tasks ${task_name} --datasets ${dataset_name} --save_dir "data"
names=$(echo ${dataset_name} | tr "+" "\n")
mkdir -p ${data_dir}
for name in ${names};
do
echo "preprocessing ${name}"
python create_data.py --tasks ${task_name} --datasets ${name} --save_dir "data"
if [ "${name}" != "${dataset_name}" ]; then
cat "data/${task_name}/${name}/train.json" >> ${train_file}
cat "data/${task_name}/${name}/validation.json" >> ${validation_file}
cat "data/${task_name}/${name}/test.json" >> ${test_file}
fi
done
python -m torch.distributed.launch \
CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} run_token_classification.py \
--task_name ${task_name} \
--train_file ${train_file} \
......@@ -39,7 +52,6 @@ python -m torch.distributed.launch \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
......
import json
import os
import random
from tqdm import tqdm
def main(args):
random.seed(45)
os.makedirs(args.output_dir, exist_ok=True)
filenames = [f for (_, _, fs) in os.walk(args.input_dir) for f in fs if 'keywords' in f]
for filename in filenames:
data = json.load(open(os.path.join(args.input_dir, filename)))
fout = open(os.path.join(args.output_dir, f"{filename.split('/')[-1].split('_')[1]}.json"), 'w', encoding='utf-8')
for dial in data:
turn_keywords = [turn['keywords'] for dial in data for turn in dial]
random.shuffle(turn_keywords)
cnt = 0
# keywords_set = {keyword for keywords in turn_keywords_set for keyword in keywords}
for dial in tqdm(data):
context = []
for i, turn in enumerate(dial):
speaker = 'user' if i%2 == 0 else 'system'
keywords = ', '.join(turn['keywords'])
random.shuffle(turn['keywords'])
keywords = ' | '.join(turn['keywords'])
utt = turn['utterance']
input_seq = '\n'.join([f"{turn['speaker']}: {turn['utt']}" for turn in context]+[f'{speaker}: '])
input_seq = f'{keywords}\n{input_seq}'
context_seq = '\n'.join([f"{turn['speaker']}: {turn['utt']}" for turn in context]+[f'{speaker}: '])
input_seq = f'keywords: {keywords}\n\ncontext: {context_seq}'
context.append({'speaker': speaker, 'utt':utt})
fout.write(json.dumps({'keywords+context': input_seq, 'response': utt}, ensure_ascii=False)+'\n')
# min_neg = len(turn['keywords'])
# max_neg = 4 * min_neg
# negative_keywords = random.sample(keywords_set, random.randint(min_neg, max_neg))
# negative_keywords = random.sample(turn_keywords_set, 1)[0]
negative_keywords = turn_keywords[cnt]
cnt += 1
possible_keywords = turn['keywords'] + list(negative_keywords)
random.shuffle(possible_keywords)
possible_keywords = ' | '.join(possible_keywords)
input_seq = f'possible keywords: {possible_keywords}\n\ncontext: {context_seq}'
fout.write(json.dumps({'keywords+context': input_seq, 'response': utt}, ensure_ascii=False)+'\n')
if __name__ == '__main__':
from argparse import ArgumentParser
......
dataset_name="sgd+metalwoz+tm1+tm2+tm3"
dataset_name="metalwoz+sgd+tm1+tm2+tm3"
names=$(echo ${dataset_name} | tr "+" "\n")
model_type="gpt"
data_dir=data/key2gen/${model_type}/${name}/${dataset_name}
data_dir=data/key2gen_shuffle_noisy/${model_type}/${name}/${dataset_name}
mkdir -p ${data_dir}
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
......@@ -9,10 +9,11 @@ test_file="${data_dir}/test.json"
for name in ${names}
do
echo "preprocessing ${name}"
python gen_pretraining_data.py -i data/lm/${name}/${model_type} -o data/keygen/${model_type}/${name}
python gen_pretraining_data.py -i data/lm/${name}/${model_type} -o data/key2gen_shuffle_noisy/${model_type}/${name}
if [ "${name}" != "${dataset_name}" ]; then
cat "data/keygen/gpt/${name}/train.json" >> ${train_file}
cat "data/keygen/gpt/${name}/validation.json" >> ${validation_file}
cat "data/keygen/gpt/${name}/test.json" >> ${test_file}
cat "data/key2gen_shuffle_noisy/gpt/${name}/train.json" >> ${train_file}
cat "data/key2gen_shuffle_noisy/gpt/${name}/validation.json" >> ${validation_file}
cat "data/key2gen_shuffle_noisy/gpt/${name}/test.json" >> ${test_file}
fi
done
python gen_pretraining_data.py -i data/lm/multiwoz21/${model_type} -o data/key2gen_shuffle_noisy/${model_type}/multiwoz21
\ No newline at end of file
set -e
n_gpus=2
task_name="key2gen"
n_gpus=1
task_name="key2gen_shuffle_noisy"
dataset_name="metalwoz+sgd+tm1+tm2+tm3"
speaker="all"
model_type="gpt"
......@@ -19,7 +19,7 @@ max_target_length=128
model_name_or_path="t5-small"
per_device_train_batch_size=128
per_device_eval_batch_size=128
gradient_accumulation_steps=4
gradient_accumulation_steps=8
lr=1e-3
num_train_epochs=1
......
......@@ -20,7 +20,7 @@ per_device_train_batch_size=128
per_device_eval_batch_size=128
gradient_accumulation_steps=4
lr=1e-3
num_train_epochs=5
num_train_epochs=1
names=$(echo ${dataset_name} | tr "+" "\n")
mkdir -p ${data_dir}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment