Skip to content
Snippets Groups Projects
Commit aee0a26e authored by zqwerty's avatar zqwerty
Browse files

extract keywords use gpt2-large and train t5-key2gen on metalwoz+sgd+tm1+tm2+tm3

parent df0c81b0
No related branches found
No related tags found
No related merge requests found
Showing with 272 additions and 81 deletions
import json
import os
def main(args):
os.makedirs(args.output_dir, exist_ok=True)
filenames = [f for (_, _, fs) in os.walk(args.input_dir) for f in fs if 'keywords' in f]
for filename in filenames:
data = json.load(open(os.path.join(args.input_dir, filename)))
fout = open(os.path.join(args.output_dir, f"{filename.split('/')[-1].split('_')[1]}.json"), 'w', encoding='utf-8')
for dial in data:
context = []
for i, turn in enumerate(dial):
speaker = 'user' if i%2 == 0 else 'system'
keywords = ', '.join(turn['keywords'])
utt = turn['utterance']
input_seq = '\n'.join([f"{turn['speaker']}: {turn['utt']}" for turn in context]+[f'{speaker}: '])
input_seq = f'{keywords}\n{input_seq}'
context.append({'speaker': speaker, 'utt':utt})
fout.write(json.dumps({'keywords+context': input_seq, 'response': utt}, ensure_ascii=False)+'\n')
if __name__ == '__main__':
from argparse import ArgumentParser
parser = ArgumentParser(description="calculate NLU metrics for unified datasets")
parser.add_argument('--input_dir', '-i', type=str, help='path to the input files')
parser.add_argument('--output_dir', '-o', type=str, help='path to the output files')
args = parser.parse_args()
print(args)
main(args)
dataset_name="sgd+metalwoz+tm1+tm2+tm3"
names=$(echo ${dataset_name} | tr "+" "\n")
model_type="gpt"
data_dir=data/key2gen/${model_type}/${name}/${dataset_name}
mkdir -p ${data_dir}
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
for name in ${names}
do
echo "preprocessing ${name}"
python gen_pretraining_data.py -i data/lm/${name}/${model_type} -o data/keygen/${model_type}/${name}
if [ "${name}" != "${dataset_name}" ]; then
cat "data/keygen/gpt/${name}/train.json" >> ${train_file}
cat "data/keygen/gpt/${name}/validation.json" >> ${validation_file}
cat "data/keygen/gpt/${name}/test.json" >> ${test_file}
fi
done
model_type=dialogpt
dataset_name=multiwoz21
model_name=dialogpt-large
data_dir="data/lm/${dataset_name}/${model_type}"
word_loss_file="${data_dir}/${model_name}_${dataset_name}_word_loss.json"
keywords_num=5
keywords_ratio=1
keywords_th=0
task_name="lm"
dataset_name=$1
model_type="gpt"
data_dir="data/${task_name}/${dataset_name}/${model_type}"
model_name_or_path="gpt2-large"
keywords_num=100
keywords_ratio=0.3
keywords_th_ratio=0
stopwords=True
output_file="${data_dir}/${dataset_name}_keywords_${model_name}_topk_${keywords_num}_ratio_${keywords_ratio}_th_${keywords_th}_stopwords_${stopwords}.json"
for data_split in validation test train
do
word_loss_file="${data_dir}/${model_name_or_path}_${dataset_name}_${data_split}_word_loss.json"
output_file="${data_dir}/${dataset_name}_${data_split}_keywords_${model_name_or_path}_topk_${keywords_num}_ratio_${keywords_ratio}_th_${keywords_th_ratio}_stopwords_${stopwords}.json"
python lmloss2keywords.py \
--model_type ${model_type} \
--word_loss_file ${word_loss_file} \
--keywords_num ${keywords_num} \
--keywords_ratio ${keywords_ratio} \
--keywords_th ${keywords_th} \
--keywords_th_ratio ${keywords_th_ratio} \
--stopwords ${stopwords} \
--output_file ${output_file}
\ No newline at end of file
done
\ No newline at end of file
set -e
n_gpus=1
task_name="lm"
dataset_name="multiwoz21"
model_type="dialogpt"
dataset_name=$1
model_type="gpt"
data_dir="data/${task_name}/${dataset_name}/${model_type}"
output_dir="output/${task_name}/${dataset_name}/${model_type}"
cache_dir="../cache"
validation_file="${data_dir}/validation.json"
source_column="dialogue"
max_length=512
model_name_or_path="microsoft/DialoGPT-large"
per_device_eval_batch_size=4
dump_eval_loss_to="${data_dir}/dialogpt-large_${dataset_name}_token_loss.json"
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type dialogpt
python ../run_clm.py \
--dump_eval_loss_to ${dump_eval_loss_to}\
--model_name_or_path ${model_name_or_path} \
--output_dir ${data_dir} \
--validation_file ${validation_file} \
--source_column ${source_column} \
--max_length ${max_length} \
--do_eval \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--preprocessing_num_workers 4 \
--per_device_eval_batch_size ${per_device_eval_batch_size}
python lmloss2keywords.py --token_loss_file ${dump_eval_loss_to} --model_type ${model_type}
dump_eval_loss_to="${data_dir}/dialogpt-large-mwoz_${dataset_name}_token_loss.json"
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type dialogpt
python ../run_clm.py \
--dump_eval_loss_to ${dump_eval_loss_to}\
--model_name_or_path ${output_dir} \
--output_dir ${data_dir} \
--validation_file ${validation_file} \
--source_column ${source_column} \
--max_length ${max_length} \
--do_eval \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--preprocessing_num_workers 4 \
--per_device_eval_batch_size ${per_device_eval_batch_size}
python lmloss2keywords.py --token_loss_file ${dump_eval_loss_to} --model_type ${model_type}
model_type="gpt"
data_dir="data/${task_name}/${dataset_name}/${model_type}"
validation_file="${data_dir}/validation.json"
model_name_or_path="gpt2-large"
dump_eval_loss_to="${data_dir}/gpt2-large_${dataset_name}_token_loss.json"
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type gpt
per_device_eval_batch_size=16
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type ${model_type}
for data_split in validation test train
do
validation_file="${data_dir}/${data_split}.json"
dump_eval_loss_to="${data_dir}/${model_name_or_path}_${dataset_name}_${data_split}_token_loss.json"
python ../run_clm.py \
--dump_eval_loss_to ${dump_eval_loss_to}\
--model_name_or_path ${model_name_or_path} \
......@@ -63,3 +30,4 @@ python ../run_clm.py \
--preprocessing_num_workers 4 \
--per_device_eval_batch_size ${per_device_eval_batch_size}
python lmloss2keywords.py --token_loss_file ${dump_eval_loss_to} --model_type ${model_type}
done
......@@ -80,8 +80,36 @@ def main(args):
stop_words = set(stopwords.words('english'))
if args.keywords_th_ratio > 0:
losses = [loss for x in word_loss_list for word, loss in zip(x['words'], x['losses']) if not any([w.lower() in stop_words for w in word_tokenize(word)])]
loss_th = sorted(losses, reverse=True)[round(args.keywords_th_ratio*len(losses))]
print(f'loss th for top {args.keywords_th_ratio*100}%: {loss_th}')
else:
loss_th = 0
def keywords_filter(word_loss_pairs):
candidate_indexes = []
for i, word_loss_pair in enumerate(word_loss_pairs):
if args.stopwords and any([w.lower() in stop_words for w in word_tokenize(word_loss_pair[0])]):
continue
if word_loss_pair[1] <= loss_th:
continue
candidate_indexes.append(i)
topk = min(round(args.keywords_ratio*len(word_loss_pairs)), args.keywords_num)
topk_indexes = sorted(candidate_indexes, key=lambda x: word_loss_pairs[x][1], reverse=True)[:topk]
topk_indexes = sorted(topk_indexes)
keywords = []
for i, index in enumerate(topk_indexes):
if i > 0 and index == topk_indexes[i-1] + 1:
keywords[-1]+= ' '+word_loss_pairs[index][0]
else:
keywords.append(word_loss_pairs[index][0])
return keywords
dialogs = []
for item in word_loss_list:
for item in tqdm(word_loss_list):
words = item['words']
losses = item['losses']
turns = []
......@@ -90,11 +118,9 @@ def main(args):
if word == '<|endoftext|>':
# switch turn
turn['utterance'] = ' '.join(turn['words'])
turn['keywords'] = list(zip(turn['words'], turn['losses']))
if args.stopwords:
turn['keywords'] = [x for x in turn['keywords'] if not any([w.lower() in stop_words for w in word_tokenize(x[0])])]
turn['keywords'] = sorted(turn['keywords'], key=lambda x: x[1], reverse=True)
turn['keywords'] = [x for x in turn['keywords'] if x[1] > args.keywords_th][:min(round(args.keywords_ratio*len(turn['keywords'])), args.keywords_num)]
keywords = keywords_filter(list(zip(turn['words'], turn['losses'])))
turn['keywords'] = keywords
# turn['keywords'] = ' | '.join([x[0] for x in keywords])
turn.pop('words')
turn.pop('losses')
turns.append(turn)
......@@ -116,7 +142,7 @@ if __name__ == '__main__':
parser.add_argument('--output_file', '-o', type=str, help='path to the output file')
parser.add_argument('--keywords_num', '-n', type=int, default=100, help='how many words in an utterance serve as keywords')
parser.add_argument('--keywords_ratio', '-r', type=float, default=1.0, help='how many words (in ratio) in an utterance serve as keywords')
parser.add_argument('--keywords_th', '-th', type=float, default=0., help='loss threshold for the keywords')
parser.add_argument('--keywords_th_ratio', '-th', type=float, default=0., help='loss threshold for the keywords, ratio of all word losses')
parser.add_argument('--stopwords', '-s', type=lambda x: bool(eval(x)), default=True, help='filter out stopwords')
args = parser.parse_args()
print(args)
......
set -e
n_gpus=1
task_name="key2gen"
dataset_name="multiwoz21"
speaker="all"
model_type="gpt"
data_dir="data/${task_name}/${model_type}/${dataset_name}"
output_dir="output/${task_name}/${model_type}/${dataset_name}"
cache_dir="../../t5/cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
source_column="keywords+context"
target_column="response"
truncation_side="left"
max_source_length=512
max_target_length=128
model_name_or_path="output/key2gen/gpt/metalwoz+sgd+tm1+tm2+tm3"
per_device_train_batch_size=128
per_device_eval_batch_size=128
gradient_accumulation_steps=4
lr=1e-3
num_train_epochs=1
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../../t5/run_seq2seq.py \
--task_name ${task_name} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${model_name_or_path} \
--do_predict \
--predict_with_generate \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_eval_batch_size ${per_device_eval_batch_size}
......@@ -19,7 +19,7 @@ gradient_accumulation_steps=4
lr=5e-5
num_train_epochs=3
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type dialogpt
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type ${model_type}
python ../run_clm.py \
--model_name_or_path ${model_name_or_path} \
......
set -e
n_gpus=1
task_name="lm"
dataset_name="multiwoz21"
model_type="gpt"
data_dir="data/${task_name}/${dataset_name}/${model_type}"
output_dir="output/${task_name}/${dataset_name}/${model_type}"
cache_dir="../cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
source_column="dialogue"
max_length=512
model_name_or_path="gpt2-large"
per_device_train_batch_size=16
per_device_eval_batch_size=16
gradient_accumulation_steps=4
lr=5e-5
num_train_epochs=3
python ../create_data.py --tasks ${task_name} --datasets ${dataset_name} --model_type ${model_type}
python ../run_clm.py \
--model_name_or_path ${model_name_or_path} \
--train_file ${train_file} \
--validation_file ${validation_file} \
--source_column ${source_column} \
--max_length ${max_length} \
--do_train \
--do_eval \
--save_strategy epoch \
--evaluation_strategy epoch \
--load_best_model_at_end \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--debug underflow_overflow \
--gradient_checkpointing
set -e
n_gpus=2
task_name="key2gen"
dataset_name="metalwoz+sgd+tm1+tm2+tm3"
speaker="all"
model_type="gpt"
data_dir="data/${task_name}/${model_type}/${dataset_name}"
output_dir="output/${task_name}/${model_type}/${dataset_name}"
cache_dir="../../t5/cache"
logging_dir="${output_dir}/runs"
train_file="${data_dir}/train.json"
validation_file="${data_dir}/validation.json"
test_file="${data_dir}/test.json"
source_column="keywords+context"
target_column="response"
truncation_side="left"
max_source_length=512
max_target_length=128
model_name_or_path="t5-small"
per_device_train_batch_size=128
per_device_eval_batch_size=128
gradient_accumulation_steps=4
lr=1e-3
num_train_epochs=1
python -m torch.distributed.launch \
--nproc_per_node ${n_gpus} ../../t5/run_seq2seq.py \
--task_name ${task_name} \
--train_file ${train_file} \
--validation_file ${validation_file} \
--test_file ${test_file} \
--source_column ${source_column} \
--target_column ${target_column} \
--max_source_length ${max_source_length} \
--max_target_length ${max_target_length} \
--truncation_side ${truncation_side} \
--model_name_or_path ${model_name_or_path} \
--do_train \
--do_eval \
--do_predict \
--save_strategy epoch \
--evaluation_strategy epoch \
--load_best_model_at_end \
--prediction_loss_only \
--cache_dir ${cache_dir} \
--output_dir ${output_dir} \
--logging_dir ${logging_dir} \
--overwrite_output_dir \
--preprocessing_num_workers 4 \
--per_device_train_batch_size ${per_device_train_batch_size} \
--per_device_eval_batch_size ${per_device_eval_batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--learning_rate ${lr} \
--num_train_epochs ${num_train_epochs} \
--debug underflow_overflow \
--adafactor \
--gradient_checkpointing
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment