From 3e185b768a6cb714ebddd515520e4e9b8d1f9ab6 Mon Sep 17 00:00:00 2001 From: Benjamin Ruppik <benjamin.ruppik@hhu.de> Date: Tue, 14 Jun 2022 16:37:32 +0200 Subject: [PATCH] Updated sbert embeddings script; added .pkl file with neighborhoods --- ...0000_sbert_size_50_remove_0_normalize_False.pkl | 3 +++ tda/sbert_create_static_embeddings.py | 10 ++++++---- tda/sbert_static_embeddings_config_50_0.yaml | 14 +++++++------- 3 files changed, 16 insertions(+), 11 deletions(-) create mode 100644 data/neighborhoods/neighborhoods_ambient_static_50000_sbert_size_50_remove_0_normalize_False.pkl diff --git a/data/neighborhoods/neighborhoods_ambient_static_50000_sbert_size_50_remove_0_normalize_False.pkl b/data/neighborhoods/neighborhoods_ambient_static_50000_sbert_size_50_remove_0_normalize_False.pkl new file mode 100644 index 0000000..7073b95 --- /dev/null +++ b/data/neighborhoods/neighborhoods_ambient_static_50000_sbert_size_50_remove_0_normalize_False.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9537e717977c4f26569228852441deec160d9156084635453825650e3d49c90b +size 2591340055 diff --git a/tda/sbert_create_static_embeddings.py b/tda/sbert_create_static_embeddings.py index 9e22d0a..2cb2873 100644 --- a/tda/sbert_create_static_embeddings.py +++ b/tda/sbert_create_static_embeddings.py @@ -69,7 +69,11 @@ LOAD_EMBEDDINGS = False data_desc_list = ['paraphrase-MiniLM-L6-v2'] DEBUG_DATA_DESC = 'paraphrase-MiniLM-L6-v2' -VOCAB_DESC = 'pretrained_cc_en' + +VOCAB_PATH = embeddings_config['data']['multiwoz_and_sgd_joint_vocabulary_path'] +VOCAB_DESC = 'multiwoz_and_sgd' +# VOCAB_PATH = embeddings_config['data']['pretrained_cc_en_vocabulary_path'] +# VOCAB_DESC = 'pretrained_cc_en' # Set up logging logger = logging.getLogger() @@ -109,10 +113,8 @@ models['paraphrase-MiniLM-L6-v2'] = SentenceTransformer('paraphrase-MiniLM-L6-v2 logging.info('Loading data ...') -with open(embeddings_config['data']['multiwoz_and_sgd_joint_vocabulary_path'], "r") as file: +with open(VOCAB_PATH, "r") as file: vocabulary = json.load(file) -# with open(embeddings_config['data']['pretrained_cc_en_vocabulary_path'], "r") as file: -# vocabulary = json.load(file) # Testing tokenization of certain sentences # sentences = \ diff --git a/tda/sbert_static_embeddings_config_50_0.yaml b/tda/sbert_static_embeddings_config_50_0.yaml index 4eceb42..ba39656 100644 --- a/tda/sbert_static_embeddings_config_50_0.yaml +++ b/tda/sbert_static_embeddings_config_50_0.yaml @@ -1,16 +1,16 @@ data: - data_folder_path: '../../data' - multiwoz_and_sgd_joint_vocabulary_path: '../multiwoz_and_sgd_joint_vocabulary.json' - pretrained_cc_en_vocabulary_path: '../pretrained_cc_en_vocabulary.json' + data_folder_path: '../data' + multiwoz_and_sgd_joint_vocabulary_path: '../data/multiwoz_and_sgd_joint_vocabulary.json' + pretrained_cc_en_vocabulary_path: '../data/pretrained_cc_en_vocabulary.json' embeddings: - embeddings_dict_path: '../../embeddings' - embeddings_dataframes_path: '../../embeddings' + embeddings_dict_path: '../data' + embeddings_dataframes_path: '../data' context: 'word' pooling_method: 'mean' special_tokens: 'ignore' neighborhoods: nbhd_size: 50 nbhd_remove: 0 - neighborhoods_path: '../../neighborhoods' - persistence_features_path: '../../persistence_features/sbert_persistence' + neighborhoods_path: '../data/neighborhoods' + persistence_features_path: '../data' normalize: False -- GitLab