From 3e185b768a6cb714ebddd515520e4e9b8d1f9ab6 Mon Sep 17 00:00:00 2001
From: Benjamin Ruppik <benjamin.ruppik@hhu.de>
Date: Tue, 14 Jun 2022 16:37:32 +0200
Subject: [PATCH] Updated sbert embeddings script; added .pkl file with
 neighborhoods

---
 ...0000_sbert_size_50_remove_0_normalize_False.pkl |  3 +++
 tda/sbert_create_static_embeddings.py              | 10 ++++++----
 tda/sbert_static_embeddings_config_50_0.yaml       | 14 +++++++-------
 3 files changed, 16 insertions(+), 11 deletions(-)
 create mode 100644 data/neighborhoods/neighborhoods_ambient_static_50000_sbert_size_50_remove_0_normalize_False.pkl

diff --git a/data/neighborhoods/neighborhoods_ambient_static_50000_sbert_size_50_remove_0_normalize_False.pkl b/data/neighborhoods/neighborhoods_ambient_static_50000_sbert_size_50_remove_0_normalize_False.pkl
new file mode 100644
index 0000000..7073b95
--- /dev/null
+++ b/data/neighborhoods/neighborhoods_ambient_static_50000_sbert_size_50_remove_0_normalize_False.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9537e717977c4f26569228852441deec160d9156084635453825650e3d49c90b
+size 2591340055
diff --git a/tda/sbert_create_static_embeddings.py b/tda/sbert_create_static_embeddings.py
index 9e22d0a..2cb2873 100644
--- a/tda/sbert_create_static_embeddings.py
+++ b/tda/sbert_create_static_embeddings.py
@@ -69,7 +69,11 @@ LOAD_EMBEDDINGS = False
 data_desc_list = ['paraphrase-MiniLM-L6-v2']
 
 DEBUG_DATA_DESC = 'paraphrase-MiniLM-L6-v2'
-VOCAB_DESC = 'pretrained_cc_en'
+
+VOCAB_PATH = embeddings_config['data']['multiwoz_and_sgd_joint_vocabulary_path']
+VOCAB_DESC = 'multiwoz_and_sgd'
+# VOCAB_PATH = embeddings_config['data']['pretrained_cc_en_vocabulary_path']
+# VOCAB_DESC = 'pretrained_cc_en'
 
 # Set up logging
 logger = logging.getLogger()
@@ -109,10 +113,8 @@ models['paraphrase-MiniLM-L6-v2'] = SentenceTransformer('paraphrase-MiniLM-L6-v2
 
 logging.info('Loading data ...')
 
-with open(embeddings_config['data']['multiwoz_and_sgd_joint_vocabulary_path'], "r") as file:
+with open(VOCAB_PATH, "r") as file:
     vocabulary = json.load(file)
-# with open(embeddings_config['data']['pretrained_cc_en_vocabulary_path'], "r") as file:
-#     vocabulary = json.load(file)
 
 # Testing tokenization of certain sentences
 # sentences = \
diff --git a/tda/sbert_static_embeddings_config_50_0.yaml b/tda/sbert_static_embeddings_config_50_0.yaml
index 4eceb42..ba39656 100644
--- a/tda/sbert_static_embeddings_config_50_0.yaml
+++ b/tda/sbert_static_embeddings_config_50_0.yaml
@@ -1,16 +1,16 @@
 data:
-  data_folder_path: '../../data'
-  multiwoz_and_sgd_joint_vocabulary_path: '../multiwoz_and_sgd_joint_vocabulary.json'
-  pretrained_cc_en_vocabulary_path: '../pretrained_cc_en_vocabulary.json'
+  data_folder_path: '../data'
+  multiwoz_and_sgd_joint_vocabulary_path: '../data/multiwoz_and_sgd_joint_vocabulary.json'
+  pretrained_cc_en_vocabulary_path: '../data/pretrained_cc_en_vocabulary.json'
 embeddings:
-  embeddings_dict_path: '../../embeddings'
-  embeddings_dataframes_path: '../../embeddings'
+  embeddings_dict_path: '../data'
+  embeddings_dataframes_path: '../data'
   context: 'word'
   pooling_method: 'mean'
   special_tokens: 'ignore'
 neighborhoods:
   nbhd_size: 50
   nbhd_remove: 0
-  neighborhoods_path: '../../neighborhoods'
-  persistence_features_path: '../../persistence_features/sbert_persistence'
+  neighborhoods_path: '../data/neighborhoods'
+  persistence_features_path: '../data'
   normalize: False
-- 
GitLab