Skip to content
Snippets Groups Projects
Commit 3e185b76 authored by Benjamin Ruppik's avatar Benjamin Ruppik
Browse files

Updated sbert embeddings script; added .pkl file with neighborhoods

parent bc605929
No related branches found
No related tags found
No related merge requests found
Source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
...@@ -69,7 +69,11 @@ LOAD_EMBEDDINGS = False ...@@ -69,7 +69,11 @@ LOAD_EMBEDDINGS = False
data_desc_list = ['paraphrase-MiniLM-L6-v2'] data_desc_list = ['paraphrase-MiniLM-L6-v2']
DEBUG_DATA_DESC = 'paraphrase-MiniLM-L6-v2' DEBUG_DATA_DESC = 'paraphrase-MiniLM-L6-v2'
VOCAB_DESC = 'pretrained_cc_en'
VOCAB_PATH = embeddings_config['data']['multiwoz_and_sgd_joint_vocabulary_path']
VOCAB_DESC = 'multiwoz_and_sgd'
# VOCAB_PATH = embeddings_config['data']['pretrained_cc_en_vocabulary_path']
# VOCAB_DESC = 'pretrained_cc_en'
# Set up logging # Set up logging
logger = logging.getLogger() logger = logging.getLogger()
...@@ -109,10 +113,8 @@ models['paraphrase-MiniLM-L6-v2'] = SentenceTransformer('paraphrase-MiniLM-L6-v2 ...@@ -109,10 +113,8 @@ models['paraphrase-MiniLM-L6-v2'] = SentenceTransformer('paraphrase-MiniLM-L6-v2
logging.info('Loading data ...') logging.info('Loading data ...')
with open(embeddings_config['data']['multiwoz_and_sgd_joint_vocabulary_path'], "r") as file: with open(VOCAB_PATH, "r") as file:
vocabulary = json.load(file) vocabulary = json.load(file)
# with open(embeddings_config['data']['pretrained_cc_en_vocabulary_path'], "r") as file:
# vocabulary = json.load(file)
# Testing tokenization of certain sentences # Testing tokenization of certain sentences
# sentences = \ # sentences = \
......
data: data:
data_folder_path: '../../data' data_folder_path: '../data'
multiwoz_and_sgd_joint_vocabulary_path: '../multiwoz_and_sgd_joint_vocabulary.json' multiwoz_and_sgd_joint_vocabulary_path: '../data/multiwoz_and_sgd_joint_vocabulary.json'
pretrained_cc_en_vocabulary_path: '../pretrained_cc_en_vocabulary.json' pretrained_cc_en_vocabulary_path: '../data/pretrained_cc_en_vocabulary.json'
embeddings: embeddings:
embeddings_dict_path: '../../embeddings' embeddings_dict_path: '../data'
embeddings_dataframes_path: '../../embeddings' embeddings_dataframes_path: '../data'
context: 'word' context: 'word'
pooling_method: 'mean' pooling_method: 'mean'
special_tokens: 'ignore' special_tokens: 'ignore'
neighborhoods: neighborhoods:
nbhd_size: 50 nbhd_size: 50
nbhd_remove: 0 nbhd_remove: 0
neighborhoods_path: '../../neighborhoods' neighborhoods_path: '../data/neighborhoods'
persistence_features_path: '../../persistence_features/sbert_persistence' persistence_features_path: '../data'
normalize: False normalize: False
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment