general
dsml
TDA4ContextualEmbeddings - Public

Repository

pyenv install 3.10
pyenv local 3.10
# Optional:
# Tell poetry to create a virtual environment for the project
# inside the project directory.
poetry config virtualenvs.in-project true

poetry env use 3.10
poetry env list --full-path # List all the environments
poetry env remove <path> # Remove an environment
poetry env remove --all # Remove all the environments
poetry install --with gpu,dev --without cpu # For GPU
poetry install --with cpu,dev --without gpu # For CPU
conda create -n "python3.10_for_tdacontextual" python=3.10
conda activate "python3.10_for_tdacontextual"
conda install -c pytorch faiss-cpu
cd $TDACONTEXTUAL_REPOSITORY_BASE_PATH # Change to the repository root directory
poetry install --with dev --without cpu,gpu
poetry shell
./tdacontextual/scripts/setup_environment.sh
# Run with input just the language model embeddings
poetry run full_training_prediction_evaluation_pipeline_for_single_setup --feature_type="lm" --toy_dataset_mode
# Run with input the language model embeddings and the contextual topological features
poetry run full_training_prediction_evaluation_pipeline_for_single_setup --feature_type="lm_c_pis_h0" --toy_dataset_mode
poetry run full_training_prediction_evaluation_pipeline_for_single_setup --feature_type="lm"
poetry run python3 -m pytest -m "not slow" tests/ --cov=tdacontextual/ --cov-report=html:tests/temp_files/coverage_report
./tdacontextual/scripts/run_tests_with_coverage.sh
@inproceedings{ruppik-etal-2024-local,
    title = "Local Topology Measures of Contextual Language Model Latent Spaces with Applications to Dialogue Term Extraction",
    author = "Ruppik, Benjamin Matthias  and
      Heck, Michael  and
      van Niekerk, Carel  and
      Vukovic, Renato  and
      Lin, Hsien-chin  and
      Feng, Shutong  and
      Zibrowius, Marcus  and
      Gasic, Milica",
    editor = "Kawahara, Tatsuya  and
      Demberg, Vera  and
      Ultes, Stefan  and
      Inoue, Koji  and
      Mehri, Shikib  and
      Howcroft, David  and
      Komatani, Kazunori",
    booktitle = "Proceedings of the 25th Annual Meeting of the Special Interest Group on Discourse and Dialogue",
    month = sep,
    year = "2024",
    address = "Kyoto, Japan",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.sigdial-1.31",
    doi = "10.18653/v1/2024.sigdial-1.31",
    pages = "344--356",
    abstract = "A common approach for sequence tagging tasks based on contextual word representations is to train a machine learning classifier directly on these embedding vectors. This approach has two shortcomings. First, such methods consider single input sequences in isolation and are unable to put an individual embedding vector in relation to vectors outside the current local context of use. Second, the high performance of these models relies on fine-tuning the embedding model in conjunction with the classifier, which may not always be feasible due to the size or inaccessibility of the underlying feature-generation model. It is thus desirable, given a collection of embedding vectors of a corpus, i.e. a datastore, to find features of each vector that describe its relation to other, similar vectors in the datastore. With this in mind, we introduce complexity measures of the local topology of the latent space of a contextual language model with respect to a given datastore. The effectiveness of our features is demonstrated through their application to dialogue term extraction. Our work continues a line of research that explores the manifold hypothesis for word embeddings, demonstrating that local structure in the space carved out by word embeddings can be exploited to infer semantic properties.",
}