from typing import List import spacy from code.helper import FileWriter, FileReader def get_tokens(model, sentences: List) -> List[List]: ''' This method get the tokens of a sentence. :param model: Which model should be used :param sentences: List containing the sentences :return: List of token-lists ''' tokens = [] for sentence in sentences: doc = model(sentence) tokens += [[token.text for token in doc]] return tokens if __name__ == '__main__': ''' Don't forget to install the german model with: $ python3 -m spacy download de_core_news_sm ''' nlp = spacy.load('de_core_news_sm') file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt') file_tokens = get_tokens(nlp, file_sentences) FileWriter.write(file_tokens, './tokens/spacy_tokens.txt')