Skip to content
Snippets Groups Projects
Commit 8d2a9be1 authored by Marc Feger's avatar Marc Feger
Browse files

First refactoring :tada:

parent fd496231
Branches
No related tags found
No related merge requests found
from typing import List
import nltk
from code.helper import FileReader, FileWriter
def get_splits(sentences: List) -> List[List]:
'''
This method get the results of a sentence.
:param sentences: List containing the sentences
:return: List of token-lists
'''
tokens = []
for sentence in sentences:
doc = nltk.sent_tokenize(sentence, language='german')
tokens += [doc]
return tokens
if __name__ == '__main__':
'''
Notice that the tokenizers are storend in the current dir.
'''
nltk.download('punkt', download_dir='.')
file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt')
file_splits = get_splits(file_sentences)
FileWriter.write(file_splits, './results/nltk_sentences.txt')
......@@ -7,9 +7,8 @@ from code.helper import FileReader, FileWriter
def get_tokens(sentences: List) -> List[List]:
'''
This method get the tokens of a sentence.
This method get the results of a sentence.
:param model: Which model should be used
:param sentences: List containing the sentences
:return: List of token-lists
'''
......@@ -27,4 +26,4 @@ if __name__ == '__main__':
nltk.download('punkt', download_dir='.')
file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt')
file_tokens = get_tokens(file_sentences)
FileWriter.write(file_tokens, './tokens/nltk_tokens.txt')
FileWriter.write(file_tokens, './results/nltk_tokens.txt')
This diff is collapsed.
File moved
File moved
File moved
......@@ -7,7 +7,7 @@ from code.helper import FileReader, FileWriter
def get_splits(model, sentences: List) -> List[List]:
'''
This method get the tokens of a sentence.
This method get the results of a sentence.
:param model: Which model should be used
:param sentences: List containing the sentences
......@@ -28,4 +28,4 @@ if __name__ == '__main__':
nlp = spacy.load('de_core_news_sm')
file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt')
file_splits = get_splits(nlp, file_sentences)
FileWriter.write(file_splits, './tokens/spacy_sentences.txt')
FileWriter.write(file_splits, './results/spacy_sentences.txt')
......@@ -7,7 +7,7 @@ from code.helper import FileWriter, FileReader
def get_tokens(model, sentences: List) -> List[List]:
'''
This method get the tokens of a sentence.
This method get the results of a sentence.
:param model: Which model should be used
:param sentences: List containing the sentences
......@@ -28,4 +28,4 @@ if __name__ == '__main__':
nlp = spacy.load('de_core_news_sm')
file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt')
file_tokens = get_tokens(nlp, file_sentences)
FileWriter.write(file_tokens, './tokens/spacy_tokens.txt')
FileWriter.write(file_tokens, './results/spacy_tokens.txt')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment