Skip to content
Snippets Groups Projects
Commit 8d2a9be1 authored by Marc Feger's avatar Marc Feger
Browse files

First refactoring :tada:

parent fd496231
Branches
No related tags found
No related merge requests found
from typing import List
import nltk
from code.helper import FileReader, FileWriter
def get_splits(sentences: List) -> List[List]:
'''
This method get the results of a sentence.
:param sentences: List containing the sentences
:return: List of token-lists
'''
tokens = []
for sentence in sentences:
doc = nltk.sent_tokenize(sentence, language='german')
tokens += [doc]
return tokens
if __name__ == '__main__':
'''
Notice that the tokenizers are storend in the current dir.
'''
nltk.download('punkt', download_dir='.')
file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt')
file_splits = get_splits(file_sentences)
FileWriter.write(file_splits, './results/nltk_sentences.txt')
...@@ -7,9 +7,8 @@ from code.helper import FileReader, FileWriter ...@@ -7,9 +7,8 @@ from code.helper import FileReader, FileWriter
def get_tokens(sentences: List) -> List[List]: def get_tokens(sentences: List) -> List[List]:
''' '''
This method get the tokens of a sentence. This method get the results of a sentence.
:param model: Which model should be used
:param sentences: List containing the sentences :param sentences: List containing the sentences
:return: List of token-lists :return: List of token-lists
''' '''
...@@ -27,4 +26,4 @@ if __name__ == '__main__': ...@@ -27,4 +26,4 @@ if __name__ == '__main__':
nltk.download('punkt', download_dir='.') nltk.download('punkt', download_dir='.')
file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt') file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt')
file_tokens = get_tokens(file_sentences) file_tokens = get_tokens(file_sentences)
FileWriter.write(file_tokens, './tokens/nltk_tokens.txt') FileWriter.write(file_tokens, './results/nltk_tokens.txt')
This diff is collapsed.
File moved
File moved
File moved
...@@ -7,7 +7,7 @@ from code.helper import FileReader, FileWriter ...@@ -7,7 +7,7 @@ from code.helper import FileReader, FileWriter
def get_splits(model, sentences: List) -> List[List]: def get_splits(model, sentences: List) -> List[List]:
''' '''
This method get the tokens of a sentence. This method get the results of a sentence.
:param model: Which model should be used :param model: Which model should be used
:param sentences: List containing the sentences :param sentences: List containing the sentences
...@@ -28,4 +28,4 @@ if __name__ == '__main__': ...@@ -28,4 +28,4 @@ if __name__ == '__main__':
nlp = spacy.load('de_core_news_sm') nlp = spacy.load('de_core_news_sm')
file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt') file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt')
file_splits = get_splits(nlp, file_sentences) file_splits = get_splits(nlp, file_sentences)
FileWriter.write(file_splits, './tokens/spacy_sentences.txt') FileWriter.write(file_splits, './results/spacy_sentences.txt')
...@@ -7,7 +7,7 @@ from code.helper import FileWriter, FileReader ...@@ -7,7 +7,7 @@ from code.helper import FileWriter, FileReader
def get_tokens(model, sentences: List) -> List[List]: def get_tokens(model, sentences: List) -> List[List]:
''' '''
This method get the tokens of a sentence. This method get the results of a sentence.
:param model: Which model should be used :param model: Which model should be used
:param sentences: List containing the sentences :param sentences: List containing the sentences
...@@ -28,4 +28,4 @@ if __name__ == '__main__': ...@@ -28,4 +28,4 @@ if __name__ == '__main__':
nlp = spacy.load('de_core_news_sm') nlp = spacy.load('de_core_news_sm')
file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt') file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt')
file_tokens = get_tokens(nlp, file_sentences) file_tokens = get_tokens(nlp, file_sentences)
FileWriter.write(file_tokens, './tokens/spacy_tokens.txt') FileWriter.write(file_tokens, './results/spacy_tokens.txt')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment