First refactoring

8d2a9be1 · Marc Feger · fd496231 · 8d2a9be1 · 8d2a9be1 · 8d2a9be1
Commit 8d2a9be1 authored Oct 26, 2019 by Marc Feger
--- a/code/nltk_sentence.py
+++ b/code/nltk_sentence.py
+from typing import List
+
+import nltk
+
+from code.helper import FileReader, FileWriter
+
+
+def get_splits(sentences: List) -> List[List]:
+    '''
+    This method get the results of a sentence.
+
+    :param sentences: List containing the sentences
+    :return: List of token-lists
+    '''
+    tokens = []
+    for sentence in sentences:
+        doc = nltk.sent_tokenize(sentence, language='german')
+        tokens += [doc]
+    return tokens
+
+
+if __name__ == '__main__':
+    '''
+    Notice that the tokenizers are storend in the current dir.
+    '''
+    nltk.download('punkt', download_dir='.')
+    file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt')
+    file_splits = get_splits(file_sentences)
+    FileWriter.write(file_splits, './results/nltk_sentences.txt')
--- a/code/nltk_tokens.py
+++ b/code/nltk_tokens.py
@@ -7,9 +7,8 @@ from code.helper import FileReader, FileWriter

 def get_tokens(sentences: List) -> List[List]:
    '''
-    This method get the tokens of a sentence.
+    This method get the results of a sentence.

-    :param model: Which model should be used
    :param sentences: List containing the sentences
    :return: List of token-lists
    '''
@@ -27,4 +26,4 @@ if __name__ == '__main__':
    nltk.download('punkt', download_dir='.')
    file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt')
    file_tokens = get_tokens(file_sentences)
-    FileWriter.write(file_tokens, './tokens/nltk_tokens.txt')
+    FileWriter.write(file_tokens, './results/nltk_tokens.txt')
--- a/code/results/nltk_sentences.txt
+++ b/code/results/nltk_sentences.txt
--- a/code/tokens/nltk_tokens.txt
+++ b/code/tokens/nltk_tokens.txt
--- a/code/tokens/spacy_sentences.txt
+++ b/code/tokens/spacy_sentences.txt
--- a/code/tokens/spacy_tokens.txt
+++ b/code/tokens/spacy_tokens.txt
--- a/code/spacy_sentence.py
+++ b/code/spacy_sentence.py
@@ -7,7 +7,7 @@ from code.helper import FileReader, FileWriter

 def get_splits(model, sentences: List) -> List[List]:
    '''
-    This method get the tokens of a sentence.
+    This method get the results of a sentence.

    :param model: Which model should be used
    :param sentences: List containing the sentences
@@ -28,4 +28,4 @@ if __name__ == '__main__':
    nlp = spacy.load('de_core_news_sm')
    file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt')
    file_splits = get_splits(nlp, file_sentences)
-    FileWriter.write(file_splits, './tokens/spacy_sentences.txt')
+    FileWriter.write(file_splits, './results/spacy_sentences.txt')
--- a/code/spacy_tokens.py
+++ b/code/spacy_tokens.py
@@ -7,7 +7,7 @@ from code.helper import FileWriter, FileReader

 def get_tokens(model, sentences: List) -> List[List]:
    '''
-    This method get the tokens of a sentence.
+    This method get the results of a sentence.

    :param model: Which model should be used
    :param sentences: List containing the sentences
@@ -28,4 +28,4 @@ if __name__ == '__main__':
    nlp = spacy.load('de_core_news_sm')
    file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt')
    file_tokens = get_tokens(nlp, file_sentences)
-    FileWriter.write(file_tokens, './tokens/spacy_tokens.txt')
+    FileWriter.write(file_tokens, './results/spacy_tokens.txt')