Add coreNLP sentence-spliter and improve tokens

a18e73a0 · Marc Feger · 8d2a9be1 · a18e73a0 · a18e73a0 · a18e73a0
Commit a18e73a0 authored 5 years ago by Marc Feger
--- a/code/corenlp_sentences.py
+++ b/code/corenlp_sentences.py
+from typing import List
+
+from pycorenlp import StanfordCoreNLP
+
+from code.helper import FileReader, FileWriter
+
+
+def get_splits(sentences: List) -> List[List]:
+    '''
+    This method get the results of a sentence.
+
+    :param sentences: List containing the sentences
+    :return: List of token-lists
+    '''
+    nlp = StanfordCoreNLP('http://localhost:9000')
+    properties = {
+        'annotators': 'ssplit',
+        'outputFormat': 'json'
+    }
+    splited_sentences = []
+    for sentence in sentences:
+        result = nlp.annotate(text=sentence, properties=properties)
+        splits = []
+        for res in result.get('sentences'):
+            results = res.get('tokens')
+            text = ""
+            for split in results[:-1]:  # only consider the first n-1 elements
+                text += split.get('word') + split.get('after')
+            text += res.get('tokens')[-1].get('word')  # consider the last element
+            splits += [text]
+        splited_sentences += [splits]
+    return splited_sentences
+
+
+if __name__ == '__main__':
+    '''
+    Notice that the tokenizers are storend in the current dir.
+    '''
+    file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt')
+    file_splits = get_splits(file_sentences)
+    FileWriter.write(file_splits, './results/corenlp_sentences.txt')
--- a/code/corenlp_tokens.py
+++ b/code/corenlp_tokens.py
+from typing import List
+
 from pycorenlp import StanfordCoreNLP

-if __name__ == '__main__':
+from code.helper import FileReader, FileWriter
+
+
+def get_tokens(sentences: List) -> List[List]:
+    '''
+    This method get the results of a sentence.
+
+    :param sentences: List containing the sentences
+    :return: List of token-lists
+    '''
    nlp = StanfordCoreNLP('http://localhost:9000')
-    text = (
-        'Nee sorry! Obwohl ich Graffiti liebe, finde ich nicht, dass sie zum Feld passen. Mag außerdem die lösemittelhaltigen Farben nicht vertragen. Wenn ich durch den Hans-Baluschek-Park radle, riecht es immer stark vom angrenzenden Südgelände nach Farbdünsten und das passt nicht wirklich zum Naturschutz. Aber eine Stelle, an der ich mir unbedingt mal etwas Farbe wünsche würde, ist die trübsinnige Autobahnbrücke in der Boelkestraße. Dort würde der Geruch m. E. auch nicht stören, weil man sich eh nur durchgängig aufhält...')
-    output = nlp.annotate(text, properties={
-        'annotators': 'tokenize,ssplit,pos,depparse,parse',
+    properties = {
+        'annotators': 'tokenize',
        'outputFormat': 'json'
-    })
-    output = nlp.tokensregex(text, pattern='[]', filter=False)
-    print(output)
+    }
+    tokens = []
+    for sentence in sentences:
+        result = nlp.annotate(text=sentence, properties=properties)
+        current_tokens = []
+        for word in result.get('tokens'):
+            current_tokens = current_tokens + [word.get('word')]
+        tokens = tokens + [current_tokens]
+    return tokens
+
+
+if __name__ == '__main__':
+    file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt')
+    file_tokens = get_tokens(file_sentences)
+    FileWriter.write(file_tokens, './results/corenlp_tokens.txt')
--- a/code/results/corenlp_sentences.txt
+++ b/code/results/corenlp_sentences.txt
--- a/code/results/corenlp_tokens.txt
+++ b/code/results/corenlp_tokens.txt