Skip to content
Snippets Groups Projects
Commit a18e73a0 authored by Marc Feger's avatar Marc Feger
Browse files

Add coreNLP sentence-spliter and improve tokens :tada:

parent 8d2a9be1
No related branches found
No related tags found
No related merge requests found
from typing import List
from pycorenlp import StanfordCoreNLP
from code.helper import FileReader, FileWriter
def get_splits(sentences: List) -> List[List]:
'''
This method get the results of a sentence.
:param sentences: List containing the sentences
:return: List of token-lists
'''
nlp = StanfordCoreNLP('http://localhost:9000')
properties = {
'annotators': 'ssplit',
'outputFormat': 'json'
}
splited_sentences = []
for sentence in sentences:
result = nlp.annotate(text=sentence, properties=properties)
splits = []
for res in result.get('sentences'):
results = res.get('tokens')
text = ""
for split in results[:-1]: # only consider the first n-1 elements
text += split.get('word') + split.get('after')
text += res.get('tokens')[-1].get('word') # consider the last element
splits += [text]
splited_sentences += [splits]
return splited_sentences
if __name__ == '__main__':
'''
Notice that the tokenizers are storend in the current dir.
'''
file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt')
file_splits = get_splits(file_sentences)
FileWriter.write(file_splits, './results/corenlp_sentences.txt')
from typing import List
from pycorenlp import StanfordCoreNLP
if __name__ == '__main__':
from code.helper import FileReader, FileWriter
def get_tokens(sentences: List) -> List[List]:
'''
This method get the results of a sentence.
:param sentences: List containing the sentences
:return: List of token-lists
'''
nlp = StanfordCoreNLP('http://localhost:9000')
text = (
'Nee sorry! Obwohl ich Graffiti liebe, finde ich nicht, dass sie zum Feld passen. Mag außerdem die lösemittelhaltigen Farben nicht vertragen. Wenn ich durch den Hans-Baluschek-Park radle, riecht es immer stark vom angrenzenden Südgelände nach Farbdünsten und das passt nicht wirklich zum Naturschutz. Aber eine Stelle, an der ich mir unbedingt mal etwas Farbe wünsche würde, ist die trübsinnige Autobahnbrücke in der Boelkestraße. Dort würde der Geruch m. E. auch nicht stören, weil man sich eh nur durchgängig aufhält...')
output = nlp.annotate(text, properties={
'annotators': 'tokenize,ssplit,pos,depparse,parse',
properties = {
'annotators': 'tokenize',
'outputFormat': 'json'
})
output = nlp.tokensregex(text, pattern='[]', filter=False)
print(output)
}
tokens = []
for sentence in sentences:
result = nlp.annotate(text=sentence, properties=properties)
current_tokens = []
for word in result.get('tokens'):
current_tokens = current_tokens + [word.get('word')]
tokens = tokens + [current_tokens]
return tokens
if __name__ == '__main__':
file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt')
file_tokens = get_tokens(file_sentences)
FileWriter.write(file_tokens, './results/corenlp_tokens.txt')
This diff is collapsed.
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment