Skip to content
Snippets Groups Projects
Commit fd496231 authored by Marc Feger's avatar Marc Feger
Browse files

First commit :tada:

parents
Branches
No related tags found
No related merge requests found
Showing
with 133 additions and 0 deletions
stanford-corenlp-full-2018-10-05
code/tokenizers
.DS_Store
.idea
\ No newline at end of file
File added
from pycorenlp import StanfordCoreNLP
if __name__ == '__main__':
nlp = StanfordCoreNLP('http://localhost:9000')
text = (
'Nee sorry! Obwohl ich Graffiti liebe, finde ich nicht, dass sie zum Feld passen. Mag außerdem die lösemittelhaltigen Farben nicht vertragen. Wenn ich durch den Hans-Baluschek-Park radle, riecht es immer stark vom angrenzenden Südgelände nach Farbdünsten und das passt nicht wirklich zum Naturschutz. Aber eine Stelle, an der ich mir unbedingt mal etwas Farbe wünsche würde, ist die trübsinnige Autobahnbrücke in der Boelkestraße. Dort würde der Geruch m. E. auch nicht stören, weil man sich eh nur durchgängig aufhält...')
output = nlp.annotate(text, properties={
'annotators': 'tokenize,ssplit,pos,depparse,parse',
'outputFormat': 'json'
})
output = nlp.tokensregex(text, pattern='[]', filter=False)
print(output)
This diff is collapsed.
from typing import List
def get_file_as_list(route: str) -> List[str]:
"""
This methods splits the file by \n and returns is as a list of stings.
:type route:
:return: File as list
"""
with open(route) as f:
lines = f.read().splitlines()
return lines
import json
from typing import List
def write(source: List[List], target: str):
with open(target, 'w') as f:
for line in source:
f.write("%s\n" % line)
File added
File added
File added
from typing import List
import nltk
from code.helper import FileReader, FileWriter
def get_tokens(sentences: List) -> List[List]:
'''
This method get the tokens of a sentence.
:param model: Which model should be used
:param sentences: List containing the sentences
:return: List of token-lists
'''
tokens = []
for sentence in sentences:
doc = nltk.word_tokenize(sentence, language='german')
tokens += [doc]
return tokens
if __name__ == '__main__':
'''
Notice that the tokenizers are storend in the current dir.
'''
nltk.download('punkt', download_dir='.')
file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt')
file_tokens = get_tokens(file_sentences)
FileWriter.write(file_tokens, './tokens/nltk_tokens.txt')
from typing import List
import spacy
from code.helper import FileReader, FileWriter
def get_splits(model, sentences: List) -> List[List]:
'''
This method get the tokens of a sentence.
:param model: Which model should be used
:param sentences: List containing the sentences
:return: List of token-lists
'''
tokens = []
for sentence in sentences:
doc = model(sentence)
tokens += [list(doc.sents)]
return tokens
if __name__ == '__main__':
'''
Don't forget to install the german model with:
$ python3 -m spacy download de_core_news_sm
'''
nlp = spacy.load('de_core_news_sm')
file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt')
file_splits = get_splits(nlp, file_sentences)
FileWriter.write(file_splits, './tokens/spacy_sentences.txt')
from typing import List
import spacy
from code.helper import FileWriter, FileReader
def get_tokens(model, sentences: List) -> List[List]:
'''
This method get the tokens of a sentence.
:param model: Which model should be used
:param sentences: List containing the sentences
:return: List of token-lists
'''
tokens = []
for sentence in sentences:
doc = model(sentence)
tokens += [[token.text for token in doc]]
return tokens
if __name__ == '__main__':
'''
Don't forget to install the german model with:
$ python3 -m spacy download de_core_news_sm
'''
nlp = spacy.load('de_core_news_sm')
file_sentences = FileReader.get_file_as_list('./corpus/projekt1_corpus.txt')
file_tokens = get_tokens(nlp, file_sentences)
FileWriter.write(file_tokens, './tokens/spacy_tokens.txt')
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
File added
This diff is collapsed.
spacy==2.2.1
nltk==3.4.5
pycorenlp==0.3.0
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment