Skip to content
Snippets Groups Projects
Commit 7a4a6c00 authored by Marc Feger's avatar Marc Feger
Browse files

Restructure project

parent 4f9b8467
No related branches found
No related tags found
No related merge requests found
File moved
File moved
File moved
File moved
import copy
import sys
from collections import Counter
from tokenize import Double
from typing import List, Dict
import nltk
......@@ -22,9 +23,10 @@ def prepare_documents(documents: List[Dict]) -> None:
:return: None
"""
for document in documents:
text_tokens = nltk.word_tokenize(document.get('text'), LANGUAGE)
document['filtered_text'] = sorted([w for w in text_tokens if w not in STOP_WORDS])
document['filtered_stemmed_text'] = [nltk.SnowballStemmer(LANGUAGE).stem(w) for w in document['filtered_text']]
text_tokens: List = nltk.word_tokenize(document.get('text'), LANGUAGE)
document['filtered_text']: List = sorted([w for w in text_tokens if w not in STOP_WORDS])
document['filtered_stemmed_text']: List = [nltk.SnowballStemmer(LANGUAGE).stem(w) for w in
document['filtered_text']]
def prepare_query(query: Dict) -> None:
......@@ -34,9 +36,9 @@ def prepare_query(query: Dict) -> None:
:param query: Dict with the query information.
:return: None
"""
text_tokens = nltk.word_tokenize(query.get('text'), LANGUAGE)
query['filtered_query'] = sorted([w for w in text_tokens if w not in STOP_WORDS])
query['filtered_stemmed_query'] = [nltk.SnowballStemmer(LANGUAGE).stem(w) for w in query['filtered_query']]
text_tokens: List = nltk.word_tokenize(query.get('text'), LANGUAGE)
query['filtered_query']: List = sorted([w for w in text_tokens if w not in STOP_WORDS])
query['filtered_stemmed_query']: List = [nltk.SnowballStemmer(LANGUAGE).stem(w) for w in query['filtered_query']]
def assemble_index_term_vocabulary_from_(documents: List[Dict], query: Dict) -> List:
......@@ -48,7 +50,7 @@ def assemble_index_term_vocabulary_from_(documents: List[Dict], query: Dict) ->
:param query: Dict of the query.
:return: The index-term-vocabulary.
"""
vocabulary = []
vocabulary: List = []
for document in documents:
vocabulary += document.get('filtered_stemmed_text')
vocabulary += query.get('filtered_stemmed_query')
......@@ -66,9 +68,9 @@ def add_document_vectors(target: List[Dict], vocabulary: List, key: str, example
:return:
"""
for element in target:
counted_document_vector = {}
relative_counted_document_vector = {}
counted_term = dict(Counter(element.get(key)))
counted_document_vector: Dict = {}
relative_counted_document_vector: Dict = {}
counted_term: Dict = dict(Counter(element.get(key)))
for term in vocabulary:
counted_document_vector[term] = 0
relative_counted_document_vector[term] = 0
......@@ -79,8 +81,8 @@ def add_document_vectors(target: List[Dict], vocabulary: List, key: str, example
else:
# Maybe the relative count of all terms per document and not all terms in the vocabulary ?
relative_counted_document_vector[term] = counted_term.get(term) / len(vocabulary)
element['counted_' + key] = counted_document_vector
element['relative_' + key] = relative_counted_document_vector
element['counted_' + key]: Dict = counted_document_vector
element['relative_' + key]: Dict = relative_counted_document_vector
def get_df(documents: List[Dict], vocabulary: List) -> Dict:
......@@ -93,7 +95,7 @@ def get_df(documents: List[Dict], vocabulary: List) -> Dict:
:return: df
"""
df = {}
df: Dict = {}
for term in vocabulary:
df[term] = 0
for document in documents:
......@@ -113,7 +115,7 @@ def add_tf_idf_as_weight(target: List[Dict], df: Dict, key: str, example=EXAMPLE
:return: None
"""
for element in target:
document_vector = copy.deepcopy(element.get('relative_filtered_stemmed_' + key))
document_vector: Dict = copy.deepcopy(element.get('relative_filtered_stemmed_' + key))
for term in element.get('filtered_stemmed_' + key):
if df.get(term) == 0:
......@@ -121,14 +123,14 @@ def add_tf_idf_as_weight(target: List[Dict], df: Dict, key: str, example=EXAMPLE
else:
# Be careful with log10
if example:
document_vector[term] = (element.get('counted_filtered_stemmed_' + key)[term] *
document_vector[term]: Double = (element.get('counted_filtered_stemmed_' + key)[term] *
np.log10(len(df.keys()) / df.get(term)))
else:
document_vector[term] = element.get('relative_filtered_stemmed_' + key)[term] * \
document_vector[term]: Double = element.get('relative_filtered_stemmed_' + key)[term] * \
(element.get('counted_filtered_stemmed_' + key)[term] *
np.log10(len(df.keys()) / df.get(term)))
element['document_vector'] = document_vector
element['document_vector']: Dict = document_vector
def cosine(v1: List, v2: List):
......@@ -139,8 +141,8 @@ def cosine(v1: List, v2: List):
:param v2: Vector 2.
:return: Cosine-Similarity of v1 and v2.
"""
v1 = np.array(v1)
v2 = np.array(v2)
v1: List = np.array(v1)
v2: List = np.array(v2)
counter = np.dot(v1, v2)
divider = np.linalg.norm(v1) * np.linalg.norm(v2)
......@@ -158,10 +160,10 @@ def add_cosine_similarity(documents: List[Dict], query: Dict) -> None:
:param query: The query to be looked for.
:return: None
"""
v2 = [value for _, value in query.get('document_vector').items()]
v2: List = [value for _, value in query.get('document_vector').items()]
for document in documents:
v1 = [value for _, value in document.get('document_vector').items()]
document['cosine'] = round(cosine(v1, v2), 3)
v1: List = [value for _, value in document.get('document_vector').items()]
document['cosine']: Double = round(cosine(v1, v2), 3)
def print_results(documents: List[Dict]) -> None:
......@@ -172,8 +174,8 @@ def print_results(documents: List[Dict]) -> None:
:return: None
"""
print('###############################################################')
documents = sorted(documents, key=lambda k: k['cosine'], reverse=True)
top5_documents = documents[:5]
documents: List[Dict] = sorted(documents, key=lambda k: k['cosine'], reverse=True)
top5_documents: List[Dict] = documents[:5]
for document in top5_documents:
print('Document: ' + document.get('filename') + ' has a cosine similarity of: ' + str(document.get('cosine')))
print('###############################################################')
......
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('german')
text = "Ich bin der der das hier gerne macht und wir gingen weg"
text_tokens = word_tokenize(text, 'german')
filtered_text = sorted([w for w in text_tokens if w not in stop_words])
filtered_stemmed_text = [SnowballStemmer('german').stem(w) for w in filtered_text]
print(filtered_text)
print(filtered_stemmed_text)
query = "Ich mag das sehr gerne, da ich gerne gehe"
query_tokens = word_tokenize(query)
filtered_query = word_tokenize(query, 'german')
filtered_stemmed_query = sorted([w for w in query_tokens if w not in stop_words])
print(filtered_query)
print(filtered_stemmed_query)
index_term_vocabulary = sorted(set(filtered_stemmed_text + filtered_stemmed_query))
print(index_term_vocabulary)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment