From 7a4a6c00d412e5f5f95adcb21d03603540b0f80f Mon Sep 17 00:00:00 2001
From: feger <marc.feger@hhu.de>
Date: Sun, 14 Apr 2019 11:38:30 +0200
Subject: [PATCH] Restructure project

---
 {Aufgabe2a => Aufgabe2}/FileReader.py     |  0
 {Aufgabe2a => Aufgabe2}/__init__.py       |  0
 {Aufgabe2a => Aufgabe2/files}/meta.txt    |  0
 {Aufgabe2a => Aufgabe2/files}/meta_a1.txt |  0
 {Aufgabe2a => Aufgabe2}/program.py        | 56 ++++++++++++-----------
 Aufgabe2a/test.py                         | 30 ------------
 6 files changed, 29 insertions(+), 57 deletions(-)
 rename {Aufgabe2a => Aufgabe2}/FileReader.py (100%)
 rename {Aufgabe2a => Aufgabe2}/__init__.py (100%)
 rename {Aufgabe2a => Aufgabe2/files}/meta.txt (100%)
 rename {Aufgabe2a => Aufgabe2/files}/meta_a1.txt (100%)
 rename {Aufgabe2a => Aufgabe2}/program.py (76%)
 delete mode 100644 Aufgabe2a/test.py

diff --git a/Aufgabe2a/FileReader.py b/Aufgabe2/FileReader.py
similarity index 100%
rename from Aufgabe2a/FileReader.py
rename to Aufgabe2/FileReader.py
diff --git a/Aufgabe2a/__init__.py b/Aufgabe2/__init__.py
similarity index 100%
rename from Aufgabe2a/__init__.py
rename to Aufgabe2/__init__.py
diff --git a/Aufgabe2a/meta.txt b/Aufgabe2/files/meta.txt
similarity index 100%
rename from Aufgabe2a/meta.txt
rename to Aufgabe2/files/meta.txt
diff --git a/Aufgabe2a/meta_a1.txt b/Aufgabe2/files/meta_a1.txt
similarity index 100%
rename from Aufgabe2a/meta_a1.txt
rename to Aufgabe2/files/meta_a1.txt
diff --git a/Aufgabe2a/program.py b/Aufgabe2/program.py
similarity index 76%
rename from Aufgabe2a/program.py
rename to Aufgabe2/program.py
index 8bdd0e7..cfe71a9 100644
--- a/Aufgabe2a/program.py
+++ b/Aufgabe2/program.py
@@ -1,6 +1,7 @@
 import copy
 import sys
 from collections import Counter
+from tokenize import Double
 from typing import List, Dict
 
 import nltk
@@ -22,9 +23,10 @@ def prepare_documents(documents: List[Dict]) -> None:
     :return: None
     """
     for document in documents:
-        text_tokens = nltk.word_tokenize(document.get('text'), LANGUAGE)
-        document['filtered_text'] = sorted([w for w in text_tokens if w not in STOP_WORDS])
-        document['filtered_stemmed_text'] = [nltk.SnowballStemmer(LANGUAGE).stem(w) for w in document['filtered_text']]
+        text_tokens: List = nltk.word_tokenize(document.get('text'), LANGUAGE)
+        document['filtered_text']: List = sorted([w for w in text_tokens if w not in STOP_WORDS])
+        document['filtered_stemmed_text']: List = [nltk.SnowballStemmer(LANGUAGE).stem(w) for w in
+                                                   document['filtered_text']]
 
 
 def prepare_query(query: Dict) -> None:
@@ -34,9 +36,9 @@ def prepare_query(query: Dict) -> None:
     :param query: Dict with the query information.
     :return: None
     """
-    text_tokens = nltk.word_tokenize(query.get('text'), LANGUAGE)
-    query['filtered_query'] = sorted([w for w in text_tokens if w not in STOP_WORDS])
-    query['filtered_stemmed_query'] = [nltk.SnowballStemmer(LANGUAGE).stem(w) for w in query['filtered_query']]
+    text_tokens: List = nltk.word_tokenize(query.get('text'), LANGUAGE)
+    query['filtered_query']: List = sorted([w for w in text_tokens if w not in STOP_WORDS])
+    query['filtered_stemmed_query']: List = [nltk.SnowballStemmer(LANGUAGE).stem(w) for w in query['filtered_query']]
 
 
 def assemble_index_term_vocabulary_from_(documents: List[Dict], query: Dict) -> List:
@@ -48,7 +50,7 @@ def assemble_index_term_vocabulary_from_(documents: List[Dict], query: Dict) ->
     :param query: Dict of the query.
     :return: The index-term-vocabulary.
     """
-    vocabulary = []
+    vocabulary: List = []
     for document in documents:
         vocabulary += document.get('filtered_stemmed_text')
     vocabulary += query.get('filtered_stemmed_query')
@@ -66,9 +68,9 @@ def add_document_vectors(target: List[Dict], vocabulary: List, key: str, example
     :return:
     """
     for element in target:
-        counted_document_vector = {}
-        relative_counted_document_vector = {}
-        counted_term = dict(Counter(element.get(key)))
+        counted_document_vector: Dict = {}
+        relative_counted_document_vector: Dict = {}
+        counted_term: Dict = dict(Counter(element.get(key)))
         for term in vocabulary:
             counted_document_vector[term] = 0
             relative_counted_document_vector[term] = 0
@@ -79,8 +81,8 @@ def add_document_vectors(target: List[Dict], vocabulary: List, key: str, example
                 else:
                     # Maybe the relative count of all terms per document and not all terms in the vocabulary ?
                     relative_counted_document_vector[term] = counted_term.get(term) / len(vocabulary)
-        element['counted_' + key] = counted_document_vector
-        element['relative_' + key] = relative_counted_document_vector
+        element['counted_' + key]: Dict = counted_document_vector
+        element['relative_' + key]: Dict = relative_counted_document_vector
 
 
 def get_df(documents: List[Dict], vocabulary: List) -> Dict:
@@ -93,7 +95,7 @@ def get_df(documents: List[Dict], vocabulary: List) -> Dict:
     :return: df
     """
 
-    df = {}
+    df: Dict = {}
     for term in vocabulary:
         df[term] = 0
     for document in documents:
@@ -113,7 +115,7 @@ def add_tf_idf_as_weight(target: List[Dict], df: Dict, key: str, example=EXAMPLE
     :return: None
     """
     for element in target:
-        document_vector = copy.deepcopy(element.get('relative_filtered_stemmed_' + key))
+        document_vector: Dict = copy.deepcopy(element.get('relative_filtered_stemmed_' + key))
         for term in element.get('filtered_stemmed_' + key):
 
             if df.get(term) == 0:
@@ -121,14 +123,14 @@ def add_tf_idf_as_weight(target: List[Dict], df: Dict, key: str, example=EXAMPLE
             else:
                 # Be careful with log10
                 if example:
-                    document_vector[term] = (element.get('counted_filtered_stemmed_' + key)[term] *
-                                             np.log10(len(df.keys()) / df.get(term)))
+                    document_vector[term]: Double = (element.get('counted_filtered_stemmed_' + key)[term] *
+                                                     np.log10(len(df.keys()) / df.get(term)))
                 else:
-                    document_vector[term] = element.get('relative_filtered_stemmed_' + key)[term] * \
-                                            (element.get('counted_filtered_stemmed_' + key)[term] *
-                                             np.log10(len(df.keys()) / df.get(term)))
+                    document_vector[term]: Double = element.get('relative_filtered_stemmed_' + key)[term] * \
+                                                    (element.get('counted_filtered_stemmed_' + key)[term] *
+                                                     np.log10(len(df.keys()) / df.get(term)))
 
-            element['document_vector'] = document_vector
+            element['document_vector']: Dict = document_vector
 
 
 def cosine(v1: List, v2: List):
@@ -139,8 +141,8 @@ def cosine(v1: List, v2: List):
     :param v2: Vector 2.
     :return: Cosine-Similarity of v1 and v2.
     """
-    v1 = np.array(v1)
-    v2 = np.array(v2)
+    v1: List = np.array(v1)
+    v2: List = np.array(v2)
 
     counter = np.dot(v1, v2)
     divider = np.linalg.norm(v1) * np.linalg.norm(v2)
@@ -158,10 +160,10 @@ def add_cosine_similarity(documents: List[Dict], query: Dict) -> None:
     :param query: The query to be looked for.
     :return: None
     """
-    v2 = [value for _, value in query.get('document_vector').items()]
+    v2: List = [value for _, value in query.get('document_vector').items()]
     for document in documents:
-        v1 = [value for _, value in document.get('document_vector').items()]
-        document['cosine'] = round(cosine(v1, v2), 3)
+        v1: List = [value for _, value in document.get('document_vector').items()]
+        document['cosine']: Double = round(cosine(v1, v2), 3)
 
 
 def print_results(documents: List[Dict]) -> None:
@@ -172,8 +174,8 @@ def print_results(documents: List[Dict]) -> None:
     :return: None
     """
     print('###############################################################')
-    documents = sorted(documents, key=lambda k: k['cosine'], reverse=True)
-    top5_documents = documents[:5]
+    documents: List[Dict] = sorted(documents, key=lambda k: k['cosine'], reverse=True)
+    top5_documents: List[Dict] = documents[:5]
     for document in top5_documents:
         print('Document: ' + document.get('filename') + ' has a cosine similarity of: ' + str(document.get('cosine')))
     print('###############################################################')
diff --git a/Aufgabe2a/test.py b/Aufgabe2a/test.py
deleted file mode 100644
index 3f20781..0000000
--- a/Aufgabe2a/test.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from nltk.stem import SnowballStemmer
-from nltk.corpus import stopwords
-from nltk.tokenize import word_tokenize
-import nltk
-
-nltk.download('punkt')
-nltk.download('stopwords')
-
-stop_words = stopwords.words('german')
-
-
-text = "Ich bin der der das hier gerne macht und wir gingen weg"
-text_tokens = word_tokenize(text, 'german')
-filtered_text = sorted([w for w in text_tokens if w not in stop_words])
-filtered_stemmed_text = [SnowballStemmer('german').stem(w) for w in filtered_text]
-
-print(filtered_text)
-print(filtered_stemmed_text)
-
-
-query = "Ich mag das sehr gerne, da ich gerne gehe"
-query_tokens = word_tokenize(query)
-filtered_query = word_tokenize(query, 'german')
-filtered_stemmed_query = sorted([w for w in query_tokens if w not in stop_words])
-
-print(filtered_query)
-print(filtered_stemmed_query)
-
-index_term_vocabulary = sorted(set(filtered_stemmed_text + filtered_stemmed_query))
-print(index_term_vocabulary)
-- 
GitLab