From 7a4a6c00d412e5f5f95adcb21d03603540b0f80f Mon Sep 17 00:00:00 2001 From: feger <marc.feger@hhu.de> Date: Sun, 14 Apr 2019 11:38:30 +0200 Subject: [PATCH] Restructure project --- {Aufgabe2a => Aufgabe2}/FileReader.py | 0 {Aufgabe2a => Aufgabe2}/__init__.py | 0 {Aufgabe2a => Aufgabe2/files}/meta.txt | 0 {Aufgabe2a => Aufgabe2/files}/meta_a1.txt | 0 {Aufgabe2a => Aufgabe2}/program.py | 56 ++++++++++++----------- Aufgabe2a/test.py | 30 ------------ 6 files changed, 29 insertions(+), 57 deletions(-) rename {Aufgabe2a => Aufgabe2}/FileReader.py (100%) rename {Aufgabe2a => Aufgabe2}/__init__.py (100%) rename {Aufgabe2a => Aufgabe2/files}/meta.txt (100%) rename {Aufgabe2a => Aufgabe2/files}/meta_a1.txt (100%) rename {Aufgabe2a => Aufgabe2}/program.py (76%) delete mode 100644 Aufgabe2a/test.py diff --git a/Aufgabe2a/FileReader.py b/Aufgabe2/FileReader.py similarity index 100% rename from Aufgabe2a/FileReader.py rename to Aufgabe2/FileReader.py diff --git a/Aufgabe2a/__init__.py b/Aufgabe2/__init__.py similarity index 100% rename from Aufgabe2a/__init__.py rename to Aufgabe2/__init__.py diff --git a/Aufgabe2a/meta.txt b/Aufgabe2/files/meta.txt similarity index 100% rename from Aufgabe2a/meta.txt rename to Aufgabe2/files/meta.txt diff --git a/Aufgabe2a/meta_a1.txt b/Aufgabe2/files/meta_a1.txt similarity index 100% rename from Aufgabe2a/meta_a1.txt rename to Aufgabe2/files/meta_a1.txt diff --git a/Aufgabe2a/program.py b/Aufgabe2/program.py similarity index 76% rename from Aufgabe2a/program.py rename to Aufgabe2/program.py index 8bdd0e7..cfe71a9 100644 --- a/Aufgabe2a/program.py +++ b/Aufgabe2/program.py @@ -1,6 +1,7 @@ import copy import sys from collections import Counter +from tokenize import Double from typing import List, Dict import nltk @@ -22,9 +23,10 @@ def prepare_documents(documents: List[Dict]) -> None: :return: None """ for document in documents: - text_tokens = nltk.word_tokenize(document.get('text'), LANGUAGE) - document['filtered_text'] = sorted([w for w in text_tokens if w not in STOP_WORDS]) - document['filtered_stemmed_text'] = [nltk.SnowballStemmer(LANGUAGE).stem(w) for w in document['filtered_text']] + text_tokens: List = nltk.word_tokenize(document.get('text'), LANGUAGE) + document['filtered_text']: List = sorted([w for w in text_tokens if w not in STOP_WORDS]) + document['filtered_stemmed_text']: List = [nltk.SnowballStemmer(LANGUAGE).stem(w) for w in + document['filtered_text']] def prepare_query(query: Dict) -> None: @@ -34,9 +36,9 @@ def prepare_query(query: Dict) -> None: :param query: Dict with the query information. :return: None """ - text_tokens = nltk.word_tokenize(query.get('text'), LANGUAGE) - query['filtered_query'] = sorted([w for w in text_tokens if w not in STOP_WORDS]) - query['filtered_stemmed_query'] = [nltk.SnowballStemmer(LANGUAGE).stem(w) for w in query['filtered_query']] + text_tokens: List = nltk.word_tokenize(query.get('text'), LANGUAGE) + query['filtered_query']: List = sorted([w for w in text_tokens if w not in STOP_WORDS]) + query['filtered_stemmed_query']: List = [nltk.SnowballStemmer(LANGUAGE).stem(w) for w in query['filtered_query']] def assemble_index_term_vocabulary_from_(documents: List[Dict], query: Dict) -> List: @@ -48,7 +50,7 @@ def assemble_index_term_vocabulary_from_(documents: List[Dict], query: Dict) -> :param query: Dict of the query. :return: The index-term-vocabulary. """ - vocabulary = [] + vocabulary: List = [] for document in documents: vocabulary += document.get('filtered_stemmed_text') vocabulary += query.get('filtered_stemmed_query') @@ -66,9 +68,9 @@ def add_document_vectors(target: List[Dict], vocabulary: List, key: str, example :return: """ for element in target: - counted_document_vector = {} - relative_counted_document_vector = {} - counted_term = dict(Counter(element.get(key))) + counted_document_vector: Dict = {} + relative_counted_document_vector: Dict = {} + counted_term: Dict = dict(Counter(element.get(key))) for term in vocabulary: counted_document_vector[term] = 0 relative_counted_document_vector[term] = 0 @@ -79,8 +81,8 @@ def add_document_vectors(target: List[Dict], vocabulary: List, key: str, example else: # Maybe the relative count of all terms per document and not all terms in the vocabulary ? relative_counted_document_vector[term] = counted_term.get(term) / len(vocabulary) - element['counted_' + key] = counted_document_vector - element['relative_' + key] = relative_counted_document_vector + element['counted_' + key]: Dict = counted_document_vector + element['relative_' + key]: Dict = relative_counted_document_vector def get_df(documents: List[Dict], vocabulary: List) -> Dict: @@ -93,7 +95,7 @@ def get_df(documents: List[Dict], vocabulary: List) -> Dict: :return: df """ - df = {} + df: Dict = {} for term in vocabulary: df[term] = 0 for document in documents: @@ -113,7 +115,7 @@ def add_tf_idf_as_weight(target: List[Dict], df: Dict, key: str, example=EXAMPLE :return: None """ for element in target: - document_vector = copy.deepcopy(element.get('relative_filtered_stemmed_' + key)) + document_vector: Dict = copy.deepcopy(element.get('relative_filtered_stemmed_' + key)) for term in element.get('filtered_stemmed_' + key): if df.get(term) == 0: @@ -121,14 +123,14 @@ def add_tf_idf_as_weight(target: List[Dict], df: Dict, key: str, example=EXAMPLE else: # Be careful with log10 if example: - document_vector[term] = (element.get('counted_filtered_stemmed_' + key)[term] * - np.log10(len(df.keys()) / df.get(term))) + document_vector[term]: Double = (element.get('counted_filtered_stemmed_' + key)[term] * + np.log10(len(df.keys()) / df.get(term))) else: - document_vector[term] = element.get('relative_filtered_stemmed_' + key)[term] * \ - (element.get('counted_filtered_stemmed_' + key)[term] * - np.log10(len(df.keys()) / df.get(term))) + document_vector[term]: Double = element.get('relative_filtered_stemmed_' + key)[term] * \ + (element.get('counted_filtered_stemmed_' + key)[term] * + np.log10(len(df.keys()) / df.get(term))) - element['document_vector'] = document_vector + element['document_vector']: Dict = document_vector def cosine(v1: List, v2: List): @@ -139,8 +141,8 @@ def cosine(v1: List, v2: List): :param v2: Vector 2. :return: Cosine-Similarity of v1 and v2. """ - v1 = np.array(v1) - v2 = np.array(v2) + v1: List = np.array(v1) + v2: List = np.array(v2) counter = np.dot(v1, v2) divider = np.linalg.norm(v1) * np.linalg.norm(v2) @@ -158,10 +160,10 @@ def add_cosine_similarity(documents: List[Dict], query: Dict) -> None: :param query: The query to be looked for. :return: None """ - v2 = [value for _, value in query.get('document_vector').items()] + v2: List = [value for _, value in query.get('document_vector').items()] for document in documents: - v1 = [value for _, value in document.get('document_vector').items()] - document['cosine'] = round(cosine(v1, v2), 3) + v1: List = [value for _, value in document.get('document_vector').items()] + document['cosine']: Double = round(cosine(v1, v2), 3) def print_results(documents: List[Dict]) -> None: @@ -172,8 +174,8 @@ def print_results(documents: List[Dict]) -> None: :return: None """ print('###############################################################') - documents = sorted(documents, key=lambda k: k['cosine'], reverse=True) - top5_documents = documents[:5] + documents: List[Dict] = sorted(documents, key=lambda k: k['cosine'], reverse=True) + top5_documents: List[Dict] = documents[:5] for document in top5_documents: print('Document: ' + document.get('filename') + ' has a cosine similarity of: ' + str(document.get('cosine'))) print('###############################################################') diff --git a/Aufgabe2a/test.py b/Aufgabe2a/test.py deleted file mode 100644 index 3f20781..0000000 --- a/Aufgabe2a/test.py +++ /dev/null @@ -1,30 +0,0 @@ -from nltk.stem import SnowballStemmer -from nltk.corpus import stopwords -from nltk.tokenize import word_tokenize -import nltk - -nltk.download('punkt') -nltk.download('stopwords') - -stop_words = stopwords.words('german') - - -text = "Ich bin der der das hier gerne macht und wir gingen weg" -text_tokens = word_tokenize(text, 'german') -filtered_text = sorted([w for w in text_tokens if w not in stop_words]) -filtered_stemmed_text = [SnowballStemmer('german').stem(w) for w in filtered_text] - -print(filtered_text) -print(filtered_stemmed_text) - - -query = "Ich mag das sehr gerne, da ich gerne gehe" -query_tokens = word_tokenize(query) -filtered_query = word_tokenize(query, 'german') -filtered_stemmed_query = sorted([w for w in query_tokens if w not in stop_words]) - -print(filtered_query) -print(filtered_stemmed_query) - -index_term_vocabulary = sorted(set(filtered_stemmed_text + filtered_stemmed_query)) -print(index_term_vocabulary) -- GitLab