Skip to content
Snippets Groups Projects
Commit 28342786 authored by Marc Feger's avatar Marc Feger
Browse files

Add A2.2 and add more doc-strings

parent 7a4a6c00
Branches
No related tags found
No related merge requests found
import ast
from typing import List, Dict
......@@ -24,3 +25,12 @@ class FileReader(object):
"text": line[1]
})
return file_as_list
def read_document_and_eval_type_of_content(self):
"""
This private function reads the first query results and returns it as a list of dicts.
:return: The file as dict.
"""
with open(self.filename) as f:
return ast.literal_eval(f.read())
import json
class FileWriter(object):
def __init__(self, filename: str, documents):
self.filename: str = filename
self.documents = documents
def write_to_file(self):
"""
Writes a document to a file.
:return: None
"""
with open(self.filename, 'w') as file:
json.dump(self.documents, file)
[{"filename": "1", "text": "drei Raben auf einem Baum", "filtered_text": ["Baum", "Raben", "drei"], "filtered_stemmed_text": ["baum", "rab", "drei"], "counted_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 1, "beim": 0, "drei": 1, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 1, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "relative_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0.3333333333333333, "beim": 0, "drei": 0.3333333333333333, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0.3333333333333333, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "document_vector": {"angegriff": 0, "apfelkuch": 0, "baum": 0.3656366710026855, "beim": 0, "drei": 0.4659800028906792, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0.4659800028906792, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "cosine": 0.0}, {"filename": "2", "text": "Tiger in Indien", "filtered_text": ["Indien", "Tiger"], "filtered_stemmed_text": ["indi", "tig"], "counted_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 1, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 1, "wildbahn": 0, "wood": 0}, "relative_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0.5, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.5, "wildbahn": 0, "wood": 0}, "document_vector": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0.6989700043360189, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.309894379144197, "wildbahn": 0, "wood": 0}, "cosine": 0.101}, {"filename": "3", "text": "Sonnenblummenfelder im Sommer", "filtered_text": ["Sommer", "Sonnenblummenfelder"], "filtered_stemmed_text": ["somm", "sonnenblummenfeld"], "counted_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 1, "sonnenblummenfeld": 1, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "relative_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0.5, "sonnenblummenfeld": 0.5, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "document_vector": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0.6989700043360189, "sonnenblummenfeld": 0.6989700043360189, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "cosine": 0.0}, {"filename": "4", "text": "s\u00fc\u00dfe kleine Tiger spielen an Baum", "filtered_text": ["Baum", "Tiger", "kleine", "spielen", "s\u00fc\u00dfe"], "filtered_stemmed_text": ["baum", "tig", "klein", "spiel", "suss"], "counted_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 1, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 1, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 1, "suss": 1, "tig": 1, "wildbahn": 0, "wood": 0}, "relative_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0.2, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0.2, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0.2, "suss": 0.2, "tig": 0.2, "wildbahn": 0, "wood": 0}, "document_vector": {"angegriff": 0, "apfelkuch": 0, "baum": 0.2193820026016113, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0.27958800173440757, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0.27958800173440757, "suss": 0.27958800173440757, "tig": 0.1239577516576788, "wildbahn": 0, "wood": 0}, "cosine": 0.056}, {"filename": "5", "text": "Mann von Tigern angegriffen", "filtered_text": ["Mann", "Tigern", "angegriffen"], "filtered_stemmed_text": ["mann", "tig", "angegriff"], "counted_filtered_stemmed_text": {"angegriff": 1, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 1, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 1, "wildbahn": 0, "wood": 0}, "relative_filtered_stemmed_text": {"angegriff": 0.3333333333333333, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0.3333333333333333, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.3333333333333333, "wildbahn": 0, "wood": 0}, "document_vector": {"angegriff": 0.4659800028906792, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0.4659800028906792, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.20659625276279797, "wildbahn": 0, "wood": 0}, "cosine": 0.074}, {"filename": "6", "text": "Tiger II Panzer", "filtered_text": ["II", "Panzer", "Tiger"], "filtered_stemmed_text": ["ii", "panz", "tig"], "counted_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 1, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 1, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 1, "wildbahn": 0, "wood": 0}, "relative_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0.3333333333333333, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0.3333333333333333, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.3333333333333333, "wildbahn": 0, "wood": 0}, "document_vector": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0.4659800028906792, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0.4659800028906792, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.20659625276279797, "wildbahn": 0, "wood": 0}, "cosine": 0.074}, {"filename": "7", "text": "Apfelkuchen mit Sahne", "filtered_text": ["Apfelkuchen", "Sahne"], "filtered_stemmed_text": ["apfelkuch", "sahn"], "counted_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 1, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 1, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "relative_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0.5, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0.5, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "document_vector": {"angegriff": 0, "apfelkuch": 0.6989700043360189, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0.6989700043360189, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "cosine": 0.0}, {"filename": "8", "text": "der gestiefelte Kater Kost\u00fcm", "filtered_text": ["Kater", "Kost\u00fcm", "gestiefelte"], "filtered_stemmed_text": ["kat", "kostum", "gestiefelt"], "counted_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 1, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 1, "klein": 0, "kostum": 1, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "relative_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0.3333333333333333, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0.3333333333333333, "klein": 0, "kostum": 0.3333333333333333, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "document_vector": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0, "gestiefelt": 0.4659800028906792, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0.4659800028906792, "klein": 0, "kostum": 0.4659800028906792, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0, "wildbahn": 0, "wood": 0}, "cosine": 0.0}, {"filename": "9", "text": "Tiger Woods beim Golfen in Kalifornien", "filtered_text": ["Golfen", "Kalifornien", "Tiger", "Woods", "beim"], "filtered_stemmed_text": ["golf", "kaliforni", "tig", "wood", "beim"], "counted_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 1, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 1, "ii": 0, "indi": 0, "kaliforni": 1, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 1, "wildbahn": 0, "wood": 1}, "relative_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0.2, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0.2, "ii": 0, "indi": 0, "kaliforni": 0.2, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.2, "wildbahn": 0, "wood": 0.2}, "document_vector": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0.27958800173440757, "drei": 0, "freier": 0, "gestiefelt": 0, "golf": 0.27958800173440757, "ii": 0, "indi": 0, "kaliforni": 0.27958800173440757, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.1239577516576788, "wildbahn": 0, "wood": 0.27958800173440757}, "cosine": 0.327}, {"filename": "10", "text": "Tiger in freier Wildbahn", "filtered_text": ["Tiger", "Wildbahn", "freier"], "filtered_stemmed_text": ["tig", "wildbahn", "freier"], "counted_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 1, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 1, "wildbahn": 1, "wood": 0}, "relative_filtered_stemmed_text": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0.3333333333333333, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.3333333333333333, "wildbahn": 0.3333333333333333, "wood": 0}, "document_vector": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0.4659800028906792, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.20659625276279797, "wildbahn": 0.4659800028906792, "wood": 0}, "cosine": 0.829}]
\ No newline at end of file
{"text": "Tiger Woods in freier Wildbahn", "filtered_query": ["Tiger", "Wildbahn", "Woods", "freier"], "filtered_stemmed_query": ["tig", "wildbahn", "wood", "freier"], "counted_filtered_stemmed_query": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 1, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 1, "wildbahn": 1, "wood": 1}, "relative_filtered_stemmed_query": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0.25, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.25, "wildbahn": 0.25, "wood": 0.25}, "document_vector": {"angegriff": 0, "apfelkuch": 0, "baum": 0, "beim": 0, "drei": 0, "freier": 0.34948500216800943, "gestiefelt": 0, "golf": 0, "ii": 0, "indi": 0, "kaliforni": 0, "kat": 0, "klein": 0, "kostum": 0, "mann": 0, "panz": 0, "rab": 0, "sahn": 0, "somm": 0, "sonnenblummenfeld": 0, "spiel": 0, "suss": 0, "tig": 0.1549471895720985, "wildbahn": 0.34948500216800943, "wood": 0.34948500216800943}}
\ No newline at end of file
......@@ -9,6 +9,7 @@ import numpy as np
from nltk.corpus import stopwords
from FileReader import FileReader
from FileWriter import FileWriter
EXAMPLE = False
LANGUAGE = 'german'
......@@ -80,7 +81,8 @@ def add_document_vectors(target: List[Dict], vocabulary: List, key: str, example
relative_counted_document_vector[term] = counted_term.get(term)
else:
# Maybe the relative count of all terms per document and not all terms in the vocabulary ?
relative_counted_document_vector[term] = counted_term.get(term) / len(vocabulary)
# Maybe len(vocabulary)
relative_counted_document_vector[term] = counted_term.get(term) / len(element.get(key))
element['counted_' + key]: Dict = counted_document_vector
element['relative_' + key]: Dict = relative_counted_document_vector
......@@ -152,32 +154,46 @@ def cosine(v1: List, v2: List):
return counter / divider
def add_cosine_similarity(documents: List[Dict], query: Dict) -> None:
def add_cosine_similarity(documents: List[Dict], query: Dict, tag='') -> None:
"""
Adds the cosine-similarity between a document and the query.
:param tag: This can be used to add a new cosine similarity with a new document vector of the query.
:param documents: A list of documents.
:param query: The query to be looked for.
:return: None
"""
v2: List = [value for _, value in query.get('document_vector').items()]
if tag != '':
tag = '_' + tag
v2: List = [value for _, value in query.get('document_vector' + tag).items()]
for document in documents:
v1: List = [value for _, value in document.get('document_vector').items()]
document['cosine']: Double = round(cosine(v1, v2), 3)
document['cosine' + tag]: Double = round(cosine(v1, v2), 3)
def print_results(documents: List[Dict]) -> None:
def print_results(documents: List[Dict], query: Dict, tag='') -> None:
"""
Pretty print the documents.
:param query: The search query.
:param tag: This can be used to use a new cosine similarity with a new document vector of the query.
:param documents: List of documents
:return: None
"""
if tag != '':
tag = '_' + tag
print('###############################################################')
documents: List[Dict] = sorted(documents, key=lambda k: k['cosine'], reverse=True)
documents: List[Dict] = sorted(documents, key=lambda k: k['cosine' + tag], reverse=True)
top5_documents: List[Dict] = documents[:5]
print('Query: ' + query.get('text'))
for document in top5_documents:
print('Document: ' + document.get('filename') + ' has a cosine similarity of: ' + str(document.get('cosine')))
print('Document: ' + document.get('filename') + ', Text: ' + document.get(
'text') + ', has a cosine similarity of: ' + str(
round(document.get('cosine' + tag), 3)))
print('###############################################################')
......@@ -190,10 +206,10 @@ if __name__ == '__main__':
To run the example of A1:
-> Change example to True
run $ python program.py meta_a1.txt "Tasse Kanne"
run $ python program.py ./files/meta_a1.txt "Tasse Kanne"
Else
-> Make sure example is False.
run $ python program.py meta.txt "<query>"
run $ python program.py ./files/meta.txt "<query>"
:arg 1: File which contains all documents (Two columns separated with \t and ends with \n)
:arg 2: Query to be looked for (must be surrounded with "")
......@@ -224,4 +240,7 @@ if __name__ == '__main__':
# print('All Documents: ' + json.dumps(all_documents, indent=3))
# print('Search Query: ' + json.dumps(search_query, indent=3))
#
print_results(all_documents)
print_results(all_documents, search_query)
FileWriter('./files/output.documents', all_documents).write_to_file()
FileWriter('./files/output.query', search_query).write_to_file()
from tokenize import Double
from typing import List, Dict, Tuple
import numpy as np
from FileReader import FileReader
from program import print_results, add_cosine_similarity
def assemble_dr_dnr(feedback: List, documents: List[Dict]) -> Tuple[List, List]:
"""
This method assembles the Dr and Dnr for Rocchio.
It is assumed that all elements in Dr will be out of the TOP-5 Elements and Dnr will be all other elements.
Therefore it is assumed that: Dr U Dnr = D (all documents)
:param feedback: List of relevant document names
:param documents: List of all documents in the system
:return: Dr, Dnr
"""
unique_documents = list(set([int(document.get('filename')) for document in documents]))
dr = list(set([int(d) for d in unique_documents if d in feedback]))
dnr = list(set([int(d) for d in unique_documents if d not in dr]))
return dr, dnr
def dict_to_list(target: Dict) -> List:
"""
Turns a dict in a list.
:param target: Dict to be turned into a list.
:return: A list containing all values of a dict
"""
return [value for _, value in target.items()]
def q_m(query: Dict, documents: List[Dict], feedback: List, alpha: Double, beta: Double, gamma: Double):
"""
Calculates the new query vector as defined by Rocchio.
:param query: The search query
:param documents: The list of all documents
:param feedback: The list with the relevant document names
:param alpha: Value for alpha
:param beta: Value for beta
:param gamma: Value for gamma
:return: None
"""
relevant_documents, not_relevant_documents = assemble_dr_dnr(feedback, documents)
addend_1: List = list(np.multiply(alpha, dict_to_list(query.get('document_vector'))))
addend_2: List = list(np.zeros(len(query.get('document_vector'))))
if len(relevant_documents) != 0:
for document in documents:
if int(document.get('filename')) in relevant_documents:
addend_2 = list(np.add(addend_2, dict_to_list(document.get('document_vector'))))
addend_2 = list(np.multiply((beta / len(relevant_documents)), addend_2))
addend_3: List = list(np.zeros(len(query.get('document_vector'))))
if len(not_relevant_documents) != 0:
for document in documents:
if int(document.get('filename')) in not_relevant_documents:
addend_3 = list(np.add(addend_3, dict_to_list(document.get('document_vector'))))
addend_3 = list(np.multiply((gamma / len(not_relevant_documents)), addend_3))
qm_tmp = list(np.add(addend_1, addend_2))
qm_tmp = list(np.subtract(qm_tmp, addend_3))
qm = dict(list(zip(query.get('document_vector').keys(), qm_tmp)))
query['document_vector_new'] = qm
if __name__ == '__main__':
"""
This program does a one step rocchio improvement for the search query.
The values for the search query along with the values for the document are taken of the corresponding
files in /files. Then the user is asked to give his feedback.
Notice: Input only valid document ids which are also in the Top 5.
Before running this code make sure you have generated all files in /files.
To do so run e.g:
$ python program.py ./files/meta.txt "Tiger Woods in freier Wildbahn"
To run the code use:
$ python rocchio.py
"""
all_documents = FileReader('./files/output.documents').read_document_and_eval_type_of_content()
search_query = FileReader('./files/output.query').read_document_and_eval_type_of_content()
print('Top-5 results:')
print_results(all_documents, search_query)
print('Attention: Input must be int! And please use only documents in the Top-5!')
input_of_relevant_documents = [int(x) for x in
input("Please enter the names of the documents you find relevant: ").split()]
q_m(search_query, all_documents, input_of_relevant_documents, 1, 0.8, 0.1)
add_cosine_similarity(all_documents, search_query, 'new')
print('Top-5 results (new):')
print_results(all_documents, search_query, 'new')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment